diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 44632703e32..ccee7739dc6 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-0a038cf0cff2d071b7359ac0491fd2ba7798a438
+b1984237a0fb32b760c1b84d6d02d2f0f7ed293b
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index e6cd356e0e0..3a0cd57ddb5 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -6,7 +6,7 @@ sympy==1.12
 timm==0.6.13
 tomli==2.0.1
 torchsr==1.0.4
-transformers==4.36.0
+transformers==4.38.0
 zstd==1.5.5.1
 pytest==7.2.0
 pytest-cov==4.1.0
diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
index 04a9ee227da..eb1221620c1 100644
--- a/.ci/scripts/build_llama_android.sh
+++ b/.ci/scripts/build_llama_android.sh
@@ -26,6 +26,7 @@ install_executorch_and_backend_lib() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_QUANTIZED=ON \
     -DXNNPACK_ENABLE_ARM_BF16=OFF \
     -Bcmake-android-out .
 
diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh
index 2d915506158..c29c09dc63d 100755
--- a/.ci/scripts/test.sh
+++ b/.ci/scripts/test.sh
@@ -37,7 +37,7 @@ build_cmake_executor_runner() {
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DBUCK2=buck2 -DCMAKE_BUILD_TYPE=Release \
+    && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4
@@ -84,8 +84,7 @@ build_cmake_xnn_executor_runner() {
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DBUCK2=buck2 \
-      -DCMAKE_BUILD_TYPE=Release \
+    && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DEXECUTORCH_BUILD_XNNPACK=ON \
       -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 90ea13281ba..94528613e33 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 MODEL_NAME=$1 # stories110M.pt
 BUILD_TOOL=$2 # buck2 or cmake
 DTYPE=$3 # fp16 or fp32
-MODE=${4:-"xnnpack"} # portable or xnnpack
+MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -37,6 +37,24 @@ if [[ -z "${MODE:-}" ]]; then
   exit 1
 fi
 
+if [[ "${MODE}" =~ .*xnnpack.* ]]; then
+  XNNPACK=ON
+else
+  XNNPACK=OFF
+fi
+
+if [[ "${MODE}" =~ .*custom.* ]]; then
+  CUSTOM=ON
+else
+  CUSTOM=OFF
+fi
+
+if [[ "${MODE}" =~ .*qe.* ]]; then
+  QE=ON
+else
+  QE=OFF
+fi
+
 if [[ -z "${BUCK:-}" ]]; then
   BUCK=buck2
 fi
@@ -47,25 +65,21 @@ fi
 
 which "${PYTHON_EXECUTABLE}"
 
-
 cmake_install_executorch_libraries() {
     echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
     rm -rf cmake-out
-    if [[ "${MODE}" == "xnnpack" ]]; then
-      XNNPACK=ON
-    else
-      XNNPACK=OFF
-    fi
     retry cmake -DBUCK2="$BUCK" \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
-    cmake --build cmake-out -j9 --target install --config Release
+    cmake --build cmake-out -j9 --target install --config Debug
 }
 
 cmake_build_llama_runner() {
@@ -73,12 +87,14 @@ cmake_build_llama_runner() {
     dir="examples/models/llama2"
     retry cmake -DBUCK2="$BUCK" \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
-    cmake --build cmake-out/${dir} -j9 --config Release
+    cmake --build cmake-out/${dir} -j9 --config Debug
 
 }
 
@@ -116,10 +132,17 @@ fi
 # Export model.
 EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
 echo "Exporting ${EXPORTED_MODEL_NAME}"
-EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
-if [[ "${MODE}" == "xnnpack" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
+EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
+if [[ "${XNNPACK}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
+fi
+if [[ "${CUSTOM}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
+fi
+if [[ "${QE}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
 fi
+# Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
diff --git a/.ci/scripts/test_quantized_aot_lib.sh b/.ci/scripts/test_quantized_aot_lib.sh
index ed9c789c5e4..610144f80d2 100755
--- a/.ci/scripts/test_quantized_aot_lib.sh
+++ b/.ci/scripts/test_quantized_aot_lib.sh
@@ -21,10 +21,9 @@ build_cmake_quantized_aot_lib() {
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DBUCK2=buck2 \
-      -DCMAKE_BUILD_TYPE=Release \
+    && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-      -DEXECUTORCH_BUILD_QUANTIZED=ON \
+      -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index c7c00be2574..f0675f56cc7 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -99,7 +99,7 @@ build_executorch_runner_cmake() {
   pushd "${CMAKE_OUTPUT_DIR}" || return
   # This command uses buck2 to gather source files and buck2 could crash flakily
   # on MacOS
-  retry cmake -DBUCK2=buck2 -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE=Release ..
+  retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE=Release ..
   popd || return
 
   if [ "$(uname)" == "Darwin" ]; then
diff --git a/.clang-format b/.clang-format
index 31a13c408fc..8ec7b569e24 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,22 +2,55 @@
 Language:        Cpp
 AccessModifierOffset: -1
 AlignAfterOpenBracket: AlwaysBreak
-AlignConsecutiveMacros: None
-AlignConsecutiveAssignments: None
-AlignConsecutiveBitFields: None
-AlignConsecutiveDeclarations: None
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    true
+AlignConsecutiveBitFields:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    true
+AlignConsecutiveDeclarations:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    true
+AlignConsecutiveMacros:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    true
+AlignConsecutiveShortCaseStatements:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCaseColons: false
 AlignEscapedNewlines: Left
 AlignOperands:   DontAlign
-AlignTrailingComments: false
+AlignTrailingComments:
+  Kind:            Never
+  OverEmptyLines:  0
 AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: false
-AllowShortEnumsOnASingleLine: true
+AllowBreakBeforeNoexceptSpecifier: Never
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
+AllowShortCompoundRequirementOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: Empty
-AllowShortLambdasOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
@@ -27,17 +60,18 @@ AttributeMacros:
   - __capability
 BinPackArguments: false
 BinPackParameters: false
+BitFieldColonSpacing: Both
 BraceWrapping:
   AfterCaseLabel:  false
   AfterClass:      false
   AfterControlStatement: Never
   AfterEnum:       false
+  AfterExternBlock: false
   AfterFunction:   false
   AfterNamespace:  false
   AfterObjCDeclaration: false
   AfterStruct:     false
   AfterUnion:      false
-  AfterExternBlock: false
   BeforeCatch:     false
   BeforeElse:      false
   BeforeLambdaBody: false
@@ -46,26 +80,27 @@ BraceWrapping:
   SplitEmptyFunction: true
   SplitEmptyRecord: true
   SplitEmptyNamespace: true
+BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Leave
+BreakAfterJavaFieldAnnotations: false
+BreakArrays:     true
 BreakBeforeBinaryOperators: None
-BreakBeforeConceptDeclarations: true
+BreakBeforeConceptDeclarations: Always
 BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
+BreakBeforeInlineASMColon: OnlyMultiline
 BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
+BreakInheritanceList: BeforeColon
 BreakStringLiterals: false
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
-DeriveLineEnding: true
 DerivePointerAlignment: false
 DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
 EmptyLineBeforeAccessModifier: LogicalBlock
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
@@ -73,8 +108,8 @@ ForEachMacros:
   - FOR_EACH
   - FOR_EACH_R
   - FOR_EACH_RANGE
-StatementAttributeLikeMacros:
-  - Q_EMIT
+IfMacros:
+  - KJ_IF_MAYBE
 IncludeBlocks:   Preserve
 IncludeCategories:
   - Regex:           '^<.*\.h(pp)?>'
@@ -91,18 +126,31 @@ IncludeCategories:
     CaseSensitive:   false
 IncludeIsMainRegex: '(Test)?$'
 IncludeIsMainSourceRegex: ''
-IndentCaseLabels: true
+IndentAccessModifiers: false
 IndentCaseBlocks: false
+IndentCaseLabels: true
+IndentExternBlock: AfterExternBlock
 IndentGotoLabels: true
 IndentPPDirectives: None
-IndentExternBlock: AfterExternBlock
-IndentRequires:  false
+IndentRequiresClause: true
 IndentWidth:     2
 IndentWrappedFunctionNames: false
+InsertBraces:    false
+InsertNewlineAtEOF: false
 InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary:          0
+  BinaryMinDigits: 0
+  Decimal:         0
+  DecimalMinDigits: 0
+  Hex:             0
+  HexMinDigits:    0
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
+KeepEmptyLinesAtEOF: false
+LambdaBodyIndentation: Signature
+LineEnding:      DeriveLF
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
@@ -112,53 +160,85 @@ ObjCBlockIndentWidth: 2
 ObjCBreakBeforeNestedBlockParam: true
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
+PackConstructorInitializers: NextLine
 PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakScopeResolution: 500
 PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
 PenaltyIndentedWhitespace: 0
+PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
+PPIndentWidth:   -1
+QualifierAlignment: Leave
+ReferenceAlignment: Pointer
 ReflowComments:  true
-SortIncludes:    true
+RemoveBracesLLVM: false
+RemoveParentheses: Leave
+RemoveSemicolon: false
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SkipMacroDefinitionBody: false
+SortIncludes:    CaseSensitive
 SortJavaStaticImport: Before
-SortUsingDeclarations: true
+SortUsingDeclarations: LexicographicNumeric
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeCaseColon: false
 SpaceBeforeCpp11BracedList: false
 SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
+SpaceBeforeJsonColon: false
 SpaceBeforeParens: ControlStatements
-SpaceAroundPointerQualifiers: Default
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros:   true
+  AfterOverloadedOperator: false
+  AfterPlacementOperator: true
+  AfterRequiresInClause: false
+  AfterRequiresInExpression: false
+  BeforeNonEmptyParentheses: false
 SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
 SpaceInEmptyBlock: false
-SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInConditionalStatement: false
+SpacesInAngles:  Never
 SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParens:  Never
+SpacesInParensOptions:
+  InCStyleCasts:   false
+  InConditionalStatements: false
+  InEmptyParentheses: false
+  Other:           false
 SpacesInSquareBrackets: false
-SpaceBeforeSquareBrackets: false
-BitFieldColonSpacing: Both
 Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
 StatementMacros:
   - Q_UNUSED
   - QT_REQUIRE_VERSION
 TabWidth:        8
-UseCRLF:         false
 UseTab:          Never
+VerilogBreakBetweenInstancePorts: true
 WhitespaceSensitiveMacros:
-  - STRINGIZE
-  - PP_STRINGIZE
   - BOOST_PP_STRINGIZE
-  - NS_SWIFT_NAME
   - CF_SWIFT_NAME
+  - NS_SWIFT_NAME
+  - PP_STRINGIZE
+  - STRINGIZE
 ...
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000000..1a7fef172cd
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+# From https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    reviewers:
+      - "pytorch/team-executorch"
+    allow:
+      - dependency-name: "torchfix"
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index c36c5861168..219812fdd26 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -57,9 +57,6 @@ jobs:
       script: |
         set -eux
 
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=${{ matrix.build-tool }}
 
         bash .ci/scripts/setup-conda.sh
@@ -75,5 +72,3 @@ jobs:
         ${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
         # Run gtest
         ${CONDA_RUN} buck2 test runtime/core/... runtime/platform/...
-
-        popd
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 0d8931cf102..fa5cd854cc0 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -10,7 +10,8 @@ on:
       - .ci/docker/**
       - .github/workflows/android.yml
       - install_requirements.sh
-      - examples/demo-apps/**
+      - examples/demo-apps/android/**
+      - extension/android/**
       - extension/module/**
   workflow_dispatch:
 
@@ -33,6 +34,7 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      upload-artifact: android-apps
       script: |
         set -eux
 
@@ -45,3 +47,62 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
         # Build Android demo app
         bash build/test_android_ci.sh
+
+        mkdir -p artifacts-to-be-uploaded
+        # Copy the app and its test suite to S3
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/
+        # Also copy the share libraries
+        cp cmake-out-android/lib/*.a artifacts-to-be-uploaded/
+
+  # Upload the app and its test suite to S3 so that they can be downloaded by the test job
+  upload-artifacts:
+    needs: test-demo-android
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the artifacts
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the artifacts
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  # Let's see how expensive this job is, we might want to tone it down by running it periodically
+  test-llama-app:
+    needs: upload-artifacts
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
+    with:
+      device-type: android
+      runner: ubuntu-latest
+      test-infra-ref: ''
+      # This is the ARN of ExecuTorch project on AWS
+      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
+      # This is the custom Android device pool that only includes Samsung Galaxy S2x
+      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
+      # Uploaded to S3 from the previous job, the name of the app comes from the project itself
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug-androidTest.apk
+      # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
+      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
+      # The exported llama2 model and its tokenizer, can be downloaded from https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b.zip.
+      # Among the input, this is the biggest file and uploading it to AWS beforehand makes the test run much faster
+      extra-data: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/bd15825b-ddab-4e47-9fef-a9c8935778dd
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 06aa6a66e98..54d019b6764 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -34,8 +34,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         BUILD_TOOL=cmake
 
         .ci/scripts/setup-conda.sh
@@ -48,8 +46,6 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
         build/test_ios_ci.sh
 
-        popd
-
   build-frameworks-ios:
     name: build-frameworks-ios
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -61,10 +57,8 @@ jobs:
       upload-artifact: executorch-frameworks-ios
       timeout: 90
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         BUILD_TOOL=cmake
-        VERSION="0.1.0"
+        VERSION="latest"
         FRAMEWORKS=(
           "executorch"
           "coreml_backend"
@@ -111,8 +105,6 @@ jobs:
           zip -r "${RUNNER_TEMP}/artifacts/${FRAMEWORK}_debug-${VERSION}.zip" "${FRAMEWORK}_debug.xcframework"
         ) done
 
-        popd
-
   upload-frameworks-ios:
     runs-on: ubuntu-22.04
     needs: build-frameworks-ios
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index ee5cfb859b3..ccc852c24fb 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -8,6 +8,7 @@ on:
       - release/*
     tags:
       - v[0-9]+.[0-9]+.[0-9]+
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
   schedule:
     - cron: '0 0 * * *'
@@ -46,13 +47,9 @@ jobs:
         # ET_VERSION_DOCS will be pulled during the doc build to add to the version dropdown
         # on the website. See docs/source/conf.py for details
 
-        REF_TYPE=${{ github.ref_type }}
-        REF_NAME=${{ github.ref_name }}
-
-        echo "$REF_TYPE"
-        echo "$REF_NAME"
-
-        ET_VERSION_DOCS="${REF_NAME}"
+        GITHUB_REF=${{ github.ref }}
+        echo "$GITHUB_REF"
+        export ET_VERSION_DOCS="${GITHUB_REF}"
         echo "$ET_VERSION_DOCS"
 
         set -eux
@@ -68,23 +65,21 @@ jobs:
         make html
         cd ..
 
+        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
+        echo "GitHub Ref: ${GITHUB_REF}"
+        if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
+          find docs/_build/html/ -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">';
+        fi
+
         cp -rf docs/_build/html/* "${RUNNER_DOCS_DIR}"
 
         mv docs/_build/html "${RUNNER_ARTIFACT_DIR}"
 
         ls -R "${RUNNER_ARTIFACT_DIR}"/*/*.html
 
-# Enable preview later. Previews are available publicly
-#
-# upload-preview:
-#    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' &&
-#        (github.ref_type == 'branch' && github.ref_name == 'main')
-#    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-
   upload-gh-pages:
     needs: build
-    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' &&
-        ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
+    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/v'))
     permissions:
       contents: write
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@@ -96,22 +91,17 @@ jobs:
       script: |
         set -euo pipefail
 
-        REF_TYPE=${{ github.ref_type }}
-        REF_NAME=${{ github.ref_name }}
-
-        # If building for a release tag, branch, set the branch/tag name
-        # as the target folder in the gh-pages branch. The artifacts created
-        # during the build will be copied over to the target dir in the
-        # gh-pages branch.
-        if [[ "${REF_TYPE}" == branch ]]; then
-          TARGET_FOLDER="${REF_NAME}"
-        elif [[ "${REF_TYPE}" == tag ]]; then
-          # Strip the leading "v" as well as the trailing patch version and "-rc" suffix.
-          # For example: 'v0.1.2' -> '0.1' and 'v0.1.2-rc1' -> 0.1.
-          TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/^v//i; s/-rc[0-9]*$//; s/\.[0-9]*$//')
+        # Get github.ref for the output doc folder. By default "main"
+        # If matches a tag like refs/tags/v1.12.0-rc3 or
+        # refs/tags/v1.12.0 convert to 1.12
+        GITHUB_REF=${{ github.ref }}
+
+        # Convert refs/tags/v1.12.0rc3 into 1.12.
+        # Adopted from https://github.com/pytorch/pytorch/blob/main/.github/workflows/_docs.yml#L150C11-L155C13
+        if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\\.[0-9]+)\\. ]]; then
+          TARGET_FOLDER="${BASH_REMATCH[1]}"
         else
-          echo "ERROR: Invalid REF_TYPE: ${REF_TYPE}. Expected 'branch' or 'tag'."
-          exit 1
+          TARGET_FOLDER="main"
         fi
         echo "Target Folder: ${TARGET_FOLDER}"
 
@@ -122,12 +112,6 @@ jobs:
         mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
         git add "${TARGET_FOLDER}" || true
 
-        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
-        if [[ "${REF_NAME}" == 'main' ]]; then
-          find "${TARGET_FOLDER}" -type f -name "*.html" -exec sed -i '/<head>/a <meta name="robots" content="noindex">' {} \;
-          git add "${TARGET_FOLDER}"/**/*.html || true
-        fi
-
         git config user.name 'pytorchbot'
         git config user.email 'soumith+bot@pytorch.org'
         git commit -m "Auto-generating sphinx docs" || true
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9751b906cd8..f650fc79209 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -90,7 +90,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack]
+        mode: [portable, xnnpack+custom, xnnpack+custom+qe]
       fail-fast: false
     with:
       runner: linux.2xlarge
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 16ed6a27577..b10c6227d39 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -46,9 +46,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: ${{ matrix.timeout }}
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         MODEL_NAME=${{ matrix.model }}
         BUILD_TOOL=${{ matrix.build-tool }}
         BACKEND=${{ matrix.backend }}
@@ -59,7 +56,6 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test xecutorch
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
-        popd
 
   test-custom-ops-macos:
     name: test-custom-ops-macos
@@ -75,9 +71,6 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=${{ matrix.build-tool }}
 
         bash .ci/scripts/setup-conda.sh
@@ -85,7 +78,6 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test custom ops
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
-        popd
 
   test-selective-build-macos:
     name: test-selective-build-macos
@@ -101,9 +93,6 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=${{ matrix.build-tool }}
 
         bash .ci/scripts/setup-conda.sh
@@ -111,7 +100,6 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test selective build
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
-        popd
 
   test-demo-backend-delegation:
     name: test-demo-backend-delegation
@@ -208,9 +196,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=cmake
 
         bash .ci/scripts/setup-conda.sh
@@ -218,7 +203,6 @@ jobs:
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test coreml delegate
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/build_all.sh
-        popd
 
   test-pybind-build-macos:
     name: test-pybind-build-macos
@@ -235,8 +219,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 180
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         bash .ci/scripts/setup-conda.sh
 
         # build module for executorch.extension.pybindings.portable_lib
@@ -245,7 +227,6 @@ jobs:
 
         # see if we can import the module successfully
         ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-        popd
 
   test-llama-runner-macos:
     name: test-llama-runner-mac
@@ -254,7 +235,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack]
+        mode: [portable, xnnpack+kv+custom]
       fail-fast: false
     with:
       runner: macos-m1-stable
@@ -263,8 +244,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         bash .ci/scripts/setup-conda.sh
 
         DTYPE=${{ matrix.dtype }}
@@ -278,4 +257,3 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M.pt "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
-        popd
diff --git a/.gitignore b/.gitignore
index 6661daed13e..26a46f23f62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 buck-out/
 cmake-out/
 cmake-android-out/
+cmake-out-android/
 cmake-ios-out/
 ethos-u-scratch/
 executorch.egg-info
diff --git a/.gitmodules b/.gitmodules
index 44137b27a71..42deca0a6bb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -62,3 +62,9 @@
 [submodule "examples/third-party/LLaVA"]
 	path = examples/third-party/LLaVA
 	url = https://github.com/haotian-liu/LLaVA.git
+[submodule "examples/models/llama2/third-party/re2"]
+	path = examples/models/llama2/third-party/re2
+	url = https://github.com/google/re2.git
+[submodule "examples/models/llama2/third-party/abseil-cpp"]
+	path = examples/models/llama2/third-party/abseil-cpp
+	url = https://github.com/abseil/abseil-cpp.git
diff --git a/.swift/custom_backend_debug/dummy.swift b/.swift/custom_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/executorch/dummy.swift b/.swift/executorch/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/executorch_debug/dummy.swift b/.swift/executorch_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/mps_backend/dummy.swift b/.swift/mps_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/mps_backend_debug/dummy.swift b/.swift/mps_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/optimized_backend/dummy.swift b/.swift/optimized_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/optimized_backend_debug/dummy.swift b/.swift/optimized_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/portable_backend/dummy.swift b/.swift/portable_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/portable_backend_debug/dummy.swift b/.swift/portable_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/quantized_backend/dummy.swift b/.swift/quantized_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/quantized_backend_debug/dummy.swift b/.swift/quantized_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/xnnpack_backend/dummy.swift b/.swift/xnnpack_backend/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/.swift/xnnpack_backend_debug/dummy.swift b/.swift/xnnpack_backend_debug/dummy.swift
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46b73f63492..0610462aed9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,6 +144,8 @@ option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
 
 option(EXECUTORCH_BUILD_CUSTOM "Build the custom kernels" OFF)
 
+option(EXECUTORCH_BUILD_CUSTOM_OPS_AOT "Build the custom ops lib for AOT" OFF)
+
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension"
        OFF)
 
@@ -175,8 +177,9 @@ option(EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" OFF)
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
-cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library."
-                       ON "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+cmake_dependent_option(
+  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
 
 #
 # cpuinfo: build cpuinfo library. Disable on unsupported platforms
@@ -184,8 +187,19 @@ cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library."
 cmake_dependent_option(EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
                        "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
 
+if(EXECUTORCH_BUILD_CUSTOM_OPS_AOT)
+  set(EXECUTORCH_BUILD_CUSTOM ON)
+endif()
+
+if(EXECUTORCH_BUILD_CUSTOM)
+  set(EXECUTORCH_BUILD_OPTIMIZED ON)
+endif()
+
 if(EXECUTORCH_BUILD_CPUINFO)
   # --- cpuinfo
+  set(ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG
+      ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   set(CPUINFO_SOURCE_DIR "backends/xnnpack/third-party/cpuinfo")
   set(CPUINFO_BUILD_TOOLS
       OFF
@@ -207,10 +221,15 @@ if(EXECUTORCH_BUILD_CPUINFO)
       CACHE STRING "")
   set(CLOG_SOURCE_DIR "${CPUINFO_SOURCE_DIR}/deps/clog")
   add_subdirectory("${CPUINFO_SOURCE_DIR}")
+  set(CMAKE_POSITION_INDEPENDENT_CODE
+      ${ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG})
 endif()
 
 if(EXECUTORCH_BUILD_PTHREADPOOL)
   # --- pthreadpool
+  set(ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG
+      ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   set(PTHREADPOOL_SOURCE_DIR "backends/xnnpack/third-party/pthreadpool")
   set(PTHREADPOOL_BUILD_TESTS
       OFF
@@ -230,6 +249,8 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
        CACHE STRING "")
   endif()
   add_subdirectory("${PTHREADPOOL_SOURCE_DIR}")
+  set(CMAKE_POSITION_INDEPENDENT_CODE
+      ${ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG})
 endif()
 
 if(NOT PYTHON_EXECUTABLE)
@@ -352,23 +373,27 @@ add_subdirectory(schema)
 # Only contains primitive operators; does not contain portable kernels or other
 # full operators. Does not contain any backends.
 #
-
-add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE program_schema)
-target_link_options_shared_lib(executorch)
+add_library(executorch_no_prim_ops ${_executorch_no_prim_ops__srcs})
+target_link_libraries(executorch_no_prim_ops PRIVATE program_schema)
 # Check if dl exists for this toolchain and only then link it.
 find_library(DL_LIBRARY_EXISTS NAMES dl)
 # Check if the library was found
 if(DL_LIBRARY_EXISTS)
-  target_link_libraries(executorch PRIVATE dl) # For dladdr()
+  target_link_libraries(executorch_no_prim_ops PRIVATE dl) # For dladdr()
 endif()
-target_include_directories(executorch PUBLIC ${_common_include_directories})
-target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_include_directories(executorch_no_prim_ops PUBLIC ${_common_include_directories})
+target_compile_options(executorch_no_prim_ops PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
-  target_compile_definitions(executorch
+  target_compile_definitions(executorch_no_prim_ops
                              PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM})
 endif()
 
+add_library(executorch ${_executorch__srcs})
+target_link_libraries(executorch PRIVATE executorch_no_prim_ops)
+target_include_directories(executorch PUBLIC ${_common_include_directories})
+target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_link_options_shared_lib(executorch)
+
 #
 # portable_ops_lib: A library to register core ATen ops using portable kernels,
 # see kernels/portable/CMakeLists.txt.
@@ -406,7 +431,7 @@ endif()
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
 install(
-  TARGETS executorch
+  TARGETS executorch executorch_no_prim_ops
   DESTINATION lib
   INCLUDES
   DESTINATION ${_common_include_directories})
@@ -504,25 +529,51 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
   endif()
 
+  # find pytorch lib, to allow pybind to take at::Tensor as input/output
+  find_package(Torch CONFIG REQUIRED)
+  find_library(TORCH_PYTHON_LIBRARY torch_python
+               PATHS "${TORCH_INSTALL_PREFIX}/lib")
+
+  set(_dep_libs
+      ${TORCH_PYTHON_LIBRARY}
+      bundled_program
+      etdump
+      executorch
+      extension_data_loader
+      portable_ops_lib
+      util
+      torch)
+
   if(EXECUTORCH_BUILD_COREML)
-    set(PYBIND_LINK_COREML "coremldelegate")
+    list(APPEND _dep_libs coremldelegate)
   endif()
 
   if(EXECUTORCH_BUILD_MPS)
-    set(PYBIND_LINK_MPS "mpsdelegate")
+    list(APPEND _dep_libs mpsdelegate)
   endif()
 
   if(EXECUTORCH_BUILD_XNNPACK)
-    # need to explicitly specify XNNPACK here
-    # otherwise uses XNNPACK symbols from libtorch_cpu
-    set(PYBIND_LINK_XNNPACK xnnpack_backend XNNPACK)
+    # need to explicitly specify XNNPACK here otherwise uses XNNPACK symbols
+    # from libtorch_cpu
+    list(APPEND _dep_libs xnnpack_backend XNNPACK)
   endif()
 
-  # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
-  find_library(TORCH_PYTHON_LIBRARY torch_python
-               PATHS "${TORCH_INSTALL_PREFIX}/lib")
+  if(EXECUTORCH_BUILD_QUANTIZED)
+    target_link_options_shared_lib(quantized_ops_lib)
+    list(APPEND _dep_libs quantized_kernels quantized_ops_lib)
+  endif()
 
+  # TODO(larryliu): Fix macOS 2 dylibs having 2 sets of static variables issue
+  if(EXECUTORCH_BUILD_CUSTOM_OPS_AOT AND NOT APPLE)
+    list(APPEND _dep_libs custom_ops_aot_lib)
+  endif()
+  # TODO(laryliu): Fix linux duplicate registation problem. In GH CI worker
+  # libcustom_ops.a doesn't dedup with the one indirectly linked from
+  # libcustom_ops_aot_lib.a
+  if(EXECUTORCH_BUILD_CUSTOM AND APPLE)
+    target_link_options_shared_lib(custom_ops)
+    list(APPEND _dep_libs custom_ops)
+  endif()
   # compile options for pybind
 
   set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
@@ -540,23 +591,14 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   # pybind portable_lib
   pybind11_add_module(portable_lib extension/pybindings/pybindings.cpp)
+  # The actual output file needs a leading underscore so it can coexist with
+  # portable_lib.py in the same python package.
+  set_target_properties(portable_lib PROPERTIES OUTPUT_NAME "_portable_lib")
   target_compile_definitions(portable_lib
-                             PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=portable_lib)
+                             PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=_portable_lib)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
-  target_link_libraries(
-    portable_lib
-    PUBLIC ${TORCH_PYTHON_LIBRARY}
-           bundled_program
-           etdump
-           executorch
-           extension_data_loader
-           portable_ops_lib
-           util
-           torch
-           ${PYBIND_LINK_COREML}
-           ${PYBIND_LINK_MPS}
-           ${PYBIND_LINK_XNNPACK})
+  target_link_libraries(portable_lib PUBLIC ${_dep_libs})
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings)
diff --git a/Package.swift b/Package.swift
deleted file mode 100644
index b0dfec174f2..00000000000
--- a/Package.swift
+++ /dev/null
@@ -1,101 +0,0 @@
-// swift-tools-version:5.9
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import PackageDescription
-
-let version = "0.1.0"
-let url = "https://ossci-ios.s3.amazonaws.com/executorch/"
-let debug = "_debug"
-let deliverables = [
-  "coreml_backend": [
-    "sha256": "5bfa35cb5143b4af6840e0e5dd2d40bce93dff331b8eb5798a46274239391a5d",
-    "sha256" + debug: "1422019da9000f8ff7be597de9e0e3b2482f99cdaa75c2d179835778647be1a6",
-    "frameworks": [
-      "Accelerate",
-      "CoreML",
-    ],
-    "libraries": [
-      "sqlite3",
-    ],
-  ],
-  "custom_backend": [
-    "sha256": "2201a61eaf7e06e1937cb73a469fb36cabc219496ba004b85feb2cc7c10f300d",
-    "sha256" + debug: "3eb6eb97bf0641d2305b0f50ff05a8862d7d65e2491cf4aa05ef1d108649f07c",
-  ],
-  "executorch": [
-    "sha256": "2b55cbcff845ab9eaf16a21e520546b2975ef8c55b9e3fbbcc0c375334e40c6f",
-    "sha256" + debug: "12933cedff6cf21c9d21668779f8d8af8049646fe7d290787b12227ff7abe4a7",
-  ],
-  "mps_backend": [
-    "sha256": "510d708361b6ea0692ce5aeb638725d6275824b37bbe744aa876fda24cc2bbbf",
-    "sha256" + debug: "6a67ba0bf8033f17bd66acb222446df51cd1304e24a4fb2c6d97e15a30fb24f0",
-    "frameworks": [
-      "Metal",
-      "MetalPerformanceShaders",
-      "MetalPerformanceShadersGraph",
-    ],
-  ],
-  "optimized_backend": [
-    "sha256": "50aaa54901a7cee1059e71cc623f054610406d65ba8fd6edb10b45861be67237",
-    "sha256" + debug: "3f43f465727c8705432f4bb69260cc9501c519e5da006fc19ee2ab2ea260d1f0",
-  ],
-  "portable_backend": [
-    "sha256": "964238e92828665aa598c05b2264faab91fb13ce0f42633cc7d5653300af3e9b",
-    "sha256" + debug: "d6d85304a4b40f13c9b893e8c264ebdb15307cacf8997494b3818a52e4914b28",
-  ],
-  "quantized_backend": [
-    "sha256": "37d31a319f92e26bab2b7ec5e783a8b14457dee0a4638dcdca1d9e17539ee3fb",
-    "sha256" + debug: "6b45f66f60f6106a41e191418c970bf7b0605df73b9815a06441a5f0809b54e6",
-  ],
-  "xnnpack_backend": [
-    "sha256": "03d506243c392e872519ae1335a025ef202319c1db339a753f9d7d74cba226f0",
-    "sha256" + debug: "3341e89abc99552a6a5bad360003baed194a83e865338bc07afe9e4f171ea169",
-  ],
-].reduce(into: [String: [String: Any]]()) {
-  $0[$1.key] = $1.value
-  $0[$1.key + debug] = $1.value
-}
-.reduce(into: [String: [String: Any]]()) {
-  var newValue = $1.value
-  if $1.key.hasSuffix(debug) {
-    $1.value.forEach { key, value in
-      if key.hasSuffix(debug) {
-        newValue[String(key.dropLast(debug.count))] = value
-      }
-    }
-  }
-  $0[$1.key] = newValue.filter { key, _ in !key.hasSuffix(debug) }
-}
-
-let package = Package(
-  name: "executorch",
-  platforms: [
-    .iOS(.v15),
-  ],
-  products: deliverables.keys.map { key in
-    .library(name: key, targets: ["\(key)_dependencies"])
-  }.sorted { $0.name < $1.name },
-  targets: deliverables.flatMap { key, value -> [Target] in
-    [
-      .binaryTarget(
-        name: key,
-        url: "\(url)\(key)-\(version).zip",
-        checksum: value["sha256"] as? String ?? ""
-      ),
-      .target(
-        name: "\(key)_dependencies",
-        dependencies: [.target(name: key)],
-        path: ".swift/\(key)",
-        linkerSettings:
-          (value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } +
-          (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) }
-      ),
-    ]
-  }
-)
diff --git a/README.md b/README.md
index 4f6d3da7240..e63a2d20caa 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Key value propositions of ExecuTorch are:
   capabilities such as CPUs, NPUs, and DSPs.
 
 For a comprehensive technical overview of ExecuTorch and step-by-step tutorials,
-please visit our [documentation website](https://pytorch.org/executorch).
+please visit our documentation website [for the latest release](https://pytorch.org/executorch/stable/index.html) (or the [main branch](https://pytorch.org/executorch/main/index.html)).
 
 ## Important: This is a preview release
 
diff --git a/backends/apple/coreml/.clang-format b/backends/apple/coreml/.clang-format
index 0c8764539cf..0b04022b0f6 100644
--- a/backends/apple/coreml/.clang-format
+++ b/backends/apple/coreml/.clang-format
@@ -1,5 +1,4 @@
 BasedOnStyle: WebKit
-BreakBeforeBraces: Attach
 AllowShortIfStatementsOnASingleLine: false
 BreakBeforeBinaryOperators: None
 BreakConstructorInitializers: BeforeColon
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index f1c19d00ee8..b3d0182999a 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -13,6 +13,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
+option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
+
 # inmemoryfs sources
 set(INMEMORYFS_SOURCES
   runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -144,7 +146,7 @@ target_include_directories(
 )
 target_link_libraries(
   coremldelegate PRIVATE
-  executorch
+  executorch_no_prim_ops
 )
 
 if(EXECUTORCH_BUILD_SDK)
@@ -174,18 +176,26 @@ find_library(SQLITE_LIBRARY sqlite3)
 
 target_link_libraries(coremldelegate
   PRIVATE
-  executorch
+  executorch_no_prim_ops
   ${ACCELERATE_FRAMEWORK}
   ${COREML_FRAMEWORK}
   ${FOUNDATION_FRAMEWORK}
   ${SQLITE_LIBRARY}
 )
 
+if(COREML_BUILD_EXECUTOR_RUNNER)
+target_link_libraries(coremldelegate
+  PRIVATE
+  portable_ops_lib
+  portable_kernels
+)
+endif()
+
 target_compile_options(coremldelegate PRIVATE "-fobjc-arc")
 target_compile_options(coremldelegate PRIVATE "-fno-exceptions")
 
 if(EXECUTORCH_BUILD_SDK)
-target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
+target_compile_options(executorch_no_prim_ops PUBLIC -DET_EVENT_TRACER_ENABLED)
 target_compile_options(coremldelegate PRIVATE "-frtti")
 target_compile_options(libprotobuf-lite PRIVATE "-frtti")
 else()
diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md
index 1710860f87e..4a21d8d8ae1 100644
--- a/backends/apple/coreml/README.md
+++ b/backends/apple/coreml/README.md
@@ -6,54 +6,123 @@ Core ML is an optimized framework for running machine learning models on Apple d
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.
+- `partition/`: Partitions a module fully or partially to Core ML backend.
+- `quantizer/`: Quantizes a module in Core ML favored scheme.
 - `scripts/` : Scripts for installing dependencies and running tests.
 - `runtime/`: Core ML delegate runtime implementation.
     - `inmemoryfs`: InMemory filesystem implementation used to serialize/de-serialize AOT blob.
     - `kvstore`: Persistent Key-Value store implementation.
     - `delegate`: Runtime implementation.
     - `include` : Public headers.
-    - `tests` :  Tests for Core ML delegate.
-    - `workspace` : Xcode workspace for tests.
+    - `sdk` : SDK implementation.
+    - `tests` :  Unit tests.
+    - `workspace` : Xcode workspace for the runtime.
 - `third-party/`: External dependencies.
 
-## Help & Improvements
-If you have problems or questions or have suggestions for ways to make
-implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
+## Partition and Delegation
 
-## Delegation
-
-For delegating the Program to the **Core ML** backend, the client must be responsible for calling `to_backend` with the **CoreMLBackend** tag.
+To delegate a Program to the **Core ML** backend, the client must call `to_backend` with the **CoreMLPartitioner**.
 
 ```python
-import executorch.exir as exir
 import torch
-
-from executorch.exir.backend.backend_api import to_backend
+import executorch.exir
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
+from executorch.backends.apple.coreml.partition.coreml_partitioner import CoreMLPartitioner
 
-class LowerableSubModel(torch.nn.Module):
+class Model(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
     def forward(self, x):
         return torch.sin(x)
 
-# Convert the lowerable module to Edge IR Representation
-to_be_lowered = LowerableSubModel()
-example_input = (torch.ones(1), )
-to_be_lowered_exir_submodule = exir.capture(to_be_lowered, example_input).to_edge()
+source_model = Model()
+example_inputs = (torch.ones(1), )
 
-# Lower to Core ML backend
-lowered_module = to_backend('CoreMLBackend', to_be_lowered_exir_submodule.exported_program, [])
+# Export the source model to Edge IR representation
+aten_program = torch.export.export(source_model, example_inputs)
+edge_program_manager = executorch.exir.to_edge(aten_program)
+
+# Delegate to Core ML backend
+delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
+
+# Serialize delegated program
+executorch_program = delegated_program_manager.to_executorch()
+with open("model.pte", "wb") as f:
+    f.write(executorch_program.buffer)
 ```
 
-Currently, the **Core ML** backend delegates the whole module to **Core ML**. If a specific op is not supported by the **Core ML** backend then the `to_backend` call would throw an exception. We will be adding a **Core ML Partitioner** to resolve the issue.
+The module will be fully or partially delegated to **Core ML**, depending on whether all or part of ops are supported by the **Core ML** backend. User may force skip certain ops by `CoreMLPartitioner(skip_ops_for_coreml_delegation=...)`
+
+The `to_backend` implementation is a thin wrapper over [coremltools](https://apple.github.io/coremltools/docs-guides/), `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
 
-The `to_backend` implementation is a thin wrapper over `coremltools`, `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
+## Quantization
+
+To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQuantizer**.
+
+```python
+import torch
+import executorch.exir
+
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
+
+from executorch.backends.apple.coreml.quantizer.coreml_quantizer import CoreMLQuantizer
+from coremltools.optimize.torch.quantization.quantization_config import (
+    LinearQuantizerConfig,
+    QuantizationScheme,
+)
+
+class Model(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=3, out_channels=16, kernel_size=3, padding=1
+        )
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        a = self.conv(x)
+        return self.relu(a)
+
+source_model = Model()
+example_inputs = (torch.randn((1, 3, 256, 256)), )
+
+pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)
+
+quantization_config = LinearQuantizerConfig.from_dict(
+    {
+        "global_config": {
+            "quantization_scheme": QuantizationScheme.symmetric,
+            "activation_dtype": torch.uint8,
+            "weight_dtype": torch.int8,
+            "weight_per_channel": True,
+        }
+    }
+)
+quantizer = CoreMLQuantizer(quantization_config)
+
+# For post-training quantization, use `prepare_pt2e`
+# For quantization-aware trainin,g use `prepare_qat_pt2e`
+prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
+
+prepared_graph(*example_inputs)
+converted_graph = convert_pt2e(prepared_graph)
+```
+
+The `converted_graph` is the quantized torch model, and can be delegated to **Core ML** similarly through **CoreMLPartitioner**
 
 ## Runtime
 
-To execute a **Core ML** delegated **Program**, the client must link to the `coremldelegate` library. Once linked there are no additional steps required, **ExecuTorch** when running the **Program** would call the **Core ML** runtime to execute the **Core ML** delegated part of the **Program**.
+To execute a Core ML delegated program, the application must link to the `coremldelegate` library. Once linked there are no additional steps required, ExecuTorch when running the program would call the Core ML runtime to execute the Core ML delegated part of the program.
 
 Please follow the instructions described in the [Core ML setup](/backends/apple/coreml/setup.md) to link the `coremldelegate` library.
+
+## Help & Improvements
+If you have problems or questions or have suggestions for ways to make
+implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
index da399e80d54..6fe37925d27 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
@@ -630,7 +630,7 @@ - (NSUInteger)_compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing
     }
     
     if (_estimatedSizeInBytes <= sizeInBytes) {
-        return YES;
+        return _estimatedSizeInBytes;
     }
     
     std::error_code ec;
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
index eab239b496c..13b1023bcbc 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
@@ -13,7 +13,8 @@
 
 NS_ASSUME_NONNULL_BEGIN
 /// The default model executor, the executor ignores logging options.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLDefaultModelExecutor : NSObject<ETCoreMLModelExecutor>
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLDefaultModelExecutor : NSObject<ETCoreMLModelExecutor>
 
 + (instancetype)new NS_UNAVAILABLE;
 
@@ -27,6 +28,9 @@ __attribute__((objc_subclassing_restricted)) @interface ETCoreMLDefaultModelExec
 /// The model.
 @property (readonly, strong, nonatomic) ETCoreMLModel* model;
 
+/// If set to `YES` then output backing are ignored.
+@property (readwrite, atomic) BOOL ignoreOutputBackings;
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
index 399c91bd495..57316e28015 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
@@ -26,6 +26,9 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model {
                                               loggingOptions:(const executorchcoreml::ModelLoggingOptions& __unused)loggingOptions
                                                  eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable __unused)eventLogger
                                                        error:(NSError * __autoreleasing *)error {
+    if (self.ignoreOutputBackings) {
+        predictionOptions.outputBackings = @{};
+    }
     id<MLFeatureProvider> outputs = [self.model.mlModel predictionFromFeatures:inputs
                                                                        options:predictionOptions
                                                                          error:error];
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
index 1a1b10848bb..d9c4d4ef638 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
@@ -7,6 +7,7 @@
 
 #import <Foundation/Foundation.h>
 
+#import <executorch/runtime/platform/log.h>
 #import <os/log.h>
 
 NS_ASSUME_NONNULL_BEGIN
@@ -48,7 +49,11 @@ typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) {
 
 /// Record the error with `os_log_error` and fills `*errorOut` with `NSError`.
 #define ETCoreMLLogErrorAndSetNSError(errorOut, errorCode, formatString, ...)                                        \
-    os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__);                                    \
+    if (ET_LOG_ENABLED) {                                                                                            \
+        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String);                    \
+    } else {                                                                                                         \
+        os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__);                                \
+    }                                                                                                                \
     if (errorOut) {                                                                                                  \
         *errorOut =                                                                                                  \
             [NSError errorWithDomain:ETCoreMLErrorDomain                                                             \
@@ -58,24 +63,31 @@ typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) {
                             }];                                                                                      \
     }
 
-/// Record the error and its underlying error with `os_log_error` and fills
-/// `*errorOut` with NSError.
+/// Record the error and its underlying error with `os_log_error` and fills `*errorOut` with `NSError`.
 #define ETCoreMLLogUnderlyingErrorAndSetNSError(errorOut, errorCode, underlyingNSError, formatString, ...) \
-    os_log_error(ETCoreMLErrorUtils.loggingChannel,                                                        \
-                 formatString ", with underlying error= %@.",                                              \
-                 ##__VA_ARGS__,                                                                            \
-                 (underlyingNSError).localizedDescription);                                                \
+    if (ET_LOG_ENABLED) {                                                                                  \
+        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String);          \
+    } else {                                                                                               \
+        os_log_error(ETCoreMLErrorUtils.loggingChannel,                                                    \
+                     formatString ", with underlying error= %@.",                                          \
+                     ##__VA_ARGS__,                                                                        \
+                     (underlyingNSError).localizedDescription);                                            \
+    }                                                                                                      \
     if (errorOut) {                                                                                        \
         *errorOut = [ETCoreMLErrorUtils errorWithCode:errorCode                                            \
                                       underlyingError:underlyingNSError                                    \
                                                format:@formatString, ##__VA_ARGS__];                       \
     }
 
-#define ETCoreMLLogError(error, formatString, ...)  \
-    os_log_error(ETCoreMLErrorUtils.loggingChannel, \
-                 formatString ", with error= %@.",  \
-                 ##__VA_ARGS__,                     \
-                 (error).localizedDescription);
+#define ETCoreMLLogError(error, formatString, ...)                                                \
+    if (ET_LOG_ENABLED) {                                                                         \
+        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String); \
+    } else {                                                                                      \
+        os_log_error(ETCoreMLErrorUtils.loggingChannel,                                           \
+                     formatString ", with error= %@.",                                            \
+                     ##__VA_ARGS__,                                                               \
+                     (error).localizedDescription);                                               \
+    }
 
 
 #pragma clang diagnostic pop
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
index 0f8a440c858..9bf3183e65a 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
@@ -6,12 +6,18 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 #import <CoreML/CoreML.h>
+#import <vector>
 
 NS_ASSUME_NONNULL_BEGIN
 
 @class ETCoreMLAsset;
 
+namespace executorchcoreml {
+class MultiArray;
+}
+
 /// Represents a ML model, the class is a thin wrapper over `MLModel` with additional properties.
+__attribute__((objc_subclassing_restricted))
 @interface ETCoreMLModel : NSObject
 
 - (instancetype)init NS_UNAVAILABLE;
@@ -31,6 +37,12 @@ NS_ASSUME_NONNULL_BEGIN
                     orderedOutputNames:(NSOrderedSet<NSString*>*)orderedOutputNames
                                  error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER;
 
+- (nullable NSArray<MLMultiArray*>*)prepareInputs:(const std::vector<executorchcoreml::MultiArray>&)inputs
+                                            error:(NSError* __autoreleasing*)error;
+
+- (nullable NSArray<MLMultiArray*>*)prepareOutputBackings:(const std::vector<executorchcoreml::MultiArray>&)outputs
+                                                    error:(NSError* __autoreleasing*)error;
+
 /// The underlying MLModel.
 @property (strong, readonly, nonatomic) MLModel* mlModel;
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
index 791fb7c03b6..ee7218bd271 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
@@ -8,6 +8,164 @@
 #import <ETCoreMLModel.h>
 
 #import <ETCoreMLAsset.h>
+#import <functional>
+#import <objc_array_util.h>
+#import <multiarray.h>
+#import <numeric>
+
+#pragma mark - ETCoreMLMultiArrayDescriptor
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLMultiArrayDescriptor: NSObject <NSCopying>
+
+- (instancetype)init NS_UNAVAILABLE;
+
++ (instancetype)new NS_UNAVAILABLE;
+
+- (instancetype)initWithShape:(NSArray<NSNumber *> *)shape
+                     dataType:(MLMultiArrayDataType)dataType NS_DESIGNATED_INITIALIZER;
+
+@property (copy, readonly, nonatomic) NSArray<NSNumber *> *shape;
+
+@property (assign, readonly, nonatomic) MLMultiArrayDataType dataType;
+
+@end
+
+@implementation ETCoreMLMultiArrayDescriptor
+
+- (instancetype)initWithShape:(NSArray<NSNumber *> *)shape
+                     dataType:(MLMultiArrayDataType)dataType {
+    self = [super init];
+    if (self) {
+        _shape = shape;
+        _dataType = dataType;
+    }
+    
+    return self;
+}
+
+- (BOOL)isEqual:(id)object {
+    if (object == self) {
+        return YES;
+    }
+    
+    if (![object isKindOfClass:self.class]) {
+        return NO;
+    }
+    
+    ETCoreMLMultiArrayDescriptor *other = (ETCoreMLMultiArrayDescriptor *)object;
+    return [self.shape isEqualToArray:other.shape] && self.dataType == other.dataType;
+}
+
+- (NSUInteger)hash {
+    return [self.shape hash] ^ (NSUInteger)self.dataType;
+}
+
+- (instancetype)copyWithZone:(NSZone *)zone {
+    return [[ETCoreMLMultiArrayDescriptor allocWithZone:zone] initWithShape:self.shape
+                                                                   dataType:self.dataType];
+}
+
+@end
+
+namespace {
+
+using namespace executorchcoreml;
+
+size_t get_number_of_bytes(MLMultiArrayDataType data_type) {
+    switch (data_type) {
+        case MLMultiArrayDataTypeFloat16: {
+            return 2;
+        }
+        case MLMultiArrayDataTypeFloat32: {
+            return 4;
+        }
+        case MLMultiArrayDataTypeInt32: {
+            return 4;
+        }
+        case MLMultiArrayDataTypeFloat64: {
+            return 8;
+        }
+        default: {
+            return 0;
+        }
+    }
+}
+
+std::vector<size_t> calculate_strides(const std::vector<size_t>& shape) {
+    if (shape.size() == 0) {
+        return {};
+    }
+    
+    if (shape.size() == 1) {
+        return {1};
+    }
+    
+    std::vector<size_t> strides(shape.size(), 1);
+    size_t product = 1;
+    for (size_t i = shape.size(); i > 0; i--) {
+        strides[i - 1] = product;
+        product *= shape[i - 1];
+    }
+    
+    return strides;
+}
+
+MLMultiArray * _Nullable make_ml_multi_array(const std::vector<size_t>& shape,
+                                             MLMultiArrayDataType dataType,
+                                             NSCache<ETCoreMLMultiArrayDescriptor *, NSMutableData *> *cache,
+                                             NSError * __autoreleasing *error) {
+    ETCoreMLMultiArrayDescriptor *descriptor = [[ETCoreMLMultiArrayDescriptor alloc] initWithShape:to_array(shape)
+                                                                                          dataType:dataType];
+    // Check the cache first otherwise allocate a new backing storage.
+    NSMutableData *backing_storage = [cache objectForKey:descriptor];
+    if (backing_storage) {
+        [cache removeObjectForKey:descriptor];
+    } else {
+        size_t n = std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies<size_t>{});
+        backing_storage = [[NSMutableData alloc] initWithLength:n * get_number_of_bytes(dataType)];
+    }
+    
+    __weak NSCache<ETCoreMLMultiArrayDescriptor *, NSMutableData *> *weakCache = cache;
+    // Add the storage back to the cache when it gets deallocated, the next prediction would use the same storage.
+    MLMultiArray *result = [[MLMultiArray alloc] initWithDataPointer:backing_storage.mutableBytes
+                                                               shape:descriptor.shape
+                                                            dataType:descriptor.dataType
+                                                             strides:to_array(calculate_strides(shape))
+                                                         deallocator:^(void * _Nonnull bytes) {[weakCache setObject:backing_storage forKey:descriptor];}
+                                                               error:error];
+    
+    return result;
+}
+
+NSDictionary<NSString *, MLMultiArrayConstraint *> *
+get_multi_array_constraints_by_name(NSDictionary<NSString *, MLFeatureDescription *> *feature_descriptions) {
+    NSMutableDictionary<NSString *, MLMultiArrayConstraint *> *result = [NSMutableDictionary dictionaryWithCapacity:feature_descriptions.count];
+    [feature_descriptions enumerateKeysAndObjectsUsingBlock:^(NSString *key, MLFeatureDescription *description, BOOL * _Nonnull stop) {
+        result[key] = description.multiArrayConstraint;
+    }];
+    
+    return result;
+}
+
+NSDictionary<NSString *, MLMultiArrayConstraint *> *get_multi_array_input_constraints_by_name(MLModelDescription *description) {
+    return get_multi_array_constraints_by_name(description.inputDescriptionsByName);
+}
+
+NSDictionary<NSString *, MLMultiArrayConstraint *> *get_multi_array_output_constraints_by_name(MLModelDescription *description) {
+    return get_multi_array_constraints_by_name(description.outputDescriptionsByName);
+}
+
+}
+
+#pragma mark - ETCoreMLModel
+@interface ETCoreMLModel ()
+
+@property (strong, readonly, nonatomic) NSCache<ETCoreMLMultiArrayDescriptor *, NSMutableData *> *cache;
+@property (copy, readonly, nonatomic) NSDictionary<NSString *, MLMultiArrayConstraint *> *inputConstraintsByName;
+@property (copy, readonly, nonatomic) NSDictionary<NSString *, MLMultiArrayConstraint *> *outputConstraintsByName;
+
+@end
+
 
 @implementation ETCoreMLModel
 
@@ -33,8 +191,11 @@ - (nullable instancetype)initWithAsset:(ETCoreMLAsset *)asset
         _asset = asset;
         _orderedInputNames = [orderedInputNames copy];
         _orderedOutputNames = [orderedOutputNames copy];
+        _cache = [[NSCache alloc] init];
+        _inputConstraintsByName = get_multi_array_input_constraints_by_name(mlModel.modelDescription);
+        _outputConstraintsByName = get_multi_array_output_constraints_by_name(mlModel.modelDescription);
     }
-
+    
     return self;
 }
 
@@ -42,4 +203,73 @@ - (NSString *)identifier {
     return self.asset.identifier;
 }
 
+- (nullable NSArray<MLMultiArray *> *)prepareArgs:(const std::vector<executorchcoreml::MultiArray>&)args
+                                         argNames:(NSOrderedSet<NSString *> *)argNames
+                             argConstraintsByName:(NSDictionary<NSString *, MLMultiArrayConstraint *> *)argConstraintsByName
+                                         copyData:(const BOOL)copyData
+                                            error:(NSError * __autoreleasing *)error {
+    NSEnumerator *nameEnumerator = [argNames objectEnumerator];
+    NSMutableArray<MLMultiArray *> *result = [NSMutableArray arrayWithCapacity:args.size()];
+    for (const auto& arg : args) {
+        BOOL lCopyData = copyData;
+        NSString *argName = [nameEnumerator nextObject];
+        MLMultiArrayConstraint *constraint = argConstraintsByName[argName];
+        const auto& layout = arg.layout();
+        auto dataType = to_ml_multiarray_data_type(layout.dataType());
+        MLMultiArray *multiArrayArg = nil;
+        if (dataType == constraint.dataType) {
+            // We can use the same data storage.
+            multiArrayArg = [[MLMultiArray alloc] initWithDataPointer:arg.data()
+                                                                shape:to_array(layout.shape())
+                                                             dataType:constraint.dataType
+                                                              strides:to_array(layout.strides())
+                                                          deallocator:^(void * _Nonnull bytes) {}
+                                                                error:error];
+            lCopyData = NO;
+        } else {
+            // We can't use the same data storage, data types are not the same.
+            multiArrayArg = ::make_ml_multi_array(layout.shape(), constraint.dataType, self.cache, error);
+        }
+        
+        if (!multiArrayArg) {
+            return nil;
+        }
+        
+        if (multiArrayArg && lCopyData) {
+            [multiArrayArg getMutableBytesWithHandler:^(void *_Nonnull mutableBytes,
+                                                        NSInteger __unused size,
+                                                        NSArray<NSNumber *> *strides) {
+                MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type(constraint.dataType).value(),
+                                                                         layout.shape(),
+                                                                         to_vector<ssize_t>(strides)));
+                arg.copy(buffer);
+            }];
+        }
+        
+        [result addObject:multiArrayArg];
+    }
+    
+    return result;
+}
+
+- (nullable NSArray<MLMultiArray *> *)prepareInputs:(const std::vector<executorchcoreml::MultiArray>&)inputs
+                                              error:(NSError * __autoreleasing *)error {
+    return [self prepareArgs:inputs
+                    argNames:self.orderedInputNames
+        argConstraintsByName:self.inputConstraintsByName
+                    copyData:YES
+                       error:error];
+    
+}
+
+- (nullable NSArray<MLMultiArray *> *)prepareOutputBackings:(const std::vector<executorchcoreml::MultiArray>&)outputs
+                                                      error:(NSError * __autoreleasing *)error {
+    return [self prepareArgs:outputs
+                    argNames:self.orderedOutputNames
+        argConstraintsByName:self.outputConstraintsByName
+                    copyData:NO
+                       error:error];
+    
+}
+
 @end
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.h
index 3a3578e06ab..f846ebbb969 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.h
@@ -9,7 +9,8 @@
 
 NS_ASSUME_NONNULL_BEGIN
 /// A class responsible for compiling a CoreML model.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelCompiler : NSObject
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLModelCompiler : NSObject
 
 + (instancetype)new NS_UNAVAILABLE;
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h
index e6e329c9ddd..2f1b22f456b 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelExecutor.h
@@ -35,6 +35,9 @@ NS_ASSUME_NONNULL_BEGIN
 /// The model.
 @property (readonly, strong, nonatomic) ETCoreMLModel* model;
 
+/// If set to `YES` then output backing are ignored.
+@property (readwrite, atomic) BOOL ignoreOutputBackings;
+
 
 @end
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.h
index 3d2e1006329..05e96ad59f5 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.h
@@ -16,7 +16,8 @@ struct ModelMetadata;
 
 NS_ASSUME_NONNULL_BEGIN
 /// A class responsible for loading a CoreML model.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelLoader : NSObject
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLModelLoader : NSObject
 
 + (instancetype)new NS_UNAVAILABLE;
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
index fb616c71527..394cff4f897 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
@@ -7,11 +7,14 @@
 
 #import <CoreML/CoreML.h>
 
+#import <vector>
+
 NS_ASSUME_NONNULL_BEGIN
 
 namespace executorchcoreml {
 struct ModelLoggingOptions;
 class ModelEventLogger;
+class MultiArray;
 };
 
 @class ETCoreMLModel;
@@ -20,7 +23,8 @@ class ModelEventLogger;
 typedef void ModelHandle;
 
 /// A class responsible for managing the models loaded by the delegate.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelManager : NSObject
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLModelManager : NSObject
 
 + (instancetype)new NS_UNAVAILABLE;
 
@@ -49,7 +53,7 @@ __attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelManager : N
 /// Executes the loaded model.
 ///
 /// @param handle The handle to the loaded model.
-/// @param args The arguments to the model.
+/// @param args The arguments (inputs and outputs) of the model.
 /// @param loggingOptions The model logging options.
 /// @param error   On failure, error is filled with the failure information.
 /// @retval `YES` if the execution succeeded otherwise `NO`.
@@ -59,6 +63,19 @@ __attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelManager : N
                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                          error:(NSError* __autoreleasing*)error;
 
+/// Executes the loaded model.
+///
+/// @param handle The handle to the loaded model.
+/// @param argsVec The arguments (inputs and outputs) of the model.
+/// @param loggingOptions The model logging options.
+/// @param error   On failure, error is filled with the failure information.
+/// @retval `YES` if the execution succeeded otherwise `NO`.
+- (BOOL)executeModelWithHandle:(ModelHandle*)handle
+                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
+                   eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
+                         error:(NSError* __autoreleasing*)error;
+
 /// Unloads the loaded model.
 ///
 /// @param handle The handle to the loaded model.
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index 1c0d2a30f97..c51de9d1e14 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -22,6 +22,8 @@
 #import <iostream>
 #import <memory>
 #import <model_metadata.h>
+#import <multiarray.h>
+#import <objc_array_util.h>
 #import <optional>
 #import <os/lock.h>
 #import <serde_json.h>
@@ -98,32 +100,60 @@ BOOL is_backed_by_same_buffer(MLMultiArray *array1, MLMultiArray *array2) {
     return options;
 }
 
-BOOL copy(MLMultiArray *src, MLMultiArray *dst, NSError * __autoreleasing *error) {
-    if (![src.shape isEqualToArray:dst.shape]) {
-        ETCoreMLLogErrorAndSetNSError(error, 0, "%@: Model is broken", NSStringFromClass(ETCoreMLModelManager.class));
-        return NO;
-    }
+void copy(MLMultiArray *src, MLMultiArray *dst) {
     if (::is_backed_by_same_buffer(src, dst)) {
-        return YES;
-    }
-    @autoreleasepool {
-        [src copyInto:dst];
+        return;
     }
-    return YES;
+    
+    [src copyInto:dst];
 }
 
-BOOL set_outputs(NSArray<MLMultiArray *> *outputs,
-                 NSArray<MLMultiArray *> *model_outputs,
-                 NSError * __autoreleasing *error) {
+void set_outputs(NSArray<MLMultiArray *> *outputs, NSArray<MLMultiArray *> *model_outputs) {
     NSEnumerator<MLMultiArray *> *enumerator = [model_outputs objectEnumerator];
     for (MLMultiArray *output in outputs) {
         MLMultiArray *model_output = [enumerator nextObject];
-        if (!::copy(output, model_output, error)) {
-            return NO;
+        ::copy(model_output, output);
+    }
+}
+
+std::optional<MultiArray::DataType> get_data_type(MLMultiArrayDataType data_type) {
+    switch (data_type) {
+        case MLMultiArrayDataTypeFloat16: {
+            return MultiArray::DataType::Float16;
+        }
+        case MLMultiArrayDataTypeFloat32: {
+            return MultiArray::DataType::Float32;
+        }
+        case MLMultiArrayDataTypeFloat64: {
+            return MultiArray::DataType::Float64;
+        }
+        case MLMultiArrayDataTypeInt32: {
+            return MultiArray::DataType::Int32;
+        }
+        default: {
+            return std::nullopt;
         }
     }
-    
-    return YES;
+}
+
+void copy(MLMultiArray *src, executorchcoreml::MultiArray& dst) {
+    [src getBytesWithHandler:^(const void * _Nonnull bytes, NSInteger size) {
+        if (bytes == dst.data()) {
+            return;
+        }
+        
+        MultiArray::MemoryLayout src_layout(get_data_type(src.dataType).value(), to_vector<size_t>(src.shape), to_vector<ssize_t>(src.strides));
+        MultiArray(const_cast<void *>(bytes), std::move(src_layout)).copy(dst);
+    }];
+}
+
+void set_outputs(std::vector<executorchcoreml::MultiArray>& outputs,
+                 NSArray<MLMultiArray *> *model_outputs) {
+    NSEnumerator<MLMultiArray *> *enumerator = [model_outputs objectEnumerator];
+    for (auto& output : outputs) {
+        MLMultiArray *model_output = [enumerator nextObject];
+        ::copy(model_output, output);
+    }
 }
 
 NSData * _Nullable get_file_data(const inmemoryfs::InMemoryFileSystem *inMemoryFS,
@@ -313,6 +343,7 @@ void add_compute_unit(std::string& identifier, MLComputeUnits compute_units) {
     
     return result;
 }
+
 #endif
 } //namespace
 
@@ -467,7 +498,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     return [[ETCoreMLModelAnalyzer alloc] initWithCompiledModelAsset:compiledModelAsset
                                                           modelAsset:modelAsset
                                                             metadata:metadata
-                                       operationPathToDebugSymbolMap: operation_path_to_symbol_name_map
+                                       operationPathToDebugSymbolMap:operation_path_to_symbol_name_map
                                                        configuration:configuration
                                                         assetManager:self.assetManager
                                                                error:error];
@@ -641,6 +672,48 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset {
     os_unfair_lock_unlock(&_lock);
 }
 
+- (nullable NSArray<MLMultiArray *> *)executeModelUsingExecutor:(id<ETCoreMLModelExecutor>)executor
+                                                         inputs:(NSArray<MLMultiArray *> *)inputs
+                                                 outputBackings:(NSArray<MLMultiArray *> *)outputBackings
+                                                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
+                                                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
+                                                          error:(NSError * __autoreleasing *)error {
+    NSError *localError = nil;
+    ETCoreMLModel *model = executor.model;
+    MLPredictionOptions *predictionOptions = ::get_prediction_options(outputBackings, model.orderedOutputNames, error);
+    if (!predictionOptions) {
+        return nil;
+    }
+    
+    id<MLFeatureProvider> inputFeatures = ::get_feature_provider(inputs, model.orderedInputNames, error);
+    if (!inputFeatures) {
+        return nil;
+    }
+    
+    NSArray<MLMultiArray *> *modelOutputs = [executor executeModelWithInputs:inputFeatures
+                                                           predictionOptions:predictionOptions
+                                                             loggingOptions:loggingOptions
+                                                                 eventLogger:eventLogger
+                                                                       error:&localError];
+    // Try without output backings.
+    if (!modelOutputs && predictionOptions.outputBackings.count > 0) {
+        localError = nil;
+        executor.ignoreOutputBackings = YES;
+    }
+    
+    modelOutputs = [executor executeModelWithInputs:inputFeatures
+                                  predictionOptions:predictionOptions
+                                     loggingOptions:loggingOptions
+                                        eventLogger:eventLogger
+                                              error:&localError];
+    
+    if (error) {
+        *error = localError;
+    }
+    
+    return modelOutputs;
+}
+
 - (BOOL)executeModelWithHandle:(ModelHandle *)handle
                           args:(NSArray<MLMultiArray *> *)args
                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
@@ -659,33 +732,91 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
     if (args.count != model.orderedInputNames.count + model.orderedOutputNames.count) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedModel,
-                                      "%@: Model is invalid.",
-                                      NSStringFromClass(self.class));
+                                      "%@: Model is invalid, expected args count to be %lu but got %lu.",
+                                      NSStringFromClass(self.class),
+                                      static_cast<unsigned long>(model.orderedInputNames.count + model.orderedOutputNames.count),
+                                      args.count);
         return NO;
     }
-    
-    NSArray<MLMultiArray *> *inputs = [args subarrayWithRange:NSMakeRange(0, model.orderedInputNames.count)];
-    NSArray<MLMultiArray *> *outputs = [args subarrayWithRange:NSMakeRange(model.orderedInputNames.count, args.count - model.orderedInputNames.count)];
-    id<MLFeatureProvider> inputFeatures = ::get_feature_provider(inputs, model.orderedInputNames, error);
-    if (!inputFeatures) {
-        return NO;
+    @autoreleasepool {
+        NSArray<MLMultiArray *> *inputs = [args subarrayWithRange:NSMakeRange(0, model.orderedInputNames.count)];
+        NSArray<MLMultiArray *> *outputs = [args subarrayWithRange:NSMakeRange(model.orderedInputNames.count, args.count - model.orderedInputNames.count)];
+        NSArray<MLMultiArray *> *outputBackings = @[];
+        if (executor.ignoreOutputBackings == NO) {
+            outputBackings = outputs;
+        }
+        
+        NSArray<MLMultiArray *> *modelOutputs = [self executeModelUsingExecutor:executor
+                                                                         inputs:inputs
+                                                                 outputBackings:outputBackings
+                                                                 loggingOptions:loggingOptions
+                                                                    eventLogger:eventLogger
+                                                                          error:error];
+        if (!modelOutputs) {
+            return NO;
+        }
+        
+        ::set_outputs(outputs, modelOutputs);
     }
     
-    MLPredictionOptions *predictionOptions = ::get_prediction_options(outputs, model.orderedOutputNames, error);
-    if (!predictionOptions) {
+    return YES;
+}
+
+- (BOOL)executeModelWithHandle:(ModelHandle *)handle
+                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
+                   eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
+                         error:(NSError * __autoreleasing *)error {
+    id<ETCoreMLModelExecutor> executor = [self executorWithHandle:handle];
+    if (!executor) {
+        ETCoreMLLogErrorAndSetNSError(error,
+                                      0,
+                                      "%@: Model is already unloaded.",
+                                      NSStringFromClass(self.class));
         return NO;
     }
     
-    NSArray<MLMultiArray *> *modelOutputs = [executor executeModelWithInputs:inputFeatures
-                                                           predictionOptions:predictionOptions
-                                                             loggingOptions:loggingOptions
-                                                                 eventLogger:eventLogger
-                                                                       error:error];
-    if (!outputs) {
+    ETCoreMLModel *model = executor.model;
+    if (argsVec.size() != model.orderedInputNames.count + model.orderedOutputNames.count) {
+        ETCoreMLLogErrorAndSetNSError(error,
+                                      ETCoreMLErrorCorruptedModel,
+                                      "%@: Model is invalid, expected args count to be %lu but got %lu.",
+                                      NSStringFromClass(self.class),
+                                      static_cast<unsigned long>(model.orderedInputNames.count + model.orderedOutputNames.count),
+                                      argsVec.size());
         return NO;
     }
     
-    return ::set_outputs(outputs, modelOutputs, error);
+    std::vector<executorchcoreml::MultiArray> inputArgs(argsVec.begin(), argsVec.begin() + model.orderedInputNames.count);
+    std::vector<executorchcoreml::MultiArray> outputArgs(argsVec.begin() + model.orderedInputNames.count, argsVec.end());
+    @autoreleasepool {
+        NSArray<MLMultiArray *> *inputs = [model prepareInputs:inputArgs error:error];
+        if (!inputs) {
+            return NO;
+        }
+        
+        NSArray<MLMultiArray *> *outputBackings = @[];
+        if (executor.ignoreOutputBackings == NO) {
+            outputBackings = [model prepareOutputBackings:outputArgs error:error];
+        }
+        
+        if (!outputBackings) {
+            return NO;
+        }
+        
+        NSArray<MLMultiArray *> *modelOutputs = [self executeModelUsingExecutor:executor
+                                                                         inputs:inputs
+                                                                 outputBackings:outputBackings
+                                                                 loggingOptions:loggingOptions
+                                                                    eventLogger:eventLogger
+                                                                          error:error];
+        if (!modelOutputs) {
+            return NO;
+        }
+        
+        ::set_outputs(outputArgs, modelOutputs);
+        return YES;
+    }
 }
 
 - (BOOL)unloadModelWithHandle:(ModelHandle *)handle {
diff --git a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
index 4aa5fffe94a..b8a10fcbbbc 100644
--- a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
+++ b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
@@ -7,55 +7,17 @@
 
 #import <MLMultiArray_Copy.h>
 
+#import <objc_array_util.h>
 #import <multiarray.h>
 
 namespace {
 using namespace executorchcoreml;
 
-template<typename T>
-T toValue(NSNumber *value);
-
-template<> size_t toValue(NSNumber *value) {
-    return value.unsignedLongValue;
-}
-
-template<> ssize_t toValue(NSNumber *value) {
-    return value.longLongValue;
-}
-
-template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
-std::vector<T> to_vector(NSArray<NSNumber *> *numbers) {
-    std::vector<T> result;
-    result.reserve(numbers.count);
-    for (NSNumber *number in numbers) {
-        result.emplace_back(toValue<T>(number));
-    }
-    
-    return result;
-}
-
-MultiArray::DataType to_multi_array_data_type(MLMultiArrayDataType data_type) {
-    switch (data_type) {
-        case MLMultiArrayDataTypeInt32: {
-            return MultiArray::DataType::Int;
-        }
-        case MLMultiArrayDataTypeFloat: {
-            return MultiArray::DataType::Float;
-        }
-        case MLMultiArrayDataTypeFloat16: {
-            return MultiArray::DataType::Float16;
-        }
-        case MLMultiArrayDataTypeDouble: {
-            return MultiArray::DataType::Double;
-        }
-    }
-}
-
 MultiArray to_multi_array(void *data,
                           MLMultiArrayDataType dataType,
                           NSArray<NSNumber *> *shape,
                           NSArray<NSNumber *> *strides) {
-    auto layout = MultiArray::MemoryLayout(to_multi_array_data_type(dataType),
+    auto layout = MultiArray::MemoryLayout(to_multiarray_data_type(dataType).value(),
                                            to_vector<size_t>(shape),
                                            to_vector<ssize_t>(strides));
     return MultiArray(data, std::move(layout));
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.h b/backends/apple/coreml/runtime/delegate/backend_delegate.h
index d6a6016c087..ed921fb35bd 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.h
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.h
@@ -26,7 +26,7 @@ class BackendDelegate {
 
     struct Config {
         // Max models cache size in bytes.
-        size_t max_models_cache_size = 2 * size_t(1024) * size_t(1024) * size_t(1024);
+        size_t max_models_cache_size = 10 * size_t(1024) * size_t(1024) * size_t(1024);
         // If set to `true`, delegate pre-warms the most recently used asset.
         bool should_prewarm_asset = true;
         // If set to `true`, delegate pre-warms the model in `init`.
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
index b91a6208b6a..1ded4a76b3b 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
@@ -44,44 +44,6 @@ MLComputeUnits get_compute_units(const Buffer& buffer) {
     return configuration;
 }
 
-template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
-NSArray<NSNumber *> *to_array(const std::vector<T>& array) {
-    NSMutableArray<NSNumber *> *result = [NSMutableArray arrayWithCapacity:array.size()];
-    for (T value : array) {
-        [result addObject:@(value)];
-    }
-    
-    return result;
-}
-
-MLMultiArrayDataType get_data_type(MultiArray::DataType dataType) {
-    switch (dataType) {
-        case MultiArray::DataType::Float16: {
-            return MLMultiArrayDataTypeFloat16;
-        }
-        case MultiArray::DataType::Float: {
-            return MLMultiArrayDataTypeFloat32;
-        }
-        case MultiArray::DataType::Double: {
-            return MLMultiArrayDataTypeDouble;
-        }
-        case MultiArray::DataType::Int: {
-            return MLMultiArrayDataTypeInt32;
-        }
-    }
-}
-
-MLMultiArray * _Nullable to_ml_multiarray(const MultiArray& array, NSError * __autoreleasing *error) {
-    const auto& layout = array.layout();
-    MLMultiArray *result = [[MLMultiArray alloc] initWithDataPointer:array.data()
-                                                               shape:to_array(layout.shape())
-                                                            dataType:get_data_type(layout.dataType())
-                                                             strides:to_array(layout.strides())
-                                                         deallocator:^(void * _Nonnull bytes) {}
-                                                               error:error];
-    return result;
-}
-
 NSURL * _Nullable create_directory_if_needed(NSURL *url,
                                              NSFileManager *fileManager,
                                              NSError * __autoreleasing *error) {
@@ -194,17 +156,8 @@ bool execute(Handle* handle,
                  ModelEventLogger *event_logger,
                  std::error_code& ec) const noexcept override {
         NSError *error = nil;
-        NSMutableArray<MLMultiArray *> *model_args = [NSMutableArray arrayWithCapacity:args.size()];
-        for (const auto& arg : args) {
-            MLMultiArray *multi_array = to_ml_multiarray(arg, &error);
-            if (!multi_array) {
-                return false;
-            }
-            [model_args addObject:multi_array];
-        }
-        
         if (![model_manager_ executeModelWithHandle:handle
-                                               args:model_args
+                                            argsVec:args
                                     loggingOptions:logging_options
                                         eventLogger:event_logger
                                               error:&error]) {
diff --git a/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist b/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
index 7dd12acaaf8..df37a47755f 100644
--- a/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
+++ b/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
@@ -7,6 +7,6 @@
 	<key>shouldPrewarmModel</key>
 	<true/>
 	<key>maxAssetsSizeInBytes</key>
-	<integer>2147483648</integer>
+	<integer>1073741824</integer>
 </dict>
 </plist>
diff --git a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
index a51e73ee68d..b672d4a08e4 100644
--- a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
@@ -28,16 +28,25 @@
 using namespace executorchcoreml;
 
 std::optional<MultiArray::DataType> get_data_type(ScalarType scalar_type) {
-    if (scalar_type == ScalarType::Float) {
-        return MultiArray::DataType::Float;
-    } else if (scalar_type == ScalarType::Double) {
-        return MultiArray::DataType::Double;
-    } else if (scalar_type == ScalarType::Half) {
-        return MultiArray::DataType::Float16;
-    } else if (scalar_type == ScalarType::Int) {
-        return MultiArray::DataType::Int;
-    } else {
-        return std::nullopt;
+    switch (scalar_type) {
+        case ScalarType::Bool:
+            return MultiArray::DataType::Bool;
+        case ScalarType::Byte:
+            return MultiArray::DataType::Byte;
+        case ScalarType::Short:
+            return MultiArray::DataType::Short;
+        case ScalarType::Int:
+            return MultiArray::DataType::Int32;
+        case ScalarType::Long:
+            return MultiArray::DataType::Int64;
+        case ScalarType::Half:
+            return MultiArray::DataType::Float16;
+        case ScalarType::Float:
+            return MultiArray::DataType::Float32;
+        case ScalarType::Double:
+            return MultiArray::DataType::Float64;
+        default:
+            return std::nullopt;
     }
 }
 
@@ -54,6 +63,7 @@
     auto tensor = eValue->toTensor();
     auto dataType = get_data_type(tensor.scalar_type());
     if (!dataType.has_value()) {
+        ET_LOG(Error, "%s: DataType=%d is not supported", ETCoreMLStrings.delegateIdentifier.UTF8String, (int)tensor.scalar_type());
         return std::nullopt;
     }
     
@@ -167,7 +177,7 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
         auto multi_array = get_multi_array(args[i], ArgType::Input);
         ET_CHECK_OR_RETURN_ERROR(multi_array.has_value(),
                                  Internal,
-                                 "%s: Expected tensor at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
+                                 "%s: Failed to create multiarray from input at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
         delegate_args.emplace_back(std::move(multi_array.value()));
     }
     
@@ -176,7 +186,7 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
         auto multi_array = get_multi_array(args[i], ArgType::Output);
         ET_CHECK_OR_RETURN_ERROR(multi_array.has_value(),
                                  Internal,
-                                 "%s: Expected tensor at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
+                                 "%s: Failed to create multiarray from output at args[%zu]", ETCoreMLStrings.delegateIdentifier.UTF8String, i);
         delegate_args.emplace_back(std::move(multi_array.value()));
     }
     
diff --git a/backends/apple/coreml/runtime/delegate/multiarray.h b/backends/apple/coreml/runtime/delegate/multiarray.h
index cd165373dc8..70a2a08a2f7 100644
--- a/backends/apple/coreml/runtime/delegate/multiarray.h
+++ b/backends/apple/coreml/runtime/delegate/multiarray.h
@@ -7,6 +7,9 @@
 
 #pragma once
 
+#import <CoreML/CoreML.h>
+#import <iostream>
+#import <optional>
 #import <vector>
 
 namespace executorchcoreml {
@@ -29,13 +32,33 @@ class Buffer {
 };
 
 /// A class representing a MultiArray.
-class MultiArray {
+class MultiArray final {
 public:
     /// The MultiArray datatype.
-    enum class DataType : uint8_t { Int = 0, Double, Float, Float16 };
+    enum class DataType : uint8_t {
+        Bool = 0,
+        Byte,
+        Char,
+        Short,
+        Int32,
+        Int64,
+        Float16,
+        Float32,
+        Float64,
+    };
+
+    /// Options for copying.
+    struct CopyOptions {
+        inline CopyOptions() noexcept : use_bnns(true), use_memcpy(true) { }
+
+        inline CopyOptions(bool use_bnns, bool use_memcpy) noexcept : use_bnns(use_bnns), use_memcpy(use_memcpy) { }
+
+        bool use_bnns = true;
+        bool use_memcpy = true;
+    };
 
     /// A class describing the memory layout of a MultiArray.
-    class MemoryLayout {
+    class MemoryLayout final {
     public:
         MemoryLayout(DataType dataType, std::vector<size_t> shape, std::vector<ssize_t> strides)
             : dataType_(dataType), shape_(std::move(shape)), strides_(std::move(strides)) { }
@@ -53,7 +76,10 @@ class MultiArray {
         inline size_t rank() const noexcept { return shape_.size(); }
 
         /// Returns the number of elements in the MultiArray.
-        size_t get_num_elements() const noexcept;
+        size_t num_elements() const noexcept;
+
+        /// Returns the byte size of an element.
+        size_t num_bytes() const noexcept;
 
         /// Returns `true` if the memory layout is packed otherwise `false`.
         bool is_packed() const noexcept;
@@ -78,11 +104,42 @@ class MultiArray {
     /// Copies this into another `MultiArray`.
     ///
     /// @param dst The destination `MultiArray`.
-    bool copy(MultiArray& dst) const noexcept;
+    void copy(MultiArray& dst, CopyOptions options = CopyOptions()) const noexcept;
+
+    /// Get the value at `indices`.
+    template <typename T> inline T value(const std::vector<size_t>& indices) const noexcept {
+        return *(static_cast<T*>(data(indices)));
+    }
+
+    /// Set the value at `indices`.
+    template <typename T> inline void set_value(const std::vector<size_t>& indices, T value) const noexcept {
+        T* ptr = static_cast<T*>(data(indices));
+        *ptr = value;
+    }
+
+    /// Get the value at `index`.
+    template <typename T> inline T value(size_t index) const noexcept { return *(static_cast<T*>(data(index))); }
+
+    /// Set the value at `index`.
+    template <typename T> inline void set_value(size_t index, T value) const noexcept {
+        T* ptr = static_cast<T*>(data(index));
+        *ptr = value;
+    }
 
 private:
+    void* data(const std::vector<size_t>& indices) const noexcept;
+
+    void* data(size_t index) const noexcept;
+
     void* data_;
     MemoryLayout layout_;
 };
 
+/// Converts `MultiArray::DataType` to `MLMultiArrayDataType`.
+std::optional<MLMultiArrayDataType> to_ml_multiarray_data_type(MultiArray::DataType data_type);
+
+/// Converts `MLMultiArrayDataType` to `MultiArray::DataType`.
+std::optional<MultiArray::DataType> to_multiarray_data_type(MLMultiArrayDataType data_type);
+
+
 } // namespace executorchcoreml
diff --git a/backends/apple/coreml/runtime/delegate/multiarray.mm b/backends/apple/coreml/runtime/delegate/multiarray.mm
index 3b8dcb98a30..74996fb8d5a 100644
--- a/backends/apple/coreml/runtime/delegate/multiarray.mm
+++ b/backends/apple/coreml/runtime/delegate/multiarray.mm
@@ -10,120 +10,16 @@
 
 #import <Accelerate/Accelerate.h>
 #import <CoreML/CoreML.h>
-
 #import <functional>
 #import <numeric>
+#import <objc_array_util.h>
+#import <optional>
 #import <vector>
 
 namespace  {
 using namespace executorchcoreml;
 
-template<typename T>
-struct TypedMultiArray {
-    explicit TypedMultiArray(T *data, MultiArray::MemoryLayout layout) noexcept
-    :data(data), layout(std::move(layout))
-    {}
-    
-    T *data;
-    MultiArray::MemoryLayout layout;
-};
-
-#pragma mark - BNNS
-
-template<typename T1, typename T2>
-struct BNNSCopier {
-    static bool supported() noexcept {
-        return false;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dstNNSDesc) noexcept {}
-};
-
-// float -> _Float16
-template<>
-struct BNNSCopier<float, _Float16> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat32;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat16;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// float -> int32_t
-template<>
-struct BNNSCopier<float, int32_t> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat32;
-        dst_bnns_desc->data_type = BNNSDataTypeInt32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// _Float16 -> float
-template<>
-struct BNNSCopier<_Float16, float> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat16;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// _Float16 -> int32_t
-template<>
-struct BNNSCopier<_Float16, int32_t> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeFloat16;
-        dst_bnns_desc->data_type = BNNSDataTypeInt32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// int32_t -> _Float16
-template<>
-struct BNNSCopier<int32_t, _Float16> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeInt32;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat16;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-// int32_t -> float
-template<>
-struct BNNSCopier<int32_t, float> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *dst_bnns_desc) noexcept {
-        src_bnns_desc->data_type = BNNSDataTypeInt32;
-        dst_bnns_desc->data_type = BNNSDataTypeFloat32;
-        BNNSCopy(src_bnns_desc, dst_bnns_desc, NULL);
-    }
-};
-
-/// Returns BNNSDataLayout and sets strides from the multi-array strides.
+// Returns BNNSDataLayout and sets strides from the multi-array strides.
 ///
 /// BNNS requires strides to be non-decreasing order;
 /// `bnns_strides[i] <= bnns_strides[i + 1]`. BNNSDataLayout defines
@@ -132,408 +28,491 @@ static void copy(BNNSNDArrayDescriptor *src_bnns_desc, BNNSNDArrayDescriptor *ds
 /// @param multi_array_strides  The multiarray strides.
 /// @param bnns_strides   The bnns strides.
 /// @retval The `BNNSDataLayout`.
-BNNSDataLayout get_bnns_data_layout(const std::vector<ssize_t>& multi_array_strides, size_t *bnns_strides) {
-    uint32_t firstMajorFlag = 1;
+std::optional<BNNSDataLayout> get_bnns_data_layout(const std::vector<ssize_t>& multi_array_strides,
+                                                   size_t *bnns_strides) {
+    bool first_major = false;
     uint32_t rank = static_cast<uint32_t>(multi_array_strides.size());
     if (rank > BNNS_MAX_TENSOR_DIMENSION) {
-        return (BNNSDataLayout)-1;
+        return std::nullopt;
     }
     
     if (std::is_sorted(multi_array_strides.begin(), multi_array_strides.end(), std::less())) {
-        firstMajorFlag = 0;
+        first_major = false;
         std::copy(multi_array_strides.begin(), multi_array_strides.end(), bnns_strides);
     } else if (std::is_sorted(multi_array_strides.begin(), multi_array_strides.end(), std::greater()) ) {
-        firstMajorFlag = 1;
+        first_major = true;
         std::copy(multi_array_strides.rbegin(), multi_array_strides.rend(), bnns_strides);
     } else {
-        return (BNNSDataLayout)-1;
+        return std::nullopt;
     }
     
     // See BNNSDataLayout's raw value how this bitwise-or makes sense.
-    return (BNNSDataLayout)((rank << 16) | (8 << 12) | firstMajorFlag);
+    return (BNNSDataLayout) (0x08000 +                    // flags as canonical first/last major type
+                             0x10000 * rank +             // set dimensionality
+                             (first_major ? 1 : 0));      // set first/last major bit
 }
 
-/// Initializes BNNSNDArrayDescriptor for the shape and strides.
+/// Returns `BNNSDataType` from `MultiArray::DataType`.
 ///
-/// @param layout  The memory layout.
-/// @param desc   The ``BNNSNDArrayDescriptor`  to be initialized.
-/// @retval `true` if the initialization succeeded otherwise `false`.
-bool init_bnns_array_descriptor(const MultiArray::MemoryLayout& layout, BNNSNDArrayDescriptor *desc) {
-    BNNSDataLayout bnns_layout = get_bnns_data_layout(layout.strides(), desc->stride);
-    if (bnns_layout == (BNNSDataLayout)-1) {
-        return false;
-    }
-    
-    std::memset(desc, 0, sizeof(*desc));
-    const auto& shape = layout.shape();
-    std::copy(shape.begin(), shape.end(), desc->size);
-    desc->layout = bnns_layout;
-    desc->data_scale = 1.0f;
-    desc->data_bias = 0.0f;
-    
-    return true;
-}
-
-template<typename T1, typename T2>
-struct MultiArrayBNNSCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!BNNSCopier<T1, T2>::supported()) {
-            return false;
+/// @param datatype  The multiarray datatype.
+/// @retval The `BNNSDataType`.
+std::optional<BNNSDataType> get_bnns_data_type(MultiArray::DataType datatype) {
+    switch (datatype) {
+        case MultiArray::DataType::Bool: {
+            return BNNSDataTypeBoolean;
         }
-        
-        BNNSNDArrayDescriptor src_bnns_array;
-        BNNSNDArrayDescriptor dst_bnns_array;
-        if (!init_bnns_array_descriptor(src.layout, &src_bnns_array) || !init_bnns_array_descriptor(dst.layout, &dst_bnns_array)) {
-            return false;
+        case MultiArray::DataType::Byte: {
+            return BNNSDataTypeUInt8;
+        }
+        case MultiArray::DataType::Char: {
+            return BNNSDataTypeInt8;
+        }
+        case MultiArray::DataType::Short: {
+            return BNNSDataTypeInt16;
+        }
+        case MultiArray::DataType::Int32: {
+            return BNNSDataTypeInt32;
+        }
+        case MultiArray::DataType::Int64: {
+            return BNNSDataTypeInt64;
+        }
+        case MultiArray::DataType::Float16: {
+            return BNNSDataTypeFloat16;
+        }
+        case MultiArray::DataType::Float32: {
+            return BNNSDataTypeFloat32;
+        }
+        default: {
+            return std::nullopt;
         }
-        
-        BNNSCopier<T1, T2>::copy(&src_bnns_array, &dst_bnns_array);
-        return true;
     }
-};
-
-#pragma mark - VImageCopier
+}
 
-bool init_vi_Buffer(const MultiArray::MemoryLayout& layout, vImage_Buffer *viBuf, size_t bytesPerScalar) {
-    size_t rank = layout.rank();
-    const auto& shape = layout.shape();
-    const auto& strides = layout.strides();
-    
-    if (rank < 2) {
-        // vImage path requires at least two dimensions.
-        return false;
-    }
-    
-    // vImage blitter requires first major and every dimension except row (shape[rank - 2]) is contiguous.
-    if (!std::is_sorted(strides.begin(), strides.end(), std::greater())) {
+/// Initializes BNNS array descriptor from multi array.
+///
+/// @param bnns_descriptor   The descriptor to be initialized.
+/// @param multi_array  The multiarray.
+/// @retval `true` if the initialization succeeded otherwise `false`.
+bool init_bnns_descriptor(BNNSNDArrayDescriptor& bnns_descriptor, const MultiArray& multi_array) {
+    const auto& layout = multi_array.layout();
+    if (layout.num_elements() == 1) {
         return false;
     }
     
-    if (strides[rank - 1] != 1) {
+    auto bnns_datatype = get_bnns_data_type(layout.dataType());
+    if (!bnns_datatype) {
         return false;
     }
     
-    size_t height = std::accumulate(shape.begin(), shape.end() - 1, size_t(1), std::multiplies<size_t>());
-    if (height * strides[rank - 2] != strides[0] * shape[0]) {
+    std::memset(&bnns_descriptor, 0, sizeof(bnns_descriptor));
+    auto bnns_layout = get_bnns_data_layout(layout.strides(), bnns_descriptor.stride);
+    if (!bnns_layout) {
         return false;
     }
     
-    size_t width = shape[rank - 1];
-    size_t rowBytes = strides[rank - 2] * bytesPerScalar;
-    
-    viBuf->data = NULL;
-    viBuf->height = height;
-    viBuf->width = width;
-    viBuf->rowBytes = rowBytes;
+    const auto& shape = layout.shape();
+    std::copy(shape.begin(), shape.end(), bnns_descriptor.size);
+    bnns_descriptor.layout = bnns_layout.value();
+    bnns_descriptor.data_scale = 1.0f;
+    bnns_descriptor.data_bias = 0.0f;
+    bnns_descriptor.data_type = bnns_datatype.value();
+    bnns_descriptor.data = multi_array.data();
     
     return true;
 }
 
-template<typename T1, typename T2>
-struct VImageCopier {
-    static bool supported() noexcept {
+bool copy_using_bnns(const MultiArray& src, MultiArray& dst) {
+    if (dst.layout().num_bytes() < src.layout().num_bytes()) {
         return false;
     }
-    
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {}
-};
-
-template<typename T>
-struct VImageCopier<T, T> {
-    static bool supported() noexcept {
-        return true;
+    BNNSNDArrayDescriptor src_descriptor;
+    if (!init_bnns_descriptor(src_descriptor, src)) {
+        return false;
     }
     
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {
-        vImageCopyBuffer(src_vi_buffer, dst_vi_buffer, sizeof(T), kvImageDoNotTile);
-    }
-};
-
-// float -> _Float16
-template <>
-struct VImageCopier<float, _Float16> {
-    static bool supported() noexcept {
-        return true;
+    BNNSNDArrayDescriptor dst_descriptor;
+    if (!init_bnns_descriptor(dst_descriptor, dst)) {
+        return false;
     }
     
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {
-        vImageConvert_PlanarFtoPlanar16F(src_vi_buffer, dst_vi_buffer, kvImageDoNotTile);
-    }
-};
+    return BNNSCopy(&dst_descriptor, &src_descriptor, NULL) == 0;
+}
 
-// _Float16 -> float
-template <>
-struct VImageCopier<_Float16, float> {
-    static bool supported() noexcept {
-        return true;
-    }
+std::vector<MultiArray::MemoryLayout> get_layouts(const std::vector<MultiArray>& arrays) {
+    std::vector<MultiArray::MemoryLayout> result;
+    result.reserve(arrays.size());
     
-    static void copy(vImage_Buffer *src_vi_buffer, vImage_Buffer *dst_vi_buffer) noexcept {
-        vImageConvert_Planar16FtoPlanarF(src_vi_buffer, dst_vi_buffer, kvImageDoNotTile);
-    }
-};
-
-template<typename T1, typename T2>
-struct MultiArrayVImageCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!VImageCopier<T1, T2>::supported()) {
-            return false;
-        }
-        
-        vImage_Buffer src_vi_buffer;
-        vImage_Buffer dst_vi_buffer;
-        if (!init_vi_Buffer(src.layout, &src_vi_buffer, sizeof(T1))) {
-            return false;
-        }
-        
-        if (!init_vi_Buffer(dst.layout, &dst_vi_buffer, sizeof(T2))) {
-            return false;
-        }
-        
-        VImageCopier<T1, T2>::copy(&src_vi_buffer, &dst_vi_buffer);
-        return true;
-    }
-};
-
-#pragma mark - VDSPCopier
-
-template<typename T1, typename T2>
-struct VDSPCopier {
-    static bool supported() noexcept {
-        return false;
-    }
+    std::transform(arrays.begin(), arrays.end(), std::back_inserter(result), [](const auto& array) {
+        return array.layout();
+    });
     
-    static void copy(const T1 *src_data, T2 *dst_data, size_t num_elements) noexcept {}
-};
+    return result;
+}
 
-// Double -> Float
-template<>
-struct VDSPCopier<double, float> {
-    static bool supported() noexcept {
-        return true;
-    }
+std::vector<void *> get_datas(const std::vector<MultiArray>& arrays) {
+    std::vector<void *> result;
+    result.reserve(arrays.size());
     
-    static void copy(const double *src_data, float *dst_data, size_t num_elements) noexcept {
-        vDSP_vdpsp(src_data, 1, dst_data, 1, num_elements);
-    }
-};
-
-// Float -> Double
-template<>
-struct VDSPCopier<float, double> {
-    static bool supported() noexcept {
-        return true;
-    }
+    std::transform(arrays.begin(), arrays.end(), std::back_inserter(result), [](const auto& array) {
+        return array.data();
+    });
     
-    static void copy(const float *src_data, double *dst_data, size_t num_elements) noexcept {
-        vDSP_vspdp(src_data, 1, dst_data, 1, num_elements);
-    }
-};
+    return result;
+}
 
-// Float -> Int32
-template<>
-struct VDSPCopier<float, int32_t> {
-    static bool supported() noexcept {
+// We can coalesce two adjacent dimensions if either dim has size 1 or if `shape[n] * stride[n] == stride[n + 1]`.
+bool can_coalesce_dimensions(const std::vector<size_t>& shape,
+                             const std::vector<ssize_t>& strides,
+                             size_t dim1,
+                             size_t dim2) {
+    auto shape1 = shape[dim1];
+    auto shape2 = shape[dim2];
+    if (shape1 == 1 || shape2 == 1) {
         return true;
     }
     
-    static void copy(const float *src_data, int32_t *dst_data, size_t num_elements) noexcept {
-        vDSP_vfix32(src_data, 1, dst_data, 1, num_elements);
-    }
-};
+    auto stride1 = strides[dim1];
+    auto stride2 = strides[dim2];
+    return shape1 * stride1 == stride2;
+}
 
-// Int32 -> Double
-template<>
-struct VDSPCopier<int32_t, double> {
-    static bool supported() noexcept {
-        return true;
+bool can_coalesce_dimensions(const std::vector<size_t>& shape,
+                             const std::vector<std::vector<ssize_t>>& all_strides,
+                             size_t dim1,
+                             size_t dim2) {
+    for (const auto& strides : all_strides) {
+        if (!::can_coalesce_dimensions(shape, strides, dim1, dim2)) {
+            return false;
+        }
     }
     
-    static void copy(const int32_t *src_data, double *dst_data, size_t num_elements) noexcept {
-        vDSP_vflt32D(src_data, 1, dst_data, 1, num_elements);
-    }
-};
+    return true;
+}
 
-// Int32 -> Float
-template<>
-struct VDSPCopier<int32_t, float> {
-    static bool supported() noexcept {
-        return true;
-    }
-    
-    static void copy(const int32_t *src_data, float *dst_data, size_t num_elements) noexcept {
-        vDSP_vflt32(src_data, 1, dst_data, 1, num_elements);
+void update_strides(std::vector<std::vector<ssize_t>>& all_strides,
+                    size_t dim1,
+                    size_t dim2) {
+    for (auto& strides : all_strides) {
+        strides[dim1] = strides[dim2];
     }
-};
+}
 
-template<typename T1, typename T2>
-struct MultiArrayVDSPCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!VDSPCopier<T1, T2>::supported()) {
-            return false;
-        }
-        
-        if (!src.layout.is_packed() || !dst.layout.is_packed()) {
-            return false;
+std::vector<MultiArray::MemoryLayout> coalesce_dimensions(std::vector<MultiArray::MemoryLayout> layouts) {
+    if (layouts.size() == 0) {
+        return {};
+    }
+    
+    std::vector<size_t> shape = layouts.back().shape();
+    // reverse shape.
+    std::reverse(shape.begin(), shape.end());
+    std::vector<std::vector<ssize_t>> all_strides;
+    // reverse strides.
+    all_strides.reserve(layouts.size());
+    std::transform(layouts.begin(), layouts.end(), std::back_inserter(all_strides), [](const MultiArray::MemoryLayout& layout) {
+        auto strides = layout.strides();
+        std::reverse(strides.begin(), strides.end());
+        return strides;
+    });
+    size_t rank = layouts[0].rank();
+    size_t prev_dim = 0;
+    for (size_t dim = 1; dim < rank; ++dim) {
+        if (::can_coalesce_dimensions(shape, all_strides, prev_dim, dim)) {
+            if (shape[prev_dim] == 1) {
+                ::update_strides(all_strides, prev_dim, dim);
+            }
+            shape[prev_dim] *= shape[dim];
+        } else {
+            ++prev_dim;
+            if (prev_dim != dim) {
+                ::update_strides(all_strides, prev_dim, dim);
+                shape[prev_dim] = shape[dim];
+            }
         }
-        
-        VDSPCopier<T1, T2>::copy(src.data, dst.data, src.layout.get_num_elements());
-        return true;
     }
-};
-
-#pragma mark - MemCopy
-
-template<typename T1, typename T2>
-struct MemCopier {
-    static bool supported() noexcept {
-        return false;
+    
+    if (rank == prev_dim + 1) {
+        return layouts;
     }
     
-    static void copy(const T1 *src_data, T2 *dst_data, size_t num_elements) noexcept {}
-};
-
-template<typename T>
-struct MemCopier<T, T> {
-    static bool supported() noexcept {
-        return true;
+    shape.resize(prev_dim + 1);
+    for (auto& strides : all_strides) {
+        strides.resize(prev_dim + 1);
     }
     
-    static void copy(const T *src_data, T *dst_data, size_t num_elements) noexcept {
-        std::memcpy(dst_data, src_data, num_elements);
+    std::vector<MultiArray::MemoryLayout> result;
+    result.reserve(layouts.size());
+    std::reverse(shape.begin(), shape.end());
+    for (size_t i = 0; i < layouts.size(); ++i) {
+        std::reverse(all_strides[i].begin(), all_strides[i].end());
+        result.emplace_back(layouts[i].dataType(), shape, std::move(all_strides[i]));
     }
+    
+    return result;
+}
+
+enum class Direction : uint8_t {
+    Forward = 0,
+    Backward
 };
 
-template<typename T1, typename T2>
-struct MultiArrayMemCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (!MemCopier<T1, T2>::supported()) {
-            return false;
-        }
-        
-        if (!src.layout.is_packed() || !dst.layout.is_packed()) {
-            return false;
+void set_data_pointers(std::vector<void *>& data_pointers,
+                       ssize_t index,
+                       size_t dim,
+                       Direction direction,
+                       const std::vector<MultiArray::MemoryLayout>& layouts) {
+    for (size_t i = 0; i < layouts.size(); ++i) {
+        const auto& layout = layouts[i];
+        const ssize_t stride = layout.strides()[dim];
+        const size_t num_bytes = layout.num_bytes();
+        ssize_t offset = 0;
+        switch (direction) {
+            case Direction::Forward: {
+                offset = stride * index * num_bytes;
+                break;
+            }
+            case Direction::Backward: {
+                offset = - stride * index * num_bytes;
+                break;
+            }
         }
-        
-        MemCopier<T1, T2>::copy(src.data, dst.data, src.layout.get_num_elements());
-        return true;
+        data_pointers[i] = (void *)(static_cast<uint8_t *>(data_pointers[i]) + offset);
     }
-};
+}
+
+void increment_data_pointers(std::vector<void *>& data_pointers,
+                             size_t index,
+                             size_t dim,
+                             const std::vector<MultiArray::MemoryLayout>& layouts) {
+    set_data_pointers(data_pointers, index, dim, Direction::Forward, layouts);
+}
 
-#pragma mark - MultiArrayIterator
-/// TODO - remove recursion and coalesce contiguous dimensions.
-template <typename T1, typename T2>
-struct MultiArrayIterator {
-    explicit MultiArrayIterator(TypedMultiArray<T1>& array1, TypedMultiArray<T2>& array2)
-    :array1(array1), array2(array2)
+void decrement_data_pointers(std::vector<void *>& data_pointers,
+                             size_t index,
+                             size_t dim,
+                             const std::vector<MultiArray::MemoryLayout>& layouts) {
+    set_data_pointers(data_pointers, index, dim, Direction::Backward, layouts);
+}
+
+class MultiArrayIterator final {
+public:
+    explicit MultiArrayIterator(const std::vector<MultiArray>& arrays)
+    :datas_(get_datas(arrays)), 
+    layouts_(coalesce_dimensions(get_layouts(arrays)))
     {}
     
+private:
     template<typename FN>
-    void loop(FN&& fn, T1 *data1, T2 *data2, size_t dim) {
-        const size_t index = dim - 1;
-        const auto& layout1 = array1.layout;
-        const auto& layout2 = array2.layout;
-        const ssize_t stride1 = layout1.strides()[index];
-        const ssize_t stride2 = layout2.strides()[index];
-        const size_t bound = layout1.shape()[index];
-        
-        if (index == 0) {
-            for (size_t i = 0; i < bound; i++) {
-                if (fn(data1 + stride1 * i, data2 + stride2 * i)) {
-                    break;
+    void exec(FN&& fn, const std::vector<MultiArray::MemoryLayout>& layouts, std::vector<void *> datas, size_t n) {
+        const auto& layout = layouts.back();
+        // Avoid function call for rank <= 2.
+        switch (n) {
+            case 0: {
+                break;
+            }
+            case 1: {
+                for (size_t i = 0; i < layout.shape()[0]; ++i) {
+                    ::increment_data_pointers(datas, i, 0, layouts);
+                    fn(datas);
+                    ::decrement_data_pointers(datas, i, 0, layouts);
+                }
+                break;
+            }
+            case 2: {
+                for (size_t i = 0; i < layout.shape()[1]; ++i) {
+                    ::increment_data_pointers(datas, i, 1, layouts);
+                    for (size_t j = 0; j < layout.shape()[0]; ++j) {
+                        ::increment_data_pointers(datas, j, 0, layouts);
+                        fn(datas);
+                        ::decrement_data_pointers(datas, j, 0, layouts);
+                    }
+                    ::decrement_data_pointers(datas, i, 1, layouts);
+                }
+                
+                break;
+            }
+                
+            default: {
+                const size_t bound = layouts.back().shape()[n - 1];
+                for (size_t index = 0; index < bound; ++index) {
+                    ::increment_data_pointers(datas, index, n - 1, layouts);
+                    exec(std::forward<FN>(fn), layouts, datas, n - 1);
+                    ::decrement_data_pointers(datas, index, n - 1, layouts);
                 }
             }
-            return;
-        }
-        
-        for (size_t i = 0; i < bound; i++) {
-            loop(fn, data1 + stride1 * i, data2 + stride2 * i, dim - 1);
         }
     }
     
+public:
     template<typename FN>
-    void loop(FN&& fn) {
-        loop(fn, array1.data, array2.data, array1.layout.rank());
+    void exec(FN&& fn) {
+        std::vector<void *> datas = datas_;
+        exec(fn, layouts_, datas, layouts_[0].rank());
     }
     
-    TypedMultiArray<T1> array1;
-    TypedMultiArray<T2> array2;
+private:
+    std::vector<void *> datas_;
+    std::vector<MultiArray::MemoryLayout> layouts_;
 };
 
+/// BNNS has no double type, so we handle the conversions here.
 template<typename T1, typename T2>
-struct MultiArrayLoopingCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        auto looper  = MultiArrayIterator<T1, T2>(src, dst);
-        looper.loop([](T1 *src, T2 *dst){
-            *dst = static_cast<T2>(*src);
-            return true;
-        });
-        
-        return true;
-    }
-};
+inline void copy_value(void *dst, const void *src) {
+    const T2 *src_ptr = static_cast<const T2 *>(src);
+    T1 *dst_ptr = static_cast<T1 *>(dst);
+    *dst_ptr = static_cast<T1>(*src_ptr);
+}
 
-template <typename T1, typename T2>
-struct MultiArrayCopier {
-    static bool copy(TypedMultiArray<T1>& src, TypedMultiArray<T2>& dst) {
-        if (src.layout.shape() != dst.layout.shape()) {
-            return false;
+template<typename T>
+void copy(void *dst,
+          MultiArray::DataType dst_data_type,
+          const void *src) {
+    switch (dst_data_type) {
+        case MultiArray::DataType::Bool: {
+            ::copy_value<bool, T>(dst, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Byte: {
+            ::copy_value<uint8_t, T>(dst, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Char: {
+            ::copy_value<int8_t, T>(dst, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Short: {
+            ::copy_value<int16_t, T>(dst, src);
+            break;
         }
-        
-        if (src.layout.get_num_elements() == 0) {
-            return true;
+            
+        case MultiArray::DataType::Int32: {
+            ::copy_value<int32_t, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayBNNSCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Int64: {
+            ::copy_value<int64_t, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayVImageCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Float16: {
+            ::copy_value<_Float16, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayVDSPCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Float32: {
+            ::copy_value<float, T>(dst, src);
+            break;
         }
-        
-        if (MultiArrayMemCopier<T1, T2>::copy(src, dst)) {
-            return true;
+            
+        case MultiArray::DataType::Float64: {
+            ::copy_value<double, T>(dst, src);
+            break;
         }
-        
-        return MultiArrayLoopingCopier<T1, T2>::copy(src, dst);
     }
-};
+}
 
-template <typename T>
-bool copy(TypedMultiArray<T>& src, MultiArray& dst) {
-    const auto& dstLayout = dst.layout();
-    switch (dstLayout.dataType()) {
-        case MultiArray::DataType::Int: {
-            auto dst_array = TypedMultiArray<int32_t>(reinterpret_cast<int32_t *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, int32_t>::copy(src, dst_array);
+void copy(void *dst,
+          MultiArray::DataType dst_data_type,
+          const void *src,
+          MultiArray::DataType src_data_type) {
+    switch (src_data_type) {
+        case MultiArray::DataType::Bool: {
+            ::copy<uint8_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Byte: {
+            ::copy<uint8_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Char: {
+            ::copy<int8_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Short: {
+            ::copy<int16_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Int32: {
+            ::copy<int32_t>(dst, dst_data_type, src);
+            break;
+        }
+            
+        case MultiArray::DataType::Int64: {
+            ::copy<int64_t>(dst, dst_data_type, src);
+            break;
         }
             
         case MultiArray::DataType::Float16: {
-            auto dst_array = TypedMultiArray<_Float16>(reinterpret_cast<_Float16 *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, _Float16>::copy(src, dst_array);
+            ::copy<_Float16>(dst, dst_data_type, src);
+            break;
         }
             
-        case MultiArray::DataType::Float: {
-            auto dst_array = TypedMultiArray<float>(reinterpret_cast<float *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, float>::copy(src, dst_array);
+        case MultiArray::DataType::Float32: {
+            ::copy<float>(dst, dst_data_type, src);
+            break;
         }
             
-        case MultiArray::DataType::Double: {
-            auto dst_array = TypedMultiArray<double>(reinterpret_cast<double *>(dst.data()), dstLayout);
-            return MultiArrayCopier<T, double>::copy(src, dst_array);
+        case MultiArray::DataType::Float64: {
+            ::copy<double>(dst, dst_data_type, src);
+            break;
         }
     }
 }
-} //namespace
+
+void copy(const MultiArray& src, MultiArray& dst, MultiArray::CopyOptions options) {
+    if (options.use_bnns && copy_using_bnns(src, dst)) {
+        return;
+    }
+    
+    if (options.use_memcpy &&
+        src.layout().dataType() == dst.layout().dataType() &&
+        src.layout().is_packed() &&
+        dst.layout().is_packed()) {
+        std::memcpy(dst.data(), src.data(), src.layout().num_elements() * src.layout().num_bytes());
+        return;
+    }
+    
+    auto iterator = MultiArrayIterator({src, dst});
+    iterator.exec([&](const std::vector<void *>& datas){
+        void *src_data = datas[0];
+        void *dst_data = datas[1];
+        ::copy(dst_data, dst.layout().dataType(), src_data, src.layout().dataType());
+    });
+}
+
+ssize_t get_data_offset(const std::vector<size_t>& indices, const std::vector<ssize_t>& strides) {
+    ssize_t offset = 0;
+    for (size_t i = 0; i < indices.size(); ++i) {
+        offset += static_cast<ssize_t>(indices[i]) * strides[i];
+    }
+    
+    return offset;
+}
+
+ssize_t get_data_offset(size_t index, const std::vector<size_t>& shape, const std::vector<ssize_t>& strides) {
+    size_t div = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());;
+    size_t offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        div /= shape[i];
+        size_t dim_index = index / div;
+        offset += dim_index * strides[i];
+        index %= div;
+    }
+    
+    return offset;
+}
+}
 
 namespace executorchcoreml {
 
-size_t MultiArray::MemoryLayout::get_num_elements() const noexcept {
+size_t MultiArray::MemoryLayout::num_elements() const noexcept {
     if (shape_.size() == 0) {
         return 0;
     }
@@ -553,32 +532,101 @@ bool copy(TypedMultiArray<T>& src, MultiArray& dst) {
             return false;
         }
         expectedStride = expectedStride * (*shapeIt);
+        stridesIt++;
     }
     
     return true;
 }
 
-bool MultiArray::copy(MultiArray& dst) const noexcept {
-    switch (layout().dataType()) {
-        case MultiArray::DataType::Int: {
-            auto src = TypedMultiArray<int32_t>(reinterpret_cast<int32_t *>(data()), layout());
-            return ::copy(src, dst);
+size_t MultiArray::MemoryLayout::num_bytes() const noexcept {
+    switch (dataType()) {
+        case MultiArray::DataType::Bool: {
+            return 1;
+        }
+        case MultiArray::DataType::Byte: {
+            return 1;
+        }
+        case MultiArray::DataType::Char: {
+            return 1;
+        }
+        case MultiArray::DataType::Short: {
+            return 2;
+        }
+        case MultiArray::DataType::Int32: {
+            return 4;
+        }
+        case MultiArray::DataType::Int64: {
+            return 8;
         }
-            
         case MultiArray::DataType::Float16: {
-            auto src = TypedMultiArray<_Float16>(reinterpret_cast<_Float16 *>(data()), layout());
-            return ::copy(src, dst);
+            return 2;
         }
-            
-        case MultiArray::DataType::Float: {
-            auto src = TypedMultiArray<float>(reinterpret_cast<float *>(data()), layout());
-            return ::copy(src, dst);
+        case MultiArray::DataType::Float32: {
+            return 4;
         }
-            
-        case MultiArray::DataType::Double: {
-            auto src = TypedMultiArray<double>(reinterpret_cast<double *>(data()), layout());
-            return ::copy(src, dst);
+        case MultiArray::DataType::Float64: {
+            return 8;
+        }
+    }
+}
+
+void MultiArray::copy(MultiArray& dst, CopyOptions options) const noexcept {
+    assert(layout().shape() == dst.layout().shape());
+    ::copy(*this, dst, options);
+}
+
+std::optional<MLMultiArrayDataType> to_ml_multiarray_data_type(MultiArray::DataType data_type) {
+    switch (data_type) {
+        case MultiArray::DataType::Float16: {
+            return MLMultiArrayDataTypeFloat16;
+        }
+        case MultiArray::DataType::Float32: {
+            return MLMultiArrayDataTypeFloat32;
+        }
+        case MultiArray::DataType::Float64: {
+            return MLMultiArrayDataTypeDouble;
+        }
+        case MultiArray::DataType::Int32: {
+            return MLMultiArrayDataTypeInt32;
+        }
+        default: {
+            return std::nullopt;
+        }
+    }
+}
+
+std::optional<MultiArray::DataType> to_multiarray_data_type(MLMultiArrayDataType data_type) {
+    switch (data_type) {
+        case MLMultiArrayDataTypeFloat16: {
+            return MultiArray::DataType::Float16;
+        }
+        case MLMultiArrayDataTypeFloat32: {
+            return MultiArray::DataType::Float32;
+        }
+        case MLMultiArrayDataTypeFloat64: {
+            return MultiArray::DataType::Float64;
+        }
+        case MLMultiArrayDataTypeInt32: {
+            return MultiArray::DataType::Int32;
+        }
+        default: {
+            return std::nullopt;
         }
     }
 }
+
+void *MultiArray::data(const std::vector<size_t>& indices) const noexcept {
+    assert(indices.size() == layout().shape().size());
+    uint8_t *ptr = static_cast<uint8_t *>(data());
+    ssize_t offset = ::get_data_offset(indices, layout().strides());
+    return ptr + offset * layout().num_bytes();
+}
+
+void *MultiArray::data(size_t index) const noexcept {
+    assert(index < layout().num_elements());
+    uint8_t *ptr = static_cast<uint8_t *>(data());
+    ssize_t offset = ::get_data_offset(index, layout().shape(), layout().strides());
+    return ptr + offset * layout().num_bytes();
+}
+
 } // namespace executorchcoreml
diff --git a/backends/apple/coreml/runtime/kvstore/key_value_store.cpp b/backends/apple/coreml/runtime/kvstore/key_value_store.cpp
index 70be312b967..4a7a491236b 100644
--- a/backends/apple/coreml/runtime/kvstore/key_value_store.cpp
+++ b/backends/apple/coreml/runtime/kvstore/key_value_store.cpp
@@ -53,8 +53,7 @@ get_create_store_statement(std::string_view store_name, StorageType key_storage_
 
 std::string get_create_index_statement(std::string_view store_name, std::string_view column_name) {
     std::stringstream ss;
-    ss << "CREATE INDEX IF NOT EXISTS " << column_name << "_INDEX"
-       << " ON " << store_name << "(" << column_name << ")";
+    ss << "CREATE INDEX IF NOT EXISTS " << column_name << "_INDEX" << " ON " << store_name << "(" << column_name << ")";
 
     return ss.str();
 }
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h
index 51204e34387..4048dae5fea 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h
@@ -48,6 +48,9 @@ __attribute__((objc_subclassing_restricted))
 /// The model.
 @property (readonly, strong, nonatomic) ETCoreMLModel* model;
 
+/// If set to `YES` then output backing are ignored.
+@property (readwrite, atomic) BOOL ignoreOutputBackings;
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
index e7f05662d28..57212445e55 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
@@ -170,6 +170,10 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod
                                               loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
                                                  eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                                                        error:(NSError * __autoreleasing *)error {
+    if (self.ignoreOutputBackings) {
+        predictionOptions.outputBackings = @{};
+    }
+    
     NSError *localError = nil;
     NSArray<MLMultiArray *> *outputs = nil;
     if (loggingOptions.log_profiling_info) {
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h
index 5ab90c0ea19..7221086318e 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h
@@ -15,7 +15,8 @@ typedef NSDictionary<ETCoreMLModelStructurePath*, MLMultiArray*> ETCoreMLModelOu
 
 NS_ASSUME_NONNULL_BEGIN
 /// A class responsible for debugging a model.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelDebugger : NSObject
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLModelDebugger : NSObject
 
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
index 0db5f24f2cb..a2fbb985820 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
@@ -21,7 +21,8 @@ typedef NSDictionary<ETCoreMLModelStructurePath*, ETCoreMLOperationProfilingInfo
 
 NS_ASSUME_NONNULL_BEGIN
 /// A class responsible for profiling a model.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelProfiler : NSObject
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLModelProfiler : NSObject
 
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.h
index 08d6379a451..a4b318ac175 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.h
@@ -17,7 +17,8 @@ NS_ASSUME_NONNULL_BEGIN
 ///
 /// The class is a thin wrapper over `executorchcoreml::modelstructure::path`.
 ///
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLModelStructurePath : NSObject<NSCopying>
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLModelStructurePath : NSObject<NSCopying>
 
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h
index 3b2211d8168..80c49f8965e 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h
@@ -12,7 +12,8 @@
 NS_ASSUME_NONNULL_BEGIN
 
 /// A class representing the profiling info of an operation.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLOperationProfilingInfo : NSObject<NSCopying>
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLOperationProfilingInfo : NSObject<NSCopying>
 
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLPair.h b/backends/apple/coreml/runtime/sdk/ETCoreMLPair.h
index be4c832da82..8ddd4191cd9 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLPair.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLPair.h
@@ -9,7 +9,8 @@
 
 NS_ASSUME_NONNULL_BEGIN
 /// A class representing a pair with first and second objects.
-__attribute__((objc_subclassing_restricted)) @interface ETCoreMLPair<First, Second> : NSObject<NSCopying>
+__attribute__((objc_subclassing_restricted))
+@interface ETCoreMLPair<First, Second> : NSObject<NSCopying>
 
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/backends/apple/coreml/runtime/test/BackendDelegateTests.mm b/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
index c74cb564495..6f0e3cff31f 100644
--- a/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
@@ -14,69 +14,32 @@
 #import <coreml_backend/delegate.h>
 #import <model_logging_options.h>
 #import <multiarray.h>
+#import <objc_array_util.h>
 
 using namespace executorchcoreml;
 
 namespace {
-template<typename T>
-T toValue(NSNumber *value);
 
-template<>
-size_t toValue(NSNumber *value) {
-    return value.unsignedLongLongValue;
-}
-
-template<>
-ssize_t toValue(NSNumber *value) {
-    return value.longLongValue;
-}
-
-template<typename T>
-std::vector<T> toVector(NSArray<NSNumber *> *values) {
-    std::vector<T> result;
-    result.reserve(values.count);
-    for (NSNumber *value in values) {
-        result.emplace_back(toValue<T>(value));
-    }
-    
-    return result;
-}
-
-MultiArray::DataType toDataType(MLMultiArrayDataType dataType) {
-    switch (dataType) {
-        case MLMultiArrayDataTypeFloat: {
-            return MultiArray::DataType::Float;
-        }
-        case MLMultiArrayDataTypeFloat16: {
-            return MultiArray::DataType::Float16;
-        }
-        case MLMultiArrayDataTypeDouble: {
-            return MultiArray::DataType::Double;
-        }
-        case MLMultiArrayDataTypeInt32: {
-            return MultiArray::DataType::Int;
-        }
-    }
-}
-
-MultiArray toMultiArray(MLMultiArray *mlMultiArray) {
-    auto shape = toVector<size_t>(mlMultiArray.shape);
-    auto strides = toVector<ssize_t>(mlMultiArray.strides);
-    auto layout = MultiArray::MemoryLayout(toDataType(mlMultiArray.dataType), std::move(shape), std::move(strides));
+MultiArray to_multiarray(MLMultiArray *ml_multiarray) {
+    auto shape = to_vector<size_t>(ml_multiarray.shape);
+    auto strides = to_vector<ssize_t>(ml_multiarray.strides);
+    auto layout = MultiArray::MemoryLayout(to_multiarray_data_type(ml_multiarray.dataType).value(),
+                                           std::move(shape),
+                                           std::move(strides));
     __block void *bytes = nullptr;
-    [mlMultiArray getMutableBytesWithHandler:^(void *mutableBytes, __unused NSInteger size, __unused NSArray<NSNumber *> *strides) {
+    [ml_multiarray getMutableBytesWithHandler:^(void *mutableBytes, __unused NSInteger size, __unused NSArray<NSNumber *> *strides) {
         bytes = mutableBytes;
     }];
     
     return MultiArray(bytes, std::move(layout));
 }
 
-std::vector<MultiArray> toMultiArrays(NSArray<MLMultiArray *> *mlMultiArrays) {
+std::vector<MultiArray> to_multiarrays(NSArray<MLMultiArray *> *ml_multiarrays) {
     std::vector<MultiArray> result;
-    result.reserve(mlMultiArrays.count);
+    result.reserve(ml_multiarrays.count);
     
-    for (MLMultiArray *mlMultiArray in mlMultiArrays) {
-        result.emplace_back(toMultiArray(mlMultiArray));
+    for (MLMultiArray *ml_multiarray in ml_multiarrays) {
+        result.emplace_back(to_multiarray(ml_multiarray));
     }
     return result;
 }
@@ -198,7 +161,7 @@ - (void)testAddModelExecution {
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     std::error_code errorCode;
     XCTAssertTrue(_delegate->execute(handle,
-                                     toMultiArrays(args),
+                                     to_multiarrays(args),
                                      ModelLoggingOptions(),
                                      nullptr,
                                      errorCode));
@@ -223,7 +186,7 @@ - (void)testMulModelExecution {
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     std::error_code errorCode;
     XCTAssertTrue(_delegate->execute(handle, 
-                                     toMultiArrays(args),
+                                     to_multiarrays(args),
                                      ModelLoggingOptions(),
                                      nullptr,
                                      errorCode));
diff --git a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
index 13f8343adf2..94b862d8424 100644
--- a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
@@ -15,7 +15,7 @@
 #import <executorch/runtime/executor/program.h>
 #import <executorch/runtime/platform/runtime.h>
 
-static constexpr size_t kRuntimeMemorySize = 10 * 1024U * 1024U; // 10 MB
+static constexpr size_t kRuntimeMemorySize = 50 * 1024U * 1024U; // 50 MB
 
 using namespace torch::executor;
 using torch::executor::testing::TensorFactory;
@@ -104,7 +104,7 @@
              ET_LOG(Info, "Skipping non-tensor input %zu", i);
              continue;
          }
-         Buffer buffer(tensor_meta->nbytes(), 1);
+         Buffer buffer(tensor_meta->nbytes(), 0);
          auto sizes = tensor_meta->sizes();
          exec_aten::TensorImpl tensor_impl(tensor_meta->scalar_type(), std::size(sizes), const_cast<int *>(sizes.data()), buffer.data());
          exec_aten::Tensor tensor(&tensor_impl);
@@ -155,8 +155,8 @@ - (void)testProgramLoad {
     XCTAssert(method.ok());
 }
 
-- (void)executeModelAtURL:(NSURL *)modelURL nTimes:(NSUInteger)nTimes {
-    for (NSUInteger i = 0; i < nTimes; i++) {
+- (void)executeModelAtURL:(NSURL *)modelURL nLoads:(NSUInteger)nLoads nExecutions:(NSUInteger)nExecutions {
+    for (NSUInteger i = 0; i < nLoads; ++i) {
         auto loader = std::make_unique<DataLoaderImpl>(modelURL.path.UTF8String);
         auto program = get_program(loader.get());
         XCTAssert(program != nullptr);
@@ -165,41 +165,44 @@ - (void)executeModelAtURL:(NSURL *)modelURL nTimes:(NSUInteger)nTimes {
         auto plannedBuffers = get_planned_buffers(methodName.get(), program.get());
         XCTAssert(plannedBuffers.ok());
         Buffer methodBuffer(kRuntimeMemorySize, 0);
-        MemoryAllocator methodAllocator(static_cast<int32_t>(methodBuffer.size()), methodBuffer.data());
+        __block MemoryAllocator methodAllocator(static_cast<int32_t>(methodBuffer.size()), methodBuffer.data());
         auto spans = to_spans(plannedBuffers.get());
         HierarchicalAllocator plannedAllocator({spans.data(), spans.size()});
         MemoryManager memoryManger(&methodAllocator, &plannedAllocator);
-        auto method = program->load_method(methodName.get().c_str(), &memoryManger);
+        __block auto method = program->load_method(methodName.get().c_str(), &memoryManger);
         XCTAssert(method.ok());
         auto inputs = ::prepare_input_tensors(method.get());
-        auto status = method->execute();
-        XCTAssertEqual(status, Error::Ok);
         auto outputs = methodAllocator.allocateList<EValue>(method->outputs_size());
-        status = method->get_outputs(outputs, method->outputs_size());
-        XCTAssertEqual(status, Error::Ok);
+        for (NSUInteger j = 0; j < nExecutions; ++j) {
+            auto status = method->execute();
+            XCTAssertEqual(status, Error::Ok);
+            status = method->get_outputs(outputs, method->outputs_size());
+            XCTAssertEqual(status, Error::Ok);
+        }
     }
 }
 
 - (void)testAddProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"add_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nTimes:10];
+    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
 }
 
 - (void)testMulProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nTimes:10];
+    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
 }
 
 - (void)testMV3ProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nTimes:10];
+    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
 }
 
 - (void)executeMultipleModelsConcurrently:(NSArray<NSURL *> *)modelURLs
-                                   nTimes:(NSUInteger)nTimes
+                                   nLoads:(NSUInteger)nLoads
+                              nExecutions:(NSUInteger)nExecutions
                                   timeout:(NSTimeInterval)timeout {
     NSMutableArray<XCTestExpectation *> *expectations = [NSMutableArray arrayWithCapacity:modelURLs.count];
     dispatch_queue_t queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
@@ -208,7 +211,7 @@ - (void)executeMultipleModelsConcurrently:(NSArray<NSURL *> *)modelURLs
         XCTestExpectation *expectation = [[XCTestExpectation alloc] initWithDescription:description];
         [expectations addObject:expectation];
         dispatch_async(queue, ^{
-            [self executeModelAtURL:modelURL nTimes:nTimes];
+            [self executeModelAtURL:modelURL nLoads:nLoads nExecutions:nExecutions];
             [expectation fulfill];
         });
     }
@@ -221,7 +224,8 @@ - (void)testMultipleModelExecutionConcurrently {
     NSURL *modelURL2 = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"];
     NSURL *modelURL3 = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     [self executeMultipleModelsConcurrently:@[modelURL1, modelURL2, modelURL3]
-                                     nTimes:10
+                                     nLoads:5
+                                nExecutions:2
                                     timeout:5 * 60];
 }
 
@@ -229,7 +233,8 @@ - (void)testSameModelExecutionConcurrently {
     NSURL *modelURL1 = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     NSURL *modelURL2 = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     [self executeMultipleModelsConcurrently:@[modelURL1, modelURL2]
-                                     nTimes:10
+                                     nLoads:5
+                                nExecutions:2
                                     timeout:5 * 60];
 }
 
diff --git a/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm b/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm
index d20d292cf69..8ad712497ea 100644
--- a/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm
+++ b/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm
@@ -115,7 +115,7 @@ - (void)testAddModelExecution {
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     XCTAssertTrue([self.modelManager executeModelWithHandle:handle 
                                                        args:args
-                                            loggingOptions:executorchcoreml::ModelLoggingOptions()
+                                             loggingOptions:executorchcoreml::ModelLoggingOptions()
                                                 eventLogger:nullptr
                                                       error:&localError]);
     for (NSUInteger i = 0; i < output.count; i++) {
diff --git a/backends/apple/coreml/runtime/test/MultiArrayTests.mm b/backends/apple/coreml/runtime/test/MultiArrayTests.mm
new file mode 100644
index 00000000000..895702ae154
--- /dev/null
+++ b/backends/apple/coreml/runtime/test/MultiArrayTests.mm
@@ -0,0 +1,133 @@
+//
+// MultiArrayTests.mm
+//
+// Copyright © 2024 Apple Inc. All rights reserved.
+//
+// Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+#import <multiarray.h>
+#import <objc_array_util.h>
+#import <vector>
+
+#import <XCTest/XCTest.h>
+
+using namespace executorchcoreml;
+
+namespace {
+size_t get_buffer_size(const std::vector<size_t>& shape, const std::vector<ssize_t>& srides) {
+    auto max_stride_it = std::max_element(srides.begin(), srides.end());
+    size_t max_stride_axis = static_cast<size_t>(std::distance(srides.begin(), max_stride_it));
+    size_t dimension_with_max_stride = shape[max_stride_axis];
+    return dimension_with_max_stride * (*max_stride_it);
+}
+
+template<typename T>
+MultiArray::DataType get_multiarray_data_type();
+
+template<> MultiArray::DataType get_multiarray_data_type<float>() {
+    return MultiArray::DataType::Float32;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<double>() {
+    return MultiArray::DataType::Float64;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<int64_t>() {
+    return MultiArray::DataType::Int64;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<int32_t>() {
+    return MultiArray::DataType::Int32;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<int16_t>() {
+    return MultiArray::DataType::Short;
+}
+
+template<> MultiArray::DataType get_multiarray_data_type<_Float16>() {
+    return MultiArray::DataType::Float16;
+}
+
+template<typename T1, typename T2>
+void verify_values(const MultiArray& multiarray1, const MultiArray& multiarray2) {
+    for (size_t i = 0;  i < multiarray1.layout().num_elements(); ++i) {
+        XCTAssertEqual(multiarray1.value<T1>(i), multiarray2.value<T2>(i));
+    }
+}
+
+template<typename T>
+MultiArray make_multi_array(const std::vector<size_t>& shape, const std::vector<ssize_t>& strides, std::vector<uint8_t>& storage) {
+    storage.resize(get_buffer_size(shape, strides) * sizeof(T), 0);
+    MultiArray::MemoryLayout layout(get_multiarray_data_type<T>(), shape, strides);
+    return MultiArray(storage.data(), std::move(layout));
+}
+
+template<typename T>
+MultiArray make_multi_array_and_fill(const std::vector<size_t>& shape, const std::vector<ssize_t>& strides, std::vector<uint8_t>& storage) {
+    auto result = make_multi_array<T>(shape, strides, storage);
+    for (size_t i = 0;  i < result.layout().num_elements(); ++i) {
+        T value = static_cast<T>(i);
+        result.set_value(i, value);
+    }
+    
+    return result;
+}
+
+template<typename T1, typename T2>
+void verify_copy_(const std::vector<size_t>& shape,
+                  const std::vector<ssize_t>& src_strides,
+                  const std::vector<ssize_t>& dst_strides) {
+    std::vector<uint8_t> src_storage;
+    auto src_multiarray = make_multi_array_and_fill<T1>(shape, src_strides, src_storage);
+    
+    std::vector<uint8_t> dst_storage;
+    auto dst_multiarray = make_multi_array<T2>(shape, dst_strides, dst_storage);
+    src_multiarray.copy(dst_multiarray, MultiArray::CopyOptions(true, false));
+    verify_values<T1, T2>(src_multiarray, dst_multiarray);
+    
+    dst_storage.clear();
+    dst_storage.resize(get_buffer_size(shape, dst_strides) * sizeof(T2), 0);
+    src_multiarray.copy(dst_multiarray, MultiArray::CopyOptions(false, false));
+    verify_values<T1, T2>(src_multiarray, dst_multiarray);
+}
+
+template<typename T1, typename T2>
+void verify_copy(const std::vector<size_t>& shape,
+                 const std::vector<ssize_t>& src_strides,
+                 const std::vector<ssize_t>& dst_strides) {
+    verify_copy_<T1, T2>(shape, src_strides, dst_strides);
+    verify_copy_<T2, T1>(shape, src_strides, dst_strides);
+}
+} //namespace
+
+@interface MultiArrayTests : XCTestCase
+
+@end
+
+@implementation MultiArrayTests
+
+- (void)verifyDataCopyWithShape:(const std::vector<size_t>&)shape
+                     srcStrides:(const std::vector<ssize_t>&)srcStrides
+                     dstStrides:(const std::vector<ssize_t>&)dstStrides {
+    verify_copy<int16_t, int32_t>(shape, srcStrides, dstStrides);
+    verify_copy<int16_t, int64_t>(shape, srcStrides, dstStrides);
+    verify_copy<int32_t, int64_t>(shape, srcStrides, dstStrides);
+    verify_copy<float, double>(shape, srcStrides, srcStrides);
+    verify_copy<float, _Float16>(shape, srcStrides, dstStrides);
+    verify_copy<double, _Float16>(shape, srcStrides, srcStrides);
+}
+
+- (void)testAdjacentDataCopy {
+    std::vector<size_t> shape = {1, 3, 10, 10};
+    std::vector<ssize_t> strides = {3 * 10 * 10, 10 * 10, 10, 1};
+    [self verifyDataCopyWithShape:shape srcStrides:strides dstStrides:strides];
+}
+
+- (void)testNonAdjacentDataCopy {
+    std::vector<size_t> shape = {1, 3, 10, 10};
+    std::vector<ssize_t> srcStrides = {3 * 10 * 64, 10 * 64, 64, 1};
+    std::vector<ssize_t> dstStrides = {3 * 10 * 10 * 10, 10 * 10 * 10, 100, 10};
+    [self verifyDataCopyWithShape:shape srcStrides:srcStrides dstStrides:dstStrides];
+}
+
+@end
diff --git a/backends/apple/coreml/runtime/util/objc_array_util.h b/backends/apple/coreml/runtime/util/objc_array_util.h
new file mode 100644
index 00000000000..5f4c8c7bc26
--- /dev/null
+++ b/backends/apple/coreml/runtime/util/objc_array_util.h
@@ -0,0 +1,42 @@
+//
+//  objc_array_util.h
+//  util
+//
+// Copyright © 2024 Apple Inc. All rights reserved.
+//
+// Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+#import <Foundation/Foundation.h>
+#import <type_traits>
+#import <vector>
+
+namespace executorchcoreml {
+
+template <typename T> T to_value(NSNumber* value);
+
+template <> inline size_t to_value(NSNumber* value) { return value.unsignedLongValue; }
+
+template <> inline ssize_t to_value(NSNumber* value) { return value.longLongValue; }
+
+template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
+inline NSArray<NSNumber*>* to_array(const std::vector<T>& array) {
+    NSMutableArray<NSNumber*>* result = [NSMutableArray arrayWithCapacity:array.size()];
+    for (T value: array) {
+        [result addObject:@(value)];
+    }
+
+    return result;
+}
+
+template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type>
+inline std::vector<T> to_vector(NSArray<NSNumber*>* numbers) {
+    std::vector<T> result;
+    result.reserve(numbers.count);
+    for (NSNumber* number in numbers) {
+        result.emplace_back(to_value<T>(number));
+    }
+
+    return result;
+}
+
+}
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index 4c9fc081b9e..d8ee4ea693a 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -100,6 +100,8 @@
 		C9E7D7952AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78D2AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm */; };
 		C9E7D7962AB3F9BF00CCAE5D /* KeyValueStoreTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78E2AB3F9BF00CCAE5D /* KeyValueStoreTests.mm */; };
 		C9E7D7A22AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D7A12AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm */; };
+		F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */; };
+		C9EC7E1B2BC73B3200A6B166 /* MultiArrayTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -297,6 +299,9 @@
 		C9EA3DB22B71A2B200B7D7BD /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
 		C9EA3FDE2B73EEA000B7D7BD /* libsqlite3.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.tbd; path = usr/lib/libsqlite3.tbd; sourceTree = SDKROOT; };
 		C9EA3FE52B73EF6300B7D7BD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = ../libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
+		C9EC7E092BC662A300A6B166 /* objc_array_util.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = objc_array_util.h; path = ../util/objc_array_util.h; sourceTree = "<group>"; };
+		C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = MultiArrayTests.mm; path = ../test/MultiArrayTests.mm; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -305,6 +310,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				C94D510F2ABDF87500AF47FD /* Accelerate.framework in Frameworks */,
+				F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
 				C94D510E2ABDF86800AF47FD /* libsqlite3.tbd in Frameworks */,
 				C94D50D92ABD7B2400AF47FD /* CoreML.framework in Frameworks */,
 				C99883862B95AD7D000953A3 /* libprotobuf-lite.a in Frameworks */,
@@ -523,6 +529,7 @@
 				C96560942AABFDCE005F8126 /* libsqlite3.tbd */,
 				C96560922AABF992005F8126 /* CoreML.framework */,
 				C96560902AABF982005F8126 /* Accelerate.framework */,
+				F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */,
 				C965608D2AABF72A005F8126 /* libexecutorch.a */,
 			);
 			name = "Recovered References";
@@ -536,6 +543,7 @@
 				C97716DB2AF44D9A00FC0DAC /* objc_json_serde.h */,
 				C97716DC2AF44E7B00FC0DAC /* objc_json_serde.mm */,
 				C97716DE2AF44FC400FC0DAC /* objc_safe_cast.h */,
+				C9EC7E092BC662A300A6B166 /* objc_array_util.h */,
 			);
 			name = util;
 			sourceTree = "<group>";
@@ -574,6 +582,7 @@
 				C998838C2B96841D000953A3 /* ETCoreMLModelStructurePathTests.mm */,
 				C998838E2B96999F000953A3 /* ETCoreMLModelProfilerTests.mm */,
 				C962271A2B984FB9002D13B7 /* ETCoreMLModelDebuggerTests.mm */,
+				C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */,
 			);
 			name = test;
 			sourceTree = "<group>";
@@ -724,6 +733,7 @@
 				C945E9372B997EEE009C3FAC /* FeatureTypes.pb.cc in Sources */,
 				C945E9402B997EEE009C3FAC /* OneHotEncoder.pb.cc in Sources */,
 				C94D50E82ABDF81100AF47FD /* key_value_store.cpp in Sources */,
+				C9EC7E1B2BC73B3200A6B166 /* MultiArrayTests.mm in Sources */,
 				C945E9452B997EEE009C3FAC /* BayesianProbitRegressor.pb.cc in Sources */,
 				C945E8E52B997ECE009C3FAC /* ETCoreMLOperationProfilingInfo.mm in Sources */,
 				C945E9312B997EEE009C3FAC /* DataStructures.pb.cc in Sources */,
diff --git a/backends/apple/coreml/scripts/build_tests.sh b/backends/apple/coreml/scripts/build_tests.sh
index 72afca2d6ce..730ba0839db 100755
--- a/backends/apple/coreml/scripts/build_tests.sh
+++ b/backends/apple/coreml/scripts/build_tests.sh
@@ -59,6 +59,7 @@ cmake --build "$CMAKE_PROTOBUF_BUILD_DIR_PATH"  -j9 -t libprotobuf-lite
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch.a" "$LIBRARIES_DIR_PATH"
+cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch_no_prim_ops.a" "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_PROTOBUF_BUILD_DIR_PATH/libprotobuf-lite.a" "$LIBRARIES_DIR_PATH"
 
 #Copy ExecuTorch headers
diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
index 0f703c9e430..b48ac7bfb69 100755
--- a/backends/apple/coreml/scripts/install_requirements.sh
+++ b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 7.2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md
index c01f6e2d238..4e66544f7bb 100644
--- a/backends/apple/coreml/setup.md
+++ b/backends/apple/coreml/setup.md
@@ -29,8 +29,8 @@ python3 -m examples.apple.coreml.scripts.export --model_name add
 4. You can now integrate the **Core ML** backend in code.
 
 ```python
-# Lower to Core ML backend
-lowered_module = to_backend('CoreMLBackend', to_be_lowered_exir_submodule, [])
+# Delegate to Core ML backend
+delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
 ```
 
 
@@ -46,15 +46,15 @@ lowered_module = to_backend('CoreMLBackend', to_be_lowered_exir_submodule, [])
 xcode-select --install
 ```
 
-2. Build **Core ML** delegate. The following will create a `executorch.xcframework` in `cmake-out` directory.
+4. Build **Core ML** delegate. The following will create `executorch.xcframework` and `coreml_backend.xcframework` in the `cmake-out` directory.
 
 ```bash
 cd executorch
 ./build/build_apple_frameworks.sh --Release --coreml
 ```
-3. Open the project in Xcode, and drag the `executorch.xcframework` generated from Step 2 to Frameworks.
+5. Open the project in Xcode, and drag `executorch.xcframework` and `coreml_backend.xcframework` frameworks generated from Step 2 to Frameworks.
 
-4. Go to project Target’s Build Phases -  Link Binaries With Libraries, click the + sign, and add the following frameworks:
+6. Go to project Target’s Build Phases -  Link Binaries With Libraries, click the + sign, and add the following frameworks:
 
 ```
 executorch.xcframework
@@ -63,9 +63,9 @@ coreml_backend.xcframework
 
 5. Go to project Target’s Build Phases -  Link Binaries With Libraries, click the + sign, and add the following frameworks.
 ```
-- Accelerate.framework
-- CoreML.framework
-- libsqlite3.tbd
+Accelerate.framework
+CoreML.framework
+libsqlite3.tbd
 ```
 
 6. The target could now run a **Core ML** delegated **Program**.
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index ef64e26f2cc..a3b0bdab670 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -70,13 +70,16 @@ target_link_libraries(mpsdelegate
   PRIVATE
   bundled_program
   mps_schema
-  ${_executor_runner_libs}
+  executorch_no_prim_ops
   ${FOUNDATION_FRAMEWORK}
   ${METAL_FRAMEWORK}
   ${MPS_FRAMEWORK}
   ${MPS_GRAPG_FRAMEWORK}
 )
 
+target_link_options_shared_lib(mpsdelegate)
+target_compile_options(mpsdelegate PUBLIC ${_common_compile_options})
+
 install(
   TARGETS mpsdelegate
   DESTINATION lib
diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py
index 0e543d7e079..bb828ed0f90 100644
--- a/backends/apple/mps/mps_preprocess.py
+++ b/backends/apple/mps/mps_preprocess.py
@@ -18,6 +18,7 @@
 from executorch.backends.apple.mps.serialization.mps_graph_schema import (
     MPSGraph,
     MPSTensor,
+    OpType,
 )
 
 from executorch.backends.apple.mps.serialization.mps_graph_serialize import (
@@ -65,6 +66,7 @@ def preprocess(
             input_ids=[],
             output_ids=[],
             constant_ids=[],
+            graph_type=OpType.mps_graph,
         )
 
         convert_model_to_fp16 = True
@@ -111,6 +113,16 @@ def handle_call_function(
         mps_graph: MPSGraph,
     ) -> None:
         logging.info(f"Visiting: {node}, {node.target.__name__}")
+
+        if (
+            "delegation_tag" in node.meta
+            and "metal_kernel" in node.meta["delegation_tag"]
+        ):
+            logging.info(
+                f"Node '{node.target.__name__}' was marked as a Metal kernel by the MPSPartitioner!"
+            )
+            mps_graph.graph_type = OpType.metal_kernel
+
         if node.target.__name__ in node_visitors:
             node_visitors[node.target.__name__].define_node(node, mps_graph)
         else:
diff --git a/backends/apple/mps/operators/indexing_ops.py b/backends/apple/mps/operators/indexing_ops.py
index f2c9dc6aeab..690549973a4 100644
--- a/backends/apple/mps/operators/indexing_ops.py
+++ b/backends/apple/mps/operators/indexing_ops.py
@@ -3,7 +3,7 @@
 #  Provided subject to the LICENSE file in the top level directory.
 #
 
-from typing import cast
+from typing import cast, List
 
 import torch
 from executorch.backends.apple.mps.operators.node_visitor import (
@@ -13,9 +13,12 @@
 from executorch.backends.apple.mps.serialization.mps_graph_schema import (
     MPSEmbedding,
     MPSGraph,
+    MPSIndexPut,
     MPSIndexSelect,
+    MPSIndexTensor,
 )
 from executorch.backends.apple.mps.utils.mps_utils import get_input_node
+from executorch.backends.transforms import get_shape
 from executorch.exir.sym_util import eval_expr
 
 
@@ -40,6 +43,78 @@ def define_node(
         mps_graph.mps_nodes.append(mps_node)
 
 
+@register_node_visitor
+class IndexTensorVisitor(NodeVisitor):
+    target = "aten.index.Tensor"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        mps_node = self.create_unary_node(node, mps_graph, MPSIndexTensor)
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        for tensor in tensors:
+            mps_node.mpsnode_union.indices_id.append(
+                self.define_tensor(tensor, mps_graph)
+            )
+
+        mps_graph.mps_nodes.append(mps_node)
+
+
+# [MPS TODO]: Works on a single iteration of llama2, but subsequent tokens
+# are wrong when using Index put. Disabling it for now.
+@register_node_visitor
+class IndexPutVisitor(NodeVisitor):
+    # target = "aten.index_put.default"
+    target = "disabled"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def infer_sizes(self, a: List[int], b: List[int]):
+        dimsA = len(a)
+        dimsB = len(b)
+        ndim = dimsA if dimsA > dimsB else dimsB
+        expandedSizes = [0] * ndim
+        for i in range(ndim - 1, -1, -1):
+            offset = ndim - 1 - i
+            dimA = dimsA - 1 - offset
+            dimB = dimsB - 1 - offset
+            sizeA = a[dimA] if dimA >= 0 else -1
+            sizeB = b[dimB] if dimB >= 0 else -1
+            expandedSizes[i] = sizeA if sizeB == -1 else sizeB
+
+        return expandedSizes
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        mps_node = self.create_unary_node(node, mps_graph, MPSIndexPut)
+        updates_shape = get_shape(node.args[2])
+        input_shape = get_shape(node.args[0])
+        new_shape = []
+        if len(updates_shape) != 1 and len(updates_shape) != len(input_shape):
+            new_shape = self.infer_sizes(input_shape, updates_shape)
+            mps_node.mpsnode_union.values_shape = new_shape
+
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        for tensor in tensors:
+            mps_node.mpsnode_union.indices_id.append(
+                self.define_tensor(tensor, mps_graph)
+            )
+
+        mps_node.mpsnode_union.values_id = self.define_tensor(
+            get_input_node(node, 2), mps_graph
+        )
+        mps_graph.mps_nodes.append(mps_node)
+
+
 @register_node_visitor
 class EmbeddingVisitor(NodeVisitor):
     target = "aten.embedding.default"
diff --git a/backends/apple/mps/operators/unary_ops.py b/backends/apple/mps/operators/unary_ops.py
index 411924d0406..8b67d7dfba2 100644
--- a/backends/apple/mps/operators/unary_ops.py
+++ b/backends/apple/mps/operators/unary_ops.py
@@ -30,6 +30,7 @@
     MPSLog,
     MPSLog10,
     MPSLog2,
+    MPSLogicalNot,
     MPSNeg,
     MPSReciprocal,
     MPSRound,
@@ -79,6 +80,7 @@ class UnaryOpVisitor(NodeVisitor):
         "aten.isnan.default",
         "aten.isinf.default",
         "aten.round.default",
+        "aten.logical_not.default",
     ]
 
     def __init__(self, *args) -> None:
@@ -115,6 +117,7 @@ def __init__(self, *args) -> None:
             exir_ops.edge.aten.isnan.default: MPSIsnan,
             exir_ops.edge.aten.isinf.default: MPSIsinf,
             exir_ops.edge.aten.round.default: MPSRound,
+            exir_ops.edge.aten.logical_not.default: MPSLogicalNot,
         }
 
     def define_node(
diff --git a/backends/apple/mps/partition/mps_partitioner.py b/backends/apple/mps/partition/mps_partitioner.py
index a06677a59a5..e5497389d14 100644
--- a/backends/apple/mps/partition/mps_partitioner.py
+++ b/backends/apple/mps/partition/mps_partitioner.py
@@ -4,12 +4,13 @@
 #
 
 import logging
-from typing import Any, Dict, List, Union
+from typing import Any, cast, Dict, List, Union
 
 import torch
 from executorch.backends.apple.mps.mps_preprocess import MPSBackend
 from executorch.backends.apple.mps.operators.node_visitor import get_node_visitors
 from executorch.backends.apple.mps.utils.mps_utils import is_parameter
+from executorch.backends.transforms import get_shape
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
     generate_partitions_from_list_of_nodes,
@@ -20,6 +21,7 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
@@ -28,6 +30,13 @@
 logging.basicConfig(level=logging.DEBUG, format=FORMAT)
 
 
+# ops implemented as Metal kernels.
+METAL_KERNELS = [
+    exir_ops.edge.aten.index.Tensor,
+    exir_ops.edge.aten.index_put.default,
+]
+
+
 class MPSOperatorSupport(OperatorSupportBase):
     def __init__(self, edge_program: torch.export.ExportedProgram, compiler_specs):
         self.node_visitors = get_node_visitors(edge_program)
@@ -65,10 +74,47 @@ def generate_partitions(self, edge_program: ExportedProgram) -> List[Any]:
             op_support=self.supported_ops,
         )
 
+    def mps_graph_advanced_indexing_support(self, node: torch.fx.Node):
+        num_indices = 0
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        input = cast(torch.fx.Node, node.args[0])
+        for t in tensors:
+            if t is not None:
+                num_indices += 1
+        # Can dispatch to MPSGraph if the length of the slices is equal
+        # to the number of dimensions of the sliced tensors, or only one
+        # slice is present. All other cases will fallback to a Metal kernel.
+        if num_indices == len(get_shape(input)) or num_indices == 1:
+            return True
+
+        return False
+
+    def use_metal_kernel(self, node: torch.fx.Node):
+        if node.target in METAL_KERNELS:
+            if (
+                node.target == exir_ops.edge.aten.index.Tensor
+                or node.target == exir_ops.edge.aten.index_put.default
+            ):
+                if not self.mps_graph_advanced_indexing_support(node):
+                    return True
+        return False
+
     def tag_nodes(self, partitions: List[Partition]) -> None:
         for partition in partitions:
+            crt_partition_counter = 0
             for node in partition.nodes:
                 delegation_tag = f"mps_{partition.id}"
+                if self.use_metal_kernel(node):
+                    logging.warning(f"[WARNING] Using Metal kernel for op {node.name}!")
+                    # Partition the Metal kernel into a separate partition
+                    crt_partition_counter += 1
+                    delegation_tag = (
+                        f"{delegation_tag}_metal_kernel_{crt_partition_counter}"
+                    )
+                    crt_partition_counter += 1
+                else:
+                    delegation_tag = f"{delegation_tag}_{crt_partition_counter}"
+
                 node.meta["delegation_tag"] = delegation_tag
                 self.partition_tags[delegation_tag] = self.delegation_spec
 
diff --git a/backends/apple/mps/runtime/MPSDevice.h b/backends/apple/mps/runtime/MPSDevice.h
index d9ab403e80b..a8b5dbe2b81 100644
--- a/backends/apple/mps/runtime/MPSDevice.h
+++ b/backends/apple/mps/runtime/MPSDevice.h
@@ -5,10 +5,19 @@
 
 #pragma once
 
+// Obj-C headers
 #include <Foundation/Foundation.h>
 #include <Metal/Metal.h>
+
+// Runtime headers
+#include <executorch/runtime/backend/interface.h>
+
+// MPS headers
 #include <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
+#include <unordered_map>
+#include <vector>
+
 #define MB(x) (x * 1048576UL)
 
 namespace torch {
@@ -25,6 +34,11 @@ enum class MacOSVersion : uint32_t {
   MACOS_VER_14_0_PLUS,
 };
 
+enum class LibraryType : uint32_t {
+  INDEXING_KERNELS = 0,
+  MAX = INDEXING_KERNELS,
+};
+
 class MPSDevice {
  public:
   /**
@@ -53,9 +67,18 @@ class MPSDevice {
 
   ~MPSDevice();
 
+  /**
+   * Compile a PSO for a given library type.
+   * Once compiled, the library and PSOs are cached.
+   */
+  Error compilePSO(LibraryType libraryType, const char* kernelName);
+  Error compileLibrary(LibraryType);
+
  private:
   static MPSDevice* _device;
   id<MTLDevice> _mtl_device;
+  std::unordered_map<LibraryType, id<MTLLibrary>> _m_library_cache;
+  std::unordered_map<std::string, id<MTLComputePipelineState>> _m_pso_cache;
   MPSDevice();
 };
 
diff --git a/backends/apple/mps/runtime/MPSDevice.mm b/backends/apple/mps/runtime/MPSDevice.mm
index 86518fd0025..f51851c3795 100644
--- a/backends/apple/mps/runtime/MPSDevice.mm
+++ b/backends/apple/mps/runtime/MPSDevice.mm
@@ -16,6 +16,20 @@
 static std::unique_ptr<MPSDevice> mps_device;
 static std::once_flag mpsdev_init;
 
+static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device, bool macOS13Plus) {
+  // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
+  // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+)
+  MTLLanguageVersion languageVersion = MTLLanguageVersion2_3;
+#if defined(__MAC_13_0)
+  if (macOS13Plus) {
+    languageVersion = MTLLanguageVersion3_0;
+  }
+#endif
+
+  ET_CHECK_MSG([device supportsFamily:MTLGPUFamilyMac2], "Missing Metal support for MTLGPUFamilyMac2");
+  return languageVersion;
+}
+
 MPSDevice::~MPSDevice() {
   [_mtl_device release];
   _mtl_device = nil;
@@ -79,6 +93,57 @@
   }
 }
 
+const char* getLibraryCString(LibraryType libraryType) {
+  switch (libraryType) {
+    case LibraryType::INDEXING_KERNELS:
+      return "TODO";
+    default:
+      ET_CHECK_MSG(false, "Unhandled library type!");
+  }
+}
+
+Error
+MPSDevice::compileLibrary(LibraryType libraryType) {
+  Error err = Error::Ok;
+  NSError* error = nil;
+  MTLCompileOptions* options = [MTLCompileOptions new];
+  [options setLanguageVersion:getMetalLanguageVersion(_mtl_device, isMacOS13Plus(MacOSVersion::MACOS_VER_13_0_PLUS))];
+  [options setFastMathEnabled:YES];
+  id<MTLLibrary> lib =
+      [_mtl_device newLibraryWithSource:[NSString stringWithCString:getLibraryCString(libraryType)
+                                                           encoding:NSASCIIStringEncoding]
+                                options:options
+                                  error:&error];
+
+  ET_CHECK_OR_RETURN_ERROR(
+    lib != nil,
+    Internal,
+    "Failed to create indexing library, error: %s", [[error description] UTF8String]
+  );
+
+  _m_library_cache[libraryType] = lib;
+  return err;
+}
+
+Error
+MPSDevice::compilePSO(LibraryType libraryType, const char* kernelName) {
+  Error err = Error::Ok;
+  if (_m_library_cache.find(libraryType) == _m_library_cache.end()) {
+    ET_LOG(Debug, "Compiling library type: %d", libraryType);
+    err = compileLibrary(libraryType);
+    ET_CHECK_OR_RETURN_ERROR(
+      err == Error::Ok,
+      Internal,
+      "An error occured occured while compiling library %d", libraryType
+    );
+  }
+  if (_m_pso_cache.find(kernelName) == _m_pso_cache.end()) {
+    ET_LOG(Debug, "Compiling kernel: %s", kernelName);
+    // err = compilePSO(libraryType, kernelName);
+  }
+  return err;
+}
+
 bool isMacOS13OrNewer(MacOSVersion version) {
   return MPSDevice::getInstance()->isMacOS13Plus(version);
 }
diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.h b/backends/apple/mps/runtime/MPSGraphBuilder.h
index 0a7bf835a73..e4e89d68691 100644
--- a/backends/apple/mps/runtime/MPSGraphBuilder.h
+++ b/backends/apple/mps/runtime/MPSGraphBuilder.h
@@ -109,6 +109,7 @@ class MPSGraphBuilder {
   _DEFINE_MPS_OP(Isnan);
   _DEFINE_MPS_OP(Isinf);
   _DEFINE_MPS_OP(Round);
+  _DEFINE_MPS_OP(LogicalNot);
   _DEFINE_MPS_OP(NormCdf);
   // Clamp ops
   _DEFINE_MPS_OP(Clamp);
@@ -120,6 +121,8 @@ class MPSGraphBuilder {
   // Indexing ops
   _DEFINE_MPS_OP(IndexSelect);
   _DEFINE_MPS_OP(Embedding);
+  _DEFINE_MPS_OP(IndexTensor);
+  _DEFINE_MPS_OP(IndexPut);
   // Linear algebra ops
   _DEFINE_MPS_OP(MatMul);
   _DEFINE_MPS_OP(Addmm);
@@ -153,6 +156,7 @@ class MPSGraphBuilder {
 
   // Helper functions
   Error addNodeToMPSGraph(NodePtr nodePtr);
+  Error compileMetalKernel(NodePtr nodePtr);
   MPSShape *getMPSShape(int32_t id);
   MPSShape *getMPSShape(const flatbuffers::Vector<int32_t> *shape);
   int64_t numel(const flatbuffers::Vector<int32_t> *shape);
@@ -161,6 +165,8 @@ class MPSGraphBuilder {
   MPSGraphTensor *getMPSGraphTensor(int32_t id);
   NSData *getConstantData(int32_t id);
   std::pair<float, float> getMinMaxValues(NodePtr nodePtr);
+  Error compileMPSGraph();
+  Error compileMetalKernel();
 
   // Each MPSGraph op result in at least MPSGraphTensor being
   // produced, which will be stored in this structure. Other ops
@@ -172,6 +178,7 @@ class MPSGraphBuilder {
   // FlatBuffer raw bytes of the serialized MPS model.
   const void *_buffer_pointer;
 
+  bool _metal_kernel;
   MPSGraph *_mpsGraph;
   MPSGraphExecutable *_mpsGraphExecutable;
   NSMutableDictionary<MPSGraphTensor *, MPSGraphShapedType *> *_feeds;
diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.mm b/backends/apple/mps/runtime/MPSGraphBuilder.mm
index d82b677066f..8b571001d42 100644
--- a/backends/apple/mps/runtime/MPSGraphBuilder.mm
+++ b/backends/apple/mps/runtime/MPSGraphBuilder.mm
@@ -17,6 +17,7 @@
   _targetTensors = [NSMutableArray new];
 
   _mpsGraphExecutable = nil;
+  _metal_kernel = false;
 }
 
 Error
@@ -32,8 +33,34 @@
     mpsgraph::MPSGraphIdentifier());
 
   _flatBufferGraph = mpsgraph::GetMPSGraph(_buffer_pointer);
-  _idToMPSGraphTensor.resize(_flatBufferGraph->mps_values()->size(), nullptr);
+  switch (_flatBufferGraph->graph_type()) {
+    case mpsgraph::OpType::metal_kernel:
+    {
+      _metal_kernel = true;
+      err = compileMetalKernel();
+      break;
+    }
+    case mpsgraph::OpType::mps_graph:
+    {
+      err = compileMPSGraph();
+      break;
+    }
+    default:
+      ET_CHECK_OR_RETURN_ERROR(
+      false,
+      DelegateInvalidCompatibility,
+      "Received an invalid operation type: expected MPSGraph or metal kernel, but got: %s",
+      EnumNameOpType(_flatBufferGraph->graph_type()));
+  }
+
+  return err;
+}
 
+Error
+MPSGraphBuilder::compileMPSGraph() {
+  Error err = Error::Ok;
+
+  _idToMPSGraphTensor.resize(_flatBufferGraph->mps_values()->size(), nullptr);
   // Add the placeholder nodes to the graph.
   for (auto in_id : *_flatBufferGraph->input_ids()) {
     err = mpsGraphRankedPlaceholder(in_id);
@@ -71,6 +98,30 @@
   return err;
 }
 
+Error
+MPSGraphBuilder::compileMetalKernel() {
+  Error err = Error::Ok;
+
+  ET_CHECK_OR_RETURN_ERROR(
+    _flatBufferGraph->mps_nodes()->size() == 1,
+    DelegateInvalidCompatibility,
+    "Currently supporting dispatching a single Metal kernel.");
+  ET_CHECK_OR_RETURN_ERROR(
+    _flatBufferGraph->constant_ids()->size() == 0,
+    DelegateInvalidCompatibility,
+    "Currently not supporting dispatching Metal kernels with constants.");
+
+  // Compile the corresponding Metal kernel
+  for (auto node : *_flatBufferGraph->mps_nodes()) {
+    err = compileMetalKernel(node);
+    if (err != Error::Ok) {
+      return err;
+    }
+  }
+
+  return err;
+}
+
 Error
 MPSGraphBuilder::mpsGraphRankedPlaceholder(int32_t id) {
   ET_LOG(Debug, "%s: %d", __FUNCTION__, id);
diff --git a/backends/apple/mps/runtime/operations/IndexingOps.mm b/backends/apple/mps/runtime/operations/IndexingOps.mm
index 1c02cbea5c4..b4dcf192b46 100644
--- a/backends/apple/mps/runtime/operations/IndexingOps.mm
+++ b/backends/apple/mps/runtime/operations/IndexingOps.mm
@@ -108,6 +108,102 @@
   return Error::Ok;
 }
 
+Error
+MPSGraphBuilder::mpsIndexTensorOp(NodePtr nodePtr) {
+  Error err = Error::Ok;
+  auto graphNode = nodePtr->mpsnode_union_as_MPSIndexTensor();
+  ET_LOG(
+    Debug, "%s: %d -> %d",
+    __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
+  );
+
+  if (_metal_kernel) {
+    err = MPSDevice::getInstance()->compilePSO(LibraryType::INDEXING_KERNELS, "index_select");
+    ET_CHECK_MSG(false, "Metal kernel path not yet implemented\n");
+  } else {
+    int validIndices = 0;
+    int numIndices = graphNode->indices_id()->size();
+    int axis = -1;
+    int indexId = -1;
+    for (int i = 0; i < numIndices; i++) {
+      int32_t index_id = graphNode->indices_id()->Get(i);
+      if (index_id == -1) {
+        continue;
+      }
+      validIndices++;
+      axis = i;
+      indexId = index_id;
+    }
+    ET_LOG(Debug, "index.Tensor with %d indices (axis = %d)", validIndices, axis);
+    ET_CHECK(validIndices > 0);
+
+    if (validIndices == 1) {
+      MPSGraphTensor* updatesTensor = getMPSGraphTensor(graphNode->input1_id());
+      MPSGraphTensor* indexTensor = getMPSGraphTensor(indexId);
+      _idToMPSGraphTensor[graphNode->output_id()] =
+        [_mpsGraph gatherWithUpdatesTensor:updatesTensor indicesTensor:indexTensor axis:axis batchDimensions:0 name:nil];
+    } else {
+      ET_CHECK_MSG(false, "Not yet implemented");
+    }
+  }
+
+  return err;
+}
+
+Error
+MPSGraphBuilder::mpsIndexPutOp(NodePtr nodePtr) {
+  Error err = Error::Ok;
+  auto graphNode = nodePtr->mpsnode_union_as_MPSIndexPut();
+  ET_LOG(
+    Debug, "%s: %d -> %d",
+    __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
+  );
+
+  if (_metal_kernel) {
+    err = MPSDevice::getInstance()->compilePSO(LibraryType::INDEXING_KERNELS, "index_put");
+    ET_CHECK_MSG(false, "Metal kernel path not yet implemented\n");
+  } else {
+    int validIndices = 0;
+    int numIndices = graphNode->indices_id()->size();
+    int axis = -1;
+    int indexId = -1;
+    for (int i = 0; i < numIndices; i++) {
+      int32_t index_id = graphNode->indices_id()->Get(i);
+      if (index_id == -1) {
+        continue;
+      }
+      validIndices++;
+      axis = i;
+      indexId = index_id;
+    }
+    ET_LOG(Debug, "index_put with %d indices (axis = %d)", validIndices, axis);
+    ET_CHECK(validIndices > 0);
+
+    if (validIndices == 1) {
+      MPSGraphTensor* dataTensor = getMPSGraphTensor(graphNode->input1_id());
+      MPSGraphTensor* updatesTensor = getMPSGraphTensor(graphNode->values_id());
+      MPSGraphTensor* indicesTensor = getMPSGraphTensor(indexId);
+      if (graphNode->values_shape()->size() != 0) {
+        updatesTensor = [_mpsGraph broadcastTensor:updatesTensor
+                                       toShape:getMPSShape(graphNode->values_shape())
+                                            name:nil];
+      }
+
+      _idToMPSGraphTensor[graphNode->output_id()] =
+        [_mpsGraph scatterWithDataTensor:dataTensor
+                           updatesTensor:updatesTensor
+                           indicesTensor:indicesTensor
+                                    axis:axis
+                                    mode:MPSGraphScatterModeSet
+                                  name:nil];
+    } else {
+      ET_CHECK_MSG(false, "Not yet implemented");
+    }
+  }
+
+  return err;
+}
+
 } // namespace delegate
 } // namespace mps
 } // namespace executor
diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm
index 71c36c967ef..648421ee2cd 100644
--- a/backends/apple/mps/runtime/operations/OperationUtils.mm
+++ b/backends/apple/mps/runtime/operations/OperationUtils.mm
@@ -166,6 +166,7 @@
     _DEFINE_MPS_NODE(Isnan);
     _DEFINE_MPS_NODE(Isinf);
     _DEFINE_MPS_NODE(Round);
+    _DEFINE_MPS_NODE(LogicalNot);
     // Clamp ops
     _DEFINE_MPS_NODE(Clamp);
     _DEFINE_MPS_NODE(Where);
@@ -178,6 +179,8 @@
     //Indexing ops
     _DEFINE_MPS_NODE(IndexSelect);
     _DEFINE_MPS_NODE(Embedding);
+    _DEFINE_MPS_NODE(IndexTensor);
+    _DEFINE_MPS_NODE(IndexPut);
     // Reduce ops
     _DEFINE_MPS_NODE(Mean);
     // Shape ops
@@ -223,6 +226,11 @@
   }
 }
 
+Error
+MPSGraphBuilder::compileMetalKernel(NodePtr nodePtr) {
+  return addNodeToMPSGraph(nodePtr);
+}
+
 #undef _DEFINE_MPS_NODE
 
 MPSGraphTensor*
diff --git a/backends/apple/mps/runtime/operations/ShapeOps.mm b/backends/apple/mps/runtime/operations/ShapeOps.mm
index 720161b955d..75de566e4ad 100644
--- a/backends/apple/mps/runtime/operations/ShapeOps.mm
+++ b/backends/apple/mps/runtime/operations/ShapeOps.mm
@@ -42,13 +42,9 @@
     __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
   );
 
-  NSMutableArray<NSNumber*>* shape = [NSMutableArray array];
-  for (int32_t i = 0; i < graphNode->num_dims(); i++) {
-    [shape addObject:[NSNumber numberWithInteger:graphNode->shape()->Get(i)]];
-  }
   _idToMPSGraphTensor[graphNode->output_id()] =
     [_mpsGraph reshapeTensor:getMPSGraphTensor(graphNode->input1_id())
-                  withShape:shape
+                  withShape:getMPSShape(graphNode->shape())
                        name:@"view_copy"];
 
   return Error::Ok;
@@ -91,7 +87,7 @@
     __FUNCTION__, graphNode->output_id()
   );
 
-  NSMutableArray<MPSGraphTensor*>* inputTensors = [NSMutableArray array];
+  NSMutableArray<MPSGraphTensor*>* inputTensors = [NSMutableArray arrayWithCapacity:graphNode->input_ids()->size()];;
   for (auto id : *graphNode->input_ids()) {
     MPSGraphTensor* catTensor = getMPSGraphTensor(id);
     if (catTensor != nil)
diff --git a/backends/apple/mps/runtime/operations/UnaryOps.mm b/backends/apple/mps/runtime/operations/UnaryOps.mm
index 31246bd44f2..ed06584b271 100644
--- a/backends/apple/mps/runtime/operations/UnaryOps.mm
+++ b/backends/apple/mps/runtime/operations/UnaryOps.mm
@@ -92,6 +92,7 @@
 REGISTER_UNARY_OP(Isnan, isNaN)
 REGISTER_UNARY_OP(Isinf, isInfinite)
 REGISTER_UNARY_OP(Round, round)
+REGISTER_UNARY_OP(LogicalNot, not)
 
 
 Error
diff --git a/backends/apple/mps/serialization/mps_graph_schema.py b/backends/apple/mps/serialization/mps_graph_schema.py
index 66697b04b7d..8134091a01d 100644
--- a/backends/apple/mps/serialization/mps_graph_schema.py
+++ b/backends/apple/mps/serialization/mps_graph_schema.py
@@ -27,6 +27,11 @@ class MPSDataType(IntEnum):
     mps_data_type_complex_float32 = 11
 
 
+class OpType(IntEnum):
+    mps_graph = 0
+    metal_kernel = 1
+
+
 @dataclass
 class MPSNode1x1:
     input1_id: int
@@ -359,6 +364,11 @@ class MPSRound(MPSNode1x1):
     pass
 
 
+@dataclass
+class MPSLogicalNot(MPSNode1x1):
+    pass
+
+
 @dataclass
 class MPSBitwise(MPSNode1x1):
     pass
@@ -434,6 +444,18 @@ class MPSEmbedding(MPSNode2x1):
     sparse: bool = False
 
 
+@dataclass
+class MPSIndexTensor(MPSNode1x1):
+    indices_id: List[int] = field(default_factory=list)
+
+
+@dataclass
+class MPSIndexPut(MPSNode1x1):
+    indices_id: List[int] = field(default_factory=list)
+    values_shape: List[int] = field(default_factory=list)
+    values_id: int = -1
+
+
 ##
 ## Shape ops
 ##
@@ -664,6 +686,7 @@ class MPSArange:
     MPSIsnan,
     MPSIsinf,
     MPSRound,
+    MPSLogicalNot,
     # Linear algebra ops
     MPSMatMul,
     MPSAddmm,
@@ -678,6 +701,8 @@ class MPSArange:
     # Indexing ops
     MPSIndexSelect,
     MPSEmbedding,
+    MPSIndexTensor,
+    MPSIndexPut,
     # Shape ops
     MPSPermute,
     MPSView,
@@ -741,3 +766,4 @@ class MPSGraph:
     input_ids: List[int]
     output_ids: List[int]
     constant_ids: List[int]
+    graph_type: OpType
diff --git a/backends/apple/mps/serialization/schema.fbs b/backends/apple/mps/serialization/schema.fbs
index c3e3eaa4faf..6ba2c937f32 100644
--- a/backends/apple/mps/serialization/schema.fbs
+++ b/backends/apple/mps/serialization/schema.fbs
@@ -24,6 +24,13 @@ enum MPSDataType : short {
   mps_data_type_complex_float32 = 11,
 }
 
+// ops like index.Tensor and index.put are currentely implemented as
+// Metal kernels for unsupported MPSGraph cases.
+enum OpType : short {
+  mps_graph,
+  metal_kernel
+}
+
 // Helper classes to define the number of input and output tensors for a node.
 // Not meant to be used directly.
 
@@ -145,6 +152,20 @@ table MPSEmbedding {
   sparse:bool;
 }
 
+table MPSIndexTensor {
+  input1_id:int;
+  indices_id:[int];
+  output_id:int;
+}
+
+table MPSIndexPut {
+  input1_id:int;
+  indices_id:[int];
+  values_shape:[int];
+  values_id:int;
+  output_id:int;
+}
+
 // Shape ops.
 table MPSPermute {
   input1_id:int;
@@ -350,6 +371,7 @@ union MPSNodeUnion {
     MPSIsnan: _MPSNode1x1,
     MPSIsinf: _MPSNode1x1,
     MPSRound: _MPSNode1x1,
+    MPSLogicalNot: _MPSNode1x1,
 
     // Linear algebra ops
     MPSMatMul: _MPSNode2x1,
@@ -366,6 +388,8 @@ union MPSNodeUnion {
     // Indexing ops
     MPSIndexSelect,
     MPSEmbedding,
+    MPSIndexTensor,
+    MPSIndexPut,
 
     // Reduce ops
     MPSMean,
@@ -438,6 +462,8 @@ table MPSGraph {
   input_ids:[int];
   output_ids:[int];
   constant_ids:[int];
+
+  graph_type:OpType;
 }
 
 root_type MPSGraph;
diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md
index 9c2222bb6de..c8fdfeb98e4 100644
--- a/backends/apple/mps/setup.md
+++ b/backends/apple/mps/setup.md
@@ -15,15 +15,28 @@ The MPS backend device maps machine learning computational graphs and primitives
 * [Introduction to ExecuTorch](intro-how-it-works.md)
 * [Setting up ExecuTorch](getting-started-setup.md)
 * [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [ExecuTorch iOS Demo App](demo-apps-ios.md)
+* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
 :::
 ::::
 
 
 ## Prerequisites (Hardware and Software)
 
-In order to be able to successfully build and run a model using the MPS backend for ExecuTorch, you'll need the following hardware and software components.
- - macOS 12 / iOS 15 or later (for MPS runtime)
- - Xcode command-line tools: xcode-select --install
+In order to be able to successfully build and run a model using the MPS backend for ExecuTorch, you'll need the following hardware and software components:
+
+### Hardware:
+ - A [mac](https://www.apple.com/mac/) for tracing the model
+
+### Software:
+
+  - **Ahead of time** tracing:
+    - [macOS](https://www.apple.com/macos/) 12
+
+  - **Runtime**:
+    - [macOS](https://www.apple.com/macos/) >= 12.4
+    - [iOS](https://www.apple.com/ios) >= 15.4
+    - [Xcode](https://developer.apple.com/xcode/) >= 14.1
 
 ## Setting up Developer Environment
 
@@ -40,47 +53,34 @@ In order to be able to successfully build and run a model using the MPS backend
 ### AOT (Ahead-of-time) Components
 
 **Compiling model for MPS delegate**:
-- In this step, you will generate a simple ExecuTorch program that lowers MobileNetV3 model to the MPS delegate. You'll then pass this Program(the `.pte` file) during the runtime to run it using the MPS backend.
+- In this step, you will generate a simple ExecuTorch program that lowers MobileNetV3 model to the MPS delegate. You'll then pass this Program (the `.pte` file) during the runtime to run it using the MPS backend.
 
 ```bash
 cd executorch
-python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --bundled
+# Note: `mps_example` script uses by default the MPSPartitioner for ops that are not yet supported by the MPS delegate. To turn it off, pass `--no-use_partitioner`.
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --bundled --use_fp16
+
+# To see all options, run following command:
+python3 -m examples.apple.mps.scripts.mps_example --help
 ```
 
 ### Runtime
 
-**Building the MPS executor runner**
-- In this step, you'll be building the `mps_executor_runner` that is able to run MPS lowered modules.
-
+**Building the MPS executor runner:**
 ```bash
-# Build the mps_executor_runner
+# In this step, you'll be building the `mps_executor_runner` that is able to run MPS lowered modules:
+cd executorch
+./examples/apple/mps/scripts/build_mps_executor_runner.sh
+```
+
+## Run the mv3 generated model using the mps_executor_runner
+
 ```bash
-# Build and install executorch
-cmake -DBUCK2="$BUCK" \
-          -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
-          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-          -DEXECUTORCH_BUILD_MPS=ON \
-          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-          -Bcmake-out .
-cmake --build cmake-out -j9 --target install --config Release
-CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
-# build mps_executor_runner
-rm -rf cmake-out/examples/apple/mps
-cmake \
-    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-    -Bcmake-out/examples/apple/mps \
-    examples/apple/mps
-
-cmake --build cmake-out/examples/apple/mps -j9 --config Release
-
-# Run the mv2 generated model using the mps_executor_runner
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program
+```
 
-# You should see the following results. Note that no output file will be generated in this example:
+- You should see the following results. Note that no output file will be generated in this example:
+```
 I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_bundled_fp16.pte is loaded.
 I 00:00:00.003306 executorch:mps_executor_runner.mm:292] Program methods: 1
 I 00:00:00.003308 executorch:mps_executor_runner.mm:294] Running method forward
@@ -94,12 +94,43 @@ I 00:00:00.118731 executorch:mps_executor_runner.mm:438] Model executed successf
 I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successfully.
 ```
 
+### [Optional] Run the generated model directly using pybind
+1. Make sure `pybind` MPS support was installed:
+```bash
+./install_requirements.sh --pybind mps
+```
+2. Run the `mps_example` script to trace the model and run it directly from python:
+```bash
+cd executorch
+# Check correctness between PyTorch eager forward pass and ExecuTorch MPS delegate forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --check_correctness
+# You should see following output: `Results between ExecuTorch forward pass with MPS backend and PyTorch forward pass for mv3_mps are matching!`
+
+# Check performance between PyTorch MPS forward pass and ExecuTorch MPS forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --bench_pytorch
+```
+
+### Profiling:
+1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model.
+```bash
+cd executorch
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
+```
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
+```
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
+```
+3. Create an instance of the Inspector API by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
+```bash
+python3 -m sdk.inspector.inspector_cli --etdump_path etdump.etdp --etrecord_path etrecord.bin
+```
+
 ## Deploying and Running on Device
 
 ***Step 1***. Create the ExecuTorch core and MPS delegate frameworks to link on iOS
 ```bash
 cd executorch
-./build/build_apple_frameworks.sh --Release --mps
+./build/build_apple_frameworks.sh --mps
 ```
 
 `mps_delegate.xcframework` will be in `cmake-out` folder, along with `executorch.xcframework` and `portable_delegate.xcframework`:
@@ -123,4 +154,4 @@ In this tutorial, you have learned how to lower a model to the MPS delegate, bui
 
 ## Frequently encountered errors and resolution.
 
-If you encountered any bugs or issues following this tutorial please file a bug/issue on the ExecuTorch repository, with hashtag **#mps**.
+If you encountered any bugs or issues following this tutorial please file a bug/issue on the [ExecuTorch repository](https://github.com/pytorch/executorch/issues), with hashtag **#mps**.
diff --git a/backends/apple/mps/targets.bzl b/backends/apple/mps/targets.bzl
index 94f030310db..4d2862eb727 100644
--- a/backends/apple/mps/targets.bzl
+++ b/backends/apple/mps/targets.bzl
@@ -22,6 +22,7 @@ def define_common_targets(is_xplat = False, platforms = []):
             "-Wno-unused-const-variable",
             "-Wno-unused-variable",
             "-fno-objc-arc",
+            "-std=c++17",
         ],
         "deps": [
             "//executorch/runtime/core:core",
diff --git a/backends/apple/mps/test/test_mps.py b/backends/apple/mps/test/test_mps.py
index 691081d35de..5ca9d0175e9 100644
--- a/backends/apple/mps/test/test_mps.py
+++ b/backends/apple/mps/test/test_mps.py
@@ -677,188 +677,6 @@ def forward(self, x):
             const_module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
-    def test_mps_constant_add(self):
-        class Module(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._constant = torch.ones(4, 4, 4)
-
-            def forward(self, x):
-                out1 = x + self._constant
-                out2 = x + self._constant + self._constant
-                return out1, out2
-
-        const_module = Module()
-        model_inputs = (torch.randn(4, 4, 4),)
-
-        self.lower_and_test_with_partitioner(
-            const_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_mul_scalar_float(self):
-        class MulScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar = 3.14
-
-            def forward(self, x):
-                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
-                return out1
-
-        mul_scalar_module = MulScalarModule()
-        model_inputs = (torch.randn(4, 4, 4),)
-
-        self.lower_and_test_with_partitioner(
-            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_mul_scalar_int(self):
-        class MulScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar = 3
-
-            def forward(self, x):
-                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
-                return out1
-
-        mul_scalar_module = MulScalarModule()
-        model_inputs = (torch.randint(11, (4, 4, 4)),)
-
-        self.lower_and_test_with_partitioner(
-            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_1(self):
-        class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.add(x, y, alpha=0.1)
-                return z
-
-        add_module = AddModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_2(self):
-        class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                z = torch.ops.aten.add.Scalar(x, 2.0)
-                return z
-
-        add_module = AddModule()
-        model_inputs = (torch.randn(2, 5),)
-
-        self.lower_and_test_with_partitioner(
-            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_3(self):
-        class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.add(x, y)
-                return z
-
-        add_module = AddModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_sub_1(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.sub(x, y, alpha=0.1)
-                return z
-
-        sub_module = SubModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_sub_2(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                z = torch.ops.aten.sub.Scalar(x, 2.0)
-                return z
-
-        sub_module = SubModule()
-        model_inputs = (torch.randn(2, 5),)
-
-        self.lower_and_test_with_partitioner(
-            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_sub_3(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.sub(x, y)
-                return z
-
-        sub_module = SubModule()
-        model_inputs = (torch.randn(1), torch.randn(1))
-
-        self.lower_and_test_with_partitioner(
-            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_scalar_float(self):
-        class AddScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar_float = 3.14
-
-            def forward(self, x):
-                out = torch.ops.aten.add.Scalar(x, self._scalar_float)
-                return out
-
-        add_scalar_module = AddScalarModule()
-        model_inputs = (torch.randn(4, 4, 4),)
-
-        self.lower_and_test_with_partitioner(
-            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_add_scalar_int(self):
-        class AddScalarModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self._scalar_int = 3
-
-            def forward(self, x):
-                out1 = torch.ops.aten.add.Scalar(x, self._scalar_int)
-                return out1
-
-        add_scalar_module = AddScalarModule()
-        model_inputs = (torch.randint(11, (4, 4, 4), dtype=torch.int32),)
-
-        self.lower_and_test_with_partitioner(
-            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
     def test_mps_backend_logit_1(self):
         class LogitModule(torch.nn.Module):
             def __init__(self):
@@ -891,22 +709,6 @@ def forward(self, x):
             logit_module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
-    def test_mps_backend_div(self):
-        class DivModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = x / y
-                return z
-
-        div_module = DivModule()
-        model_inputs = (torch.ones(1), torch.ones(1))
-
-        self.lower_and_test_with_partitioner(
-            div_module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
     def test_mps_backend_round(self):
         class RoundModule(torch.nn.Module):
             def __init__(self):
@@ -923,36 +725,6 @@ def forward(self, x):
             module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
-    def test_mps_backend_fmod(self):
-        class FModModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                return torch.fmod(x, y)
-
-        module = FModModule()
-        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
-
-        self.lower_and_test_with_partitioner(
-            module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
-    def test_mps_backend_floor_divide(self):
-        class FloorDivideModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                return torch.floor_divide(x, y)
-
-        module = FloorDivideModule()
-        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
-
-        self.lower_and_test_with_partitioner(
-            module, model_inputs, func_name=inspect.stack()[0].function[5:]
-        )
-
     def test_mps_backend_amax(self):
         class AmaxModule(torch.nn.Module):
             def __init__(self):
@@ -1331,6 +1103,149 @@ def forward(self, x):
             module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
+    def test_mps_indexing_get_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 1, 2], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[1, 2], [3, 4], [5, 6]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_2(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_3(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [[0, 1], [4, 3]]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_4(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_5(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 2, 1], :, 0]
+
+        module = IndexGet()
+        model_inputs = (torch.ones(3, 2, 4),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indices2d(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, rows, columns):
+                return x[rows, columns]
+
+        module = IndexGet()
+        x = torch.arange(0, 12).resize(4, 3)
+        rows = torch.tensor([[0, 0], [3, 3]])
+        columns = torch.tensor([[0, 2], [0, 2]])
+        model_inputs = (
+            x,
+            rows,
+            columns,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_0(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[1:4]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # using advanced index for column
+                return x[1:4, [1, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_boolean_array_indexing(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[x > 5]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
     def test_mps_backend_isinf(self):
         class IsInfModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/apple/mps/test/test_mps_binary_ops.py b/backends/apple/mps/test/test_mps_binary_ops.py
new file mode 100644
index 00000000000..fdf2d1fbb94
--- /dev/null
+++ b/backends/apple/mps/test/test_mps_binary_ops.py
@@ -0,0 +1,296 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import inspect
+
+import torch
+from executorch.backends.apple.mps.test.test_mps_utils import TestMPS
+
+
+class TestMPSAdd(TestMPS):
+    class Add(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            z = x + y
+            z = z + x
+            z = z + x
+            z = z + z
+            return z
+
+    class Add2(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            z = x + x
+            return z
+
+    class AddConstant(torch.nn.Module):
+        def __init__(self, constant):
+            super().__init__()
+            self._constant1 = constant
+            self.register_buffer("_constant2", constant, persistent=False)
+            self.register_parameter("_constant3", torch.nn.Parameter(constant))
+
+        def forward(self, x):
+            out1 = x + self._constant1 + torch.ones(1, 1, 1)
+            out2 = x + self._constant2 + self._constant3
+            return out1, out2
+
+    def test_fp16_add(self):
+        inputs = (torch.ones(1).to(torch.float16), torch.ones(1).to(torch.float16))
+        self.lower_and_test_with_partitioner(
+            self.Add(), inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_fp32_add(self):
+        inputs = (torch.ones(1), torch.ones(1))
+        self.lower_and_test_with_partitioner(
+            self.Add(), inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_fp32_add_constant(self):
+        inputs = (torch.randn(4, 4, 4),)
+        self.lower_and_test_with_partitioner(
+            self.AddConstant(torch.ones(4, 4, 4)),
+            inputs,
+            func_name=inspect.stack()[0].function[5:],
+        )
+
+    def test_add_w_alpha(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.add(x, y, alpha=0.1)
+                return z
+
+        add_module = AddModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_scalar(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                z = torch.ops.aten.add.Scalar(x, 2.0)
+                return z
+
+        add_module = AddModule()
+        model_inputs = (torch.randn(2, 5),)
+
+        self.lower_and_test_with_partitioner(
+            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_scalar_int(self):
+        class AddScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar_int = 3
+
+            def forward(self, x):
+                out1 = torch.ops.aten.add.Scalar(x, self._scalar_int)
+                return out1
+
+        add_scalar_module = AddScalarModule()
+        model_inputs = (torch.randint(11, (4, 4, 4), dtype=torch.int32),)
+
+        self.lower_and_test_with_partitioner(
+            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_without_alpha(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.add(x, y)
+                return z
+
+        add_module = AddModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            add_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_add_scalar_float(self):
+        class AddScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar_float = 3.14
+
+            def forward(self, x):
+                out = torch.ops.aten.add.Scalar(x, self._scalar_float)
+                return out
+
+        add_scalar_module = AddScalarModule()
+        model_inputs = (torch.randn(4, 4, 4),)
+
+        self.lower_and_test_with_partitioner(
+            add_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_constant_add(self):
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._constant = torch.ones(4, 4, 4)
+
+            def forward(self, x):
+                out1 = x + self._constant
+                out2 = x + self._constant + self._constant
+                return out1, out2
+
+        const_module = Module()
+        model_inputs = (torch.randn(4, 4, 4),)
+
+        self.lower_and_test_with_partitioner(
+            const_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+
+class TestMPSSub(TestMPS):
+    def test_mps_backend_sub_1(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.sub(x, y, alpha=0.1)
+                return z
+
+        sub_module = SubModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_sub_2(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                z = torch.ops.aten.sub.Scalar(x, 2.0)
+                return z
+
+        sub_module = SubModule()
+        model_inputs = (torch.randn(2, 5),)
+
+        self.lower_and_test_with_partitioner(
+            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_sub_3(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = torch.sub(x, y)
+                return z
+
+        sub_module = SubModule()
+        model_inputs = (torch.randn(1), torch.randn(1))
+
+        self.lower_and_test_with_partitioner(
+            sub_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+
+class TestMPSMul(TestMPS):
+    def test_mps_mul_scalar_float(self):
+        class MulScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar = 3.14
+
+            def forward(self, x):
+                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
+                return out1
+
+        mul_scalar_module = MulScalarModule()
+        model_inputs = (torch.randn(4, 4, 4),)
+
+        self.lower_and_test_with_partitioner(
+            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_mul_scalar_int(self):
+        class MulScalarModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._scalar = 3
+
+            def forward(self, x):
+                out1 = torch.ops.aten.mul.Scalar(x, self._scalar)
+                return out1
+
+        mul_scalar_module = MulScalarModule()
+        model_inputs = (torch.randint(11, (4, 4, 4)),)
+
+        self.lower_and_test_with_partitioner(
+            mul_scalar_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+
+class TestMPSDiv(TestMPS):
+    def test_mps_backend_div(self):
+        class DivModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x / y
+                return z
+
+        div_module = DivModule()
+        model_inputs = (torch.ones(1), torch.ones(1))
+
+        self.lower_and_test_with_partitioner(
+            div_module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_fmod(self):
+        class FModModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return torch.fmod(x, y)
+
+        module = FModModule()
+        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_backend_floor_divide(self):
+        class FloorDivideModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return torch.floor_divide(x, y)
+
+        module = FloorDivideModule()
+        model_inputs = (torch.randn(2, 3, 4), torch.randn(2, 3, 4))
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
diff --git a/backends/apple/mps/test/test_mps_indexing_ops.py b/backends/apple/mps/test/test_mps_indexing_ops.py
new file mode 100644
index 00000000000..7991f1a165a
--- /dev/null
+++ b/backends/apple/mps/test/test_mps_indexing_ops.py
@@ -0,0 +1,225 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import inspect
+
+import torch
+from executorch.backends.apple.mps.test.test_mps_utils import TestMPS
+
+
+class TestMPSIndexingOps(TestMPS):
+    def test_mps_indexing_get_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 1, 2], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[1, 2], [3, 4], [5, 6]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_2(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[1, 2], [3, 4], [5, 6]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_3(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 1, 0], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[[1, 2], [3, 4], [5, 6]]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_4(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 1, 0], [0, 1, 0]]
+
+        module = IndexGet()
+        model_inputs = (
+            torch.tensor([[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]),
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_5(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_6(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[:, [[0, 1], [4, 3]]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_7(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 4, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.randn(5, 7, 3),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_get_8(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[[0, 2, 1], :, 0]
+
+        module = IndexGet()
+        model_inputs = (torch.ones(3, 2, 4),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indices2d(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, rows, columns):
+                return x[rows, columns]
+
+        module = IndexGet()
+        x = torch.arange(0, 12).resize(4, 3)
+        rows = torch.tensor([[0, 0], [3, 3]])
+        columns = torch.tensor([[0, 2], [0, 2]])
+        model_inputs = (
+            x,
+            rows,
+            columns,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_0(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[1:4]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_slicing_using_advanced_index_for_column_1(self):
+        class IndexGet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # using advanced index for column
+                return x[1:4, [1, 2]]
+
+        module = IndexGet()
+        model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    # def test_boolean_array_indexing(self):
+    #     class IndexGet(torch.nn.Module):
+    #         def __init__(self):
+    #             super().__init__()
+
+    #         def forward(self, x):
+    #             return x[x > 5]
+
+    #     module = IndexGet()
+    #     model_inputs = (torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]),)
+
+    #     self.lower_and_test_with_partitioner(
+    #         module, model_inputs, func_name=inspect.stack()[0].function[5:]
+    #     )
+
+    def test_mps_indexing_put_1(self):
+
+        class IndexPut(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y, z):
+                x[:, :, y] = z
+                return x
+
+        module = IndexPut()
+        input = torch.ones(1, 8, 128, 8)
+        indices = torch.tensor([1])
+        values = torch.randn(8, 1, 8)
+        model_inputs = (
+            input,
+            indices,
+            values,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
diff --git a/backends/apple/mps/test/test_mps_unary_ops.py b/backends/apple/mps/test/test_mps_unary_ops.py
new file mode 100644
index 00000000000..69c1f5ba5c6
--- /dev/null
+++ b/backends/apple/mps/test/test_mps_unary_ops.py
@@ -0,0 +1,26 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import inspect
+
+import torch
+from executorch.backends.apple.mps.test.test_mps_utils import TestMPS
+
+
+class TestMPSLoigcal(TestMPS):
+    def test_mps_logical_not(self):
+        class LogicalNot(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.logical_not()
+
+        module = LogicalNot()
+        model_inputs = (torch.tensor([1, 1, 0, 0], dtype=torch.bool),)
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index 0e4a7424cc2..6e569dedb50 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -15,7 +15,6 @@
 from executorch.exir import (
     EdgeCompileConfig,
     EdgeProgramManager,
-    ExecutorchProgram,
     ExirExportedProgram,
     to_edge,
 )
@@ -28,7 +27,6 @@
 from executorch.sdk.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-from torch._export import capture_pre_autograd_graph
 from torch.export import export, ExportedProgram
 
 # Config for Capturing the weights, will be moved in the future
@@ -141,7 +139,59 @@ def randomize_bn(num_features: int, dimensionality: int = 2) -> torch.nn.Module:
     return bn
 
 
+def dump_bundled_program(sample_inputs, expected_output, executorch_program, func_name):
+    method_test_suites = [
+        MethodTestSuite(
+            method_name="forward",
+            test_cases=[
+                MethodTestCase(inputs=sample_inputs, expected_outputs=expected_output)
+            ],
+        )
+    ]
+
+    logging.info(f"Expected output: {expected_output}")
+    logging.info("  -> Test suites generated successfully")
+
+    bundled_program = BundledProgram(executorch_program, method_test_suites)
+    bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
+        bundled_program
+    )
+
+    filename = f"{func_name}.pte"
+    logging.info(f"Step 4: Saving bundled program to {filename}")
+    with open(filename, "wb") as file:
+        file.write(bundled_program_buffer)
+
+
 class TestMPS(unittest.TestCase):
+    def assert_outputs_equal(self, model_output, ref_output):
+        """
+        Helper testing function that asserts that the model output and the reference output
+        are equal with some tolerance. Due to numerical differences between eager mode and
+        the MPS's backend, we relax the detal such that absolute tolerance is 1e-3. and
+        relative tolerance is 1e-3.
+        """
+
+        # Compare the result from executor and eager mode direclty
+        if isinstance(ref_output, tuple) or isinstance(ref_output, list):
+            # Multiple outputs executor always returns tuple, even if there is one output
+            self.assertTrue(
+                len(ref_output) == len(model_output),
+                msg="Length of outputs is not matching!",
+            )
+            for i in range(len(ref_output)):
+                self.assertTrue(
+                    torch.allclose(
+                        model_output[i], ref_output[i], atol=1e-03, rtol=1e-03
+                    )
+                )
+        else:
+            # If one output, eager returns tensor while executor tuple of size 1
+            self.assertTrue(
+                torch.allclose(model_output[0], ref_output, atol=1e-03, rtol=1e-03),
+                msg="Outputs are not matching!",
+            )
+
     def lower_module_and_test_output(
         self,
         module: Any,
@@ -149,26 +199,24 @@ def lower_module_and_test_output(
         func_name: str,
         use_partitioner: bool = True,
         use_fp16: bool = False,
+        bundled_program=True,
     ) -> ExirExportedProgram:
         """
         Helper testing function that takes a torch.nn.Module and lowers it to MPS with
         the given sample inputs. It then runs the lowered module and compares its
         outputs with the outputs of the eager module.
         """
-
         logging.info("Step 1: EXIR capturing of original module")
 
-        class WrappedModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.one_module = module
+        model = module.eval()
+        original_inputs = []
+        for t in sample_inputs:
+            original_inputs.append(t.detach().clone())
+        original_inputs = tuple(original_inputs)
 
-            def forward(self, *args):
-                return self.one_module(*args)
+        expected_output = model(*sample_inputs)
 
-        model = WrappedModule()
-        model = model.eval()
-        model = capture_pre_autograd_graph(model, sample_inputs)
+        model = torch._export.capture_pre_autograd_graph(model, sample_inputs)
 
         edge_program = export_to_edge(
             model,
@@ -183,10 +231,15 @@ def forward(self, *args):
 
         if use_partitioner:
             logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}")
-            edge = edge_program.to_backend(MPSPartitioner(compile_specs=compile_specs))
-            logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
+            delegated_program = edge_program
+            delegated_program = edge_program.to_backend(
+                MPSPartitioner(compile_specs=compile_specs)
+            )
+            logging.info(
+                f"Lowered graph:\n{delegated_program.exported_program().graph}"
+            )
 
-            executorch_program = edge.to_executorch(
+            executorch_program = delegated_program.to_executorch(
                 config=ExecutorchBackendConfig(extract_constant_segment=False)
             )
         else:
@@ -206,42 +259,35 @@ def forward(self, *args):
                 )
             )
 
-        exported_program: ExirExportedProgram = exir.capture(
-            WrappedModule(), sample_inputs, _CAPTURE_CONFIG
-        ).to_edge(_EDGE_COMPILE_CONFIG)
-
-        executorch_program: ExecutorchProgram = exported_program.to_executorch()
-
-        logging.info("Step 3: Generating bundled program")
-        logging.info(
-            "  -> Number of execution plans: {len(executorch_program.program.execution_plan)}"
-        )
+        if bundled_program:
+            dump_bundled_program(
+                sample_inputs, expected_output, executorch_program, func_name
+            )
+        try:
+            from executorch.extension.pybindings.portable_lib import (  # @manual
+                _load_for_executorch_from_buffer,
+            )
 
-        expected_output = module(*sample_inputs)
+            logging.info("Testing delegated program using pybind")
 
-        method_test_suites = [
-            MethodTestSuite(
-                method_name="forward",
-                test_cases=[
-                    MethodTestCase(
-                        inputs=sample_inputs, expected_outputs=module(*sample_inputs)
-                    )
-                ],
+            # Test the model with executor
+            logging.debug("Initializing MPSGraph")
+            executorch_module = _load_for_executorch_from_buffer(
+                executorch_program.buffer
             )
-        ]
 
-        logging.info(f"Expected output: {expected_output}")
-        logging.info("  -> Test suites generated successfully")
+            model_output = executorch_module.forward(original_inputs)
 
-        bundled_program = BundledProgram(executorch_program, method_test_suites)
-        bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
-            bundled_program
-        )
+            logging.info(f"Expected output: {expected_output}")
+            logging.info(f"MPS delegate output: {model_output}")
+            self.assert_outputs_equal(model_output, expected_output)
+            logging.info("Delegated program matches PyTorch Eager mode result!")
 
-        filename = f"{func_name}.pte"
-        logging.info(f"Step 4: Saving bundled program to {filename}")
-        with open(filename, "wb") as file:
-            file.write(bundled_program_buffer)
+            return delegated_program
+        except ImportError:
+            logging.info(
+                "ExecuTorch MPS delegate was built without pybind support. Exiting..."
+            )
 
     def lower_and_test_with_partitioner(
         self,
@@ -251,7 +297,6 @@ def lower_and_test_with_partitioner(
         use_fp16: bool = False,
     ):
         logging.info(func_name)
-        # MPS TODO: partitioner support
         self.lower_module_and_test_output(
             graph_module,
             example_inputs,
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 9304e65dc38..ecd98536f4b 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -38,6 +38,118 @@
     logger.setLevel(logging.INFO)
 
 
+class ArmCompileSpecBuilder:
+    def __init__(self):
+        self.compile_spec: List[CompileSpec] = []
+        self.compiler_flags = []
+        self.output_format = None
+        self.path_for_intermediates = None
+        self.permute_nhwc = False
+
+    def ethosu_compile_spec(
+        self,
+        config: str,
+        system_config: Optional[str] = None,
+        memory_mode: Optional[str] = None,
+        extra_flags: Optional[str] = None,
+        config_ini: Optional[str] = "Arm/vela.ini",
+    ):
+        """
+        Generate compile spec for Ethos-U NPU
+
+        Args:
+            config: Ethos-U accelerator configuration, e.g. ethos-u55-128
+            system_config: System configuration to select from the Vel
+                configuration file
+            memory_mode: Memory mode to select from the Vela configuration file
+            extra_flags: Extra flags for the Vela compiler
+            config_ini: Vela configuration file(s) in Python ConfigParser .ini
+                file format
+        """
+        assert (
+            self.output_format is None
+        ), f"Output format already set to f{self.output_format}"
+        self.output_format = "vela"
+        self.compiler_flags = [
+            f"--accelerator-config={config}",
+            f"--config={config_ini}",
+        ]
+        if system_config is not None:
+            self.compiler_flags.append(f"--system-config={system_config}")
+        if memory_mode is not None:
+            self.compiler_flags.append(f"--memory-mode={memory_mode}")
+        if extra_flags is not None:
+            self.compiler_flags.append(extra_flags)
+
+        return self
+
+    def tosa_compile_spec(self):
+        """
+        Generate compile spec for TOSA flatbuffer output
+        """
+        assert (
+            self.output_format is None
+        ), f"Output format already set: {self.output_format}"
+        self.output_format = "tosa"
+        return self
+
+    def dump_intermediate_tosa(self, output_path: str):
+        """
+        Output intermediate .tosa file
+        """
+        self.path_for_intermediates = output_path
+        return self
+
+    def set_permute_memory_format(self, set_nhwc_permutation: bool = True):
+        self.permute_nhwc = set_nhwc_permutation
+        return self
+
+    def build(self):
+        """
+        Generate a list of compile spec objects from the builder
+        """
+        if self.output_format == "vela":
+            self.compile_spec += [
+                CompileSpec("output_format", "vela".encode()),
+                CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
+            ]
+        elif self.output_format == "tosa":
+            self.compile_spec.append(CompileSpec("output_format", "tosa".encode()))
+
+        if self.path_for_intermediates is not None:
+            self.compile_spec.append(
+                CompileSpec("debug_tosa_path", self.path_for_intermediates.encode())
+            )
+
+        if self.permute_nhwc:
+            self.compile_spec.append(
+                CompileSpec("permute_memory_format", "nhwc".encode())
+            )
+
+        return self.compile_spec
+
+
+def is_permute_memory(compile_spec: List[CompileSpec]) -> bool:
+    for spec in compile_spec:
+        if spec.key == "permute_memory_format":
+            return spec.value.decode() == "nhwc"
+    return False
+
+
+def is_tosa(compile_spec: List[CompileSpec]) -> bool:
+    for spec in compile_spec:
+        if spec.key == "output_format":
+            return spec.value.decode() == "tosa"
+    return False
+
+
+def get_intermediate_path(compile_spec: List[CompileSpec]) -> str:
+    for spec in compile_spec:
+        if spec.key == "debug_tosa_path":
+            return spec.value.decode()
+    return None
+
+
 def generate_ethosu_compile_spec(
     config: str,
     permute_memory_to_nhwc: Optional[bool] = None,
@@ -46,45 +158,31 @@ def generate_ethosu_compile_spec(
     extra_flags: Optional[str] = None,
     config_ini: Optional[str] = "Arm/vela.ini",
 ) -> List[CompileSpec]:
-    """
-    Generate compile spec for Ethos-U NPU
-    """
-    compiler_flags = [f"--accelerator-config={config}", f"--config={config_ini}"]
-    if system_config is not None:
-        compiler_flags.append(f"--system-config={system_config}")
-    if memory_mode is not None:
-        compiler_flags.append(f"--memory-mode={memory_mode}")
-    if extra_flags is not None:
-        compiler_flags.append(extra_flags)
-
-    compile_spec = [
-        CompileSpec("output_format", "vela".encode()),
-        CompileSpec("compile_flags", " ".join(compiler_flags).encode()),
-    ]
-
-    if permute_memory_to_nhwc:
-        compile_spec.append(CompileSpec("permute_memory_format", "nhwc".encode()))
-
-    return compile_spec
+    return (
+        ArmCompileSpecBuilder()
+        .ethosu_compile_spec(
+            config,
+            system_config=system_config,
+            memory_mode=memory_mode,
+            extra_flags=extra_flags,
+            config_ini=config_ini,
+        )
+        .set_permute_memory_format(permute_memory_to_nhwc)
+        .build()
+    )
 
 
 def generate_tosa_compile_spec(
     permute_memory_to_nhwc: Optional[bool] = None,
     output_path: Optional[str] = None,
 ) -> List[CompileSpec]:
-    """
-    Generate compile spec for TOSA flatbuffer output
-    """
-
-    compile_spec = [CompileSpec("output_format", "tosa".encode())]
-
-    if permute_memory_to_nhwc:
-        compile_spec.append(CompileSpec("permute_memory_format", "nhwc".encode()))
-
-    if output_path is not None:
-        compile_spec.append(CompileSpec("debug_tosa_path", output_path.encode()))
-
-    return compile_spec
+    return (
+        ArmCompileSpecBuilder()
+        .tosa_compile_spec()
+        .set_permute_memory_format(permute_memory_to_nhwc)
+        .dump_intermediate_tosa(output_path)
+        .build()
+    )
 
 
 @final
diff --git a/backends/arm/arm_quantizer_utils.py b/backends/arm/arm_quantizer_utils.py
index 63c98ee42d2..79455695769 100644
--- a/backends/arm/arm_quantizer_utils.py
+++ b/backends/arm/arm_quantizer_utils.py
@@ -780,11 +780,11 @@ def _annotate_add(
         if _is_annotated([add_node]):
             continue
 
+        input_act0 = add_node.args[0]
         input_act_qspec = get_input_act_qspec(quantization_config)
-        output_act_qspec = get_output_act_qspec(quantization_config)
+        shared_with_input0_qspec = SharedQuantizationSpec((input_act0, add_node))
 
         input_qspec_map = {}
-        input_act0 = add_node.args[0]
         if isinstance(input_act0, Node):
             if _is_input_large_scalar(input_act0, gm):
                 continue
@@ -798,11 +798,14 @@ def _annotate_add(
                 continue
             if _is_input_non_float_tensor(input_act1):
                 continue
-            input_qspec_map[input_act1] = input_act_qspec
+            if input_act0 is not input_act1:
+                input_qspec_map[input_act1] = shared_with_input0_qspec
+            else:
+                input_qspec_map[input_act1] = input_act_qspec
 
         add_node.meta["quantization_annotation"] = QuantizationAnnotation(
             input_qspec_map=input_qspec_map,
-            output_qspec=output_act_qspec,
+            output_qspec=shared_with_input0_qspec,
             _annotated=True,
         )
     return annotated_partitions
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index 0d0f0eb0377..f387672b7b4 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -43,7 +43,14 @@ def vela_compile(tosa_graph, args: List[str]):
 
         # invoke vela
         vela_command = f"cd {tmpdir}; vela {' '.join(args)} {tosaname}"
-        subprocess.run([vela_command], shell=True, check=True)
+        try:
+            subprocess.run([vela_command], shell=True, check=True, capture_output=True)
+        except subprocess.CalledProcessError as process_error:
+            raise RuntimeError(
+                f"Vela compiler ('{vela_command}') failed with error:\n \
+                                     {process_error.stderr.decode()}\n \
+                                      Stdout:\n{process_error.stdout.decode()}"
+            )
 
         np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
         blocks = b""
diff --git a/backends/arm/operators/op_addmm.py b/backends/arm/operators/op_addmm.py
index cc49e5c3821..444799d3536 100644
--- a/backends/arm/operators/op_addmm.py
+++ b/backends/arm/operators/op_addmm.py
@@ -73,7 +73,7 @@ def define_node(
                 quant_node = input_node.all_input_nodes[0]
             else:
                 quant_node = input_node
-            input_zp = get_quant_node_args(quant_node)[1]
+            input_zp = get_quant_node_args(quant_node).zp
         attr.ConvAttribute(
             pad=pad_attr,
             stride=stride_attr,
@@ -111,24 +111,21 @@ def define_node(
             # rank > 2 linear layer
             if input_node.target == exir_ops.edge.aten.view_copy.default:
                 quant_node = input_node.all_input_nodes[0]
-                input_scale, _ = get_quant_node_args(quant_node)
+                input_scale = get_quant_node_args(quant_node).scale
                 consumer_node = list(node.users)[0]
                 consumer_consumer_node = list(consumer_node.users)[0]
-                (
-                    consumer_node_scale,
-                    consumer_node_node_zp,
-                ) = get_quant_node_args(consumer_consumer_node)
-
+                quant_args = get_quant_node_args(consumer_consumer_node)
+                consumer_node_scale = quant_args.scale
+                consumer_node_node_zp = quant_args.zp
             else:
-                input_scale, _ = get_quant_node_args(input_node)
+                input_scale = get_quant_node_args(input_node).scale
                 consumer_node = list(node.users)[0]
-                (
-                    consumer_node_scale,
-                    consumer_node_node_zp,
-                ) = get_quant_node_args(consumer_node)
+                quant_args = get_quant_node_args(consumer_node)
+                consumer_node_scale = quant_args.scale
+                consumer_node_node_zp = quant_args.zp
 
             weight_node_q_node = weight_node.all_input_nodes[0]
-            weight_scale, _ = get_quant_node_args(weight_node_q_node)
+            weight_scale = get_quant_node_args(weight_node_q_node).scale
 
             output_rescale_scale = (input_scale * weight_scale) / consumer_node_scale
             (
diff --git a/backends/arm/operators/op_common.py b/backends/arm/operators/op_common.py
index 4701343e8e8..eadf00c294d 100644
--- a/backends/arm/operators/op_common.py
+++ b/backends/arm/operators/op_common.py
@@ -31,8 +31,8 @@ def build_avg_pool_2d_common(
     output_zp = 0
 
     if is_quant_node:
-        _, input_zp = get_quant_node_args(node.args[0])
-        _, output_zp = get_quant_node_args(list(node.users)[0])
+        input_zp = get_quant_node_args(node.args[0]).zp
+        output_zp = get_quant_node_args(list(node.users)[0]).zp
 
     attr = ts.TosaSerializerAttribute()
     attr.PoolAttribute(
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 45a9a6671d1..cc1c1f3c263 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -80,7 +80,7 @@ def define_node(
         )
 
         input_zp = (
-            get_quant_node_args(node.all_input_nodes[0])[1] if is_quant_node else 0
+            get_quant_node_args(node.all_input_nodes[0]).zp if is_quant_node else 0
         )
 
         attr.ConvAttribute(
diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py
index eb9b0a18fba..3d58f6d628c 100644
--- a/backends/arm/operators/op_hardtanh.py
+++ b/backends/arm/operators/op_hardtanh.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Arm Limited and/or its affiliates.
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -11,6 +11,8 @@
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
+
+from executorch.backends.arm.tosa_quant_utils import get_quant_node_args
 from serializer.tosa_serializer import TosaOp
 
 
@@ -30,12 +32,31 @@ def define_node(
         is_quant_node: bool,
     ) -> None:
         attr = ts.TosaSerializerAttribute()
+
+        if is_quant_node:
+            # Get quant parameters
+            scale, zp, qmin, qmax = get_quant_node_args(node.all_input_nodes[0])
+            # Convert to quantized representation
+            clamp_min_qs = round((inputs[1].number / scale) + zp)
+            clamp_min_qs = max(clamp_min_qs, qmin)
+            clamp_max_qs = round((inputs[2].number / scale) + zp)
+            clamp_max_qs = min(clamp_max_qs, qmax)
+            # Set fp values to 0.0 since they are not used
+            clamp_min_fp = 0.0
+            clamp_max_fp = 0.0
+        else:
+            clamp_min_fp = inputs[1].number
+            clamp_max_fp = inputs[2].number
+            # Set qs values to 0 since they are not used
+            clamp_min_qs = 0
+            clamp_max_qs = 0
+
         attr.ClampAttribute(
             tosa_graph.builder,
-            int(inputs[1].number),
-            int(inputs[2].number),
-            inputs[1].number,
-            inputs[2].number,
+            clamp_min_qs,
+            clamp_max_qs,
+            clamp_min_fp,
+            clamp_max_fp,
         )
 
         tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr)
diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py
index 6a57d895ff0..05e02468d6d 100644
--- a/backends/arm/operators/op_placeholder.py
+++ b/backends/arm/operators/op_placeholder.py
@@ -50,11 +50,13 @@ def process_placeholder(
             weight_node = weight_node_permuted.all_input_nodes[0]
 
             if input_node.target == exir_ops.edge.aten.view_copy.default:
-                input_node_scale, _ = get_quant_node_args(input_node.all_input_nodes[0])
+                input_node_scale = get_quant_node_args(
+                    input_node.all_input_nodes[0]
+                ).scale
             else:
-                input_node_scale, _ = get_quant_node_args(input_node)
+                input_node_scale = get_quant_node_args(input_node).scale
 
-            weight_node_scale, _ = get_quant_node_args(weight_node)
+            weight_node_scale = get_quant_node_args(weight_node).scale
 
             bias_values_quantized = (
                 (parameter_values / (input_node_scale * weight_node_scale))
@@ -81,8 +83,8 @@ def process_placeholder(
                 bias_node,
             ) = consumer_node.all_input_nodes
 
-            input_node_scale, _ = get_quant_node_args(input_node)
-            weight_node_scale, _ = get_quant_node_args(weight_node)
+            input_node_scale = get_quant_node_args(input_node).scale
+            weight_node_scale = get_quant_node_args(weight_node).scale
 
             bias_scales = input_node_scale * weight_node_scale
             parameter_values_quantized = (
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index eeccfadf5b9..6a8f4b526d6 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -5,6 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import shutil
+import tempfile
+
+from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
 
 # TODO: fixme! These globs are a temporary workaround. Reasoning:
 # Running the jobs in _unittest.yml will not work since that environment doesn't
@@ -13,3 +16,36 @@
 # should be installed in the CI env.
 TOSA_REF_MODEL_INSTALLED = shutil.which("tosa_reference_model")
 VELA_INSTALLED = shutil.which("vela")
+
+
+def get_tosa_compile_spec(permute_memory_to_nhwc=False, custom_path=None):
+    """
+    Default compile spec for TOSA tests.
+    """
+    intermediate_path = custom_path or tempfile.mkdtemp(prefix="arm_tosa_")
+    compile_spec = (
+        ArmCompileSpecBuilder()
+        .tosa_compile_spec()
+        .set_permute_memory_format(permute_memory_to_nhwc)
+        .dump_intermediate_tosa(intermediate_path)
+        .build()
+    )
+    return compile_spec
+
+
+def get_u55_compile_spec(permute_memory_to_nhwc=False):
+    """
+    Default compile spec for Ethos-U55 tests.
+    """
+    compile_spec = (
+        ArmCompileSpecBuilder()
+        .ethosu_compile_spec(
+            "ethos-u55-128",
+            system_config="Ethos_U55_High_End_Embedded",
+            memory_mode="Shared_Sram",
+            extra_flags=None,
+        )
+        .set_permute_memory_format(permute_memory_to_nhwc)
+        .build()
+    )
+    return compile_spec
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index 6bb70d98c59..df55e5253e2 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -10,8 +10,9 @@
 import unittest
 
 import torch
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -45,8 +46,7 @@ def _tosa_MI_pipeline(self, module: torch.nn.Module, dump_file=None):
             ArmTester(
                 module,
                 inputs=module.get_inputs(),
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
+                compile_spec=common.get_tosa_compile_spec(),
             )
             .export()
             .to_edge()
@@ -60,8 +60,7 @@ def _tosa_BI_pipeline(self, module: torch.nn.Module, dump_file=None):
             ArmTester(
                 module,
                 inputs=module.get_inputs(),
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
+                compile_spec=common.get_tosa_compile_spec(),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index ded3b6f6919..6c247ac10e9 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -9,11 +9,11 @@
 import unittest
 
 import torch
-import torchvision.models as models
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from torchvision import models
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 
 
@@ -46,9 +46,7 @@ def test_mv2_tosa_MI(self):
             ArmTester(
                 self.mv2,
                 inputs=self.model_inputs,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .export()
             .to_edge()
@@ -62,9 +60,7 @@ def test_mv2_tosa_BI(self):
             ArmTester(
                 self.mv2,
                 inputs=self.model_inputs,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize(Quantize(calibrate=False))
             .export()
@@ -74,7 +70,7 @@ def test_mv2_tosa_BI(self):
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -89,9 +85,7 @@ def test_mv2_u55_BI(self):
             ArmTester(
                 self.mv2,
                 inputs=self.model_inputs,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize(Quantize(calibrate=False))
             .export()
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 091e4b27ff5..f77feec586f 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -12,8 +12,8 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -57,9 +57,7 @@ def _test_add_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=False,
+                compile_spec=common.get_tosa_compile_spec(),
             )
             .export()
             .check_count({"torch.ops.aten.add.Tensor": 1})
@@ -70,7 +68,7 @@ def _test_add_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -83,9 +81,7 @@ def _test_add_tosa_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=False,
+                compile_spec=common.get_tosa_compile_spec(),
             )
             .quantize()
             .export()
@@ -98,7 +94,7 @@ def _test_add_tosa_BI_pipeline(
         )
 
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(qtol=1)
+            tester.run_method_and_compare_outputs(qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -111,8 +107,7 @@ def _test_add_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
+                compile_spec=common.get_u55_compile_spec(),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index 6e0aa8cbad8..259e8e11809 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -12,8 +12,7 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -51,9 +50,7 @@ def _test_avgpool2d_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .export()
             .check(["torch.ops.aten.avg_pool2d.default"])
@@ -65,7 +62,7 @@ def _test_avgpool2d_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -78,9 +75,7 @@ def _test_avgpool2d_tosa_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
@@ -93,7 +88,7 @@ def _test_avgpool2d_tosa_BI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(qtol=1)
+            tester.run_method_and_compare_outputs(qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -106,9 +101,7 @@ def _test_avgpool2d_tosa_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py
index 4f94412bf25..1f0ef7bc293 100644
--- a/backends/arm/test/ops/test_conv.py
+++ b/backends/arm/test/ops/test_conv.py
@@ -11,8 +11,8 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -248,9 +248,7 @@ def _test_conv2d_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .export()
             .to_edge()
@@ -260,7 +258,7 @@ def _test_conv2d_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -275,9 +273,7 @@ def _test_conv2d_tosa_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
@@ -288,7 +284,7 @@ def _test_conv2d_tosa_BI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(qtol=1)
+            tester.run_method_and_compare_outputs(qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -301,9 +297,7 @@ def _test_conv2d_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index c96a2920e2e..2bde0688489 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -11,8 +11,8 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -127,6 +127,32 @@ def forward(self, x):
         return x
 
 
+class ComboConvRelu6(torch.nn.Module):
+    edge_op_list = [
+        "executorch_exir_dialects_edge__ops_aten_convolution_default",
+        "executorch_exir_dialects_edge__ops_aten_hardtanh_default",
+    ]
+
+    test_data = [
+        (20 * torch.randn(1, 3, 256, 256),),
+        (5 * torch.randn(1, 3, 256, 256),),
+        (torch.randn(1, 3, 256, 256),),
+        (-5 * torch.randn(1, 3, 256, 256),),
+    ]
+
+    def __init__(self):
+        super().__init__()
+        self.conv2d = torch.nn.Conv2d(
+            in_channels=3, out_channels=3, kernel_size=3, stride=1, groups=1
+        )
+        self.relu6 = torch.nn.ReLU6()
+
+    def forward(self, x):
+        x = self.conv2d(x)
+        x = self.relu6(x)
+        return x
+
+
 class TestConvCombos(unittest.TestCase):
     def _test_conv_combo_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
@@ -135,9 +161,7 @@ def _test_conv_combo_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .export()
             .to_edge()
@@ -147,7 +171,7 @@ def _test_conv_combo_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -164,9 +188,7 @@ def _test_conv_combo_tosa_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
@@ -177,7 +199,7 @@ def _test_conv_combo_tosa_BI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(atol=atol, rtol=rtol, qtol=1)
+            tester.run_method_and_compare_outputs(atol=atol, rtol=rtol, qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -190,9 +212,7 @@ def _test_conv_combo_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
@@ -229,15 +249,9 @@ def test_conv_batchnorm_relu_tosa_MI(self):
         model = ComboConvBatchnormRelu()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
-    # TODO(MLETORCH-85): Investigate numerical issue. This diff is present in legacy
-    # testcase as well (and also not tested). For now, just increase the
-    # tolerance, such that we don't skip the test entirely (i.e. we maintain
-    # functionality).
     def test_conv_batchnorm_relu_tosa_BI(self):
         model = ComboConvBatchnormRelu()
-        self._test_conv_combo_tosa_BI_pipeline(
-            model, model.get_inputs(), atol=1.0, rtol=1.0
-        )
+        self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
 
     @unittest.skipIf(
         not common.VELA_INSTALLED,
@@ -247,6 +261,31 @@ def test_conv_batchnorm_relu_u55_BI(self):
         model = ComboConvBatchnormRelu()
         self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs())
 
+    ##################
+    ## Conv + ReLU6 ##
+    ##################
+    @parameterized.expand(ComboConvRelu6.test_data)
+    def test_conv_relu6_tosa_MI(self, test_data: torch.Tensor):
+        model = ComboConvRelu6()
+        test_data = (test_data,)
+        self._test_conv_combo_tosa_MI_pipeline(model, test_data)
+
+    @parameterized.expand(ComboConvRelu6.test_data)
+    def test_conv_relu6_tosa_BI(self, test_data: torch.Tensor):
+        model = ComboConvRelu6()
+        test_data = (test_data,)
+        self._test_conv_combo_tosa_BI_pipeline(model, test_data)
+
+    @parameterized.expand(ComboConvRelu6.test_data)
+    @unittest.skipIf(
+        not common.VELA_INSTALLED,
+        "There is no point in running U55 tests if the Vela tool is not installed",
+    )
+    def test_conv_relu6_u55_BI(self, test_data: torch.Tensor):
+        model = ComboConvRelu6()
+        test_data = (test_data,)
+        self._test_conv_combo_u55_BI_pipeline(model, test_data)
+
     ###############################
     ## Block bottleneck residual ##
     ###############################
@@ -254,14 +293,9 @@ def test_block_bottleneck_residual_tosa_MI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
-    # TODO(MLETORCH-85): Investigate numerical issue. This diff was present in legacy
-    # testcase as well. For now, just increase the tolerance, such that
-    # we don't skip the test entirely (i.e. we maintain functionality).
     def test_block_bottleneck_residual_tosa_BI(self):
         model = ComboBlockBottleneckResidual()
-        self._test_conv_combo_tosa_BI_pipeline(
-            model, model.get_inputs(), atol=1.0, rtol=1.0
-        )
+        self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
 
     @unittest.skipIf(
         not common.VELA_INSTALLED,
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 006a8567387..0901a49293b 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -12,8 +12,8 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.ops.test_conv import Conv2d
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -134,9 +134,7 @@ def _test_dw_conv2d_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .export()
             .to_edge()
@@ -146,7 +144,7 @@ def _test_dw_conv2d_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -159,9 +157,7 @@ def _test_dw_conv2d_tosa_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
@@ -172,7 +168,7 @@ def _test_dw_conv2d_tosa_BI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(qtol=1)
+            tester.run_method_and_compare_outputs(qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -185,9 +181,7 @@ def _test_dw_conv2d_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index afb5a57b2d1..6313db3a167 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -12,8 +12,8 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -114,8 +114,7 @@ def _test_linear_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
+                compile_spec=common.get_tosa_compile_spec(),
             )
             .export()
             .check_count({"torch.ops.aten.addmm.default": 1})
@@ -126,7 +125,7 @@ def _test_linear_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -139,8 +138,7 @@ def _test_linear_tosa_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
+                compile_spec=common.get_tosa_compile_spec(),
             )
             .quantize()
             .export()
@@ -152,7 +150,7 @@ def _test_linear_tosa_BI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(qtol=True)
+            tester.run_method_and_compare_outputs(qtol=True)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -165,8 +163,7 @@ def _test_linear_tosa_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
+                compile_spec=common.get_u55_compile_spec(),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 95bc8c4babc..79ba3de7dad 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -12,8 +12,7 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -56,9 +55,7 @@ def _test_meandim_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .export()
             .check(["torch.ops.aten.mean.dim"])
@@ -70,7 +67,7 @@ def _test_meandim_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -83,9 +80,7 @@ def _test_meandim_tosa_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
@@ -98,7 +93,7 @@ def _test_meandim_tosa_BI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(qtol=1)
+            tester.run_method_and_compare_outputs(qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -111,9 +106,7 @@ def _test_meandim_tosa_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
-                permute_memory_to_nhwc=True,
+                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index 5d5766f019b..32bd2253464 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -12,8 +12,7 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -44,8 +43,7 @@ def _test_softmax_tosa_MI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.MI,
-                backend=ArmBackendSelector.TOSA,
+                compile_spec=common.get_tosa_compile_spec(),
             )
             .export()
             .check(["torch.ops.aten._softmax.default"])
@@ -57,7 +55,7 @@ def _test_softmax_tosa_MI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method_and_compare_outputs()
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -68,10 +66,7 @@ def _test_softmax_tosa_BI_pipeline(
     ):
         tester = (
             ArmTester(
-                module,
-                inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.TOSA,
+                module, inputs=test_data, compile_spec=common.get_tosa_compile_spec()
             )
             .quantize()
             .export()
@@ -84,7 +79,7 @@ def _test_softmax_tosa_BI_pipeline(
             .to_executorch()
         )
         if common.TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs(qtol=1)
+            tester.run_method_and_compare_outputs(qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -97,8 +92,7 @@ def _test_softmax_tosa_u55_BI_pipeline(
             ArmTester(
                 module,
                 inputs=test_data,
-                profile=TosaProfile.BI,
-                backend=ArmBackendSelector.ETHOS_U55,
+                compile_spec=common.get_u55_compile_spec(),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 2d0816a2943..dceae602716 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -3,17 +3,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from enum import Enum
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
 import torch
+
 from executorch.backends.arm.arm_backend import (
-    generate_ethosu_compile_spec,
-    generate_tosa_compile_spec,
+    get_intermediate_path,
+    is_permute_memory,
+    is_tosa,
 )
-
 from executorch.backends.arm.arm_partitioner import ArmPartitioner
 from executorch.backends.arm.arm_quantizer import (
     ArmQuantizer,
@@ -22,7 +22,6 @@
 
 from executorch.backends.arm.test.tosautil.tosa_test_utils import (
     QuantizationParams,
-    TosaProfile,
     TosaTestUtils,
 )
 
@@ -35,12 +34,89 @@
 )
 
 from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export import ExportedProgram
 
 
-class ArmBackendSelector(Enum):
-    TOSA = "tosa"
-    ETHOS_U55 = "ethos-u55"
+def _get_input_params(
+    program: ExportedProgram, is_quantized: bool
+) -> Tuple[str, Union[List[QuantizationParams], List[None]]]:
+    """
+    Get name and optionally quantization parameters for the inputs to this
+    model.
+
+    Args:
+        program (ExportedProgram): The program to get input parameters from
+    Returns:
+        Tuple[str, Optional[QuantizationParams]]: A tuple containing the
+            input node names and their quantization parameters.
+    """
+    input_names = []
+    # E.g. bias and weights are 'placeholders' as well. This is used to
+    # get only the use inputs.
+    usr_inputs = program.graph_signature.user_inputs
+    for node in program.graph.nodes:
+        if node.op == "placeholder" and node.name in usr_inputs:
+            input_names.append(node.name)
+            continue
+
+    if is_quantized:
+        quant_params = []
+        for node in program.graph.nodes:
+            if (
+                node.target
+                == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                and node.args[0].name in input_names
+            ):
+                qp = QuantizationParams(
+                    node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
+                )
+                quant_params.append(qp)
+                if len(quant_params) == len(
+                    input_names
+                ):  # break early if we have all the inputs quantized parameters
+                    break
+        assert len(quant_params) != 0, "Quantization paramerters not found"
+        return (input_names, quant_params)
+    else:
+        return (input_names, len(input_names) * [None])  # return a list of None's
+
+
+def _get_output_param(
+    program: ExportedProgram, is_quantized: bool
+) -> Tuple[str, Union[QuantizationParams, None]]:
+    """
+    Get name and optionally quantization parameters for the inputs to this
+    model.
+
+    Args:
+        program (ExportedProgram): The program to get output parameters from.
+    Returns:
+        Tuple[str, Optional[QuantizationParams]]: A tuple containing the
+            output node name and its quantization parameters.
+    """
+    output_node = None
+    for node in program.graph.nodes:
+        if node.op == "output":
+            output_node = node
+            break
+
+    if is_quantized:
+        quant_params = None
+        for node in program.graph.nodes:
+            if (
+                node.target
+                == torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                and node == output_node.args[0][0]
+            ):
+                quant_params = QuantizationParams(
+                    node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
+                )
+                break  # break early, there's only one output node
+        assert quant_params is not None, "Quantization paramerters not found"
+        return (output_node.name, quant_params)
+    else:
+        return (output_node.name, None)
 
 
 class Partition(Partition):
@@ -75,44 +151,23 @@ def __init__(
         self,
         model: torch.nn.Module,
         inputs: Tuple[torch.Tensor],
-        backend: ArmBackendSelector = ArmBackendSelector.TOSA,
-        profile: TosaProfile = TosaProfile.BI,
-        permute_memory_to_nhwc: bool = False,
+        compile_spec: List[CompileSpec] = None,
     ):
         """
         Args:
             model (torch.nn.Module): The model to test
             inputs (Tuple[torch.Tensor]): The inputs to the model
-            backend (ArmBackendSelector): The backend to use. E.g. TOSA or
-                ETHOS_U55.
-                TOSA: Lower to TOSA and test numerical correctness compared to
-                    torch reference.
-                ETHOS_U55: Lower to TOSA, then let Vela compile. Only
-                    functional test, no numerical checks.
-            profile (TosaProfile): The TOSA profile to use. Either
-                TosaProfile.BI or TosaProfile.MI
-            permute_memory_to_nhwc (bool) : flag for enabling the memory format
-                permutation to nhwc as required by TOSA
+            compile_spec (List[CompileSpec]): The compile spec to use
         """
+
+        # Use the TosaTestUtils if you are using a TOSA backend
         self.tosa_test_util = None
-        self.is_quantized = profile == TosaProfile.BI
-        self.permute_memory_to_nhwc = permute_memory_to_nhwc
-
-        if backend == ArmBackendSelector.TOSA:
-            self.tosa_test_util = TosaTestUtils(profile=profile)
-            # The spec below tiggers arm_backend.py to output two files:
-            #   1) output.tosa
-            #   2) desc.json
-            # Saved on disk in self.tosa_test_util.intermediate_path
-            self.compile_spec = generate_tosa_compile_spec(
-                permute_memory_to_nhwc, self.tosa_test_util.intermediate_path
-            )
-        elif backend == ArmBackendSelector.ETHOS_U55:
-            self.compile_spec = generate_ethosu_compile_spec(
-                config="ethos-u55-128", permute_memory_to_nhwc=permute_memory_to_nhwc
-            )
-        else:
-            raise ValueError(f"Unknown backend: {backend}")
+        if is_tosa(compile_spec):
+            intermediate_path = get_intermediate_path(compile_spec)
+            self.tosa_test_util = TosaTestUtils(intermediate_path=intermediate_path)
+
+        self.compile_spec = compile_spec
+
         super().__init__(model, inputs)
 
     def quantize(self, quantize_stage: Optional[Quantize] = None):
@@ -134,8 +189,14 @@ def partition(self, partition_stage: Optional[Partition] = None):
             partition_stage = Partition(arm_partitioner)
         return super().partition(partition_stage)
 
-    def run_method(
-        self, stage: Optional[str] = None, inputs: Optional[Tuple[torch.Tensor]] = None
+    def run_method_and_compare_outputs(
+        self,
+        stage: Optional[str] = None,
+        inputs: Optional[Tuple[torch.Tensor]] = None,
+        num_runs=1,
+        atol=1e-03,
+        rtol=1e-03,
+        qtol=0,
     ):
         """
         This function runs the tosa_reference_model tool to get output data
@@ -150,130 +211,74 @@ def run_method(
         Todo:
             * A lot of the stuff in this method should be broken out into a
               run_artifact() method on a ToExecutorch stage class.
-            * See "TODO" inline below
         """
         assert (
             self.tosa_test_util is not None
         ), "self.tosa_test_util is not initialized, cannot use run_method()"
-        inputs_to_run = inputs or self.inputs
+
+        number_of_runs = 1 if inputs is not None else num_runs
+        stage = stage or self.cur
 
         export_stage = self.stages[self.stage_name(Export)]
 
-        (input_names, qp_input) = self._get_input_params(export_stage.artifact)
-        (output_name, qp_output) = self._get_output_param(export_stage.artifact)
+        is_quantized = self.stages["Quantize"] is not None
+        (input_names, qp_input) = _get_input_params(export_stage.artifact, is_quantized)
+        (output_name, qp_output) = _get_output_param(
+            export_stage.artifact, is_quantized
+        )
 
         # Calculate the reference output using the original module or the quant
-        # module. self.quantization_scale is used by compare_outputs() to
-        # calculate the tolerance
-        self.quantization_scale = None if qp_output is None else qp_output.scale
-        if self.is_quantized:
-            module_for_ref = self.stages[self.stage_name(Quantize)].artifact
+        # module.
+        quantization_scale = None
+        if is_quantized:
+            quantization_scale = qp_output.scale
+            quantize_stage = self.stages[self.stage_name(Quantize)]
+            module_for_ref = quantize_stage.artifact
+            print(f"Comparing Stage {stage} with Stage {quantize_stage}")
         else:
             module_for_ref = self.original_module
-        self.reference_output = self._calculate_reference_output(
-            module_for_ref, inputs_to_run
-        )
-
-        # Transpose input data which is on NCHW format to NHWC format,
-        if self.permute_memory_to_nhwc and len(inputs_to_run[0].shape) == 4:
-            NHWC_Order = (0, 2, 3, 1)
-            inputs_to_run = (np.transpose(inputs_to_run[0], NHWC_Order),)
-
-        # Run the TOSA ref model to get the output tensor, which will be
-        # compared to the torch output in compare_outputs()
-        tosa_output = self.tosa_test_util.run_tosa_ref_model(
-            params_input=(input_names, qp_input),
-            param_output=(output_name, qp_output),
-            inputs=inputs_to_run,
-        )
-
-        # Transpose back to NCHW format for comparison to torch output
-        if self.permute_memory_to_nhwc and len(tosa_output.shape) == 4:
-            NCHW_Order = (0, 3, 1, 2)
-            tosa_output = (np.transpose(tosa_output, NCHW_Order),)
-
-        self.stage_output = tosa_output
+            print(f"Comparing Stage {stage} with original module")
+
+        # Loop inputs and compare TOSA ref model output with Torch reference
+        # for each loop iteration.
+        for run_iteration in range(number_of_runs):
+            inputs_to_run = inputs if inputs else next(self.generate_random_inputs())
+            input_shapes = [generated_input.shape for generated_input in inputs_to_run]
+            print(f"Run {run_iteration} with input shapes: {input_shapes}")
+
+            # Get Torch reference data...
+            reference_output = self._calculate_reference_output(
+                module_for_ref, inputs_to_run
+            )
 
-        return self
+            # ...now get TOSA ref model data
+            # Transpose input data which is on NCHW format to NHWC format,
+            is_nhwc = is_permute_memory(self.compile_spec)
+            if is_nhwc and len(inputs_to_run[0].shape) == 4:
+                NHWC_Order = (0, 2, 3, 1)
+                inputs_to_run = (np.transpose(inputs_to_run[0], NHWC_Order),)
+
+            # Run the TOSA ref model to get the output tensor, which will be
+            # compared to the torch output in compare_outputs()
+            tosa_output = self.tosa_test_util.run_tosa_ref_model(
+                params_input=(input_names, qp_input),
+                param_output=(output_name, qp_output),
+                inputs=inputs_to_run,
+            )
 
-    def _get_input_params(
-        self, program: ExportedProgram
-    ) -> Tuple[str, Union[List[QuantizationParams], List[None]]]:
-        """
-        Get name and optionally quantization parameters for the inputs to this
-        model.
+            # Transpose back to NCHW format for comparison to torch output
+            if is_nhwc and len(tosa_output.shape) == 4:
+                NCHW_Order = (0, 3, 1, 2)
+                tosa_output = (np.transpose(tosa_output, NCHW_Order),)
 
-        Args:
-            program (ExportedProgram): The program to get input parameters from
-        Returns:
-            Tuple[str, Optional[QuantizationParams]]: A tuple containing the
-                input node names and their quantization parameters.
-        """
-        input_names = []
-        # E.g. bias and weights are 'placeholders' as well. This is used to
-        # get only the use inputs.
-        usr_inputs = program.graph_signature.user_inputs
-        for node in program.graph.nodes:
-            if node.op == "placeholder" and node.name in usr_inputs:
-                input_names.append(node.name)
-                continue
-
-        if self.is_quantized:
-            quant_params = []
-            for node in program.graph.nodes:
-                if (
-                    node.target
-                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
-                    and node.args[0].name in input_names
-                ):
-                    qp = QuantizationParams(
-                        node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
-                    )
-                    quant_params.append(qp)
-                    if len(quant_params) == len(
-                        input_names
-                    ):  # break early if we have all the inputs quantized parameters
-                        break
-            assert len(quant_params) != 0, "Quantization paramerters not found"
-            return (input_names, quant_params)
-        else:
-            return (input_names, len(input_names) * [None])  # return a list of None's
+            stage_output = tosa_output
 
-    def _get_output_param(
-        self, program: ExportedProgram
-    ) -> Tuple[str, Union[QuantizationParams, None]]:
-        """
-        Get name and optionally quantization parameters for the inputs to this
-        model.
-
-        Args:
-            program (ExportedProgram): The program to get output parameters from.
-        Returns:
-            Tuple[str, Optional[QuantizationParams]]: A tuple containing the
-                output node name and its quantization parameters.
-        """
-        output_node = None
-        for node in program.graph.nodes:
-            if node.op == "output":
-                output_node = node
-                break
+            # Output from running artifact at stage
+            self._compare_outputs(
+                reference_output, stage_output, quantization_scale, atol, rtol, qtol
+            )
 
-        if self.is_quantized:
-            quant_params = None
-            for node in program.graph.nodes:
-                if (
-                    node.target
-                    == torch.ops.quantized_decomposed.dequantize_per_tensor.default
-                    and node == output_node.args[0][0]
-                ):
-                    quant_params = QuantizationParams(
-                        node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
-                    )
-                    break  # break early, there's only one output node
-            assert quant_params is not None, "Quantization paramerters not found"
-            return (output_node.name, quant_params)
-        else:
-            return (output_node.name, None)
+        return self
 
     @staticmethod
     def _calculate_reference_output(
diff --git a/backends/arm/test/tosautil/tosa_test_utils.py b/backends/arm/test/tosautil/tosa_test_utils.py
index 2ac9794fe89..df9b1455281 100644
--- a/backends/arm/test/tosautil/tosa_test_utils.py
+++ b/backends/arm/test/tosautil/tosa_test_utils.py
@@ -15,8 +15,6 @@
 import numpy as np
 import torch
 
-from executorch.backends.arm.test.test_models import TosaProfile
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.WARNING)
 
@@ -39,15 +37,11 @@ def __init__(self, node_name: str, zp: int, scale: float):
 class TosaTestUtils:
     def __init__(
         self,
-        intermediate_path: Optional[str] = None,
+        intermediate_path: str,
         tosa_ref_model_path: Optional[str] = None,
-        profile: Optional[TosaProfile] = None,
     ):
-        self.intermediate_path = intermediate_path or tempfile.mkdtemp(
-            prefix="arm_tosa_"
-        )
+        self.intermediate_path = intermediate_path
         self.tosa_ref_model_path = tosa_ref_model_path or "tosa_reference_model"
-        self.profile = profile or TosaProfile.MI
         assert os.path.exists(
             self.intermediate_path
         ), f"TOSA artifact path don't exist! Path: {self.intermediate_path}"
@@ -145,7 +139,7 @@ def run_tosa_ref_model(
               as a next step. See:
               https://review.mlplatform.org/plugins/gitiles/tosa/reference_model/#executable-usage
         """
-
+        is_quantized = param_output[1] is not None
         desc_file_path = os.path.join(self.intermediate_path, "desc.json")
         assert os.path.exists(
             desc_file_path
@@ -159,7 +153,7 @@ def run_tosa_ref_model(
         ):
             data_np = data.detach().numpy()
 
-            if self.profile is TosaProfile.BI:
+            if is_quantized:
                 assert (
                     quant_param.node_name == input_name
                 ), "These quantization params do not match the input tensor name"
@@ -190,7 +184,7 @@ def run_tosa_ref_model(
         # Load the output file (OFM) and return it as a numpy array
         tosa_ref_output = np.load(ofm_file_npy)
 
-        if self.profile is TosaProfile.BI:
+        if is_quantized:
             # Need to dequant back to FP32 for comparison with torch output
             quant_param = param_output[1]
             assert (
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index 9e04ba68eef..25fba250395 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -6,8 +6,10 @@
 # Utiliy functions for TOSA quantized lowerings
 
 import math
+from typing import NamedTuple
 
 import serializer.tosa_serializer as ts
+import torch.fx
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.exir.dialects._ops import ops as exir_ops
 from serializer.tosa_serializer import TosaOp, TosaSerializerTensor
@@ -17,7 +19,14 @@
 dq_q_ops = [q_op, dq_op]
 
 
-def is_quant_node(node):
+class QuantArgs(NamedTuple):
+    scale: float
+    zp: int
+    qmin: int
+    qmax: int
+
+
+def is_quant_node(node: torch.fx.Node):
     consumer_node = list(node.users)[0]
     input = node.all_input_nodes[0]
 
@@ -41,10 +50,22 @@ def is_quant_arg(arg):
     return consumer_node.target == q_op
 
 
-def get_quant_node_args(node):
+def get_quant_node_args(node: torch.fx.Node):
+    """
+    Get the quantization parameters from a quant node.
+
+    Args:
+        node: The quant node.
+    Returns:
+        QuantArgs: scale, zp, qmin, qmax
+    """
     quant_args = [TosaArg(arg) for arg in node.args]
-    # Return the scale and zp
-    return quant_args[1].number, quant_args[2].number
+    return QuantArgs(
+        quant_args[1].number,
+        quant_args[2].number,
+        quant_args[3].number,
+        quant_args[4].number,
+    )
 
 
 # Check if scale32 mode is used for given output element type
diff --git a/backends/example/TARGETS b/backends/example/TARGETS
index 4bcc77e2a10..59df492e027 100644
--- a/backends/example/TARGETS
+++ b/backends/example/TARGETS
@@ -53,7 +53,6 @@ python_unittest(
         "//caffe2:torch",
         "//executorch/exir:delegate",
         "//executorch/exir:lib",
-        "//executorch/exir/backend:backend_api",
         "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
         "//pytorch/vision:torchvision",
     ],
diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py
index 27354e02ad8..d85e8e87229 100644
--- a/backends/example/test_example_delegate.py
+++ b/backends/example/test_example_delegate.py
@@ -11,7 +11,7 @@
 from executorch import exir
 from executorch.backends.example.example_partitioner import ExamplePartitioner
 from executorch.backends.example.example_quantizer import ExampleQuantizer
-from executorch.exir.backend.backend_api import to_backend
+from executorch.exir import to_edge
 
 from executorch.exir.backend.canonical_partitioners.duplicate_dequant_node_pass import (
     DuplicateDequantNodePass,
@@ -19,8 +19,8 @@
 from executorch.exir.delegate import executorch_call_delegate
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.export import export
 
-# @manual=//pytorch/vision:torchvision
 from torchvision.models.quantization import mobilenet_v2
 
 
@@ -40,7 +40,6 @@ def get_example_inputs():
 
         model = Conv2dModule()
         example_inputs = Conv2dModule.get_example_inputs()
-        CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True)
         EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
             _check_ir_validity=False,
         )
@@ -59,24 +58,23 @@ def get_example_inputs():
         m = convert_pt2e(m)
 
         quantized_gm = m
-        exported_program = exir.capture(
-            quantized_gm, copy.deepcopy(example_inputs), CAPTURE_CONFIG
-        ).to_edge(EDGE_COMPILE_CONFIG)
+        exported_program = to_edge(
+            export(quantized_gm, copy.deepcopy(example_inputs)),
+            compile_config=EDGE_COMPILE_CONFIG,
+        )
 
-        lowered_export_program = to_backend(
-            exported_program.exported_program,
+        lowered_export_program = exported_program.to_backend(
             ExamplePartitioner(),
         )
 
         print("After lowering to qnn backend: ")
-        lowered_export_program.graph.print_tabular()
+        lowered_export_program.exported_program().graph.print_tabular()
 
     def test_delegate_mobilenet_v2(self):
         model = mobilenet_v2(num_classes=3)
         model.eval()
         example_inputs = (torch.rand(1, 3, 320, 240),)
 
-        CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True)
         EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
             _check_ir_validity=False,
         )
@@ -91,20 +89,22 @@ def test_delegate_mobilenet_v2(self):
         m = convert_pt2e(m)
 
         quantized_gm = m
-        exported_program = exir.capture(
-            quantized_gm, copy.deepcopy(example_inputs), CAPTURE_CONFIG
-        ).to_edge(EDGE_COMPILE_CONFIG)
+        exported_program = to_edge(
+            export(quantized_gm, copy.deepcopy(example_inputs)),
+            compile_config=EDGE_COMPILE_CONFIG,
+        )
 
-        lowered_export_program = to_backend(
-            exported_program.transform(DuplicateDequantNodePass()).exported_program,
+        lowered_export_program = exported_program.transform(
+            [DuplicateDequantNodePass()]
+        ).to_backend(
             ExamplePartitioner(),
         )
 
-        lowered_export_program.graph.print_tabular()
+        lowered_export_program.exported_program().graph.print_tabular()
 
         call_deleage_node = [
             node
-            for node in lowered_export_program.graph.nodes
+            for node in lowered_export_program.exported_program().graph.nodes
             if node.target == executorch_call_delegate
         ]
         self.assertEqual(len(call_deleage_node), 1)
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 8883e5ee026..727952b4fe4 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -253,7 +253,7 @@ target_link_libraries(qnn_executorch_backend
     qnn_executorch_header
     qnn_schema
     qnn_manager
-    executorch
+    executorch_no_prim_ops
     qcir_utils
 )
 target_link_libraries(utils
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index b63a5583b10..8adbfde0b92 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -41,8 +41,10 @@
     op_skip_ops,
     op_slice_copy,
     op_softmax,
+    op_sqrt,
     op_squeeze,
     op_sub,
+    op_sum_int_list,
     op_tanh,
     op_transpose,
     op_unsqueeze,
@@ -86,7 +88,9 @@
     op_slice_copy,
     op_softmax,
     op_squeeze,
+    op_sqrt,
     op_sub,
+    op_sum_int_list,
     op_tanh,
     op_transpose,
     op_unsqueeze,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index 3dae32f882e..60c51b055d0 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -14,8 +14,6 @@
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
-from .qnn_constants import QNN_uint16
-
 from .utils import get_parameter, is_graph_input, is_graph_output, is_parameter
 
 
@@ -26,16 +24,17 @@
     # Note that there is no int64 tensor data type in Qnn.
     torch.int64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UNDEFINED,
     torch.uint8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_8,
-    QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
+    torch.uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
 }
 QNN_TENSOR_TYPE_MAP = {
+    torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
     torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
     torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,
     torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,
     torch.int32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
     torch.int64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_64,
     torch.uint8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_8,
-    QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16,
+    torch.uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16,
     float: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
 }
 
@@ -169,7 +168,7 @@ def get_quant_encoding_conf(
         return self.make_qnn_per_tensor_config(quant_attrs)
 
     def get_quant_tensor_value(
-        self, tensor: torch.Tensor, quant_attrs: Dict, dtype, bitwidth
+        self, tensor: torch.Tensor, quant_attrs: Dict, quant_configs: Dict
     ) -> torch.Tensor:
         if quant_attrs["encoding"] in PER_TENSOR_ENCODING:
             scale = quant_attrs["scale"]
@@ -178,16 +177,11 @@ def get_quant_tensor_value(
             scale = quant_attrs["scales"]
             zero_point = quant_attrs["zero_points"]
 
-        # To bypass torch.uint16 quantization is not supported
-        dtype = (
-            torch.int32
-            if dtype == PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16
-            else quant_attrs["dtype"]
-        )
+        dtype = quant_configs["dtype"]
 
         tensor = tensor.div(scale).add(zero_point).round().to(dtype)
         # Make the backends access data correctly
-        if bitwidth == 4:
+        if quant_configs.get("bitwidth") == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
             tensor = torch.bitwise_and(mask, tensor)
         return tensor
@@ -236,7 +230,7 @@ def get_data_type(
                 <= torch.iinfo(torch.int16).max - torch.iinfo(torch.int16).min
             ):
                 if unsigned:
-                    quant_config["dtype"] = QNN_uint16
+                    quant_config["dtype"] = torch.uint16
                 else:
                     quant_config["dtype"] = torch.int16
             return QNN_QUANT_TYPE_MAP[quant_config["dtype"]]
@@ -327,8 +321,7 @@ def define_tensor(
                 tensor = self.get_quant_tensor_value(
                     tensor,
                     node.meta["quant_attrs"],
-                    dtype,
-                    quant_configs.get("bitwidth"),
+                    quant_configs,
                 )
             tensor_wrapper = PyQnnWrapper.TensorWrapper(
                 tensor_name,
diff --git a/backends/qualcomm/builders/op_dequantize.py b/backends/qualcomm/builders/op_dequantize.py
index 9c351103949..f80103b4b89 100644
--- a/backends/qualcomm/builders/op_dequantize.py
+++ b/backends/qualcomm/builders/op_dequantize.py
@@ -56,20 +56,16 @@ def define_node(
 
 
 @register_node_visitor
-class PerTensorDequantizeDefault(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_tensor.default"]
+class PerTensorDequantize(DequantizeOpBase):
+    target = [
+        "quantized_decomposed.dequantize_per_tensor.default",
+        "quantized_decomposed.dequantize_per_tensor.tensor",
+    ]
 
 
 @register_node_visitor
-class PerTensorDequantizeTensor(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_tensor.tensor"]
-
-
-@register_node_visitor
-class PerChannelDequantizeDefault(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_channel.default"]
-
-
-@register_node_visitor
-class PerChannelDequantizeTensor(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_channel.tensor"]
+class PerChannelDequantize(DequantizeOpBase):
+    target = [
+        "quantized_decomposed.dequantize_per_channel.default",
+        "quantized_decomposed.dequantize_per_channel.tensor",
+    ]
diff --git a/backends/qualcomm/builders/op_linear.py b/backends/qualcomm/builders/op_linear.py
index 85f65ebbcbb..9a593528219 100644
--- a/backends/qualcomm/builders/op_linear.py
+++ b/backends/qualcomm/builders/op_linear.py
@@ -40,6 +40,14 @@ def define_node(
         linear_input_tensors.append(input_tensor_wrapper)
 
         weight_node = node.args[1]
+        if (
+            quant_attrs := weight_node.meta.get("quant_attrs")
+        ) and "scales" in quant_attrs:
+            # Dimension of weight is [m, n], per channel quant params is [m]
+            # Change to [m, 1] to fit the tensor.div(s).add(z)
+            quant_attrs["scales"] = quant_attrs["scales"].reshape([-1, 1])
+            quant_attrs["zero_points"] = quant_attrs["zero_points"].reshape([-1, 1])
+
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
@@ -52,6 +60,12 @@ def define_node(
 
         if len(node.args) >= 3:
             bias_node = node.args[2]
+
+            # TODO remove this when qnn sdk support
+            if "scales" in bias_node.meta.get("quant_attrs", {}):
+                print(
+                    f"[WARNING] Fallback linear bias, {bias_node}. per channel bias quantization is not support yet."
+                )
             bias_tensor = get_parameter(bias_node, self.edge_program)
             bias_tensor_wrapper = self.define_tensor(
                 bias_node,
diff --git a/backends/qualcomm/builders/op_log_softmax.py b/backends/qualcomm/builders/op_log_softmax.py
index c159b9bf00e..002dd5bc9b2 100644
--- a/backends/qualcomm/builders/op_log_softmax.py
+++ b/backends/qualcomm/builders/op_log_softmax.py
@@ -72,5 +72,4 @@ def define_node(
             PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
             {"data": np.uint32(dim)},
         )
-        # pdb.set_trace()
         return log_softmax_op
diff --git a/backends/qualcomm/builders/op_sqrt.py b/backends/qualcomm/builders/op_sqrt.py
new file mode 100644
index 00000000000..7847d00e8b8
--- /dev/null
+++ b/backends/qualcomm/builders/op_sqrt.py
@@ -0,0 +1,59 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpSqrt, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class SQRT(NodeVisitor):
+    target = ["aten.sqrt.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        # tensor input
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+        sqrt_input_tensors = [input_tensor_wrapper]
+
+        out_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            out_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+        sqrt_output_tensors = [output_tensor_wrapper]
+
+        sqrt_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpSqrt.op_name,
+        )
+        sqrt_op.AddInputTensors(sqrt_input_tensors)
+        sqrt_op.AddOutputTensors(sqrt_output_tensors)
+
+        return sqrt_op
diff --git a/backends/qualcomm/builders/op_sum_int_list.py b/backends/qualcomm/builders/op_sum_int_list.py
new file mode 100644
index 00000000000..26cc262462e
--- /dev/null
+++ b/backends/qualcomm/builders/op_sum_int_list.py
@@ -0,0 +1,80 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict, List
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpReduceSum, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Sum(NodeVisitor):
+    target = ["aten.sum.dim_IntList"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+        sum_input_tensors = [input_tensor_wrapper]
+
+        # sum dims
+        sum_dims = cast(List[int], node.args[1])
+        sum_dims = [sum_dim % len(input_node.meta["val"].shape) for sum_dim in sum_dims]
+        if "axis_order" in node.meta:
+            sum_dims = [node.meta["axis_order"].index(sum_dim) for sum_dim in sum_dims]
+        sum_dims_shape = [len(sum_dims)]
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+        sum_output_tensors = [output_tensor_wrapper]
+        sum_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReduceSum.op_name,
+        )
+        sum_op.AddInputTensors(sum_input_tensors)
+        sum_op.AddOutputTensors(sum_output_tensors)
+        sum_op.AddTensorParam(
+            OpReduceSum.param_axes,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(sum_dims_shape),
+            sum_dims_shape,
+            np.array(sum_dims, dtype=np.uint32),
+            True,
+        )
+
+        if len(node.args) > 2:
+            keep_dims = cast(bool, node.args[2])
+            sum_op.AddScalarParam(
+                OpReduceSum.param_keep_dims,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+                {"data": keep_dims},
+            )
+        return sum_op
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 82c50046bee..118a0768d9d 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -8,7 +8,6 @@
 from enum import IntEnum, unique
 
 QNN_OP_PACKAGE_NAME_QTI_AISW = "qti.aisw"
-QNN_uint16 = "uint16"
 
 # Below constants should be same as those in QNN headers.
 # Maybe someday we should expose these constants by pybind
@@ -106,6 +105,13 @@ class OpExpandDims:
     param_axis: str = "axis"
 
 
+@dataclass(init=False, frozen=True)
+class OpReduceSum:
+    op_name: str = "ReduceSum"
+    param_axes: str = "axes"
+    param_keep_dims: str = "keep_dims"
+
+
 @dataclass(init=False, frozen=True)
 class OpFullyConnected:
     op_name: str = "FullyConnected"
@@ -123,6 +129,11 @@ class OpGelu:
     op_name: str = "Gelu"
 
 
+@dataclass(init=False, frozen=True)
+class OpSqrt:
+    op_name: str = "ElementWiseSquareRoot"
+
+
 @dataclass(init=False, frozen=True)
 class OpHardSwish:
     op_name: str = "HardSwish"
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index b06a5766a63..36a2986f09a 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -13,6 +13,8 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.full.default,
+    exir_ops.edge.aten.slice_scatter.default,
+    exir_ops.edge.aten.index_put.default,
 ]
 
 allow_list_operator = [
diff --git a/backends/qualcomm/passes/convert_hardsigmoid.py b/backends/qualcomm/passes/convert_hardsigmoid.py
index dc0044da392..68fb8e11094 100644
--- a/backends/qualcomm/passes/convert_hardsigmoid.py
+++ b/backends/qualcomm/passes/convert_hardsigmoid.py
@@ -25,6 +25,10 @@ def call(self, graph_module: torch.fx.GraphModule):
         partitions = get_source_partitions(graph, [torch.nn.Hardsigmoid])
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
+                if exir_ops.edge.aten.hardswish.default in [
+                    node.target for node in src_partition.nodes
+                ]:
+                    continue
                 if self.quantization_capture:
                     # only one hardsigmoid op will be seen
                     input_nodes = src_partition.input_nodes
@@ -34,8 +38,6 @@ def call(self, graph_module: torch.fx.GraphModule):
                 else:
                     in_ops_target = exir_ops.edge.aten.add.Tensor
                     out_ops_target = exir_ops.edge.aten.div.Tensor
-                    # see the reverse engineering logic hardswish
-                    # https://shorturl.at/pACEL
                     input_nodes = [
                         n for n in src_partition.nodes if n.target is in_ops_target
                     ]
diff --git a/backends/qualcomm/passes/layout_transform.py b/backends/qualcomm/passes/layout_transform.py
index 8c86f1919ad..fbf1431f1a5 100644
--- a/backends/qualcomm/passes/layout_transform.py
+++ b/backends/qualcomm/passes/layout_transform.py
@@ -52,6 +52,9 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.bmm.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.gelu.default,
+        exir_ops.edge.aten.sqrt.default,
+        exir_ops.edge.aten.sum.dim_IntList,
+        exir_ops.edge.aten.pow.Tensor_Scalar,
         *q_ops,
         *dq_ops,
         _operator.getitem,
@@ -109,7 +112,10 @@ def is_layout_sensitive(self, node: torch.fx.Node) -> bool:
         return node.target in self.layout_sensitive_ops
 
     def is_layout_agnostic(self, node: torch.fx.Node) -> bool:
-        if node.target == exir_ops.edge.aten.mean.dim:
+        if node.target in [
+            exir_ops.edge.aten.mean.dim,
+            exir_ops.edge.aten.sum.dim_IntList,
+        ]:
             # if dimemsion is not kept, we'll have no clue how to do layout transform
             if len(node.args) < 3 or not node.args[2]:
                 return False
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 674314d991c..1414af171a4 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -267,7 +267,7 @@ def __init__(self):
         self.custom_quant_annotations: Sequence[Callable] = []
         self.discard_nodes: Set[str] = set()
 
-        self.enable_per_channel_conv_quant: bool = True
+        self.use_per_channel_weight_quant_ops: Set[OpOverload] = set()
         # the weight quantized for activation 8 bits and 16 bits
         self.per_channel_weight_dtype: Dict = {
             "8bit_act": torch.int8,
@@ -290,16 +290,13 @@ def _annotate_custom_annotation(self, gm: GraphModule) -> None:
     def _get_quant_config(self, op: str | OpOverload) -> Optional[QuantizationConfig]:
         """
         Priority:
-            1. per channel config when enable_per_channel_conv_quant is True
+            1. is one of use_per_channel_weight_quant_ops
             2. int8 / int16 config
         """
         if type(op) == str:
             return
 
-        if self.enable_per_channel_conv_quant and op in [
-            torch.ops.aten.conv1d.default,
-            torch.ops.aten.conv2d.default,
-        ]:
+        if op in self.use_per_channel_weight_quant_ops:
             if op in self.bit16_quant_ops:
                 return get_ptq_per_channel_weight_config(
                     torch.uint16, self.per_channel_weight_dtype["16bit_act"]
@@ -316,6 +313,12 @@ def _get_quant_config(self, op: str | OpOverload) -> Optional[QuantizationConfig
 
         print(f"No quant config is implemented for op, {op}")
 
+    def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: bool):
+        if enable:
+            self.use_per_channel_weight_quant_ops.update(ops)
+        else:
+            self.use_per_channel_weight_quant_ops.difference(ops)
+
     def add_16bit_quant_ops(self, ops: Set[OpOverload]) -> None:
         for op in ops:
             assert (
@@ -368,8 +371,15 @@ def set_per_channel_weight_dtype(
         if weight_dtype_for_16bit_act:
             self.per_channel_weight_dtype["16bit_act"] = weight_dtype_for_16bit_act
 
-    def set_per_channel_quant(self, enable: bool) -> None:
-        self.enable_per_channel_conv_quant = enable
+    def set_per_channel_conv_quant(self, enable: bool) -> None:
+        conv_ops = {torch.ops.aten.conv1d.default, torch.ops.aten.conv2d.default}
+        self._update_per_channel_weight_quant_ops(conv_ops, enable)
+
+    def set_per_channel_linear_quant(self, enable: bool) -> None:
+        linear_ops = {
+            torch.ops.aten.linear.default,
+        }
+        self._update_per_channel_weight_quant_ops(linear_ops, enable)
 
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         model = RemoveClone()(model).graph_module
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index 809b7298eba..ac741b7dc14 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -9,6 +9,7 @@
 import torch
 
 from torch._ops import OpOverload
+from torch._subclasses import FakeTensor
 
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
@@ -42,6 +43,19 @@ def decorator(annotator: Callable):
     return decorator
 
 
+def _is_input_float_tensor(node: Node):
+    """Check if the input is not a float tensor, so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if (
+        not isinstance(node, Node)
+        or "val" not in node.meta
+        or not isinstance(node.meta["val"], FakeTensor)
+    ):
+        return False
+    return node.meta["val"].dtype == torch.float32
+
+
 def _is_annotated(nodes: List[Node]):
     """
     Given a list of nodes (that represents an operator pattern),
@@ -123,11 +137,11 @@ def annotate_binary(node: Node, quantization_config: QuantizationConfig) -> None
 
     input_qspec_map = {}
     input_act0 = node.args[0]
-    if isinstance(input_act0, Node):
+    if _is_input_float_tensor(input_act0):
         input_qspec_map[input_act0] = input_act_qspec
 
     input_act1 = node.args[1]
-    if isinstance(input_act1, Node):
+    if _is_input_float_tensor(input_act1):
         input_qspec_map[input_act1] = input_act_qspec
 
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
@@ -162,6 +176,11 @@ def annotate_rsub(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.sum.dim_IntList])
+def annotate_sum(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_binary(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.ceil.default])
 def annotate_ceil(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -289,6 +308,11 @@ def annotate_slice(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.sqrt.default])
+def annotate_sqrt(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.gelu.default])
 def annotate_gelu(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -520,11 +544,11 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None
     )
     nodes_to_mark_annotated = [node, weight_node]
     if bias_node:
-        _annotate_input_qspec_map(
-            node,
-            bias_node,
-            quantization_config.bias,
-        )
+        if callable(quantization_config.bias):
+            bias_config = quantization_config.bias(node)
+        else:
+            bias_config = quantization_config.bias
+        _annotate_input_qspec_map(node, bias_node, bias_config)
         nodes_to_mark_annotated.append(bias_node)
     _annotate_output_qspec(node, quantization_config.output_activation)
     _mark_nodes_as_annotated(nodes_to_mark_annotated)
diff --git a/backends/qualcomm/setup.md b/backends/qualcomm/setup.md
index 18ebf412fc0..b78b481e86e 100644
--- a/backends/qualcomm/setup.md
+++ b/backends/qualcomm/setup.md
@@ -93,7 +93,6 @@ mkdir build_android
 cd build_android
 # build executorch & qnn_executorch_backend
 cmake .. \
-    -DBUCK2=buck2 \
     -DCMAKE_INSTALL_PREFIX=$PWD \
     -DEXECUTORCH_BUILD_QNN=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index bbc89276854..812380ae6af 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -122,8 +122,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.modules = [
-            Conv2DSequential(),
-            Conv2DSequential(),
+            Conv2dSequential(),
+            Conv2dSequential(),
             Add(),
             Relu(),
         ]
@@ -172,7 +172,7 @@ def forward(self, x, y):
         return CompositeReferenceModule(self.modules)
 
 
-class Conv1DSequential(torch.nn.Module):
+class Conv1dSequential(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.first = torch.nn.Conv1d(
@@ -210,43 +210,6 @@ def forward(self, x):
         return x
 
 
-class Conv2DSequential(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.first = torch.nn.Conv2d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=(3, 3),
-            padding=1,
-            bias=True,
-        )
-        self.second = torch.nn.Conv2d(
-            in_channels=3,
-            out_channels=2,
-            kernel_size=(3, 3),
-            padding=1,
-            bias=True,
-        )
-
-    def forward(self, x):
-        return self.second(self.first(x))
-
-
-class Conv2DSingle(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=(3, 3),
-            padding=1,
-            bias=True,
-        )
-
-    def forward(self, x):
-        return self.conv(x)
-
-
 class Conv2dAvgPool2d(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -321,6 +284,58 @@ def forward(self, x):
         return self.pool(self.conv(x))
 
 
+class Conv2dSequential(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.first = torch.nn.Conv2d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=(3, 3),
+            padding=1,
+            bias=True,
+        )
+        self.second = torch.nn.Conv2d(
+            in_channels=3,
+            out_channels=2,
+            kernel_size=(3, 3),
+            padding=1,
+            bias=True,
+        )
+
+    def forward(self, x):
+        return self.second(self.first(x))
+
+
+class Conv2dSingle(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=(3, 3),
+            padding=1,
+            bias=True,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Conv2dSumReduceDim(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.first = torch.nn.Conv2d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=(3, 3),
+            padding=1,
+            bias=True,
+        )
+
+    def forward(self, x):
+        return torch.sum(self.first(x), dim=(2, 3), keepdim=False)
+
+
 class Div(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -409,9 +424,9 @@ def forward(self, x):
 
 
 class Linear(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, use_bias: bool = True):
         super().__init__()
-        self.linear = torch.nn.Linear(4, 5).eval()
+        self.linear = torch.nn.Linear(4, 5, use_bias).eval()
 
     def forward(self, x):
         return self.linear(x)
@@ -691,7 +706,7 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x):
-        return x / torch.sqrt(torch.tensor([64]))
+        return x / torch.sqrt(torch.tensor([64.0]))
 
 
 class Squeeze(torch.nn.Module):
@@ -748,6 +763,14 @@ def forward(self, x):
         return 10 - x
 
 
+class SumIntList(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.sum(x, dim=(2, 3), keepdim=True)
+
+
 class Tanh(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index fc879431307..3874da9e981 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -6,8 +6,10 @@
 import json
 import subprocess
 import sys
+import tempfile
 import unittest
 from multiprocessing.connection import Listener
+from pathlib import Path
 
 import torch
 from executorch.backends.qualcomm.tests.utils import (
@@ -93,12 +95,12 @@ def test_qnn_backend_clamp(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv1d(self):
-        module = Conv1DSequential()  # noqa: F405
+        module = Conv1dSequential()  # noqa: F405
         sample_input = (torch.randn([1, 1, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv2d(self):
-        module = Conv2DSequential()  # noqa: F405
+        module = Conv2dSequential()  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
@@ -181,11 +183,10 @@ def test_qnn_backend_element_wise_mul(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
-    @unittest.skip("not yet implemented")
     def test_qnn_backend_element_wise_sqrt(self):
         modules = [Sqrt(), SqrtConstant()]  # noqa: F405
-        sample_input = (torch.randn([3, 1]),)
         for i, module in enumerate(modules):
+            sample_input = (torch.rand([3, 1]),)
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
@@ -355,6 +356,11 @@ def test_qnn_backend_squeeze(self):
         sample_input = (torch.randn([1, 3, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_sum_int_list(self):
+        module = SumIntList()  # noqa: F405
+        sample_input = (torch.randn([1, 4, 8, 8]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_tanh(self):
         module = Tanh()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
@@ -419,6 +425,11 @@ def test_qnn_backend_conv2d_max_pool2d(self):
         sample_input = (torch.rand(1, 2, 14, 14),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_sum_reduce_dim(self):
+        module = Conv2dSumReduceDim()  # noqa: F405
+        sample_input = (torch.randn([1, 1, 3, 3]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_residual_block(self):
         module = ResidualBlockModule()  # noqa: F405
         sample_input = (torch.randn(1, 32, 28, 28),)
@@ -492,7 +503,7 @@ def setUp(self):
         )
 
     def test_qnn_backend_16a4w_conv2d(self):
-        module = Conv2DSingle()  # noqa: F405
+        module = Conv2dSingle()  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
         module = self.get_qdq_module(
             module, sample_input, quant_dtype=QuantDtype.use_16a4w
@@ -503,7 +514,33 @@ def test_qnn_backend_16a4w_linear(self):
         module = Linear()  # noqa: F405
         sample_input = (torch.randn([3, 4]),)
         module = self.get_qdq_module(
-            module, sample_input, quant_dtype=QuantDtype.use_16a4w
+            module,
+            sample_input,
+            quant_dtype=QuantDtype.use_16a4w,
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_16a4w_per_channel_linear(self):
+        module = Linear(use_bias=False)  # noqa: F405
+        sample_input = (torch.randn([3, 4]),)
+        module = self.get_qdq_module(
+            module,
+            sample_input,
+            is_linear_per_channel=True,
+            quant_dtype=QuantDtype.use_16a4w,
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
+    # Is not enabled in the current qnn sdk release
+    @unittest.expectedFailure
+    def test_qnn_backend_16a4w_per_channel_linear_with_bias(self):
+        module = Linear()  # noqa: F405
+        sample_input = (torch.randn([3, 4]),)
+        module = self.get_qdq_module(
+            module,
+            sample_input,
+            is_linear_per_channel=True,
+            quant_dtype=QuantDtype.use_16a4w,
         )
         self.lower_module_and_test_output(module, sample_input)
 
@@ -547,13 +584,13 @@ def test_qnn_backend_clamp(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv1d(self):
-        module = Conv1DSequential()  # noqa: F405
+        module = Conv1dSequential()  # noqa: F405
         sample_input = (torch.randn([1, 1, 3]),)
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv2d(self):
-        module = Conv2DSequential()  # noqa: F405
+        module = Conv2dSequential()  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
@@ -641,11 +678,10 @@ def test_qnn_backend_element_wise_mul(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
-    @unittest.skip("not yet implemented")
     def test_qnn_backend_element_wise_sqrt(self):
         modules = [Sqrt(), SqrtConstant()]  # noqa: F405
-        sample_input = (torch.randn([3, 1]),)
         for i, module in enumerate(modules):
+            sample_input = (torch.rand([3, 1]),)
             with self.subTest(i=i):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
@@ -845,6 +881,12 @@ def test_qnn_backend_stack(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_sum_int_list(self):
+        module = SumIntList()  # noqa: F405
+        sample_input = (torch.randn([1, 4, 8, 8]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_tanh(self):
         module = Tanh()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
@@ -918,6 +960,12 @@ def test_qnn_backend_conv2d_max_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_sum_reduce_dim(self):
+        module = Conv2dSumReduceDim()  # noqa: F405
+        sample_input = (torch.randn([1, 1, 3, 3]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_example_models(self):
         instances = [
             {"module": DeepLabV3ResNet101Model(), "annotation": ()},
@@ -1067,6 +1115,7 @@ def test_qnn_backend_multi_contexts_composite(self):
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
+    @unittest.expectedFailure
     def test_qnn_backend_profile_op(self):
         TestQNN.enable_profile = True
         backend_options = generate_htp_compiler_spec(use_fp16=True)
@@ -1102,6 +1151,19 @@ def test_qnn_backend_shared_buffer(self):
             expected_partitions=1,
         )
 
+    def test_qnn_backend_online_prepare(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            debug=False,
+            saver=False,
+            online_prepare=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        self.lower_module_and_test_output(module, sample_input)
+
 
 class TestQNNQuantizedUtils(TestQNN):
     # TODO: refactor to support different backends
@@ -1186,6 +1248,7 @@ def test_qnn_backend_multi_contexts_composite(self):
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
+    @unittest.expectedFailure
     def test_qnn_backend_profile_op(self):
         TestQNN.enable_profile = True
         backend_options = generate_htp_compiler_spec(use_fp16=False)
@@ -1223,6 +1286,20 @@ def test_qnn_backend_shared_buffer(self):
             expected_partitions=1,
         )
 
+    def test_qnn_backend_online_prepare(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            debug=False,
+            saver=False,
+            online_prepare=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
 
 class TestExampleOssScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
@@ -1268,6 +1345,40 @@ def test_fbnet(self):
             self.assertGreaterEqual(msg["top_1"], 60)
             self.assertGreaterEqual(msg["top_5"], 90)
 
+    def test_ssd300_vgg16(self):
+        if not self.required_envs([self.pretrained_weight, self.oss_repo]):
+            self.skipTest("missing required envs")
+
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/ssd300_vgg16.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--oss_repo",
+            self.oss_repo,
+            "--pretrained_weight",
+            self.pretrained_weight,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            self.assertGreaterEqual(msg["mAP"], 0.70)
+
 
 class TestExampleScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
@@ -1640,6 +1751,29 @@ def test_ptq_mobilebert(self):
             for k, v in cpu.items():
                 self.assertLessEqual(abs(v[0] - htp[k][0]), 5)
 
+    def test_export_example(self):
+        if not self.required_envs([self.model_name]):
+            self.skipTest("missing required envs")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            cmds = [
+                "python",
+                "qualcomm/scripts/export_example.py",
+                "--model_name",
+                self.model_name,
+                "--output_folder",
+                "{}/".format(tmp_dir),
+                "--generate_etrecord",
+            ]
+
+            p = subprocess.Popen(
+                cmds, stdout=subprocess.DEVNULL, cwd=f"{self.executorch_root}/examples"
+            )
+            p.communicate()
+            self.assertTrue(
+                Path("{0}/{1}.pte".format(tmp_dir, self.model_name)).exists()
+            )
+
 
 def setup_environment():
     parser = setup_common_args_and_variables()
@@ -1669,6 +1803,12 @@ def setup_environment():
         default="",
         type=str,
     )
+    parser.add_argument(
+        "-n",
+        "--model_name",
+        help="Input the model to export",
+        type=str,
+    )
     parser.add_argument(
         "-o",
         "--online_prepare",
@@ -1687,6 +1827,11 @@ def setup_environment():
         help="Emit log only when error happened",
         action="store_true",
     )
+    parser.add_argument(
+        "--oss_repo",
+        help="Path to open source software model repository",
+        type=str,
+    )
 
     args, ns_args = parser.parse_known_args(namespace=unittest)
     TestQNN.host = args.host
@@ -1697,9 +1842,11 @@ def setup_environment():
     TestQNN.artifact_dir = args.artifact_dir
     TestQNN.image_dataset = args.image_dataset
     TestQNN.pretrained_weight = args.pretrained_weight
+    TestQNN.model_name = args.model_name
     TestQNN.online_prepare = args.online_prepare
     TestQNN.enable_profile = args.enable_profile
     TestQNN.error_only = args.error_only
+    TestQNN.oss_repo = args.oss_repo
     TestQNN.shared_buffer = args.shared_buffer
     return sys.argv[:1] + ns_args
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 5700b5fb17a..59a48f123da 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -225,6 +225,7 @@ def get_qdq_module(
         module: torch.nn.Module,
         inputs: Tuple[torch.Tensor],
         is_conv_per_channel: Optional[bool] = True,
+        is_linear_per_channel: Optional[bool] = False,
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
     ) -> torch.fx.GraphModule:
@@ -232,7 +233,8 @@ def get_qdq_module(
 
         quantizer = QnnQuantizer()
         quantizer.add_custom_quant_annotations(custom_quant_annotations)
-        quantizer.set_per_channel_quant(is_conv_per_channel)
+        quantizer.set_per_channel_conv_quant(is_conv_per_channel)
+        quantizer.set_per_channel_linear_quant(is_linear_per_channel)
 
         if quant_dtype == QuantDtype.use_8a8w:
             pass  # default setting
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index b6792b5d70b..0a9b7d064d1 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -19,6 +19,7 @@
     ConvertBinaryOpsWithScalar,
 )
 from executorch.backends.qualcomm.passes.convert_bmm_to_matmul import ConvertBmmToMatmul
+from executorch.backends.qualcomm.passes.convert_hardsigmoid import ConvertHardsigmoid
 from executorch.backends.qualcomm.passes.convert_interpolate_with_upsample2d import (
     ConvertInterpolateWithUpsample2D,
 )
@@ -103,6 +104,7 @@ def _transform(edge_program: ExportedProgram) -> None:
     graph_module = edge_program.graph_module
     RemoveClone()(graph_module)
     ConvertToLinear()(graph_module)
+    ConvertHardsigmoid()(graph_module)
     ConvertBmmToMatmul()(graph_module)
     ConvertInterpolateWithUpsample2D()(graph_module)
     I64toI32(edge_program)(graph_module)
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
new file mode 100644
index 00000000000..bc5a674970f
--- /dev/null
+++ b/backends/vulkan/README.md
@@ -0,0 +1,192 @@
+# ExecuTorch Vulkan Delegate
+
+The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is
+built on top of the cross-platform Vulkan GPU API standard. It is primarily
+designed to leverage the GPU to accelerate model inference on Android devices,
+but can be used on any platform that supports an implementation of Vulkan:
+laptops, servers, and edge devices.
+
+::::{note}
+The Vulkan delegate is currently under active development, and its components
+are subject to change.
+::::
+
+## What is Vulkan?
+
+Vulkan is a low-level GPU API specification developed as a successor to OpenGL.
+It is designed to offer developers more explicit control over GPUs compared to
+previous specifications in order to reduce overhead and maximize the
+capabilities of the modern graphics hardware.
+
+Vulkan has been widely adopted among GPU vendors, and most modern GPUs (both
+desktop and mobile) in the market support Vulkan. Vulkan is also included in
+Android from Android 7.0 onwards.
+
+**Note that Vulkan is a GPU API, not a GPU Math Library**. That is to say it
+provides a way to execute compute and graphics operations on a GPU, but does not
+come with a built-in library of performant compute kernels.
+
+## The Vulkan Compute Library
+
+The ExecuTorch Vulkan Delegate is a wrapper around a standalone runtime known as
+the **Vulkan Compute Library**. The aim of the Vulkan Compute Library is to
+provide GPU implementations for PyTorch operators via GLSL compute shaders.
+
+The Vulkan Compute Library is a fork/iteration of the [PyTorch Vulkan Backend](https://pytorch.org/tutorials/prototype/vulkan_workflow.html).
+The core components of the PyTorch Vulkan backend were forked into ExecuTorch
+and adapted for an AOT graph-mode style of model inference (as opposed to
+PyTorch which adopted an eager execution style of model inference).
+
+The components of the Vulkan Compute Library are contained in the
+`executorch/backends/vulkan/runtime/` directory. The core components are listed
+and described below:
+
+```
+runtime/
+├── api/ .................... Wrapper API around Vulkan to manage Vulkan objects
+└── graph/ .................. ComputeGraph class which implements graph mode inference
+    └── ops/ ................ Base directory for operator implementations
+        ├── glsl/ ........... GLSL compute shaders
+        │   ├── *.glsl
+        │   └── conv2d.glsl
+        └── impl/ ........... C++ code to dispatch GPU compute shaders
+            ├── *.cpp
+            └── Conv2d.cpp
+```
+
+## Features
+
+The Vulkan delegate currently supports the following features:
+
+* **Memory Planning**
+  * Intermediate tensors whose lifetimes do not overlap will share memory allocations. This reduces the peak memory usage of model inference.
+* **Capability Based Partitioning**:
+  * A graph can be partially lowered to the Vulkan delegate via a partitioner, which will identify nodes (i.e. operators) that are supported by the Vulkan delegate and lower only supported subgraphs
+* **Support for upper-bound dynamic shapes**:
+  * Tensors can change shape between inferences as long as its current shape is smaller than the bounds specified during lowering
+
+In addition to increasing operator coverage, the following features are
+currently in development:
+
+* **Quantization Support**
+  * We are currently working on support for 8-bit dynamic quantization, with plans to extend to other quantization schemes in the future.
+* **Memory Layout Management**
+  * Memory layout is an important factor to optimizing performance. We plan to introduce graph passes to introduce memory layout transitions throughout a graph to optimize memory-layout sensitive operators such as Convolution and Matrix Multiplication.
+* **Selective Build**
+  * We plan to make it possible to control build size by selecting which operators/shaders you want to build with
+
+## End to End Example
+
+To further understand the features of the Vulkan Delegate and how to use it,
+consider the following end to end example with MobileNet V2.
+
+### Compile and lower a model to the Vulkan Delegate
+
+Assuming ExecuTorch has been set up and installed, the following script can be
+used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`.
+
+```
+import torch
+import torchvision.models as models
+
+from torch.export import export, ExportedProgram
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import EdgeProgramManager, ExecutorchProgramManager, to_edge
+from executorch.exir.backend.backend_api import to_backend
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+exported_program: ExportedProgram = export(mobilenet_v2, sample_inputs)
+edge: EdgeProgramManager = to_edge(exported_program)
+
+# Lower the model to Vulkan backend
+edge = edge.to_backend(VulkanPartitioner())
+
+exec_prog = edge.to_executorch()
+
+with open("vulkan_mobilenetv2.pte", "wb") as file:
+    exec_prog.write_to_file(file)
+```
+
+Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate
+using the `to_backend()` API. The Vulkan Delegate implements the
+`VulkanPartitioner` class which identifies nodes (i.e. operators) in the graph
+that are supported by the Vulkan delegate, and separates compatible sections of
+the model to be executed on the GPU.
+
+This means the a model can be lowered to the Vulkan delegate even if it contains
+some unsupported operators. This will just mean that only parts of the graph
+will be executed on the GPU.
+
+
+::::{note}
+The [Vulkan partitioner code](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/vulkan_partitioner.py)
+can be inspected to examine which ops are currently implemented in the Vulkan
+delegate.
+::::
+
+### Build Vulkan Delegate libraries
+
+The easiest way to build and test the Vulkan Delegate is to build for Android
+and test on a local Android device. Android devices have built in support for
+Vulkan, and the Android NDK ships with a GLSL compiler, which is needed to
+compile the Vulkan Compute Library's GLSL compute shaders.
+
+The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
+when building with CMake.
+
+First, make sure that you have the Android NDK installed - Android NDK r25c is
+recommended. The Android SDK should also be installed so that you have access
+to `adb`.
+
+```shell
+# Recommended version is Android NDK r25c.
+export ANDROID_NDK=<path_to_ndk>
+# Select an appropriate Android ABI
+export ANDROID_ABI=arm64-v8a
+# All subsequent commands should be performed from ExecuTorch repo root
+cd <path_to_executorch_root>
+# Make sure adb works
+adb --version
+```
+
+To build and install ExecuTorch libraries (for Android) with the Vulkan
+Delegate:
+
+```shell
+# From executorch root directory
+(rm -rf cmake-android-out && \
+  pp cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DEXECUTORCH_BUILD_VULKAN=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out && \
+  cmake --build cmake-android-out -j16 --target install)
+```
+
+### Run the Vulkan model on device
+
+::::{note}
+Since operator support is currently limited, only binary arithmetic operators
+will run on the GPU. Expect inference to be slow as the majority of operators
+are being executed via Portable operators.
+::::
+
+Now, the partially delegated model can be executed (partially) on your device's
+GPU!
+
+```shell
+# Build a model runner binary linked with the Vulkan delegate libs
+cmake --build cmake-android-out --target vulkan_executor_runner -j32
+
+# Push model to device
+adb push vulkan_mobilenetv2.pte /data/local/tmp/vulkan_mobilenetv2.pte
+# Push binary to device
+adb push cmake-android-out/backends/vulkan/vulkan_executor_runner /data/local/tmp/runner_bin
+
+# Run the model
+adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vulkan_mobilenetv2.pte
+```
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
new file mode 100644
index 00000000000..f9fc35657a6
--- /dev/null
+++ b/backends/vulkan/docs/android_demo.md
@@ -0,0 +1,148 @@
+# Building and Running ExecuTorch with the Vulkan Backend
+
+The [ExecuTorch Vulkan Delegate](./native-delegates-executorch-vulkan-delegate.md)
+is a native GPU delegate for ExecuTorch.
+
+<!----This will show a grid card on the page----->
+::::{grid} 2
+:::{grid-item-card}  What you will learn in this tutorial:
+:class-card: card-content
+* How to export the Stories 110M parameter model with partial GPU delegation
+* How to execute the partially delegated model on Android
+:::
+:::{grid-item-card}  Prerequisites:
+:class-card: card-prerequisites
+* Follow [**Setting up ExecuTorch**](./getting-started-setup.md)
+* Follow [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+:::
+::::
+
+## Prerequisites
+
+Note that all the steps below should be performed from the ExecuTorch repository
+root directory, and assumes that you have gone through the steps of setting up
+ExecuTorch.
+
+You should also refer to the **Prerequisites** section of the [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+Tutorial in order to install the specified versions of the Android NDK and the
+Android SDK.
+
+```shell
+# Recommended version is Android NDK r25c.
+export ANDROID_NDK=<path_to_ndk>
+# Select an appropriate Android ABI
+export ANDROID_ABI=arm64-v8a
+# All subsequent commands should be performed from ExecuTorch repo root
+cd <path_to_executorch_root>
+# Make sure adb works
+adb --version
+```
+
+## Lowering the Stories 110M model to Vulkan
+
+::::{note}
+The resultant model will only be partially delegated to the Vulkan backend. In
+particular, only binary arithmetic operators (`aten.add`, `aten.sub`,
+`aten.mul`, `aten.div`) and the matrix multiplication operator (`aten.mm`) will
+be executed on the GPU via the Vulkan delegate. The rest of the model will be
+executed using Portable operators. This is because the Vulkan delegate is still
+early in development and currently has limited operator coverage.
+::::
+
+First, download `stories110M.pt` and `tokenizer.model` from Github:
+
+```shell
+wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+```
+
+Next, create the params file:
+
+```shell
+echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+```
+
+Then, create a tokenizer binary file:
+
+```shell
+python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+
+Finally, export the `stories110M.pt` file into an ExecuTorch program:
+
+```shell
+python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json --vulkan
+```
+
+A `vulkan_llama2.pte` file should have been created as a result of the last step.
+
+Push the tokenizer binary and `vulkan_llama2.pte` onto your Android device:
+
+```shell
+adb mkdir /data/local/tmp/llama/
+adb push tokenizer.bin /data/local/tmp/llama/
+adb push vulkan_llama2.pte /data/local/tmp/llama/
+```
+
+## Build and Run the LLaMA runner binary on Android
+
+First, build and install ExecuTorch libraries, then build the LLaMA runner
+binary using the Android NDK toolchain.
+
+```shell
+(rm -rf cmake-android-out && \
+  cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_VULKAN=ON \
+    -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out && \
+  cmake --build cmake-android-out -j16 --target install)
+
+# Build LLaMA Runner library
+(rm -rf cmake-android-out/examples/models/llama2 && \
+  cmake examples/models/llama2 \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out/examples/models/llama2 && \
+  cmake --build cmake-android-out/examples/models/llama2 -j16)
+```
+
+Finally, push and run the llama runner binary on your Android device.
+
+```shell
+adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main
+
+adb shell /data/local/tmp/llama_main \
+    --model_path=/data/local/tmp/llama/vulkan_llama2.pte \
+    --tokenizer_path=/data/local/tmp/llama/tokenizer.bin \
+    --prompt "hi" \--temperature=0
+```
+
+The following output will be produced:
+
+```
+hippo named Hippy lived in a big pond. Hippy was a very happy hippo. He liked to play...
+```
+
+## Running with the LLaMA Android Demo App
+
+It is also possible to run the partially delegated Vulkan model inside the LLaMA
+Android demo app.
+
+First, make some modifications to the Android app setup script to make sure that
+the Vulkan backend is built when building and installing ExecuTorch libraries:
+
+```shell
+# Run from executorch root directory. You can also edit this in a code editor
+sed -i 's/-DEXECUTORCH_BUILD_XNNPACK=ON/-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_VULKAN=ON/g' examples/demo-apps/android/LlamaDemo/setup.sh
+```
+
+Then, Follow the instructions at [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+to build and run the demo application on your Android device. Once the app
+starts up, you can load and run the `vulkan_llama2.pte` model with the app.
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index b5df34b08cd..e9ec9f2d84c 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -48,8 +48,17 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.max_pool2d_with_indices.default,
             # Sum
             exir_ops.edge.aten.sum.dim_IntList,
+            # Convolution operators
+            exir_ops.edge.aten.convolution.default,
+            # Normalization
+            exir_ops.edge.aten.native_layer_norm.default,
+            # Shape-related operators
+            exir_ops.edge.aten.select_copy.int,
+            exir_ops.edge.aten.unsqueeze_copy.default,
+            exir_ops.edge.aten.view_copy.default,
             # Other
             operator.getitem,
+            exir_ops.edge.aten.full.default,
         ]
         return supported
 
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 03579b888dd..c1f3f06b440 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -77,26 +77,26 @@ api::StorageType get_storage_type(
     const vkgraph::VkStorageType& vk_storage_type) {
   switch (vk_storage_type) {
     case vkgraph::VkStorageType::BUFFER:
-      return api::StorageType::BUFFER;
+      return api::kBuffer;
     case vkgraph::VkStorageType::TEXTURE_3D:
-      return api::StorageType::TEXTURE_3D;
+      return api::kTexture3D;
     case vkgraph::VkStorageType::TEXTURE_2D:
-      return api::StorageType::TEXTURE_2D;
+      return api::kTexture2D;
     default:
       break;
   }
-  return api::StorageType::UNKNOWN;
+  VK_THROW("Invalid storage type encountered!");
 }
 
 api::GPUMemoryLayout get_memory_layout(
     const vkgraph::VkMemoryLayout& vk_memory_layout) {
   switch (vk_memory_layout) {
     case vkgraph::VkMemoryLayout::TENSOR_WIDTH_PACKED:
-      return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+      return api::kWidthPacked;
     case vkgraph::VkMemoryLayout::TENSOR_HEIGHT_PACKED:
-      return api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED;
+      return api::kHeightPacked;
     case vkgraph::VkMemoryLayout::TENSOR_CHANNELS_PACKED:
-      return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+      return api::kChannelsPacked;
     default:
       break;
   }
@@ -334,18 +334,18 @@ bool maybe_resize_input(
     const size_t input_i,
     exec_aten::Tensor& et_tensor) {
   ValueRef in_tensor_ref = graph->inputs()[input_i].value;
-  vTensor& in_tensor = graph->get_val(in_tensor_ref).toTensor();
+  vTensorPtr in_tensor = graph->get_tensor(in_tensor_ref);
 
   ET_CHECK_MSG(
-      et_tensor.dim() == in_tensor.sizes().size(),
+      et_tensor.dim() == in_tensor->sizes().size(),
       "Cannot resize input tensor: old ndim %zu does not match new ndim %zu",
-      static_cast<size_t>(in_tensor.sizes().size()),
+      static_cast<size_t>(in_tensor->sizes().size()),
       static_cast<size_t>(et_tensor.dim()));
 
   bool should_resize = false;
   std::vector<int64_t> new_sizes(et_tensor.dim());
   for (size_t i = 0; i < et_tensor.dim(); i++) {
-    if (in_tensor.sizes()[i] != et_tensor.sizes()[i]) {
+    if (in_tensor->sizes()[i] != et_tensor.sizes()[i]) {
       should_resize = true;
     }
     new_sizes.at(i) = et_tensor.sizes()[i];
@@ -356,9 +356,9 @@ bool maybe_resize_input(
   }
 
   ET_CHECK_MSG(
-      in_tensor.numel() == et_tensor.numel(),
+      in_tensor->numel() == et_tensor.numel(),
       "Vulkan tensor numel %zu does not match ET tensor numel %zu",
-      static_cast<size_t>(in_tensor.numel()),
+      static_cast<size_t>(in_tensor->numel()),
       static_cast<size_t>(et_tensor.numel()));
 
   return should_resize;
@@ -369,12 +369,12 @@ void maybe_resize_output(
     const size_t output_i,
     exec_aten::Tensor& et_tensor) {
   ValueRef out_tensor_ref = graph->outputs()[output_i].value;
-  vTensor& out_tensor = graph->get_val(out_tensor_ref).toTensor();
+  vTensorPtr out_tensor = graph->get_tensor(out_tensor_ref);
 
   exec_aten::SizesType new_output_size[kTensorDimensionLimit];
-  size_t ndim = out_tensor.sizes().size();
+  size_t ndim = out_tensor->sizes().size();
   for (int i = 0; i < ndim; ++i) {
-    new_output_size[i] = out_tensor.sizes()[i];
+    new_output_size[i] = out_tensor->sizes()[i];
   }
 
   exec_aten::ArrayRef<exec_aten::SizesType> output_size{new_output_size, ndim};
@@ -449,6 +449,9 @@ class VulkanBackend final : public PyTorchBackendInterface {
 
     Error err = compileModel(processed->data(), compute_graph);
 
+    // This backend does not need its processed data after compiling the model.
+    processed->Free();
+
     if (err != Error::Ok) {
       return err;
     }
diff --git a/backends/vulkan/runtime/api/Adapter.cpp b/backends/vulkan/runtime/api/Adapter.cpp
index 3d5c87d23d9..a02a6aa3e0a 100644
--- a/backends/vulkan/runtime/api/Adapter.cpp
+++ b/backends/vulkan/runtime/api/Adapter.cpp
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+// @lint-ignore-every CLANGTIDY clang-diagnostic-missing-field-initializers
+
 #include <executorch/backends/vulkan/runtime/api/Adapter.h>
 
 #include <bitset>
@@ -21,15 +23,33 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
     : handle(physical_device_handle),
       properties{},
       memory_properties{},
+      shader_16bit_storage{
+          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES},
+      shader_8bit_storage{
+          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES},
+      shader_float16_int8_types{
+          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR},
       queue_families{},
       num_compute_queues(0),
       has_unified_memory(false),
       has_timestamps(properties.limits.timestampComputeAndGraphics),
-      timestamp_period(properties.limits.timestampPeriod) {
+      timestamp_period(properties.limits.timestampPeriod),
+      extension_features(&shader_16bit_storage) {
   // Extract physical device properties
   vkGetPhysicalDeviceProperties(handle, &properties);
   vkGetPhysicalDeviceMemoryProperties(handle, &memory_properties);
 
+  VkPhysicalDeviceFeatures2 features2{
+      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2};
+
+  // Create linked list to query availability of extensions
+  features2.pNext = &shader_16bit_storage;
+  shader_16bit_storage.pNext = &shader_8bit_storage;
+  shader_8bit_storage.pNext = &shader_float16_int8_types;
+  shader_float16_int8_types.pNext = nullptr;
+
+  vkGetPhysicalDeviceFeatures2(handle, &features2);
+
   // Check if there are any memory types have both the HOST_VISIBLE and the
   // DEVICE_LOCAL property flags
   const VkMemoryPropertyFlags unified_memory_flags =
@@ -140,6 +160,9 @@ VkDevice create_logical_device(
 #ifdef VK_KHR_portability_subset
       VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
 #endif /* VK_KHR_portability_subset */
+      VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
+      VK_KHR_8BIT_STORAGE_EXTENSION_NAME,
+      VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
   };
 
   std::vector<const char*> enabled_device_extensions;
@@ -148,7 +171,7 @@ VkDevice create_logical_device(
       enabled_device_extensions,
       requested_device_extensions);
 
-  const VkDeviceCreateInfo device_create_info{
+  VkDeviceCreateInfo device_create_info{
       VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType
       nullptr, // pNext
       0u, // flags
@@ -162,6 +185,8 @@ VkDevice create_logical_device(
       nullptr, // pEnabledFeatures
   };
 
+  device_create_info.pNext = physical_device.extension_features;
+
   VkDevice handle = nullptr;
   VK_CHECK(vkCreateDevice(
       physical_device.handle, &device_create_info, nullptr, &handle));
@@ -371,38 +396,57 @@ std::string Adapter::stringize() const {
   ss << "    deviceType:    " << device_type << std::endl;
   ss << "    deviceName:    " << properties.deviceName << std::endl;
 
-#define PRINT_LIMIT_PROP(name)                                         \
-  ss << "      " << std::left << std::setw(36) << #name << limits.name \
+#define PRINT_PROP(struct, name)                                       \
+  ss << "      " << std::left << std::setw(36) << #name << struct.name \
      << std::endl;
 
-#define PRINT_LIMIT_PROP_VEC3(name)                                       \
-  ss << "      " << std::left << std::setw(36) << #name << limits.name[0] \
-     << "," << limits.name[1] << "," << limits.name[2] << std::endl;
+#define PRINT_PROP_VEC3(struct, name)                                     \
+  ss << "      " << std::left << std::setw(36) << #name << struct.name[0] \
+     << "," << struct.name[1] << "," << struct.name[2] << std::endl;
 
   ss << "    Physical Device Limits {" << std::endl;
-  PRINT_LIMIT_PROP(maxImageDimension1D);
-  PRINT_LIMIT_PROP(maxImageDimension2D);
-  PRINT_LIMIT_PROP(maxImageDimension3D);
-  PRINT_LIMIT_PROP(maxTexelBufferElements);
-  PRINT_LIMIT_PROP(maxPushConstantsSize);
-  PRINT_LIMIT_PROP(maxMemoryAllocationCount);
-  PRINT_LIMIT_PROP(maxSamplerAllocationCount);
-  PRINT_LIMIT_PROP(maxComputeSharedMemorySize);
-  PRINT_LIMIT_PROP_VEC3(maxComputeWorkGroupCount);
-  PRINT_LIMIT_PROP(maxComputeWorkGroupInvocations);
-  PRINT_LIMIT_PROP_VEC3(maxComputeWorkGroupSize);
+  PRINT_PROP(limits, maxImageDimension1D);
+  PRINT_PROP(limits, maxImageDimension2D);
+  PRINT_PROP(limits, maxImageDimension3D);
+  PRINT_PROP(limits, maxTexelBufferElements);
+  PRINT_PROP(limits, maxPushConstantsSize);
+  PRINT_PROP(limits, maxMemoryAllocationCount);
+  PRINT_PROP(limits, maxSamplerAllocationCount);
+  PRINT_PROP(limits, maxComputeSharedMemorySize);
+  PRINT_PROP_VEC3(limits, maxComputeWorkGroupCount);
+  PRINT_PROP(limits, maxComputeWorkGroupInvocations);
+  PRINT_PROP_VEC3(limits, maxComputeWorkGroupSize);
+  ss << "    }" << std::endl;
+
+  ss << "    16bit Storage Features {" << std::endl;
+  PRINT_PROP(physical_device_.shader_16bit_storage, storageBuffer16BitAccess);
+  PRINT_PROP(
+      physical_device_.shader_16bit_storage,
+      uniformAndStorageBuffer16BitAccess);
+  PRINT_PROP(physical_device_.shader_16bit_storage, storagePushConstant16);
+  PRINT_PROP(physical_device_.shader_16bit_storage, storageInputOutput16);
+  ss << "    }" << std::endl;
+
+  ss << "    8bit Storage Features {" << std::endl;
+  PRINT_PROP(physical_device_.shader_8bit_storage, storageBuffer8BitAccess);
+  PRINT_PROP(
+      physical_device_.shader_8bit_storage, uniformAndStorageBuffer8BitAccess);
+  PRINT_PROP(physical_device_.shader_8bit_storage, storagePushConstant8);
+  ss << "    }" << std::endl;
+
+  ss << "    Shader 16bit and 8bit Features {" << std::endl;
+  PRINT_PROP(physical_device_.shader_float16_int8_types, shaderFloat16);
+  PRINT_PROP(physical_device_.shader_float16_int8_types, shaderInt8);
   ss << "    }" << std::endl;
-  ss << "  }" << std::endl;
-  ;
 
   const VkPhysicalDeviceMemoryProperties& mem_props =
       physical_device_.memory_properties;
 
+  ss << "  }" << std::endl;
   ss << "  Memory Info {" << std::endl;
   ss << "    Memory Types [" << std::endl;
   for (size_t i = 0; i < mem_props.memoryTypeCount; ++i) {
-    ss << "      "
-       << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] "
+    ss << "      " << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] "
        << get_memory_properties_str(mem_props.memoryTypes[i].propertyFlags)
        << std::endl;
   }
@@ -433,6 +477,9 @@ std::string Adapter::stringize() const {
   ss << "  ]" << std::endl;
   ss << "}";
 
+#undef PRINT_PROP
+#undef PRINT_PROP_VEC3
+
   return ss.str();
 }
 
diff --git a/backends/vulkan/runtime/api/Adapter.h b/backends/vulkan/runtime/api/Adapter.h
index afbb48f4059..b038aea9fa8 100644
--- a/backends/vulkan/runtime/api/Adapter.h
+++ b/backends/vulkan/runtime/api/Adapter.h
@@ -30,6 +30,12 @@ struct PhysicalDevice final {
   // Properties obtained from Vulkan
   VkPhysicalDeviceProperties properties;
   VkPhysicalDeviceMemoryProperties memory_properties;
+  // Additional features available from extensions
+  VkPhysicalDevice16BitStorageFeatures shader_16bit_storage;
+  VkPhysicalDevice8BitStorageFeatures shader_8bit_storage;
+  VkPhysicalDeviceShaderFloat16Int8Features shader_float16_int8_types;
+
+  // Available GPU queues
   std::vector<VkQueueFamilyProperties> queue_families;
 
   // Metadata
@@ -38,6 +44,9 @@ struct PhysicalDevice final {
   bool has_timestamps;
   float timestamp_period;
 
+  // Head of the linked list of extensions to be requested
+  void* extension_features{nullptr};
+
   explicit PhysicalDevice(VkPhysicalDevice);
 };
 
@@ -189,6 +198,34 @@ class Adapter final {
     return vma_;
   }
 
+  // Physical Device Features
+
+  inline bool has_16bit_storage() {
+    return physical_device_.shader_16bit_storage.storageBuffer16BitAccess ==
+        VK_TRUE;
+  }
+
+  inline bool has_8bit_storage() {
+    return physical_device_.shader_8bit_storage.storageBuffer8BitAccess ==
+        VK_TRUE;
+  }
+
+  inline bool has_16bit_compute() {
+    return physical_device_.shader_float16_int8_types.shaderFloat16 == VK_TRUE;
+  }
+
+  inline bool has_8bit_compute() {
+    return physical_device_.shader_float16_int8_types.shaderInt8 == VK_TRUE;
+  }
+
+  inline bool has_full_float16_buffers_support() {
+    return has_16bit_storage() && has_16bit_compute();
+  }
+
+  inline bool has_full_int8_buffers_support() {
+    return has_8bit_storage() && has_8bit_compute();
+  }
+
   // Command Buffer Submission
 
   void
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 5f2d2eb72c7..9a43cf455d6 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -59,17 +59,25 @@ Context::~Context() {
 
 DescriptorSet Context::get_descriptor_set(
     const ShaderInfo& shader_descriptor,
-    const utils::uvec3& local_workgroup_size) {
+    const utils::uvec3& local_workgroup_size,
+    const SpecVarList& additional_constants) {
   VkDescriptorSetLayout shader_layout =
       shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
 
   VkPipelineLayout pipeline_layout =
       pipeline_layout_cache().retrieve(shader_layout);
 
+  SpecVarList spec_constants = {
+      SV(local_workgroup_size.data[0u]),
+      SV(local_workgroup_size.data[1u]),
+      SV(local_workgroup_size.data[2u])};
+
+  spec_constants.append(additional_constants);
+
   VkPipeline pipeline = pipeline_cache().retrieve(
       {pipeline_layout_cache().retrieve(shader_layout),
        shader_cache().retrieve(shader_descriptor),
-       local_workgroup_size});
+       spec_constants});
 
   cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
 
@@ -227,5 +235,11 @@ UniformParamsBuffer& UniformParamsBuffer::operator=(
   return *this;
 }
 
+ParamsBindList::ParamsBindList(
+    std::initializer_list<const api::BufferBindInfo> init_list) {
+  bind_infos.resize(init_list.size());
+  std::copy(init_list.begin(), init_list.end(), bind_infos.begin());
+}
+
 } // namespace api
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index 0813d4190de..d79344dce8d 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -172,7 +172,16 @@ class Context final {
     }
   }
 
-  DescriptorSet get_descriptor_set(const ShaderInfo&, const utils::uvec3&);
+  DescriptorSet get_descriptor_set(
+      const ShaderInfo&,
+      const utils::uvec3&,
+      const SpecVarList&);
+
+  inline DescriptorSet get_descriptor_set(
+      const ShaderInfo& shader_descriptor,
+      const utils::uvec3& local_work_group_size) {
+    return get_descriptor_set(shader_descriptor, local_work_group_size, {});
+  }
 
   void register_shader_dispatch(
       const DescriptorSet&,
@@ -196,6 +205,7 @@ class Context final {
       PipelineBarrier&,
       const utils::uvec3&,
       const utils::uvec3&,
+      const SpecVarList&,
       VkFence fence_handle,
       Arguments&&...);
 
@@ -234,7 +244,7 @@ class UniformParamsBuffer final {
     }
   }
 
-  VulkanBuffer& buffer() {
+  const VulkanBuffer& buffer() const {
     return vulkan_buffer_;
   }
 
@@ -254,6 +264,12 @@ class UniformParamsBuffer final {
   }
 };
 
+struct ParamsBindList final {
+  std::vector<api::BufferBindInfo> bind_infos;
+
+  ParamsBindList(std::initializer_list<const api::BufferBindInfo> init_list);
+};
+
 class StorageBuffer final {
  private:
   Context* context_p_;
@@ -321,6 +337,10 @@ inline void arg_is_empty(bool& any_is_empty, const VulkanImage& image) {
   any_is_empty = any_is_empty || !image;
 }
 
+inline void arg_is_empty(bool& any_is_empty, const BufferBindInfo& bind_info) {
+  any_is_empty = any_is_empty || (bind_info.handle == VK_NULL_HANDLE);
+}
+
 /*
   Reports if any VulkanBuffer or VulkanImage argument in a variadic argument
   list does not have any memory associated with it.
@@ -485,6 +505,7 @@ inline bool Context::submit_compute_job(
     PipelineBarrier& pipeline_barrier,
     const utils::uvec3& global_work_group,
     const utils::uvec3& local_work_group_size,
+    const SpecVarList& specialization_constants,
     VkFence fence_handle,
     Arguments&&... arguments) {
   // If any of the provided arguments does not have memory associated with it,
@@ -527,8 +548,8 @@ inline bool Context::submit_compute_job(
 #endif /* USE_VULKAN_GPU_DIAGNOSTICS */
 
   // Factor out template parameter independent code to minimize code bloat.
-  DescriptorSet descriptor_set =
-      get_descriptor_set(shader, local_work_group_size);
+  DescriptorSet descriptor_set = get_descriptor_set(
+      shader, local_work_group_size, specialization_constants);
 
   detail::bind(
       descriptor_set,
diff --git a/backends/vulkan/runtime/api/Descriptor.cpp b/backends/vulkan/runtime/api/Descriptor.cpp
index 25cbaeaa10d..572cc674981 100644
--- a/backends/vulkan/runtime/api/Descriptor.cpp
+++ b/backends/vulkan/runtime/api/Descriptor.cpp
@@ -15,6 +15,18 @@
 namespace vkcompute {
 namespace api {
 
+//
+// BufferBinding
+//
+
+BufferBindInfo::BufferBindInfo()
+    : handle(VK_NULL_HANDLE), offset(0u), range(0u) {}
+
+BufferBindInfo::BufferBindInfo(const VulkanBuffer& buffer_p)
+    : handle(buffer_p.handle()),
+      offset(buffer_p.mem_offset()),
+      range(buffer_p.mem_range()) {}
+
 //
 // DescriptorSet
 //
@@ -66,6 +78,21 @@ DescriptorSet& DescriptorSet::bind(
   return *this;
 }
 
+DescriptorSet& DescriptorSet::bind(
+    const uint32_t idx,
+    const BufferBindInfo& bind_info) {
+  DescriptorSet::ResourceBinding binder{};
+  binder.binding_idx = idx; // binding_idx
+  binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type
+  binder.is_image = false; // is_image
+  binder.resource_info.buffer_info.buffer = bind_info.handle; // buffer
+  binder.resource_info.buffer_info.offset = bind_info.offset; // offset
+  binder.resource_info.buffer_info.range = bind_info.range; // range
+  add_binding(binder);
+
+  return *this;
+}
+
 DescriptorSet& DescriptorSet::bind(
     const uint32_t idx,
     const VulkanImage& image) {
diff --git a/backends/vulkan/runtime/api/Descriptor.h b/backends/vulkan/runtime/api/Descriptor.h
index 9b9dcda208e..0b6b1cd885a 100644
--- a/backends/vulkan/runtime/api/Descriptor.h
+++ b/backends/vulkan/runtime/api/Descriptor.h
@@ -20,6 +20,20 @@
 namespace vkcompute {
 namespace api {
 
+/*
+ * Stores the binding information of a Vulkan Buffer so that the buffer can be
+ * bound at a later time. This struct should only be used if the buffer to be
+ * bound is guaranteed to be active at the time of binding.
+ */
+struct BufferBindInfo final {
+  VkBuffer handle;
+  VkDeviceSize offset;
+  VkDeviceSize range;
+
+  BufferBindInfo();
+  BufferBindInfo(const VulkanBuffer& buffer_p);
+};
+
 class DescriptorSet final {
  public:
   explicit DescriptorSet(VkDevice, VkDescriptorSet, ShaderLayout::Signature);
@@ -50,6 +64,7 @@ class DescriptorSet final {
   std::vector<ResourceBinding> bindings_;
 
  public:
+  DescriptorSet& bind(const uint32_t, const BufferBindInfo&);
   DescriptorSet& bind(const uint32_t, const VulkanBuffer&);
   DescriptorSet& bind(const uint32_t, const VulkanImage&);
 
diff --git a/backends/vulkan/runtime/api/Pipeline.cpp b/backends/vulkan/runtime/api/Pipeline.cpp
index 7207814707c..f4be0039e67 100644
--- a/backends/vulkan/runtime/api/Pipeline.cpp
+++ b/backends/vulkan/runtime/api/Pipeline.cpp
@@ -98,6 +98,101 @@ VkImageLayout vk_layout(
   return VK_IMAGE_LAYOUT_UNDEFINED;
 }
 
+//
+// SpecVar
+//
+
+SpecVar::SpecVar() : type(SpecVar::Type::INT) {
+  value.as_int32 = 0;
+}
+
+SpecVar::SpecVar(const float val) : type(SpecVar::Type::FLOAT) {
+  value.as_float = val;
+}
+
+SpecVar::SpecVar(const int32_t val) : type(SpecVar::Type::INT) {
+  value.as_int32 = val;
+}
+
+SpecVar::SpecVar(const uint32_t val) : type(SpecVar::Type::UINT) {
+  value.as_uint32 = val;
+}
+
+SpecVar::SpecVar(const bool val) : type(SpecVar::Type::BOOL) {
+  value.as_bool = val;
+}
+
+uint32_t SpecVar::val_size() const {
+  switch (type) {
+    case SpecVar::Type::FLOAT:
+      return sizeof(float);
+    case SpecVar::Type::INT:
+      return sizeof(int32_t);
+    case SpecVar::Type::UINT:
+      return sizeof(uint32_t);
+    case SpecVar::Type::BOOL:
+      return sizeof(bool);
+  }
+  return 4;
+}
+
+uint32_t SpecVar::val_offset() const {
+  return api::utils::safe_downcast<uint32_t>(offsetof(SpecVar, value));
+}
+
+bool operator==(const SpecVar& lhs, const SpecVar& rhs) {
+  if (lhs.type != rhs.type) {
+    return false;
+  }
+  switch (lhs.type) {
+    case SpecVar::Type::FLOAT:
+      return lhs.value.as_float == rhs.value.as_float;
+    case SpecVar::Type::INT:
+      return lhs.value.as_int32 == rhs.value.as_int32;
+    case SpecVar::Type::UINT:
+      return lhs.value.as_uint32 == rhs.value.as_uint32;
+    case SpecVar::Type::BOOL:
+      return lhs.value.as_bool == rhs.value.as_bool;
+  }
+  return false;
+}
+
+SpecVarList::SpecVarList() {}
+
+SpecVarList::SpecVarList(std::initializer_list<SpecVar> init_list) {
+  vars.resize(init_list.size());
+  std::copy(init_list.begin(), init_list.end(), vars.begin());
+}
+
+void SpecVarList::append(const SpecVarList& other) {
+  vars.insert(vars.end(), other.vars.begin(), other.vars.end());
+}
+
+std::vector<VkSpecializationMapEntry> SpecVarList::generate_map_entries()
+    const {
+  std::vector<VkSpecializationMapEntry> map_entries;
+  map_entries.resize(vars.size());
+  uint32_t cur_offset = 0u;
+  for (uint32_t i = 0; i < vars.size(); ++i) {
+    map_entries.at(i) = {
+        i, cur_offset + vars.at(i).val_offset(), vars.at(i).val_size()};
+    cur_offset += sizeof(SpecVar);
+  }
+  return map_entries;
+}
+
+bool operator==(const SpecVarList& lhs, const SpecVarList& rhs) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+  for (uint32_t i = 0; i < lhs.size(); ++i) {
+    if (lhs.vars.at(i) != rhs.vars.at(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 //
 // PipelineLayout
 //
@@ -154,33 +249,14 @@ ComputePipeline::ComputePipeline(
     const ComputePipeline::Descriptor& descriptor,
     VkPipelineCache pipeline_cache)
     : device_(device), handle_{VK_NULL_HANDLE} {
-  // NOLINTNEXTLINE
-  constexpr VkSpecializationMapEntry specialization_map_entries[3]{
-      // X
-      {
-          0u,
-          offsetof(utils::uvec3, data[0u]),
-          sizeof(utils::uvec3::data[0u]),
-      },
-      // Y
-      {
-          1u,
-          offsetof(utils::uvec3, data[1u]),
-          sizeof(utils::uvec3::data[1u]),
-      },
-      // Z
-      {
-          2u,
-          offsetof(utils::uvec3, data[2u]),
-          sizeof(utils::uvec3::data[2u]),
-      },
-  };
+  std::vector<VkSpecializationMapEntry> map_entries =
+      descriptor.specialization_constants.generate_map_entries();
 
   const VkSpecializationInfo specialization_info{
-      3u, // mapEntryCount
-      specialization_map_entries, // pMapEntries
-      sizeof(descriptor.local_work_group), // dataSize
-      &descriptor.local_work_group, // pData
+      descriptor.specialization_constants.size(), // mapEntryCount
+      map_entries.data(), // pMapEntries
+      descriptor.specialization_constants.data_nbytes(), // dataSize
+      descriptor.specialization_constants.data(), // pData
   };
 
   const VkPipelineShaderStageCreateInfo shader_stage_create_info{
@@ -242,7 +318,7 @@ bool operator==(
   return (
       _1.pipeline_layout == _2.pipeline_layout &&
       _1.shader_module == _2.shader_module &&
-      _1.local_work_group == _2.local_work_group);
+      _1.specialization_constants == _2.specialization_constants);
 }
 
 //
diff --git a/backends/vulkan/runtime/api/Pipeline.h b/backends/vulkan/runtime/api/Pipeline.h
index 409fd2afa87..b8c16efd910 100644
--- a/backends/vulkan/runtime/api/Pipeline.h
+++ b/backends/vulkan/runtime/api/Pipeline.h
@@ -18,9 +18,73 @@
 #include <mutex>
 #include <unordered_map>
 
+#define SV(x) ::vkcompute::api::SpecVar(x)
+
 namespace vkcompute {
 namespace api {
 
+struct SpecVar final {
+  enum class Type : uint8_t {
+    FLOAT,
+    INT,
+    UINT,
+    BOOL,
+  };
+
+  union Value {
+    int32_t as_int32;
+    uint32_t as_uint32;
+    float as_float;
+    bool as_bool;
+  };
+
+  Value value;
+  Type type;
+
+  SpecVar();
+  SpecVar(const float val);
+  SpecVar(const int32_t val);
+  SpecVar(const uint32_t val);
+  SpecVar(const bool val);
+
+  uint32_t val_size() const;
+  uint32_t val_offset() const;
+};
+
+bool operator==(const SpecVar& lhs, const SpecVar& rhs);
+
+class SpecVarList final {
+  std::vector<SpecVar> vars;
+
+ public:
+  SpecVarList();
+  SpecVarList(std::initializer_list<SpecVar> init_list);
+
+  inline const SpecVar& at(const size_t index) const {
+    return vars.at(index);
+  }
+
+  inline const SpecVar* data() const {
+    return vars.data();
+  }
+
+  inline uint32_t size() const {
+    return api::utils::safe_downcast<uint32_t>(vars.size());
+  }
+
+  inline uint32_t data_nbytes() const {
+    return vars.size() * sizeof(SpecVar);
+  }
+
+  void append(const SpecVarList& other);
+
+  std::vector<VkSpecializationMapEntry> generate_map_entries() const;
+
+  friend bool operator==(const SpecVarList& lhs, const SpecVarList& rhs);
+};
+
+bool operator==(const SpecVarList& lhs, const SpecVarList& rhs);
+
 struct PipelineBarrier final {
   struct Stages final {
     VkPipelineStageFlags src;
@@ -83,7 +147,7 @@ class ComputePipeline final {
   struct Descriptor final {
     VkPipelineLayout pipeline_layout;
     VkShaderModule shader_module;
-    utils::uvec3 local_work_group;
+    SpecVarList specialization_constants;
   };
 
   explicit ComputePipeline(
@@ -171,12 +235,29 @@ class ComputePipelineCache final {
           seed, std::hash<VkPipelineLayout>()(descriptor.pipeline_layout));
       seed = utils::hash_combine(
           seed, std::hash<VkShaderModule>()(descriptor.shader_module));
-      seed = utils::hash_combine(
-          seed, std::hash<uint32_t>()(descriptor.local_work_group.data[0u]));
-      seed = utils::hash_combine(
-          seed, std::hash<uint32_t>()(descriptor.local_work_group.data[1u]));
-      seed = utils::hash_combine(
-          seed, std::hash<uint32_t>()(descriptor.local_work_group.data[2u]));
+
+      const SpecVarList& spec_vars = descriptor.specialization_constants;
+      seed = utils::hash_combine(seed, std::hash<uint32_t>()(spec_vars.size()));
+
+      for (int i = 0; i < spec_vars.size(); ++i) {
+        const SpecVar& spec_var = spec_vars.at(i);
+        size_t new_seed = 0;
+        switch (spec_var.type) {
+          case SpecVar::Type::FLOAT:
+            new_seed = std::hash<float>()(spec_var.value.as_float);
+            break;
+          case SpecVar::Type::INT:
+            new_seed = std::hash<int32_t>()(spec_var.value.as_int32);
+            break;
+          case SpecVar::Type::UINT:
+            new_seed = std::hash<uint32_t>()(spec_var.value.as_uint32);
+            break;
+          case SpecVar::Type::BOOL:
+            new_seed = std::hash<bool>()(spec_var.value.as_bool);
+            break;
+        }
+        seed = utils::hash_combine(seed, new_seed);
+      }
 
       return seed;
     }
diff --git a/backends/vulkan/runtime/api/Runtime.cpp b/backends/vulkan/runtime/api/Runtime.cpp
index b470435a894..e113a4e3b4f 100644
--- a/backends/vulkan/runtime/api/Runtime.cpp
+++ b/backends/vulkan/runtime/api/Runtime.cpp
@@ -16,6 +16,22 @@
 namespace vkcompute {
 namespace api {
 
+#define PRINT_CASE(name)       \
+  case MemoryAccessType::name: \
+    out << #name;              \
+    break;
+
+std::ostream& operator<<(std::ostream& out, const MemoryAccessType& tag) {
+  switch (tag) {
+    PRINT_CASE(NONE)
+    PRINT_CASE(READ)
+    PRINT_CASE(WRITE)
+  }
+  return out;
+}
+
+#undef PRINT_CASE
+
 namespace {
 
 void find_requested_layers_and_extensions(
@@ -69,7 +85,7 @@ VkInstance create_instance(const RuntimeConfiguration& config) {
       0, // applicationVersion
       nullptr, // pEngineName
       0, // engineVersion
-      VK_API_VERSION_1_0, // apiVersion
+      VK_API_VERSION_1_1, // apiVersion
   };
 
   std::vector<const char*> enabled_layers;
diff --git a/backends/vulkan/runtime/api/Shader.cpp b/backends/vulkan/runtime/api/Shader.cpp
index 3f0f90119dd..731a79b1f7f 100644
--- a/backends/vulkan/runtime/api/Shader.cpp
+++ b/backends/vulkan/runtime/api/Shader.cpp
@@ -23,38 +23,19 @@ ShaderInfo::ShaderInfo()
           0u,
       } {}
 
-ShaderInfo::ShaderInfo(
-    std::string name,
-    const uint32_t* const spirv_bin,
-    const uint32_t size,
-    std::vector<VkDescriptorType>  layout)
-    : src_code{
-          spirv_bin,
-          size,
-      },
-      kernel_name{std::move(name)},
-      kernel_layout{std::move(layout)} {}
-
 ShaderInfo::ShaderInfo(
     std::string name,
     const uint32_t* const spirv_bin,
     const uint32_t size,
     std::vector<VkDescriptorType>  layout,
-    const std::vector<uint32_t>& tile_size,
-    const StorageType bias_storage_type,
-    const StorageType weight_storage_type)
+    const utils::uvec3 tile_size)
     : src_code{
           spirv_bin,
           size,
       },
       kernel_name{std::move(name)},
       kernel_layout{std::move(layout)},
-      tile_size(tile_size),
-      bias_storage_type(bias_storage_type),
-      weight_storage_type(weight_storage_type) {
-  for (uint64_t i = 0; i < tile_size.size(); ++i) {
-    out_tile_size.data[i] = tile_size[i];
-  }
+      out_tile_size(tile_size) {
 }
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2) {
diff --git a/backends/vulkan/runtime/api/Shader.h b/backends/vulkan/runtime/api/Shader.h
index 6677ecee30b..9a04b52b80a 100644
--- a/backends/vulkan/runtime/api/Shader.h
+++ b/backends/vulkan/runtime/api/Shader.h
@@ -62,25 +62,14 @@ struct ShaderInfo final {
   // Shader Metadata
   utils::uvec3 out_tile_size{1u, 1u, 1u};
 
-  std::vector<uint32_t> tile_size;
-  StorageType bias_storage_type{StorageType::UNKNOWN};
-  StorageType weight_storage_type{StorageType::UNKNOWN};
-
   explicit ShaderInfo();
-  explicit ShaderInfo(std::string, const char*);
-  explicit ShaderInfo(
-      std::string,
-      const uint32_t*,
-      const uint32_t,
-      std::vector<VkDescriptorType>);
+
   explicit ShaderInfo(
       std::string,
       const uint32_t*,
       const uint32_t,
       std::vector<VkDescriptorType>,
-      const std::vector<uint32_t>& tile_size,
-      const StorageType bias_storage_type,
-      const StorageType weight_storage_type);
+      const utils::uvec3 tile_size);
 };
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2);
diff --git a/backends/vulkan/runtime/api/Tensor.cpp b/backends/vulkan/runtime/api/Tensor.cpp
index 89424abb2a0..402d35d75bb 100644
--- a/backends/vulkan/runtime/api/Tensor.cpp
+++ b/backends/vulkan/runtime/api/Tensor.cpp
@@ -13,80 +13,6 @@ namespace vkcompute {
 
 namespace {
 
-/*
- * Calculates the strides of a contiguous tensor. empty_tensor_restride from
- * TensorImpl.h was used as a reference.
- */
-std::vector<int64_t> calc_contiguous_strides(
-    const std::vector<int64_t>& sizes) {
-  int64_t ndim = static_cast<int64_t>(sizes.size());
-  std::vector<int64_t> strides(ndim);
-
-  int64_t running_product = 1;
-  if (ndim >= 1) {
-    strides.at(ndim - 1) = running_product;
-    for (int i = static_cast<int>(sizes.size()) - 2; i >= 0; --i) {
-      running_product *= sizes.at(i + 1);
-      strides.at(i) = running_product;
-    }
-  }
-
-  return strides;
-}
-
-std::vector<int64_t> calc_channels_last_strides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-
-  switch (sizes.size()) {
-    case 4:
-      strides.at(1) = 1;
-      strides.at(3) = sizes.at(1);
-      strides.at(2) = strides.at(3) * sizes.at(3);
-      strides.at(0) = strides.at(2) * sizes.at(2);
-      return strides;
-    case 3:
-      strides.at(0) = 1;
-      strides.at(2) = sizes.at(0);
-      strides.at(1) = strides.at(2) * sizes.at(2);
-      return strides;
-    default:
-      VK_THROW("ChannelsLast format only available for 3 <= ndim <= 4!");
-  }
-
-  return strides;
-}
-
-/*
- * Calculates the strides of a tensor based on the sizes and memory format. Note
- * that strides are only valid for vTensors that are backed by buffer storage;
- * if texture storage is used then the strides are invalid and set to zeros.
- */
-std::vector<int64_t> calc_strides(
-    const std::vector<int64_t>& sizes,
-    const api::GPUMemoryLayout memory_layout,
-    const api::StorageType storage_type) {
-  switch (storage_type) {
-    case api::StorageType::BUFFER:
-      switch (memory_layout) {
-        case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-          return calc_contiguous_strides(sizes);
-          break;
-        case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-          return calc_channels_last_strides(sizes);
-          break;
-        default:
-          VK_THROW("Invalid memory format used to create vTensor!");
-      }
-      break;
-    case api::StorageType::TEXTURE_3D:
-    case api::StorageType::TEXTURE_2D:
-      return std::vector<int64_t>(sizes.size());
-    default:
-      VK_THROW("Invalid storage type used to create vTensor!");
-  }
-}
-
 /*
  * When stored on the GPU, one dimension will be aligned to the next multiple of
  * 4 in order to take advantage of vec4 data types. The dimension that is
@@ -99,10 +25,8 @@ std::vector<int64_t> calc_gpu_sizes(
     const std::vector<int64_t>& sizes,
     const api::GPUMemoryLayout memory_layout,
     const api::StorageType storage_type) {
-  VK_CHECK_COND(storage_type != api::StorageType::UNKNOWN);
-
   std::vector<int64_t> gpu_sizes;
-  if (storage_type == api::StorageType::BUFFER) {
+  if (storage_type == api::kBuffer) {
     gpu_sizes.resize(sizes.size());
     for (size_t i = 0; i < sizes.size(); i++) {
       gpu_sizes.at(i) = sizes.at(i);
@@ -127,21 +51,21 @@ std::vector<int64_t> calc_gpu_sizes(
 
   size_t ndim = gpu_sizes.size();
   switch (memory_layout) {
-    case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
+    case api::kWidthPacked:
       if (ndim >= 1) {
         gpu_sizes.at(ndim - 1) =
             api::utils::align_up(api::utils::val_at(-1, sizes), INT64_C(4));
       }
       break;
 
-    case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
+    case api::kHeightPacked:
       if (ndim >= 2) {
         gpu_sizes.at(ndim - 2) =
             api::utils::align_up(api::utils::val_at(-2, sizes), INT64_C(4));
       }
       break;
 
-    case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
+    case api::kChannelsPacked:
       if (ndim >= 3) {
         gpu_sizes.at(ndim - 3) =
             api::utils::align_up(api::utils::val_at(-3, sizes), INT64_C(4));
@@ -162,7 +86,7 @@ api::utils::uvec3 create_image_extents(
     const api::GPUMemoryLayout memory_layout) {
   size_t ndim = gpu_sizes.size();
 
-  if (storage_type == api::StorageType::BUFFER) {
+  if (storage_type == api::kBuffer) {
     // image extents do not apply to buffer storage
     return {0u, 0u, 0u};
   } else {
@@ -177,15 +101,15 @@ api::utils::uvec3 create_image_extents(
     uint32_t batch = safe_downcast<uint32_t>(val_at(-4, gpu_sizes));
 
     switch (memory_layout) {
-      case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-        VK_CHECK_COND(width % 4 == 0, "Channels must be divisible by 4!");
+      case api::kWidthPacked:
+        VK_CHECK_COND(width % 4 == 0, "Width must be divisible by 4!");
         width /= 4;
         break;
-      case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
-        VK_CHECK_COND(height % 4 == 0, "Channels must be divisible by 4!");
+      case api::kHeightPacked:
+        VK_CHECK_COND(height % 4 == 0, "Height must be divisible by 4!");
         height /= 4;
         break;
-      case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
+      case api::kChannelsPacked:
         VK_CHECK_COND(channels % 4 == 0, "Channels must be divisible by 4!");
         channels /= 4;
         break;
@@ -214,152 +138,110 @@ vTensor::vTensor(
       memory_layout_(memory_layout),
       // Calculate sizes and strides
       sizes_(sizes.begin(), sizes.end()),
-      strides_{calc_strides(sizes, memory_layout_, storage_type)},
       gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
-      gpu_strides_{calc_strides(gpu_sizes_, memory_layout_, storage_type)},
-      virtual_extents_(
-          create_image_extents(gpu_sizes_, storage_type, memory_layout)),
+      texture_limits_{{0, 0, 0}},
       // Utility Uniform Buffers that can be passed to shaders as arguments
-      cpu_sizes_uniform_(nullptr),
-      gpu_sizes_uniform_(nullptr),
-      extents_uniform_(nullptr),
+      sizes_uniform_(),
+      texture_limits_uniform_(),
       // Construct Tensor storage
-      view_(std::make_shared<vTensorStorage>(
+      storage_(
           context,
           storage_type,
           memory_layout_,
           gpu_sizes_,
           dtype_,
-          allocate_memory)) {}
+          allocate_memory) {
+  if (storage_type != api::kBuffer) {
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[0]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[1]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[2])};
+  }
 
-vTensor::vTensor(
-    api::Context* const context,
-    const std::vector<int64_t>& sizes,
-    double q_scale,
-    int64_t q_zero_point,
-    const api::ScalarType dtype,
-    const api::StorageType storage_type,
-    const api::GPUMemoryLayout memory_layout)
-    : dtype_(dtype),
-      memory_layout_(memory_layout),
-      // Calculate sizes and strides
-      sizes_(sizes.begin(), sizes.end()),
-      strides_{calc_strides(sizes, memory_layout_, storage_type)},
-      gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
-      gpu_strides_{calc_strides(gpu_sizes_, memory_layout_, storage_type)},
-      virtual_extents_(
-          create_image_extents(gpu_sizes_, storage_type, memory_layout)),
-      // Vulkan uniform buffer containing sizes and stride info
-      cpu_sizes_uniform_(nullptr),
-      gpu_sizes_uniform_(nullptr),
-      extents_uniform_(nullptr),
-      // Quantization params
-      is_quantized_{true},
-      q_scale_{q_scale},
-      q_zero_point_{q_zero_point},
-      // Construct Tensor storage
-      view_(std::make_shared<vTensorStorage>(
-          context,
-          storage_type,
-          memory_layout_,
-          gpu_sizes_,
-          dtype_)) {}
+  if (dtype == api::kHalf) {
+    VK_CHECK_COND(
+        api::context()->adapter_ptr()->has_16bit_storage(),
+        "Half dtype is only available if the physical device supports float16 "
+        "storage buffers!");
+  }
+}
 
 api::VulkanImage& vTensor::image(
     api::PipelineBarrier& pipeline_barrier,
-    const api::PipelineStageFlags stage) const& {
-  view_->transition(pipeline_barrier, stage, api::MemoryAccessType::READ);
-  return view_->image_;
+    const api::PipelineStageFlags stage) & {
+  storage_.transition(pipeline_barrier, stage, api::MemoryAccessType::READ);
+  return storage_.image_;
 }
 
 api::VulkanImage& vTensor::image(
     api::PipelineBarrier& pipeline_barrier,
     const api::PipelineStageFlags stage,
     const api::MemoryAccessFlags access) & {
-  view_->transition(pipeline_barrier, stage, access);
-  return view_->image_;
+  storage_.transition(pipeline_barrier, stage, access);
+  return storage_.image_;
 }
 
 api::VulkanBuffer& vTensor::buffer(
     api::PipelineBarrier& pipeline_barrier,
-    const api::PipelineStageFlags stage) const& {
-  view_->transition(pipeline_barrier, stage, api::MemoryAccessType::READ);
-  return view_->buffer_;
+    const api::PipelineStageFlags stage) & {
+  storage_.transition(pipeline_barrier, stage, api::MemoryAccessType::READ);
+  return storage_.buffer_;
 }
 
 api::VulkanBuffer& vTensor::buffer(
     api::PipelineBarrier& pipeline_barrier,
     const api::PipelineStageFlags stage,
     const api::MemoryAccessFlags access) & {
-  view_->transition(pipeline_barrier, stage, access);
-  return view_->buffer_;
-}
-
-std::shared_ptr<api::UniformParamsBuffer> vTensor::cpu_sizes_ubo() {
-  if (!cpu_sizes_uniform_) {
-    cpu_sizes_uniform_.reset(new api::UniformParamsBuffer(
-        view_->context_, api::utils::make_whcn_ivec4(sizes_)));
-  }
-  return cpu_sizes_uniform_;
+  storage_.transition(pipeline_barrier, stage, access);
+  return storage_.buffer_;
 }
 
-std::shared_ptr<api::UniformParamsBuffer> vTensor::gpu_sizes_ubo() {
-  if (!gpu_sizes_uniform_) {
-    gpu_sizes_uniform_.reset(new api::UniformParamsBuffer(
-        view_->context_, api::utils::make_whcn_ivec4(gpu_sizes_)));
+const api::BufferBindInfo vTensor::sizes_ubo() {
+  if (!sizes_uniform_.buffer()) {
+    sizes_uniform_ = api::UniformParamsBuffer(
+        storage_.context_, api::utils::make_whcn_ivec4(sizes_));
   }
-  return gpu_sizes_uniform_;
+  return api::BufferBindInfo(sizes_uniform_.buffer());
 }
 
-std::shared_ptr<api::UniformParamsBuffer> vTensor::extents_ubo() {
-  if (!extents_uniform_) {
-    extents_uniform_.reset(new api::UniformParamsBuffer(
-        view_->context_,
-        api::utils::uvec4(
-            {view_->extents_.data[0],
-             view_->extents_.data[1],
-             view_->extents_.data[2],
-             1u})));
+const api::BufferBindInfo vTensor::texture_limits_ubo() {
+  if (!texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_ =
+        api::UniformParamsBuffer(storage_.context_, texture_limits_);
   }
-  return extents_uniform_;
+  return api::BufferBindInfo(texture_limits_uniform_.buffer());
 }
 
 VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
   switch (storage_type()) {
-    case api::StorageType::BUFFER:
-      return view_->buffer_.allocation_create_info();
-    case api::StorageType::TEXTURE_2D:
-    case api::StorageType::TEXTURE_3D:
-      return view_->image_.allocation_create_info();
-    case api::StorageType::UNKNOWN:
-      break;
+    case api::kBuffer:
+      return storage_.buffer_.allocation_create_info();
+    case api::kTexture2D:
+    case api::kTexture3D:
+      return storage_.image_.allocation_create_info();
   }
   return {};
 }
 
 VkMemoryRequirements vTensor::get_memory_requirements() const {
   switch (storage_type()) {
-    case api::StorageType::BUFFER:
-      return view_->buffer_.get_memory_requirements();
-    case api::StorageType::TEXTURE_2D:
-    case api::StorageType::TEXTURE_3D:
-      return view_->image_.get_memory_requirements();
-    case api::StorageType::UNKNOWN:
-      break;
+    case api::kBuffer:
+      return storage_.buffer_.get_memory_requirements();
+    case api::kTexture2D:
+    case api::kTexture3D:
+      return storage_.image_.get_memory_requirements();
   }
   return {};
 }
 
 void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
   switch (storage_type()) {
-    case api::StorageType::BUFFER:
-      view_->buffer_.bind_allocation(allocation);
-      break;
-    case api::StorageType::TEXTURE_2D:
-    case api::StorageType::TEXTURE_3D:
-      view_->image_.bind_allocation(allocation);
+    case api::kBuffer:
+      storage_.buffer_.bind_allocation(allocation);
       break;
-    case api::StorageType::UNKNOWN:
+    case api::kTexture2D:
+    case api::kTexture3D:
+      storage_.image_.bind_allocation(allocation);
       break;
   }
 }
@@ -367,60 +249,52 @@ void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
 void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   sizes_ = new_sizes;
   gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
-  virtual_extents_ =
-      create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
 
-  if (cpu_sizes_uniform_) {
-    cpu_sizes_uniform_->update(api::utils::make_whcn_ivec4(sizes_));
+  if (storage_type() != api::kBuffer) {
+    // Calculate the extents of the image texture that would have been required
+    // for a tensor of the new sizes.
+    api::utils::uvec3 virtual_extents =
+        create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
+    // Update the texture limits to reflect the new virtual extents.
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[0]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[1]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[2])};
   }
 
-  if (gpu_sizes_uniform_) {
-    gpu_sizes_uniform_->update(api::utils::make_whcn_ivec4(gpu_sizes_));
+  if (sizes_uniform_.buffer()) {
+    sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
   }
-
-  if (extents_uniform_) {
-    extents_uniform_->update(api::utils::uvec4(
-        {virtual_extents_.data[0],
-         virtual_extents_.data[1],
-         virtual_extents_.data[2],
-         1u}));
+  if (texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_.update(texture_limits_);
   }
 }
 
 void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
   update_size_metadata(new_sizes);
-  view_->discard_and_reallocate(
+  storage_.discard_and_reallocate(
       calc_gpu_sizes(new_sizes, memory_layout_, storage_type()),
       memory_layout_,
       dtype_);
 }
 
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
-  update_size_metadata(new_sizes);
-  if (storage_type() == api::StorageType::BUFFER) {
-    if (gpu_nbytes() > view_->buffer_.mem_size()) {
-      VK_THROW(
-          "Cannot virtual_resize a vTensor with sizes that require a larger "
-          "buffer! reallocate() should be used instead.");
-    }
-  } else {
-    bool valid_resize = true;
-    if (virtual_extents_.data[0] > view_->extents_.data[0]) {
-      valid_resize = false;
-    }
-    if (virtual_extents_.data[1] > view_->extents_.data[1]) {
-      valid_resize = false;
-    }
-    if (virtual_extents_.data[2] > view_->extents_.data[2]) {
-      valid_resize = false;
-    }
+  // For texture storage check that the current texture is large enough for the
+  // new sizes of the tensor.
+  if (storage_type() != api::kBuffer) {
+    api::utils::uvec3 virtual_extents =
+        create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
 
-    if (!valid_resize) {
-      VK_THROW(
-          "Cannot virtual_resize a vTensor with sizes that require a larger "
-          "image texture! reallocate() should be used instead.");
-    }
+    bool valid_resize = virtual_extents.data[0] <= extents().data[0];
+    valid_resize = valid_resize && virtual_extents.data[1] <= extents().data[1];
+    valid_resize = valid_resize && virtual_extents.data[2] <= extents().data[2];
+
+    VK_CHECK_COND(
+        valid_resize,
+        "Cannot use virtual resize if new sizes requires a larger texture.");
   }
+
+  update_size_metadata(new_sizes);
 }
 
 //
@@ -443,14 +317,14 @@ api::VulkanImage allocate_image(
   };
 
   VkImageType image_type = VK_IMAGE_TYPE_3D;
-  VkImageViewType image_view_type = VK_IMAGE_VIEW_TYPE_3D;
+  VkImageViewType image_view_type;
 
   switch (storage_type) {
-    case api::StorageType::TEXTURE_3D:
+    case api::kTexture3D:
       image_type = VK_IMAGE_TYPE_3D;
       image_view_type = VK_IMAGE_VIEW_TYPE_3D;
       break;
-    case api::StorageType::TEXTURE_2D:
+    case api::kTexture2D:
       image_type = VK_IMAGE_TYPE_2D;
       image_view_type = VK_IMAGE_VIEW_TYPE_2D;
       break;
@@ -481,7 +355,7 @@ api::VulkanBuffer allocate_buffer(
   api::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
   switch (storage_type) {
-    case api::StorageType::BUFFER:
+    case api::kBuffer:
       break;
     default:
       // Return an empty VulkanBuffer if Buffer storage is not used
@@ -585,39 +459,6 @@ void vTensorStorage::transition(
   last_access_.access = cur_access;
 }
 
-void add_buffer_barrier(
-    api::PipelineBarrier& pipeline_barrier,
-    const api::VulkanBuffer& buffer,
-    const api::PipelineStageFlags prev_stage,
-    const api::MemoryAccessFlags prev_access,
-    const api::PipelineStageFlags cur_stage,
-    const api::MemoryAccessFlags cur_access) {
-  // Check for RAW
-  const bool read_requested = (cur_access & api::MemoryAccessType::READ) != 0;
-  const bool prev_written = (prev_access & api::MemoryAccessType::WRITE) != 0;
-
-  const bool is_RAW = read_requested && prev_written;
-
-  if (is_RAW) {
-    VkPipelineStageFlags src_stage = api::vk_stage(prev_stage);
-    if (0u == src_stage) {
-      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-    }
-    VkPipelineStageFlags dst_stage = api::vk_stage(cur_stage);
-    if (0u == dst_stage) {
-      dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
-    }
-
-    pipeline_barrier.stage.src |= src_stage;
-    pipeline_barrier.stage.dst |= dst_stage;
-
-    pipeline_barrier.buffers.emplace_back(
-        api::vk_access(prev_stage, prev_access),
-        api::vk_access(cur_stage, cur_access),
-        buffer);
-  }
-}
-
 void vTensorStorage::discard_and_reallocate(
     const std::vector<int64_t>& gpu_sizes,
     const api::GPUMemoryLayout gpu_memory_layout,
diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
index 3ce34b9949a..787e8111204 100644
--- a/backends/vulkan/runtime/api/Tensor.h
+++ b/backends/vulkan/runtime/api/Tensor.h
@@ -42,11 +42,11 @@ class vTensorStorage final {
       const api::ScalarType dtype,
       const bool allocate_memory = true);
 
-  vTensorStorage(const vTensorStorage&) = delete;
-  vTensorStorage& operator=(const vTensorStorage&) = delete;
+  vTensorStorage(const vTensorStorage& other) = delete;
+  vTensorStorage& operator=(const vTensorStorage& other) = delete;
 
-  vTensorStorage(vTensorStorage&&) = default;
-  vTensorStorage operator=(vTensorStorage&&) = delete;
+  vTensorStorage(vTensorStorage&& other) = default;
+  vTensorStorage& operator=(vTensorStorage&& other) = default;
 
   ~vTensorStorage();
 
@@ -94,115 +94,60 @@ class vTensorStorage final {
 };
 
 class vTensor final {
- public:
-  // Do not allow empty vTensor construction
-  vTensor() = default;
+  struct TextureLimits {
+    // Alignment is required to conform with Vulkan specification; a 3 or 4
+    // component vector with components of size N must have base alignment of
+    // 4N.
+    alignas(16) api::utils::ivec3 limits;
+  };
 
-  // Default constructor
-  vTensor(
+ public:
+  explicit vTensor(
       api::Context* context,
       const std::vector<int64_t>& sizes,
       const api::ScalarType dtype,
-      const api::StorageType storage_type = api::StorageType::TEXTURE_3D,
-      const api::GPUMemoryLayout memory_layout =
-          api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
+      const api::StorageType storage_type = api::kTexture3D,
+      const api::GPUMemoryLayout memory_layout = api::kChannelsPacked,
       const bool allocate_memory = true);
 
-  // Default constructor for quantized vTensor
-  vTensor(
-      api::Context* const context,
-      const std::vector<int64_t>& sizes,
-      double q_scale,
-      int64_t q_zero_point,
-      const api::ScalarType dtype,
-      const api::StorageType storage_type = api::StorageType::TEXTURE_3D,
-      const api::GPUMemoryLayout memory_layout =
-          api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
-
-  // Copy Constructor and Assignment; Ideally copying  would be disabled
-  // (see the reasoning for move assignment below) but it is required for
-  // compatibility with OpaqueTensorImpl
-  vTensor(const vTensor& other) = default;
-  vTensor& operator=(const vTensor& other) = default;
+  vTensor(const vTensor& other) = delete;
+  vTensor& operator=(const vTensor& other) = delete;
 
-  // Move Constructor and assignment
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
 
  private:
-  // Tensor Options
   api::ScalarType dtype_;
-
-  // GPU specific memory layout qualifier
   api::GPUMemoryLayout memory_layout_;
 
-  // Sizes and Strides
   std::vector<int64_t> sizes_;
-  std::vector<int64_t> strides_;
-
-  // Storage Dimensions. When stored on the GPU, one dimension will be aligned
-  // to the next multiple of 4 in order to take advantage of vec4 data types.
   std::vector<int64_t> gpu_sizes_;
-  std::vector<int64_t> gpu_strides_;
-
-  // The extents that correspond to the tensor's size metadata. Note that this
-  // may not be the same as the extents of the underlying image texture because
-  // vTensor can be virtually resized via virtual_resize() which will cause it
-  // to be interpreted as a tensor with a different size.
-  api::utils::uvec3 virtual_extents_;
-
-  // A Vulkan uniform buffer containing the tensor sizes that can be passed into
-  // a shader.
-  std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_uniform_;
-
-  // A Vulkan uniform buffer containing the GPU tensor sizes that can be passed
-  // into a shader. GPU sizes refers to the sizes of the tensor after padding
-  // has been applied to one dimension to align it to the next multiple of 4.
-  std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_uniform_;
-
-  // A Vulkan uniform buffer containing the image extents of the underlying
-  // image texture that can be passed into a shader.
-  std::shared_ptr<api::UniformParamsBuffer> extents_uniform_;
-
-  // Quantization params
-  bool is_quantized_{false};
-  double q_scale_{1.0f};
-  int64_t q_zero_point_{0u};
-
-  // Even at the cost of a heap allocation plus the resulting negative impact
-  // on cache locality due to the subsequent pointer chasing, it is still
-  // critical to share the view across vTensor implementations to minimize
-  // programmer errors.  Ideally this class should have been only made movable,
-  // and non-copyable - something we cannot do unfortunately due to the inner
-  // workings of at::TensorImpl requiring copy semantics in
-  // at::TensorImpl::release_resources() to function as expected.  Now that this
-  // class is made copyable though, a new door to a whole new class of bugs is
-  // opened, in that there now is a chance of two [shallow] copies, have their
-  // StorageState objects go out of sync as a result of an operation being
-  // performed on one shallow copy that is not reflected in the other.
-  // Technically, if the programmer is very careful, it is possible to avoid
-  // this trap and not pay the cost of indirection, but the resulting bugs of
-  // missing memory barriers will be so frustrating to hunt down for those
-  // unfamiliar with the internal mechanics of this class, that I decided to
-  // take the performance penalty of this extra layer of indirection in favor
-  // of making this class easier to use.
-  std::shared_ptr<vTensorStorage> view_;
+  TextureLimits texture_limits_;
+
+  // A Vulkan uniform buffer containing the (W, H, C, N) tensor sizes that can
+  // be passed into a shader.
+  api::UniformParamsBuffer sizes_uniform_;
+
+  // A Vulkan uniform buffer containing the texture limits derived from the
+  // tensor's current size information that can be passed into a shader. Note
+  // that the texture limits may be different from the texture's extents if the
+  // tensor has been resized with `virtual_resize()`.
+  api::UniformParamsBuffer texture_limits_uniform_;
+
+  vTensorStorage storage_;
 
  public:
   /*
    Texture Access
   */
 
-  inline api::StorageType storage_type() const {
-    return view_->storage_type_;
-  }
-
   inline api::VulkanImage& image() const& {
-    return view_->image_;
+    return storage_.image_;
   }
 
-  api::VulkanImage& image(api::PipelineBarrier&, const api::PipelineStageFlags)
-      const&;
+  api::VulkanImage& image(
+      api::PipelineBarrier&,
+      const api::PipelineStageFlags) &;
 
   api::VulkanImage& image(
       api::PipelineBarrier&,
@@ -210,12 +155,12 @@ class vTensor final {
       const api::MemoryAccessFlags) &;
 
   inline api::VulkanBuffer& buffer() const& {
-    return view_->buffer_;
+    return storage_.buffer_;
   }
 
   api::VulkanBuffer& buffer(
       api::PipelineBarrier&,
-      const api::PipelineStageFlags) const&;
+      const api::PipelineStageFlags) &;
 
   api::VulkanBuffer& buffer(
       api::PipelineBarrier&,
@@ -226,8 +171,12 @@ class vTensor final {
     Metadata
   */
 
+  inline api::StorageType storage_type() const {
+    return storage_.storage_type_;
+  }
+
   inline const api::utils::uvec3& extents() const {
-    return view_->extents_;
+    return storage_.extents_;
   }
 
   /*
@@ -237,93 +186,42 @@ class vTensor final {
     return dtype_;
   }
 
-  /*
-   * Get an `api::ScalarType` that corresponds to the image format of the
-   * texture
-   */
-  inline api::ScalarType texture_dtype() const {
-    return api::element_scalartype(view_->texture_format());
-  }
-
   inline api::GPUMemoryLayout gpu_memory_layout() const {
     return memory_layout_;
   }
 
-  inline uint32_t gpu_memory_layout_as_uint() const {
-    return static_cast<uint32_t>(memory_layout_);
+  inline int32_t gpu_memory_layout_int() const {
+    return static_cast<int32_t>(memory_layout_);
   }
 
   inline const std::vector<int64_t>& sizes() const {
     return sizes_;
   }
 
-  inline const std::vector<int64_t>& strides() const {
-    return strides_;
+  inline const int64_t size(size_t dim) const {
+    return sizes().at(dim);
   }
 
-  inline const std::vector<int64_t>& gpu_sizes() const {
-    return gpu_sizes_;
-  }
-
-  inline const std::vector<int64_t>& gpu_strides() const {
-    return gpu_strides_;
-  }
-
-  inline const api::utils::uvec3& virtual_extents() const {
-    return virtual_extents_;
+  inline const int64_t dim() const {
+    return sizes_.size();
   }
 
   /*
-   * Get a uniform buffer object containing the tensor sizes to use in a compute
-   * shader. Note that the UBO will be created the first time this function is
-   * called.
+   * Get the binding information for the uniform buffer object containing the
+   * tensor sizes to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
    */
-  std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_ubo();
+  const api::BufferBindInfo sizes_ubo();
 
   /*
-   * Get a uniform buffer object containing the tensor GPU sizes to use in a
-   * compute shader. Note that the UBO will be created the first time this
-   * function is called.
+   * Get the binding information for the uniform buffer object containing the
+   * texture limits to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
    */
-  std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_ubo();
-
-  /*
-   * Get a uniform buffer object containing the image extents to use in a
-   * compute shader. Note that the UBO will be created the first time this
-   * function is called.
-   */
-  std::shared_ptr<api::UniformParamsBuffer> extents_ubo();
-
-  inline void set_is_quantized() {
-    is_quantized_ = true;
-  }
-
-  inline bool is_quantized() const {
-    return is_quantized_;
-  }
+  const api::BufferBindInfo texture_limits_ubo();
 
-  inline void set_scale(const double q_scale) {
-    q_scale_ = q_scale;
-  }
-
-  inline double get_scale() const {
-    return q_scale_;
-  }
-
-  inline float get_scale_float() const {
-    return api::utils::safe_downcast<float>(q_scale_);
-  }
-
-  inline void set_zero_point(const int64_t q_zero_point) {
-    q_zero_point_ = q_zero_point;
-  }
-
-  inline int64_t get_zero_point() const {
-    return q_zero_point_;
-  }
-
-  inline int32_t get_zero_point_int32() const {
-    return api::utils::safe_downcast<int32_t>(q_zero_point_);
+  inline const api::utils::ivec3 texture_limits() const {
+    return texture_limits_.limits;
   }
 
   inline size_t numel() const {
@@ -342,7 +240,7 @@ class vTensor final {
   }
 
   /*
-   * Return nbytes but bnased on gpu_sizes_ instead of sizes_
+   * Return nbytes but based on gpu_sizes_ instead of sizes_
    */
   inline VkDeviceSize gpu_nbytes() const {
     return api::element_size(dtype()) * gpu_numel();
@@ -385,12 +283,4 @@ class vTensor final {
   void virtual_resize(const std::vector<int64_t>& new_sizes);
 };
 
-void add_buffer_barrier(
-    api::PipelineBarrier&,
-    const api::VulkanBuffer&,
-    const api::PipelineStageFlags,
-    const api::MemoryAccessFlags,
-    const api::PipelineStageFlags,
-    const api::MemoryAccessFlags);
-
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Types.h b/backends/vulkan/runtime/api/Types.h
index 03cef80e673..c63f164aa8f 100644
--- a/backends/vulkan/runtime/api/Types.h
+++ b/backends/vulkan/runtime/api/Types.h
@@ -23,15 +23,15 @@
 #define VK_FORMAT_FLOAT4 VK_FORMAT_R32G32B32A32_SFLOAT
 #endif /* USE_VULKAN_FP16_INFERENCE */
 
-#define VK_FORALL_SCALAR_TYPES(_)               \
-  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Byte)     \
-  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, Char)      \
-  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int)  \
-  _(bool, VK_FORMAT_R8G8B8A8_SINT, Bool)        \
-  _(float, VK_FORMAT_R16G16B16A16_SFLOAT, Half) \
-  _(float, VK_FORMAT_FLOAT4, Float)             \
-  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8)     \
-  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8)   \
+#define VK_FORALL_SCALAR_TYPES(_)                  \
+  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Byte)        \
+  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, Char)         \
+  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int)     \
+  _(bool, VK_FORMAT_R8G8B8A8_SINT, Bool)           \
+  _(uint16_t, VK_FORMAT_R16G16B16A16_SFLOAT, Half) \
+  _(float, VK_FORMAT_FLOAT4, Float)                \
+  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8)        \
+  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8)      \
   _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32)
 
 namespace vkcompute {
@@ -162,13 +162,16 @@ VK_FORALL_SCALAR_TYPES(SPECIALIZE_ScalarTypeToCType)
  *
  * UNKNOWN is not expected to be used.
  */
-enum class StorageType {
+enum class StorageType : uint8_t {
   BUFFER,
   TEXTURE_3D,
   TEXTURE_2D,
-  UNKNOWN,
 };
 
+static constexpr StorageType kBuffer = StorageType::BUFFER;
+static constexpr StorageType kTexture3D = StorageType::TEXTURE_3D;
+static constexpr StorageType kTexture2D = StorageType::TEXTURE_2D;
+
 /**
  * The enum below is used to describe how tensor data is laid out when stored in
  * GPU memory. The name of the enum describes which dimension is tightly packed;
@@ -182,11 +185,20 @@ enum class StorageType {
  * strides of the tensor will be used instead to convert between logical tensor
  * coordinates and linear access indices.
  */
-enum class GPUMemoryLayout : uint32_t {
+enum class GPUMemoryLayout : uint8_t {
   TENSOR_WIDTH_PACKED = 0u,
   TENSOR_HEIGHT_PACKED = 1u,
   TENSOR_CHANNELS_PACKED = 2u,
 };
 
+static constexpr GPUMemoryLayout kWidthPacked =
+    GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+
+static constexpr GPUMemoryLayout kHeightPacked =
+    GPUMemoryLayout::TENSOR_HEIGHT_PACKED;
+
+static constexpr GPUMemoryLayout kChannelsPacked =
+    GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+
 } // namespace api
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Utils.h b/backends/vulkan/runtime/api/Utils.h
index f04c11ba030..d12844bbf1e 100644
--- a/backends/vulkan/runtime/api/Utils.h
+++ b/backends/vulkan/runtime/api/Utils.h
@@ -262,6 +262,23 @@ inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const ivec3& v) {
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const uvec4& v) {
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const ivec4& v) {
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
+  return os;
+}
+
 //
 // std::vector<T> Handling
 //
@@ -292,6 +309,25 @@ inline ivec2 make_ivec2(
   }
 }
 
+inline ivec3 make_ivec3(
+    const std::vector<int64_t>& ints,
+    bool reverse = false) {
+  VK_CHECK_COND(ints.size() == 3);
+  if (reverse) {
+    return {
+        safe_downcast<int32_t>(ints[2]),
+        safe_downcast<int32_t>(ints[1]),
+        safe_downcast<int32_t>(ints[0]),
+    };
+  } else {
+    return {
+        safe_downcast<int32_t>(ints[0]),
+        safe_downcast<int32_t>(ints[1]),
+        safe_downcast<int32_t>(ints[2]),
+    };
+  }
+}
+
 inline ivec4 make_ivec4(
     const std::vector<int64_t>& ints,
     bool reverse = false) {
@@ -332,6 +368,13 @@ inline ivec3 make_ivec3(uvec3 ints) {
       safe_downcast<int32_t>(ints.data[2u])};
 }
 
+inline uvec3 make_uvec3(ivec3 ints) {
+  return {
+      safe_downcast<uint32_t>(ints.data[0u]),
+      safe_downcast<uint32_t>(ints.data[1u]),
+      safe_downcast<uint32_t>(ints.data[2u])};
+}
+
 /*
  * Given an vector of up to 4 uint64_t representing the sizes of a tensor,
  * constructs a uvec4 containing those elements in reverse order.
diff --git a/backends/vulkan/runtime/api/gen_vulkan_spv.py b/backends/vulkan/runtime/api/gen_vulkan_spv.py
index 6b085277579..04b1f1582e5 100644
--- a/backends/vulkan/runtime/api/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/api/gen_vulkan_spv.py
@@ -34,22 +34,13 @@
 CPP_H_NAME = "spv.h"
 CPP_SRC_NAME = "spv.cpp"
 
+# Basic configuration settings for shaders
 DEFAULT_ENV: Dict[str, Any] = {
     "PRECISION": "highp",
-    "FLOAT_IMAGE_FORMAT": "rgba16f",
-    "INT_IMAGE_FORMAT": "rgba32i",
-    "UINT_IMAGE_FORMAT": "rgba32ui",
 }
 
-TYPES_ENV: Dict[str, Any] = {
-    "IMAGE_FORMAT": {
-        "float": "rgba32f",
-        "half": "rgba16f",
-        "int": "rgba32i",
-        "uint": "rgba32ui",
-        "int8": "rgba8i",
-        "uint8": "rgba8ui",
-    },
+# Establishes relationships between different tensor types and different GLSL types
+TYPE_MAPPINGS: Dict[str, Any] = {
     "IMAGE_T": {
         3: {
             "float": "image3D",
@@ -78,29 +69,89 @@
             "uint": "usampler2D",
         },
     },
-    "VEC4_T": {
-        "float": "vec4",
-        "half": "vec4",
-        "int": "ivec4",
-        "uint": "uvec4",
-        "int8": "vec4",
-        "uint8": "uvec4",
-    },
-    "T": {
-        "float": "float",
-        "half": "float",
-        "int": "int",
-        "uint": "uint",
-        "int8": "int",
-        "uint8": "uint8",
+    "IMAGE_FORMAT": {
+        "float": "rgba32f",
+        "half": "rgba16f",
+        "int": "rgba32i",
+        "uint": "rgba32ui",
+        "int8": "rgba8i",
+        "uint8": "rgba8ui",
     },
 }
 
-FUNCS_ENV: Dict[str, Any] = {
-    "GET_POS": {
+
+def define_variable(name: str) -> str:
+    if name in locals():
+        return f"#define {name} {locals()[name]}"
+    elif name in globals():
+        return f"#define {name} {globals()[name]}"
+    else:
+        raise RuntimeError(f"{name} is not defined")
+
+
+def get_buffer_scalar_type(dtype: str) -> str:
+    if dtype == "half":
+        return "float16_t"
+    elif dtype[-1] == "8":
+        return dtype + "_t"
+
+    return dtype
+
+
+def get_buffer_gvec_type(dtype: str, n: int) -> str:
+    if n == 1:
+        return get_buffer_scalar_type(dtype)
+
+    if dtype == "float":
+        return f"vec{n}"
+    elif dtype == "half":
+        return f"f16vec{n}"
+    elif dtype == "int8":
+        return f"i8vec{n}"
+    elif dtype == "uint8":
+        return f"u8vec{n}"
+
+    raise AssertionError(f"Invalid dtype: {dtype}")
+
+
+def get_texel_type(dtype: str) -> str:
+    image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype]
+    if image_format[-1] == "f":
+        return "vec4"
+    elif image_format[-2] == "ui":
+        return "uvec4"
+    elif image_format[-1] == "i":
+        return "ivec4"
+    raise AssertionError(f"Invalid image format: {image_format}")
+
+
+def get_gvec_type(dtype: str, n: int) -> str:
+    gvec4_type = get_texel_type(dtype)
+    return gvec4_type[:-1] + str(n)
+
+
+def get_texel_component_type(dtype: str) -> str:
+    vec4_type = get_texel_type(dtype)
+    if vec4_type[:3] == "vec":
+        return "float"
+    elif vec4_type[:4] == "ivec":
+        return "int"
+    elif vec4_type[:4] == "uvec":
+        return "uint"
+    raise AssertionError(f"Invalid vec4 type: {vec4_type}")
+
+
+UTILITY_FNS: Dict[str, Any] = {
+    "macro_define": define_variable,
+    "get_pos": {
         3: lambda pos: pos,
         2: lambda pos: f"{pos}.xy",
-    }
+    },
+    "buffer_scalar_type": get_buffer_scalar_type,
+    "buffer_gvec_type": get_buffer_gvec_type,
+    "texel_type": get_texel_type,
+    "gvec_type": get_gvec_type,
+    "texel_component_type": get_texel_component_type,
 }
 
 
@@ -376,26 +427,6 @@ def create_shader_params(
         for key, value in variant_params.items():
             shader_params[key] = value
 
-        shader_dtype = shader_params.get("DTYPE", "float")
-
-        if shader_dtype == "int":
-            shader_params["FORMAT"] = self.env["INT_IMAGE_FORMAT"]
-        elif shader_dtype == "uint":
-            shader_params["FORMAT"] = self.env["UINT_IMAGE_FORMAT"]
-        elif shader_dtype == "int32":
-            shader_params["FORMAT"] = "rgba32i"
-        elif shader_dtype == "uint32":
-            shader_params["FORMAT"] = "rgba32ui"
-        elif shader_dtype == "int8":
-            shader_params["FORMAT"] = "rgba8i"
-        elif shader_dtype == "uint8":
-            shader_params["FORMAT"] = "rgba8ui"
-        elif shader_dtype == "float32":
-            shader_params["FORMAT"] = "rgba32f"
-        # Assume float by default
-        else:
-            shader_params["FORMAT"] = self.env["FLOAT_IMAGE_FORMAT"]
-
         return shader_params
 
     def constructOutputMap(self) -> None:
@@ -440,7 +471,7 @@ def generateSPV(self, output_dir: str) -> Dict[str, str]:
                     glsl_out_path,
                     "-o",
                     spv_out_path,
-                    "--target-env=vulkan1.0",
+                    "--target-env=vulkan1.1",
                     "-Werror",
                 ] + [
                     arg
@@ -543,13 +574,6 @@ def findRegisterFor(lineStr: str) -> Tuple[str, List[str]]:
     r"\buniform\b": "VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER",
 }
 
-storageTypeToEnum = {
-    "TEXTURE_2D": "api::StorageType::TEXTURE_2D",
-    "TEXTURE_3D": "api::StorageType::TEXTURE_3D",
-    "BUFFER": "api::StorageType::BUFFER",
-    "": "api::StorageType::UNKNOWN",
-}
-
 
 def determineDescriptorType(lineStr: str) -> str:
     for identifier, typeNum in typeIdMapping.items():
@@ -632,7 +656,7 @@ def generateShaderInfoStr(shader_info: ShaderInfo, name: str, sizeBytes: int) ->
     tile_size = (
         f"{{{', '.join(str(x) for x in shader_info.tile_size)}}}"
         if (len(shader_info.tile_size) > 0)
-        else "std::vector<uint32_t>()"
+        else "{1, 1, 1}"
     )
 
     shader_info_layouts = "{{{}}}".format(",\n ".join(shader_info.layouts))
@@ -643,8 +667,6 @@ def generateShaderInfoStr(shader_info: ShaderInfo, name: str, sizeBytes: int) ->
         str(sizeBytes),
         shader_info_layouts,
         tile_size,
-        storageTypeToEnum[shader_info.weight_storage_type],
-        storageTypeToEnum[shader_info.bias_storage_type],
     ]
 
     shader_info_str = textwrap.indent(
@@ -741,9 +763,9 @@ def main(argv: List[str]) -> int:
     )
     options = parser.parse_args()
 
-    DEFAULT_ENV.update(TYPES_ENV)
-    DEFAULT_ENV.update(FUNCS_ENV)
     env = DEFAULT_ENV
+    env.update(TYPE_MAPPINGS)
+    env.update(UTILITY_FNS)
 
     for key, value in parse_arg_env(options.env).items():
         env[key] = value
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index dada914b22d..0c7941d6f52 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -17,6 +17,39 @@
 
 namespace vkcompute {
 
+//
+// VTensorPtr
+//
+
+#define VALUE_PTR_CLASS_IMPL(classname, ctype, type_name)                 \
+  classname::classname(ComputeGraph* const graph, const ValueRef idx)     \
+      : graph_(graph), ptr_(&(graph_->values_.at(idx).to##type_name())) { \
+    graph_->values_in_use_++;                                             \
+  }                                                                       \
+  ctype* classname::operator->() const {                                  \
+    return ptr_;                                                          \
+  }                                                                       \
+  ctype& classname::operator*() const {                                   \
+    return *ptr_;                                                         \
+  }                                                                       \
+  classname::~classname() {                                               \
+    graph_->values_in_use_--;                                             \
+  }
+
+VALUE_PTR_CLASS_IMPL(vTensorPtr, vTensor, Tensor)
+VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef)
+VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging)
+VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector<int64_t>, IntList)
+VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector<double>, DoubleList)
+VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector<bool>, BoolList)
+VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector<ValueRef>, ValueList)
+
+#undef VALUE_PTR_CLASS_IMPL
+
+//
+// ComputeGraph
+//
+
 ComputeGraph::ComputeGraph(GraphConfig config)
     : config_{config},
       prepack_descriptor_counts_{},
@@ -26,6 +59,7 @@ ComputeGraph::ComputeGraph(GraphConfig config)
           config_.contextConfig)},
       shared_objects_{},
       values_{},
+      param_ubos_{},
       prepack_nodes_{},
       execute_nodes_{},
       inputs_{},
@@ -86,7 +120,7 @@ api::StorageType ComputeGraph::suggested_storage_type() {
   if (config_.enableStorageTypeOverride) {
     return config_.storageTypeOverride;
   }
-  return api::StorageType::TEXTURE_3D;
+  return api::kTexture3D;
 }
 
 api::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
@@ -95,14 +129,43 @@ api::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
     return config_.memoryLayoutOverride;
   }
   if (sizes.size() < 3) {
-    return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+    return api::kWidthPacked;
   }
   // For 3 dimensional tensors that only have a channels dimension of 1, still
   // prefer width packed.
   if (api::utils::val_at(-3, sizes) == 1) {
-    return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+    return api::kWidthPacked;
   }
-  return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+  return api::kChannelsPacked;
+}
+
+void ComputeGraph::check_no_active_value_ptrs() {
+  VK_CHECK_COND(
+      values_in_use_ == 0,
+      "Make sure that there are no pointers stored from the return values of "
+      "`ComputeGraph::get_*()` functions in scope before adding Values to the "
+      "graph. Modifying the graph's values may cause existing pointers to be "
+      "invalidated.");
+}
+
+std::vector<int64_t> ComputeGraph::get_sizes_of(ValueRef idx) {
+  Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toTensor().sizes();
+  } else if (val.isTensorRef()) {
+    return val.toTensorRef().sizes;
+  }
+  VK_THROW("Could not get sizes of value with type ", val.type());
+}
+
+api::ScalarType ComputeGraph::get_dtype_of(ValueRef idx) {
+  Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toTensor().dtype();
+  } else if (val.isTensorRef()) {
+    return val.toTensorRef().dtype;
+  }
+  VK_THROW("Could not get dtype of value with type ", val.type());
 }
 
 ValueRef ComputeGraph::add_tensor(
@@ -114,6 +177,7 @@ ValueRef ComputeGraph::add_tensor(
   bool allocate_memory = shared_object_idx < 0;
 
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back(vTensor(
       context(), sizes, dtype, storage_type, memory_layout, allocate_memory));
 
@@ -132,16 +196,26 @@ ValueRef ComputeGraph::add_tensor(
       sizes, dtype, suggested_storage_type(), memory_layout, shared_object_idx);
 }
 
+ValueRef ComputeGraph::add_tensor_like(
+    const ValueRef idx,
+    const api::StorageType storage_type,
+    const api::GPUMemoryLayout memory_layout) {
+  return add_tensor(
+      get_sizes_of(idx), get_dtype_of(idx), storage_type, memory_layout);
+}
+
+ValueRef ComputeGraph::add_tensor_like(
+    const ValueRef idx,
+    const api::GPUMemoryLayout memory_layout) {
+  return add_tensor(get_sizes_of(idx), get_dtype_of(idx), memory_layout);
+}
+
 ValueRef ComputeGraph::add_tensor(
     const std::vector<int64_t>& sizes,
     const api::ScalarType dtype,
     const int64_t shared_object_idx) {
   return add_tensor(
-      sizes,
-      dtype,
-      suggested_storage_type(),
-      suggested_memory_layout(sizes),
-      shared_object_idx);
+      sizes, dtype, suggested_memory_layout(sizes), shared_object_idx);
 }
 
 ValueRef ComputeGraph::add_tensorref(
@@ -149,6 +223,7 @@ ValueRef ComputeGraph::add_tensorref(
     const api::ScalarType dtype,
     const void* const data) {
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back(TensorRef(sizes, dtype, data));
   return idx;
 }
@@ -157,24 +232,28 @@ ValueRef ComputeGraph::add_staging(
     const api::ScalarType dtype,
     const size_t numel) {
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back(api::StorageBuffer(context(), dtype, numel));
   return idx;
 }
 
 ValueRef ComputeGraph::add_none() {
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back();
   return idx;
 }
 
 ValueRef ComputeGraph::add_value_list(std::vector<ValueRef>&& value) {
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back(std::move(value));
   return idx;
 }
 
 ValueRef ComputeGraph::add_string(std::string&& str) {
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back(std::move(str));
   return idx;
 }
@@ -183,8 +262,9 @@ ValueRef ComputeGraph::set_input_tensor(
     const ValueRef idx,
     const bool use_staging) {
   if (use_staging) {
-    vTensor& tensor = get_val(idx).toTensor();
-    ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
+    api::ScalarType dtype = get_tensor(idx)->dtype();
+    size_t gpu_numel = get_tensor(idx)->gpu_numel();
+    ValueRef staging_idx = add_staging(dtype, gpu_numel);
     add_staging_to_tensor_node(*this, staging_idx, idx);
     inputs_.push_back({idx, staging_idx});
     return staging_idx;
@@ -197,8 +277,9 @@ ValueRef ComputeGraph::set_output_tensor(
     const ValueRef idx,
     const bool use_staging) {
   if (use_staging) {
-    vTensor& tensor = get_val(idx).toTensor();
-    ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
+    api::ScalarType dtype = get_tensor(idx)->dtype();
+    size_t gpu_numel = get_tensor(idx)->gpu_numel();
+    ValueRef staging_idx = add_staging(dtype, gpu_numel);
     add_tensor_to_staging_node(*this, idx, staging_idx);
     outputs_.push_back({idx, staging_idx});
     return staging_idx;
@@ -218,20 +299,18 @@ void ComputeGraph::copy_into_staging(
     const ValueRef idx,
     const void* data,
     const size_t numel) {
-  Value& in_val = get_val(idx);
-  api::StorageBuffer& staging = in_val.toStaging();
-  size_t nbytes = numel * api::element_size(staging.dtype());
-  copy_ptr_to_staging(data, staging, nbytes);
+  StagingPtr staging = get_staging(idx);
+  size_t nbytes = numel * api::element_size(staging->dtype());
+  copy_ptr_to_staging(data, *staging, nbytes);
 }
 
 void ComputeGraph::copy_from_staging(
     const ValueRef idx,
     void* data,
     const size_t numel) {
-  Value& out_val = get_val(idx);
-  api::StorageBuffer& staging = out_val.toStaging();
-  size_t nbytes = numel * api::element_size(staging.dtype());
-  copy_staging_to_ptr(staging, data, nbytes);
+  StagingPtr staging = get_staging(idx);
+  size_t nbytes = numel * api::element_size(staging->dtype());
+  copy_staging_to_ptr(*staging, data, nbytes);
 }
 
 void ComputeGraph::prepare() {
@@ -297,7 +376,7 @@ void ComputeGraph::resize_input(
     const int64_t idx,
     const std::vector<int64_t>& new_sizes) {
   IOValueRef io_val = inputs_.at(idx);
-  get_val(io_val.value).toTensor().virtual_resize(new_sizes);
+  get_tensor(io_val.value)->virtual_resize(new_sizes);
 }
 
 void ComputeGraph::propagate_resize() {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 00aa60020f3..00d8cbd3c55 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -10,6 +10,8 @@
 
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
+#include <optional>
+
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
@@ -36,6 +38,38 @@ struct is_valid_scalar_type<double> : std::true_type {};
 template <>
 struct is_valid_scalar_type<bool> : std::true_type {};
 
+//
+// Guarded Pointer Classes
+//
+
+class ComputeGraph;
+
+#define DECL_VALUE_PTR_CLASS(classname, ctype)                         \
+  class classname final {                                              \
+    ComputeGraph* const graph_;                                        \
+    ctype* ptr_;                                                       \
+                                                                       \
+   public:                                                             \
+    explicit classname(ComputeGraph* const graph, const ValueRef idx); \
+    ctype* operator->() const;                                         \
+    ctype& operator*() const;                                          \
+    ~classname();                                                      \
+  };
+
+DECL_VALUE_PTR_CLASS(vTensorPtr, vTensor)
+DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef)
+DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer)
+DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
+DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
+DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
+DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector<ValueRef>)
+
+#undef DECL_VALUE_PTR_CLASS
+
+//
+// ComputeGraph
+//
+
 /*
  * This is the core data structure used to execute Vulkan models in graph mode.
  * As opposed to ATen/eager mode where a command buffer is encoded every
@@ -61,6 +95,7 @@ class ComputeGraph final {
   std::unique_ptr<api::Context> context_;
   std::vector<SharedObject> shared_objects_;
   std::vector<Value> values_;
+  std::vector<api::UniformParamsBuffer> param_ubos_;
 
   std::vector<std::unique_ptr<PrepackNode>> prepack_nodes_;
   std::vector<std::unique_ptr<ExecuteNode>> execute_nodes_;
@@ -68,6 +103,9 @@ class ComputeGraph final {
   std::vector<IOValueRef> inputs_;
   std::vector<IOValueRef> outputs_;
 
+ protected:
+  size_t values_in_use_ = 0;
+
  public:
   //
   // Accessors
@@ -89,34 +127,73 @@ class ComputeGraph final {
       const api::ShaderInfo& shader_info,
       bool execute);
 
-  /*
-   * Returns the value at a particular index in the graph. If storing this
-   * function's return value in a lvalue reference, it is imperative that no
-   * values are added to the graph while the reference is in scope, otherwise
-   * the underlying value may have been moved as part of a vector resize.
-   */
-  inline Value& get_val(ValueRef idx) {
-    return values_.at(idx);
+#define GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ptr_type, short_name, type_name) \
+  inline ptr_type get_##short_name(const ValueRef idx) {                   \
+    return ptr_type(this, idx);                                            \
+  }                                                                        \
+  inline bool val_is_##short_name(const ValueRef idx) {                    \
+    return values_.at(idx).is##type_name();                                \
+  }
+
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(vTensorPtr, tensor, Tensor)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(TensorRefPtr, tref, TensorRef)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(StagingPtr, staging, Staging)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(IntListPtr, int_list, IntList)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList)
+  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList)
+
+#undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS
+
+#define GET_AND_CHECK_VAL_AS_TYPE_FNS(ctype, short_name, type_name) \
+  inline ctype get_##short_name(const ValueRef idx) {               \
+    return values_.at(idx).to##type_name();                         \
+  }                                                                 \
+  inline bool val_is_##short_name(const ValueRef idx) {             \
+    return values_.at(idx).is##type_name();                         \
+  }
+
+  GET_AND_CHECK_VAL_AS_TYPE_FNS(int64_t, int, Int)
+  GET_AND_CHECK_VAL_AS_TYPE_FNS(double, double, Double)
+  GET_AND_CHECK_VAL_AS_TYPE_FNS(bool, bool, Bool)
+  GET_AND_CHECK_VAL_AS_TYPE_FNS(std::string, string, String)
+
+#undef GET_AND_CHECK_VAL_AS_TYPE_FNS
+
+  inline bool val_is_none(const ValueRef idx) {
+    return values_.at(idx).isNone();
+  }
+
+  inline TypeTag get_val_type(const ValueRef idx) {
+    return values_.at(idx).type();
   }
 
-  inline const std::vector<int64_t>& get_val_sizes(ValueRef idx) {
-    Value& val = get_val(idx);
-    if (val.isTensor()) {
-      return val.toTensor().sizes();
-    } else if (val.isTensorRef()) {
-      return val.toTensorRef().sizes;
+  std::vector<int64_t> get_sizes_of(ValueRef idx);
+
+  api::ScalarType get_dtype_of(ValueRef idx);
+
+  template <typename T>
+  T extract_scalar(const ValueRef idx) {
+    Value& value = values_.at(idx);
+    if (value.isInt()) {
+      return static_cast<T>(value.toInt());
+    }
+    if (value.isDouble()) {
+      return static_cast<T>(value.toDouble());
+    }
+    if (value.isBool()) {
+      return static_cast<T>(value.toBool());
     }
-    VK_THROW("Could not get sizes of value with type ", val.type());
+    VK_THROW("Cannot extract scalar from Value with type ", value.type());
   }
 
-  inline api::ScalarType get_val_dtype(ValueRef idx) {
-    Value& val = get_val(idx);
-    if (val.isTensor()) {
-      return val.toTensor().dtype();
-    } else if (val.isTensorRef()) {
-      return val.toTensorRef().dtype;
+  template <typename T>
+  std::optional<T> extract_optional_scalar(const ValueRef idx) {
+    if (val_is_none(idx)) {
+      return ::std::nullopt;
+    } else {
+      return extract_scalar<T>(idx);
     }
-    VK_THROW("Could not get dtype of value with type ", val.type());
   }
 
   inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
@@ -156,13 +233,17 @@ class ComputeGraph final {
    * Returns the memory layout of a Tensor value at the specified index.
    */
   inline api::GPUMemoryLayout memory_layout_of(ValueRef idx) {
-    return get_val(idx).toTensor().gpu_memory_layout();
+    return get_tensor(idx)->gpu_memory_layout();
   }
 
   //
   // Graph Building
   //
 
+ private:
+  void check_no_active_value_ptrs();
+
+ public:
   /*
    * Add a `vTensor` value to the graph with the specified properties. There are
    * various convenience overloads of this function that may be used instead.
@@ -172,7 +253,7 @@ class ComputeGraph final {
       const api::ScalarType dtype,
       const api::StorageType storage_type,
       const api::GPUMemoryLayout memory_layout,
-      const int64_t shared_object_idx);
+      const int64_t shared_object_idx = -1);
 
   /*
    * Add a `vTensor` value to the graph with the specified properties. The
@@ -191,9 +272,25 @@ class ComputeGraph final {
    */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
-      const api::ScalarType dtype = api::ScalarType::Float,
+      const api::ScalarType dtype,
       const int64_t shared_object_idx = -1);
 
+  /*
+   * Add a `vTensor` value to the graph with the properties of `vref`.
+   */
+  ValueRef add_tensor_like(
+      const ValueRef vref,
+      const api::StorageType storage_type,
+      const api::GPUMemoryLayout memory_layout);
+
+  /*
+   * Add a `vTensor` value to the graph with the properties of `vref`. The
+   * suggested storage type will be used to construct the `vTensor`.
+   */
+  ValueRef add_tensor_like(
+      const ValueRef vref,
+      const api::GPUMemoryLayout memory_layout);
+
   /*
    * Add a `TensorRef` value to the graph with the specific properties. A
    * `TensorRef` is a reference to a `vTensor` whose data is stored in an
@@ -229,9 +326,9 @@ class ComputeGraph final {
   ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
 
   template <typename Block>
-  inline std::shared_ptr<api::UniformParamsBuffer> create_params_buffer(
-      const Block& data) {
-    return std::make_shared<api::UniformParamsBuffer>(context_.get(), data);
+  const api::BufferBindInfo create_params_buffer(const Block& data) {
+    param_ubos_.emplace_back(api::UniformParamsBuffer(context_.get(), data));
+    return api::BufferBindInfo(param_ubos_.back().buffer());
   }
 
   /*
@@ -296,12 +393,31 @@ class ComputeGraph final {
 
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
   void propagate_resize();
+
+  //
+  // Debug support (implemented in Logging.cpp)
+  //
+
+  void print_readable();
+
+  //
+  // Friend classes
+  //
+
+  friend class vTensorPtr;
+  friend class TensorRefPtr;
+  friend class StagingPtr;
+  friend class IntListPtr;
+  friend class DoubleListPtr;
+  friend class BoolListPtr;
+  friend class ValueListPtr;
 };
 
 template <typename T>
 inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
 ComputeGraph::add_scalar(T value) {
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back(value);
   return idx;
 }
@@ -310,6 +426,7 @@ template <typename T>
 inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
 ComputeGraph::add_scalar_list(std::vector<T>&& value) {
   ValueRef idx(static_cast<int>(values_.size()));
+  check_no_active_value_ptrs();
   values_.emplace_back(std::move(value));
   return idx;
 }
diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp
index 7e3d1dfbf98..98b2d9a4263 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.cpp
+++ b/backends/vulkan/runtime/graph/GraphConfig.cpp
@@ -48,16 +48,16 @@ GraphConfig::GraphConfig() {
   // of memory, increase this safety factor.
   descriptorPoolSafetyFactor = 1.25;
 
-  // For now, force TEXTURE_3D storage as we are still developing shader
-  // support for buffer storage type.
+  // For now, force kTexture3D storage as we are still developing shader support
+  // for buffer storage type.
   enableStorageTypeOverride = true;
-  storageTypeOverride = api::StorageType::TEXTURE_3D;
+  storageTypeOverride = api::kTexture3D;
 
-  // For now, force TENSOR_WIDTH_PACKED memory layout by default as we are still
+  // For now, force kWidthPacked memory layout by default as we are still
   // developing support for other memory layouts. In the future memory layout
   // settings will be serialized as part of the graph.
   enableMemoryLayoutOverride = true;
-  memoryLayoutOverride = api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+  memoryLayoutOverride = api::kWidthPacked;
 }
 
 void GraphConfig::setStorageTypeOverride(api::StorageType storage_type) {
diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp
new file mode 100644
index 00000000000..00d7837503a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/Logging.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <set>
+
+namespace vkcompute {
+
+void ComputeGraph::print_readable() {
+  std::set<ValueRef> input_set;
+  for (const IOValueRef& io_val : inputs()) {
+    input_set.insert(io_val.value);
+  }
+
+  std::set<ValueRef> output_set;
+  for (const IOValueRef& io_val : outputs()) {
+    output_set.insert(io_val.value);
+  }
+
+  std::set<ValueRef> prepack_set;
+  for (const std::unique_ptr<PrepackNode>& node : prepack_nodes()) {
+    prepack_set.insert(node->tref_);
+    prepack_set.insert(node->packed_);
+  }
+
+  std::map<ValueRef, size_t> value_ref_to_shared_object_idx;
+
+  std::cout << "====================" << std::left << std::setfill('=')
+            << std::setw(40) << " Shared Object List " << std::right
+            << std::setfill(' ') << std::endl;
+
+  std::cout << std::setw(6) << "idx" << std::setw(20) << "sizes"
+            << std::setw(24) << "users" << std::endl;
+
+  size_t so_idx = 0;
+  for (const SharedObject& shared_object : shared_objects_) {
+    std::cout << std::setw(6) << so_idx;
+    {
+      std::stringstream ss;
+      ss << shared_object.aggregate_memory_requirements.size;
+      std::cout << std::setw(20) << ss.str();
+    }
+
+    {
+      std::stringstream ss;
+      ss << shared_object.users;
+      std::cout << std::setw(24) << ss.str();
+    }
+    std::cout << std::endl;
+
+    for (const ValueRef& user : shared_object.users) {
+      value_ref_to_shared_object_idx[user] = so_idx;
+    }
+
+    so_idx++;
+  }
+
+  std::cout << "====================" << std::left << std::setfill('=')
+            << std::setw(40) << " Value List " << std::right
+            << std::setfill(' ') << std::endl;
+
+  std::cout << std::setw(6) << "idx" << std::setw(10) << "type" << std::setw(20)
+            << "sizes" << std::setw(10) << "node_type" << std::setw(10)
+            << "so_idx" << std::endl;
+
+  size_t value_idx = 0;
+  for (Value& val : values_) {
+    std::cout << std::setw(6) << value_idx << std::setw(10) << val.type();
+
+    // sizes
+    std::cout << std::setw(20);
+    if (val.isTensor()) {
+      const vTensor& v_tensor = val.toTensor();
+      std::stringstream ss;
+      ss << v_tensor.sizes();
+      std::cout << ss.str();
+    } else if (val.isTensorRef()) {
+      const TensorRef tensor_ref = val.toTensorRef();
+      std::stringstream ss;
+      ss << tensor_ref.sizes;
+      std::cout << ss.str();
+    } else {
+      std::cout << "";
+    }
+
+    // Node type
+    std::cout << std::setw(10);
+    {
+      if (input_set.count(value_idx) > 0) {
+        std::cout << "INPUT";
+      } else if (output_set.count(value_idx) > 0) {
+        std::cout << "OUTPUT";
+      } else if (prepack_set.count(value_idx) > 0) {
+        std::cout << "PREPACK";
+      } else {
+        std::cout << "";
+      }
+    }
+
+    std::cout << std::setw(10);
+    if (value_ref_to_shared_object_idx.count(value_idx) > 0) {
+      size_t shared_obj_idx = value_ref_to_shared_object_idx.at(value_idx);
+      std::cout << shared_obj_idx;
+    } else {
+      std::cout << "";
+    }
+
+    std::cout << std::endl;
+    value_idx++;
+  }
+
+  std::cout << "====================" << std::left << std::setfill('=')
+            << std::setw(40) << " Prepack Node List " << std::right
+            << std::setfill(' ') << std::endl;
+  std::cout << std::setw(6) << "idx" << std::setw(32) << "shader_name"
+            << std::setw(8) << "tref" << std::setw(8) << "packed" << std::endl;
+
+  size_t prepack_node_idx = 0;
+  for (const std::unique_ptr<PrepackNode>& node : prepack_nodes()) {
+    std::cout << std::setw(6) << prepack_node_idx << std::setw(32)
+              << node->shader_.kernel_name << std::setw(8) << node->tref_
+              << std::setw(8) << node->packed_ << std::endl;
+
+    prepack_node_idx++;
+  }
+
+  std::cout << "====================" << std::left << std::setfill('=')
+            << std::setw(40) << " Execute Node List " << std::right
+            << std::setfill(' ') << std::endl;
+
+  std::cout << std::setw(6) << "idx" << std::setw(32) << "shader_name"
+            << std::setw(24) << "READ_arg" << std::setw(24) << "WRITE_arg"
+            << std::endl;
+
+  size_t node_idx = 0;
+  for (const std::unique_ptr<ExecuteNode>& node : execute_nodes()) {
+    std::cout << std::setw(6) << node_idx;
+    std::cout << std::setw(32) << node->shader_.kernel_name;
+
+    std::stringstream read_s;
+    for (const ArgGroup& arg_group : node->args_) {
+      if (arg_group.access != api::MemoryAccessType::READ) {
+        continue;
+      }
+      read_s << arg_group.refs;
+    }
+    std::cout << std::setw(24) << read_s.str();
+
+    std::stringstream write_s;
+    for (const ArgGroup& arg_group : node->args_) {
+      if (arg_group.access != api::MemoryAccessType::WRITE) {
+        continue;
+      }
+      write_s << arg_group.refs;
+    }
+    std::cout << std::setw(24) << write_s.str();
+
+    std::cout << std::endl;
+
+    node_idx++;
+  }
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h
new file mode 100644
index 00000000000..447d52d16bd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/Logging.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/Utils.h>
+
+#include <optional>
+#include <ostream>
+#include <vector>
+
+namespace vkcompute {
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
+  os << '[';
+  for (const auto& elem : vec) {
+    os << elem << ',';
+  }
+  os << ']';
+  return os; // Return the ostream to allow chaining
+}
+
+inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec3& v) {
+  return api::utils::operator<<(os, v);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec4& v) {
+  return api::utils::operator<<(os, v);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const api::utils::ivec3& v) {
+  return api::utils::operator<<(os, v);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const api::utils::ivec4& v) {
+  return api::utils::operator<<(os, v);
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& opt) {
+  os << "[";
+  if (opt) {
+    os << opt.value();
+  }
+  os << "]";
+  return os;
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h
index 3d85d60ce0e..a2ce20bad85 100644
--- a/backends/vulkan/runtime/graph/containers/Constant.h
+++ b/backends/vulkan/runtime/graph/containers/Constant.h
@@ -28,12 +28,6 @@ struct TensorRef final {
       const std::vector<int64_t>& t_sizes,
       api::ScalarType t_dtype,
       const void* const t_data);
-
-  TensorRef(const TensorRef&) = default;
-  TensorRef& operator=(const TensorRef&) = default;
-
-  TensorRef(TensorRef&&) = default;
-  TensorRef& operator=(TensorRef&&) = default;
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.cpp b/backends/vulkan/runtime/graph/containers/SharedObject.cpp
index ad6ea54f9d9..cbc526700c3 100644
--- a/backends/vulkan/runtime/graph/containers/SharedObject.cpp
+++ b/backends/vulkan/runtime/graph/containers/SharedObject.cpp
@@ -13,13 +13,13 @@
 namespace vkcompute {
 
 void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
-  vTensor& t = graph->get_val(idx).toTensor();
+  vTensorPtr t = graph->get_tensor(idx);
 
   //
   // Aggregate Memory Requirements
   //
 
-  const VkMemoryRequirements mem_reqs = t.get_memory_requirements();
+  const VkMemoryRequirements mem_reqs = t->get_memory_requirements();
   aggregate_memory_requirements.size =
       std::max(mem_reqs.size, aggregate_memory_requirements.size);
   aggregate_memory_requirements.alignment =
@@ -30,7 +30,7 @@ void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
   // Aggregate Allocation Create Info
   //
 
-  const VmaAllocationCreateInfo create_info = t.get_allocation_create_info();
+  const VmaAllocationCreateInfo create_info = t->get_allocation_create_info();
   // Clear out CREATE_STRATEGY bit flags in case of conflict
   VmaAllocationCreateFlags clear_mask = ~VMA_ALLOCATION_CREATE_STRATEGY_MASK;
   VmaAllocationCreateFlags create_flags = create_info.flags & clear_mask;
@@ -62,7 +62,7 @@ void SharedObject::bind_users(ComputeGraph* const graph) {
     return;
   }
   for (const ValueRef idx : users) {
-    graph->get_val(idx).toTensor().bind_allocation(allocation);
+    graph->get_tensor(idx)->bind_allocation(allocation);
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index 948d4c2c12d..2e5da86a723 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -93,9 +93,10 @@ struct Value final {
     payload.u.member_name = rhs.payload.u.member_name;           \
     break;
 
-#define CASE_MOVE_MOVEABLE_TYPE(type_tag, type, member_name)             \
+#define CASE_MOVE_MOVEABLE_TYPE(type_tag, type, member_name, dtor_name)  \
   case type_tag:                                                         \
     new (&payload.member_name) type(std::move(rhs.payload.member_name)); \
+    rhs.payload.member_name.~dtor_name();                                \
     break;
 
   Value(Value&& rhs) noexcept : tag(rhs.tag) {
@@ -105,20 +106,23 @@ struct Value final {
       CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::DOUBLE, as_double);
       CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::BOOL, as_bool);
       // Tensor and tensor adjacent types
-      CASE_MOVE_MOVEABLE_TYPE(TypeTag::TENSOR, vTensor, as_tensor);
-      CASE_MOVE_MOVEABLE_TYPE(TypeTag::STAGING, api::StorageBuffer, as_staging);
-      CASE_MOVE_MOVEABLE_TYPE(TypeTag::TENSORREF, TensorRef, as_tensorref);
+      CASE_MOVE_MOVEABLE_TYPE(TypeTag::TENSOR, vTensor, as_tensor, vTensor);
+      CASE_MOVE_MOVEABLE_TYPE(
+          TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer);
+      CASE_MOVE_MOVEABLE_TYPE(
+          TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef);
       // Scalar lists
       CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::INTLIST, std::vector<int64_t>, as_int_list);
+          TypeTag::INTLIST, std::vector<int64_t>, as_int_list, vector);
       CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::DOUBLELIST, std::vector<double>, as_double_list);
+          TypeTag::DOUBLELIST, std::vector<double>, as_double_list, vector);
       CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::BOOLLIST, std::vector<bool>, as_bool_list);
+          TypeTag::BOOLLIST, std::vector<bool>, as_bool_list, vector);
       // Special types
       CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::VALUELIST, std::vector<ValueRef>, as_value_list);
-      CASE_MOVE_MOVEABLE_TYPE(TypeTag::STRING, std::string, as_string);
+          TypeTag::VALUELIST, std::vector<ValueRef>, as_value_list, vector);
+      CASE_MOVE_MOVEABLE_TYPE(
+          TypeTag::STRING, std::string, as_string, basic_string);
 
       case TypeTag::NONE:
         clearToNone();
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
index 08a17a18872..95e7ead3452 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
@@ -20,7 +20,8 @@ ExecuteNode::ExecuteNode(
     const api::utils::uvec3& global_workgroup_size,
     const api::utils::uvec3& local_workgroup_size,
     const std::vector<ArgGroup>& args,
-    const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+    const api::ParamsBindList& params,
+    const api::SpecVarList& spec_vars,
     const ResizeFunction& resize_fn,
     const std::vector<ValueRef>& resize_args)
     : shader_(shader),
@@ -28,6 +29,7 @@ ExecuteNode::ExecuteNode(
       local_workgroup_size_(local_workgroup_size),
       args_(args),
       params_(params),
+      spec_vars_(spec_vars),
       resize_fn_(resize_fn),
       resize_args_(resize_args) {
   graph.update_descriptor_counts(shader, /*execute = */ true);
@@ -40,11 +42,12 @@ void ExecuteNode::encode(ComputeGraph* graph) {
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
   api::DescriptorSet descriptor_set =
-      context->get_descriptor_set(shader_, local_workgroup_size_);
+      context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
       graph, args_, pipeline_barrier, descriptor_set, idx);
+
   bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
   context->register_shader_dispatch(
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index b63273023ed..b211cb2c91f 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -54,7 +54,8 @@ class ExecuteNode final {
       const api::utils::uvec3& global_workgroup_size,
       const api::utils::uvec3& local_workgroup_size,
       const std::vector<ArgGroup>& args,
-      const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+      const api::ParamsBindList& params,
+      const api::SpecVarList& spec_vars = {},
       const ResizeFunction& resize_fn = nullptr,
       const std::vector<ValueRef>& resize_args = {});
 
@@ -73,7 +74,8 @@ class ExecuteNode final {
   const api::utils::uvec3 global_workgroup_size_;
   const api::utils::uvec3 local_workgroup_size_;
   const std::vector<ArgGroup> args_;
-  std::vector<std::shared_ptr<api::UniformParamsBuffer>> params_;
+  const api::ParamsBindList params_;
+  const api::SpecVarList spec_vars_;
   const ResizeFunction resize_fn_;
   const std::vector<ValueRef> resize_args_;
 };
diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
index 449c31508a5..4d1f749830c 100644
--- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
+++ b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
@@ -16,7 +16,9 @@ bool OperatorRegistry::has_op(const std::string& name) {
 
 OperatorRegistry::OpFunction& OperatorRegistry::get_op_fn(
     const std::string& name) {
-  return table_.find(name)->second;
+  const auto it = table_.find(name);
+  VK_CHECK_COND(it != table_.end(), "Could not find operator with name ", name);
+  return it->second;
 }
 
 void OperatorRegistry::register_op(const std::string& name, OpFunction& fn) {
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index 60d1982d97e..9d4bc98ac57 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -11,10 +11,19 @@
 #include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
 namespace vkcompute {
 
+api::ShaderInfo get_noop_shader(ComputeGraph& graph, const ValueRef packed) {
+  std::string noop_shader_name("no_op");
+  vTensorPtr t_packed = graph.get_tensor(packed);
+  add_ndim_suffix(noop_shader_name, *t_packed);
+  add_dtype_suffix(noop_shader_name, *t_packed);
+  return VK_KERNEL_FROM_STR(noop_shader_name);
+}
+
 PrepackNode::PrepackNode(
     ComputeGraph& graph,
     const api::ShaderInfo& shader,
@@ -22,45 +31,87 @@ PrepackNode::PrepackNode(
     const api::utils::uvec3& local_workgroup_size,
     const ValueRef tref,
     const ValueRef packed,
-    const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params)
+    const api::ParamsBindList& params,
+    const api::SpecVarList& spec_vars)
     : shader_(shader),
+      noop_shader_(get_noop_shader(graph, packed)),
       global_workgroup_size_(global_workgroup_size),
       local_workgroup_size_(local_workgroup_size),
       tref_(tref),
       packed_(packed),
-      params_(params) {
+      params_(params),
+      spec_vars_(spec_vars) {
   graph.update_descriptor_counts(shader, /*execute = */ false);
+  graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
+}
+
+api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
+  vTensorPtr packed = graph->get_tensor(packed_);
+
+  // If no TensorRef is provided, create a staging buffer of zeros according to
+  // the vTensor metadata.
+  if (graph->val_is_none(tref_)) {
+    size_t numel = api::utils::multiply_integers(packed->sizes());
+    api::StorageBuffer staging(graph->context(), packed->dtype(), numel);
+    size_t nbytes = numel * api::element_size(packed->dtype());
+    set_staging_zeros(staging, nbytes);
+    return staging;
+  }
+
+  TensorRefPtr tref = graph->get_tref(tref_);
+  size_t numel = api::utils::multiply_integers(tref->sizes);
+  api::StorageBuffer staging(graph->context(), tref->dtype, numel);
+  size_t nbytes = numel * api::element_size(tref->dtype);
+  copy_ptr_to_staging(tref->data, staging, nbytes);
+  return staging;
 }
 
 void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
-  api::PipelineBarrier pipeline_barrier{};
 
-  TensorRef tref = graph->get_val(tref_).toTensorRef();
-  vTensor packed = graph->get_val(packed_).toTensor();
-
-  size_t numel = api::utils::multiply_integers(tref.sizes);
-  api::StorageBuffer staging(graph->context(), tref.dtype, numel);
-  size_t nbytes = numel * api::element_size(tref.dtype);
-  copy_ptr_to_staging(tref.data, staging, nbytes);
+  vTensorPtr packed = graph->get_tensor(packed_);
+  api::StorageBuffer staging = create_staging_buffer(graph);
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
-  api::DescriptorSet descriptor_set =
-      context->get_descriptor_set(shader_, local_workgroup_size_);
-
-  uint32_t idx = 0;
-  bind_tensor_to_descriptor_set(
-      packed,
-      pipeline_barrier,
-      api::MemoryAccessType::WRITE,
-      descriptor_set,
-      idx++);
-  bind_staging_to_descriptor_set(staging, descriptor_set, idx++);
-  bind_params_to_descriptor_set(params_, descriptor_set, idx);
-
-  context->register_shader_dispatch(
-      descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
+  {
+    api::PipelineBarrier pipeline_barrier{};
+    api::DescriptorSet descriptor_set =
+        context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
+
+    uint32_t idx = 0;
+    bind_tensor_to_descriptor_set(
+        *packed,
+        pipeline_barrier,
+        api::MemoryAccessType::WRITE,
+        descriptor_set,
+        idx++);
+    bind_staging_to_descriptor_set(staging, descriptor_set, idx++);
+    bind_params_to_descriptor_set(params_, descriptor_set, idx);
+
+    context->register_shader_dispatch(
+        descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
+  }
+
+  // Submit a compute shader that performs a no-op with the packed tensor in
+  // order to trigger an image layout transition from GENERAL to
+  // READ_ONLY_OPTIMAL. This ensures that future uses of the tensor will be
+  // bound with the correct image layout.
+  {
+    api::PipelineBarrier pipeline_barrier{};
+    api::DescriptorSet descriptor_set =
+        context->get_descriptor_set(noop_shader_, {1, 1, 1});
+
+    bind_tensor_to_descriptor_set(
+        *packed,
+        pipeline_barrier,
+        api::MemoryAccessType::READ,
+        descriptor_set,
+        0);
+
+    context->register_shader_dispatch(
+        descriptor_set, pipeline_barrier, noop_shader_, {1, 1, 1});
+  }
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index dd31be12b37..92e24c5818e 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -33,7 +33,8 @@ class PrepackNode final {
       const api::utils::uvec3& local_workgroup_size,
       const ValueRef tref,
       const ValueRef packed,
-      const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params);
+      const api::ParamsBindList& params,
+      const api::SpecVarList& spec_vars = {});
 
   ~PrepackNode() = default;
 
@@ -41,12 +42,16 @@ class PrepackNode final {
 
  protected:
   const api::ShaderInfo shader_;
+  api::ShaderInfo noop_shader_;
   const api::utils::uvec3 global_workgroup_size_;
   const api::utils::uvec3 local_workgroup_size_;
   const ValueRef tref_;
   const ValueRef packed_;
-  // TODO(T180906457): allow re-computing param buffers.
-  std::vector<std::shared_ptr<api::UniformParamsBuffer>> params_;
+  const api::ParamsBindList params_;
+  const api::SpecVarList spec_vars_;
+
+ private:
+  api::StorageBuffer create_staging_buffer(ComputeGraph* graph);
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index c648db2c4c2..cf8521fa2b3 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -8,12 +8,14 @@
 
 #version 450 core
 
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
 #define PRECISION ${PRECISION}
 
-#define OP(X, Y, A) ${OPERATOR}
+#define VEC4_T ${texel_type(DTYPE)}
+
+#define op(X, Y, A) ${OPERATOR}
+
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
 
 layout(std430) buffer;
 
@@ -22,59 +24,56 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D image_other;
 
 layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
-  ivec4 data;
-}
-out_sizes;
+  ivec4 out_sizes;
+};
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
-  ivec4 data;
-}
-in_sizes;
+  ivec4 in_sizes;
+};
 
 layout(set = 0, binding = 5) uniform PRECISION restrict OtherSizes {
-  ivec4 data;
-}
-other_sizes;
+  ivec4 other_sizes;
+};
 
 layout(set = 0, binding = 6) uniform PRECISION restrict BroadcastParams {
-  ivec2 data;
-}
-broadcast_params;
+  ivec2 broadcast_params;
+};
 
 layout(set = 0, binding = 7) uniform PRECISION restrict Alpha {
-  float data;
-}
-alpha;
+  float alpha;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, out_sizes.data);
+  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
 
-  if (any(greaterThanEqual(coord, out_sizes.data))) {
+  if (any(greaterThanEqual(idx, out_sizes))) {
     return;
   }
 
-  ivec4 in_coord = out_coord_to_in_coord(coord, in_sizes.data);
-  ${VEC4_T[DTYPE]} in_texel = ${VEC4_T[DTYPE]}(texelFetch(
+  ivec4 in_idx = broadcast_indices(idx, in_sizes);
+  VEC4_T in_texel = VEC4_T(texelFetch(
     image_in,
-    COORD_TO_POS_${PACKING}(in_coord, in_sizes.data),
+    to_texture_pos(in_idx, in_sizes, packed_dim),
     0));
 
-  ivec4 other_coord = out_coord_to_in_coord(coord, other_sizes.data);
-  ${VEC4_T[DTYPE]} other_texel = ${VEC4_T[DTYPE]}(texelFetch(
+  ivec4 other_idx = broadcast_indices(idx, other_sizes);
+  VEC4_T other_texel = VEC4_T(texelFetch(
     image_other,
-    COORD_TO_POS_${PACKING}(other_coord, other_sizes.data),
+    to_texture_pos(other_idx, other_sizes, packed_dim),
     0));
 
   // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
-  if (broadcast_params.data.x > 0) {
+  if (broadcast_params.x > 0) {
     in_texel = in_texel.xxxx;
   }
-  if (broadcast_params.data.y > 0) {
+  if (broadcast_params.y > 0) {
     other_texel = other_texel.xxxx;
   }
 
-  imageStore(image_out, pos, ${VEC4_T[DTYPE]}(OP(in_texel, other_texel, alpha.data)));
+  imageStore(image_out, pos, VEC4_T(op(in_texel, other_texel, alpha)));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
index 28f65ee29c7..e5334fcbb6e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
@@ -9,22 +9,12 @@ binary_op:
     OPERATOR: X + A * Y
     NDIM: 3
     DTYPE: float
-    PACKING: CHANNELS_PACKED
+    PACKING: C_packed
   generate_variant_forall:
-    PACKING:
-      - VALUE: CHANNELS_PACKED
-        SUFFIX: C_packed
-      - VALUE: WIDTH_PACKED
-        SUFFIX: W_packed
-      - VALUE: HEIGHT_PACKED
-        SUFFIX: H_packed
     DTYPE:
       - VALUE: half
-        SUFFIX: half
       - VALUE: float
-        SUFFIX: float
       - VALUE: int
-        SUFFIX: int
   shader_variants:
     - NAME: binary_add
     - NAME: binary_sub
diff --git a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
index 55fd8b8e482..840e98a25ed 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
@@ -6,12 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-ivec4 out_coord_to_in_coord(const ivec4 out_coord, const ivec4 in_sizes) {
-  ivec4 in_coord = out_coord;
+ivec4 broadcast_indices(const ivec4 out_idx, const ivec4 in_sizes) {
+  ivec4 in_idx = out_idx;
   for (int i = 0; i < 4; ++i) {
-    if (out_coord[i] >= in_sizes[i]) {
-      in_coord[i] = 0;
+    if (out_idx[i] >= in_sizes[i]) {
+      in_idx[i] = 0;
     }
   }
-  return in_coord;
+  return in_idx;
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.glsl b/backends/vulkan/runtime/graph/ops/glsl/clone.glsl
new file mode 100644
index 00000000000..64def8d7000
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/clone.glsl
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+  imageStore(image_out, pos, texelFetch(image_in, pos, 0));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.yaml b/backends/vulkan/runtime/graph/ops/glsl/clone.yaml
new file mode 100644
index 00000000000..5dbce0e9d8d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/clone.yaml
@@ -0,0 +1,10 @@
+clone:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: clone
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
new file mode 100644
index 00000000000..b77f171dcc9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  int kernel_size;
+  int stride;
+  int padding;
+  int dilation;
+  int in_group_size;
+  int out_group_size;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Let us define
+//
+// input = (N, in_C, in_L),
+// output = (N, out_C, out_L),
+// groups = G,
+// kernel = K,
+//
+// which results in shapes
+//
+// weight = (out_C, in_C / G, K),
+// bias = (out_C,).
+//
+// This implementation performs out_C shader invocations, where each invocation
+// calculates the rolling kernel of the length dimension for each batch, i.e.,
+// computes out_L * N results.
+//
+// Note that we can rewrite this implementation as out_L * out_C * ceil(N / 4)
+// shader invocations, where each invocation computes 1 result. But that
+// performs worse.
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  int in_length = in_sizes.x;
+  int batch_size = in_sizes.z;
+
+  // "out_c" is the output's channel index where we write our result.
+  // Across shader invocations, this is the only value that varies.
+  int out_c = pos.y;
+  vec4 bias = texelFetch(bias_in, ivec3(out_c, 0, 0), 0);
+
+  // "in_c" tracks the input's channel start index.
+  // We iterate over the input group that corresponds to the output group.
+  int c_start = (out_c / out_group_size) * in_group_size;
+  int c_end = c_start + in_group_size;
+
+  // "in_l" tracks the input's length start index for our input-kernel overlay
+  // region.
+  int l_start = -padding;
+  int l_end = in_length + padding - dilation * (kernel_size - 1);
+
+  // Since the input/output tensors are channel-packed, which is along the
+  // batch dimension, we can batch-read/write four elements at a time.
+  for (int n = 0; n < batch_size; n += 4) {
+    // "out_l" tracks the output's length index where we write our result.
+    int out_l = 0;
+
+    for (int in_l = l_start; in_l < l_end; in_l += stride, ++out_l) {
+      vec4 sum = vec4(0);
+
+      for (int in_c = c_start; in_c < c_end; ++in_c) {
+        // "k" tracks the kernel's index for our input-kernel computation.
+        // It reads out-of-bound zeros, but trying to avoid them complicates
+        // for-loop conditions, which results in worse performance.
+        for (int k = 0; k < kernel_size; k += 4) {
+          // Since the weight tensor is width-packed, which is along the length
+          // dimension, we can batch-read four elements at a time.
+          const ivec3 w_pos = ivec3(k / 4, in_c % in_group_size, out_c);
+          const vec4 weight = texelFetch(kernel_in, w_pos, 0);
+
+          const ivec3 in_pos_0 = ivec3(in_l + k * dilation, in_c, n / 4);
+          sum = fma(weight.xxxx, texelFetch(image_in, in_pos_0, 0), sum);
+
+          const ivec3 in_pos_1 = ivec3(in_l + (k+1) * dilation, in_c, n / 4);
+          sum = fma(weight.yyyy, texelFetch(image_in, in_pos_1, 0), sum);
+
+          const ivec3 in_pos_2 = ivec3(in_l + (k+2) * dilation, in_c, n / 4);
+          sum = fma(weight.zzzz, texelFetch(image_in, in_pos_2, 0), sum);
+
+          const ivec3 in_pos_3 = ivec3(in_l + (k+3) * dilation, in_c, n / 4);
+          sum = fma(weight.wwww, texelFetch(image_in, in_pos_3, 0), sum);
+        }
+      }
+
+      ivec3 out_pos = ivec3(out_l, out_c, n / 4);
+      imageStore(image_out, out_pos, sum + bias.x);
+    }
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
new file mode 100644
index 00000000000..ad1e419e6d3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv1d:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv1d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
new file mode 100644
index 00000000000..33f5ff9dd3e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+};
+
+// If fields are separated, SwiftShader cannot identify in_group_size.
+layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
+  ivec2 overlay_region;
+  int in_group_size;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a 2D convolution. Each shader invocation calculates the output at
+ * a single output location.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so reads from the padding region are skipped.
+  const ivec2 start = max(ivec2(0), ipos);
+  const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy));
+  // Compute the start of the kernel based on how far we are skipping ahead when
+  // reading the input. Note that these are "canonical" indices.
+  ivec2 kstart = (start - ipos) / dilation;
+  // During prepacking, the weight tensor was rearranged in order to optimize
+  // for data access linearity in this shader. Therefore we need to adjust the
+  // canonical coordinates to the corresponding index in the rearranged weight
+  // tensor. The x-coordinate is multipled by 4 since each group of 4 channels
+  // is folded into the X axis. The y-coordinate is offset based on the z-
+  // coordinate because the 2D planes were stacked atop each other vertically.
+  kstart.x *= 4;
+  kstart.y += pos.z * kernel_size.y;
+
+  // Perform the convolution by iterating over the overlay region.
+  VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  const int ic4 = in_group_size / 4;
+  for (int z4 = 0; z4 < ic4; ++z4, kstart.x += kernel_size.x * 4) {
+    for (int y = start.y, ky = kstart.y; y < end.y; y += dilation.y, ++ky) {
+      for (int x = start.x, kx = kstart.x; x < end.x; x += dilation.x, kx += 4) {
+        const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, z4), 0);
+        const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
+
+        // To explain the calculation below, the contents of in_texel and the
+        // group of 4 texels loaded from kernel_in are shown:
+        //
+        //   in_texel               kernel_in
+        //    -x->                   ---x--->
+        //   +---+              +----+----+----+----+
+        // ^ | w |           ^  | D0 | D1 | D2 | D3 |
+        // | +---+           |  +----+----+----+----+
+        // | | z |           |  | C0 | C1 | C2 | C3 |
+        // z +---+           z  +----+----+----+----+
+        // | | y |           |  | B0 | B1 | B2 | B3 |
+        // | +---+           |  +----+----+----+----+
+        //   | x |              | A0 | A1 | A2 | A3 |
+        //   +---+              +----+----+----+----+
+        //
+        // In the kernel_in graphic, cells sharing the same letter are from
+        // the same batch/output channel index, and the number denotes a unique
+        // channel index. To calculate the output texel, the following
+        // calculation is performed:
+        //
+        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+        //  | x | | D0 |   | y | | D1 |   | z | | D2 |   | w | | D3 |
+        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+        //  | x | | C0 |   | y | | C1 |   | z | | C2 |   | w | | C3 |
+        //  +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+
+        //  | x | | B0 |   | y | | B1 |   | z | | B2 |   | w | | B3 |
+        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+        //  | x | | A0 |   | y | | A1 |   | z | | A2 |   | w | | A3 |
+        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+        //
+        // which is expressed in the following statements.
+
+        sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum);
+        sum = fma(in_texel.yyyy, texelFetch(kernel_in, ivec2(kxs.y, ky), 0), sum);
+        sum = fma(in_texel.zzzz, texelFetch(kernel_in, ivec2(kxs.z, ky), 0), sum);
+        sum = fma(in_texel.wwww, texelFetch(kernel_in, ivec2(kxs.w, ky), 0), sum);
+      }
+    }
+  }
+
+  imageStore(image_out, pos, sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml
new file mode 100644
index 00000000000..882737b6f19
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
new file mode 100644
index 00000000000..56d70a2bfe0
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+};
+
+// If fields are separated, SwiftShader cannot identify in_group_size.
+layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
+  ivec2 overlay_region;
+  int in_group_size;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a depthwise convolution. Each shader invocation calculates the
+ * output at a single output location.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so reads from the padding region are skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + overlay_region.xy;
+
+  VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y; y < end.y; y += dilation.y) {
+    for (int x = start.x; x < end.x; x += dilation.x) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
+      ++kx;
+    }
+  }
+
+  imageStore(image_out, pos, sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml
new file mode 100644
index 00000000000..31c9778b2aa
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_dw
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
new file mode 100644
index 00000000000..cf4cfe66ac2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+};
+
+// If fields are separated, SwiftShader cannot identify in_group_size.
+layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
+  ivec2 overlay_region;
+  int in_group_size;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a depthwise convolution. Each shader invocation calculates the
+ * output at a single output location.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + overlay_region.xy;
+
+  VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y, i = 0; i < ${TILE_SIZE}; y += dilation.y, i++) {
+    for (int x = start.x, j = 0; j < ${TILE_SIZE}; x += dilation.x, j++) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const vec4 in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
+      kx++;
+    }
+  }
+
+  imageStore(image_out, pos, sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
new file mode 100644
index 00000000000..b9346abdd9d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw_output_tile:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    TILE_SIZE: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_dw_output_tile_3x3
+    - NAME: conv2d_dw_output_tile_5x5
+      TILE_SIZE: 5
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
new file mode 100644
index 00000000000..d3ae8b3b32b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define BUF_T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_type(DTYPE)}
+#define SCALAR_T ${texel_component_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
+layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
+  BUF_T buffer_in[];
+};
+
+// Corresponds to {1,4,3,9} in the example below.
+layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// Corresponds to {3,3,1,11} in the example below.
+layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
+  ivec4 original_sizes;
+};
+
+// Corresponds to {1,12} in the example below.
+layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
+  ivec2 padded_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+/*
+ * Computes special prepacking for a depthwise convolution. Each shader invocation
+ * calculates the input buffer location to read into the desired texel. This
+ * packing was originally developed on CPU and that approach is described in the
+ * rest of this comment. Refer to the code-level comments, for how we translate
+ * it to GPU by reversing the steps.
+ *
+ * Consider an example weight tensor of size {11,1,3,3}. The following
+ * transformations will be applied.
+ *
+ * 1. Pad the N dim so that it is a multiple of 4. In this case, 1
+ * batch of padding is added, producing a tensor of size {12,1,3,3}.
+ *      at::pad(x, {0,0,0,0,0,0,0,1}, "constant", 0);
+ *
+ * 2. Flatten the last two dims by reshaping the tensor:
+ *      x.reshape({12,1,9});
+ *
+ * 3. "Fold" the N dim into the C dim. Split the tensor along the N dim so that
+ * each split has 4 channels.
+ *      x.reshape({3,4,1,9});
+ *
+ * 4. Stack the batches on each other vertically by permuting the N and C dims
+ * and reshaping the tensor.
+ *      x.permute({1,0,2,3}).reshape({4,3,9});
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
+
+  if (any(greaterThanEqual(idx, sizes))) {
+    return;
+  }
+
+  // As in usual staging shaders, map from GPU texel position to normal CPU
+  // buffer indices: (9,3) -> (4,3,9)
+  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+
+  // Re-map the normal CPU buffer indices to special indices, through a series
+  // of mappings: reshape is a no-op to the underlying indices, so we only map
+  // for pad and permute.
+  const int Np = padded_sizes.x;
+  const int N = original_sizes.w;
+  const int C = original_sizes.z;
+  const int H = original_sizes.y;
+  const int W = original_sizes.x;
+
+  // Undo step 3 permute: (4,3,1,9) -> (3,4,1,9)
+  const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (C * H * W));
+
+  // Undo step 1 pad: (12,1,3,3) -> (11,1,3,3)
+  // For values in the padded region, write zero instead of buffer data.
+  const ivec4 n = p1 / (C * H * W);
+  const ivec4 mask = ivec4(greaterThanEqual(n, ivec4(N)));
+
+  VEC4_T texel = VEC4_T(0);
+  if (mask.x == 0) {
+    texel.x = SCALAR_T(buffer_in[p1.x]);
+  }
+  if (mask.y == 0) {
+    texel.y = SCALAR_T(buffer_in[p1.y]);
+  }
+  if (mask.z == 0) {
+    texel.z = SCALAR_T(buffer_in[p1.z]);
+  }
+  if (mask.w == 0) {
+    texel.w = SCALAR_T(buffer_in[p1.w]);
+  }
+
+  imageStore(image_out, pos.xy, texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml
new file mode 100644
index 00000000000..33342145a82
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw_prepack_weights:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_dw_prepack_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
new file mode 100644
index 00000000000..cb84cb38272
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define BUF_T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_type(DTYPE)}
+#define SCALAR_T ${texel_component_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
+layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
+  BUF_T buffer_in[];
+};
+
+// Corresponds to {1,4,9,24} in the example below.
+layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// Corresponds to {3,3,7,10} in the example below.
+layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
+  ivec4 original_sizes;
+};
+
+// Corresponds to {8,12} in the example below.
+layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
+  ivec2 padded_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+/*
+ * Computes special prepacking for a 2D convolution. Each shader invocation
+ * calculates the input buffer location to read into the desired texel. This
+ * packing was originally developed on CPU and that approach is described in the
+ * rest of this comment. Refer to the code-level comments, for how we translate
+ * it to GPU by reversing the steps.
+ *
+ * Consider an example weight tensor of size {10,7,3,3}. The following
+ * transformations will be applied.
+ *
+ * 1. Pad the N and C dims so that both are a multiple of 4. In this case, 2
+ * batches and 1 channel of padding are added, producing a tensor of size
+ * {12,8,3,3}.
+ *      at::pad(x, {0,0,0,0,0,1,0,2}, "constant", 0);
+ *
+ * 2. Split the tensor along the C dim so that each split has 4 channels.
+ *      x.reshape({12,2,4,3,3});
+ *
+ * 3. For each split, "fold" the C dim into the W dim. Suppose the first rows
+ * at H=0 of the split have values
+ *    0,1,2 | 10,11,12 | 20,21,22 | 30,31,32
+ *
+ * where | denotes a channel boundary. Then, the goal is to combine those rows
+ * into one row with the values
+ *    0, 10, 20, 30, 1, 11, 21, 31, 2, 12, 22, 32
+ *
+ *      x.permute({0,1,3,4,2}).reshape({12,2,3,12});
+ *
+ * 4. Stack the splits belonging to the same batch horizontally by swapping the
+ * C and H dims.
+ *      x.permute({0,2,1,3}).reshape({12,3,24});
+ *
+ * 5. Repeat a similar process to "fold" the N dim into the C dim. Split along
+ * the N dim so that each split has 4 batches.
+ *      x.reshape({3,4,3,24});
+ *
+ * 6. Stack the batches on each other vertically by swapping the N and C dims.
+ *      x.permute({1,0,2,3}).reshape({4,9,24});
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
+
+  if (any(greaterThanEqual(idx, sizes))) {
+    return;
+  }
+
+  // As in usual staging shaders, map from GPU texel position to normal CPU
+  // buffer indices: (24,9) -> (4,9,24)
+  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+
+  // Re-map the normal CPU buffer indices to special indices, through a series
+  // of mappings: reshape is a no-op to the underlying indices, so we only map
+  // for pad and permute.
+  const int Np = padded_sizes.y;
+  const int Cp = padded_sizes.x;
+  const int N = original_sizes.w;
+  const int C = original_sizes.z;
+  const int H = original_sizes.y;
+  const int W = original_sizes.x;
+
+  // Undo step 6 premute: (4,3,3,24) -> (3,4,3,24)
+  // Undo step 4 permute: (12,3,2,12) -> (12,2,3,12)
+  // Undo step 3 permute, part 1: (12,2,3h,3w,4) -> (12,2,3h,4,3w)
+  // Undo step 3 permute, part 2: (12,2,3h,4,3w) -> (12,2,4,3h,3w)
+  const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (H * Cp * W));
+  const ivec4 p2 = swap_adj_dims(p1, H, (Cp / 4), (W * 4));
+  const ivec4 p3 = swap_adj_dims(p2, W, 4, 1);
+  const ivec4 p4 = swap_adj_dims(p3, H, 4, W);
+
+  // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
+  // For values in the padded region, write zero instead of buffer data.
+  const ivec4 c = p4 % (Cp * H * W) / (H * W);
+  const ivec4 n = p4 / (Cp * H * W);
+  const ivec4 p5 = p4 - n * (Cp - C) * H * W;
+  const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
+      ivec4(greaterThanEqual(n, ivec4(N)));
+
+  VEC4_T texel = VEC4_T(0);
+  if (mask.x == 0) {
+    texel.x = SCALAR_T(buffer_in[p5.x]);
+  }
+  if (mask.y == 0) {
+    texel.y = SCALAR_T(buffer_in[p5.y]);
+  }
+  if (mask.z == 0) {
+    texel.z = SCALAR_T(buffer_in[p5.z]);
+  }
+  if (mask.w == 0) {
+    texel.w = SCALAR_T(buffer_in[p5.w]);
+  }
+
+  imageStore(image_out, pos.xy, texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml
new file mode 100644
index 00000000000..28cf63dc163
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_prepack_weights:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_prepack_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
new file mode 100644
index 00000000000..453a03dea54
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
+  ivec4 data;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+};
+
+// If fields are separated, SwiftShader cannot identify in_group_size.
+layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
+  ivec2 overlay_region;
+  int in_group_size;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
+ * output tile for pointwise convolution is more efficient because the kernel
+ * size is only 1x1, making it easier to re-use loaded texels from kernel_in.
+ */
+void main() {
+  const ivec3 gpos = ivec3(gl_GlobalInvocationID);
+
+  // Output position for TILE_SIZE = 2
+  // +--------+--------+
+  // | pos[0] | pos[1] |
+  // +--------+--------+
+  // | pos[2] | pos[3] |
+  // +--------+--------+
+  ivec3 pos[${TILE_SIZE * TILE_SIZE}];
+  for (int y = 0, i = 0; y < 2; ++y) {
+    for (int x = 0; x < 2; ++x) {
+      pos[i] = ivec3(
+          gpos.x * 2 + x, gpos.y * ${TILE_SIZE} + y, gpos.z);
+      i++;
+    }
+  }
+
+  // If the top left position is out of bounds, then this invocation will have
+  // no work to do.
+  if (any(greaterThanEqual(pos[0], out_limits))) {
+    return;
+  }
+
+  // Compute the index of the input texture that needs to be loaded for each
+  // output position. Note that negative indices can be produced indicating that
+  // the top-left element is in a region added by padding.
+  ivec2 ipos[${TILE_SIZE * TILE_SIZE}];
+  for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
+    ipos[i] = pos[i].xy * stride - padding;
+  }
+
+  vec4 sum[${TILE_SIZE * TILE_SIZE}];
+  sum[0] = texelFetch(bias_in, ivec2(gpos.z, 0), 0);
+  for (int i = 1; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
+    sum[i] = sum[0];
+  }
+
+  // Since the kernel is 1x1, we only have to loop over the depth dimension.
+  for (int z = 0, z4 = 0; z < in_group_size; z += 4, ++z4) {
+    // During prepacking, the weight tensor has been permuted so that the
+    // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
+    // the z-axis.
+    vec4 in_tex[${TILE_SIZE * TILE_SIZE}];
+    const vec4 ktex_0 = texelFetch(kernel_in, ivec2(z + 0, gpos.z), 0);
+    const vec4 ktex_1 = texelFetch(kernel_in, ivec2(z + 1, gpos.z), 0);
+    const vec4 ktex_2 = texelFetch(kernel_in, ivec2(z + 2, gpos.z), 0);
+    const vec4 ktex_3 = texelFetch(kernel_in, ivec2(z + 3, gpos.z), 0);
+
+    for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
+      in_tex[i] = texelFetch(image_in, ivec3(ipos[i], z4), 0);
+    }
+
+    for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
+      // For 2x2 tile size algorithm works as follows.
+      // To explain the calculations below, the contents of one in_tex and the
+      // group of 4 texels loaded from kernel_in are shown:
+      //
+      //   in_tex                 kernel_in
+      //    -x->                   ---x--->
+      //   +---+              +----+----+----+----+
+      // ^ | w |           ^  | D0 | D1 | D2 | D3 |
+      // | +---+           |  +----+----+----+----+
+      // | | z |           |  | C0 | C1 | C2 | C3 |
+      // z +---+           z  +----+----+----+----+
+      // | | y |           |  | B0 | B2 | B2 | B3 |
+      // | +---+           |  +----+----+----+----+
+      //   | x |              | A0 | A1 | A2 | A3 |
+      //   +---+              +----+----+----+----+
+      //
+      // In the kernel_in graphic, cells sharing the same letter are from
+      // the same batch/output channel index, and the number denotes a unique
+      // channel index. To calculate the output texel, the following
+      // calculation is performed:
+      //
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //  | x | | D0 |   | y | | D1 |   | z | | D2 |   | w | | D3 |
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //  | x | | C0 |   | y | | C1 |   | z | | C2 |   | w | | C3 |
+      //  +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+
+      //  | x | | B0 |   | y | | B1 |   | z | | B2 |   | w | | B3 |
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //  | x | | A0 |   | y | | A1 |   | z | | A2 |   | w | | A3 |
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //
+      //  which is what is expressed in the following calculations. This is done
+      //  for each output position.
+      sum[i] = fma(in_tex[i].xxxx, ktex_0, sum[i]);
+      sum[i] = fma(in_tex[i].yyyy, ktex_1, sum[i]);
+      sum[i] = fma(in_tex[i].zzzz, ktex_2, sum[i]);
+      sum[i] = fma(in_tex[i].wwww, ktex_3, sum[i]);
+    }
+  }
+
+  for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
+    if (all(lessThan(pos[i], out_limits))) {
+      imageStore(image_out, pos[i], sum[i]);
+    }
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
new file mode 100644
index 00000000000..2e04b6a3991
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_pw:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    TILE_SIZE: 2
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_pw
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
new file mode 100644
index 00000000000..3f2f6241a1d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+};
+
+// If fields are separated, SwiftShader cannot identify in_group_size.
+layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
+  ivec2 overlay_region;
+  int in_group_size;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+/*
+ * Computes a 2D transpose convolution. Each shader invocation calculates the
+ * output at a single output location. For details, refer to conv2d.glsl which
+ * uses a similar approach.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  ivec2 ipos = pos.xy + padding;
+
+  const ivec2 start = max(
+      ivec2(0),
+      ivec2(ceil((vec2(ipos) - kernel_size + 1) / vec2(stride))));
+  const ivec2 end =
+      min(ivec2(in_sizes.xy),
+          ivec2(floor(vec2(ipos) / vec2(stride))) + 1);
+
+  const int ic = in_group_size;
+  const int kx_stride = ic * (stride.x - 1);
+
+  int ky_start = overlay_region.y - 1 - (ipos.y - stride.y * start.y) + pos.z * kernel_size.y;
+  int kx_start = (overlay_region.x - 1 - (ipos.x - stride.x * start.x)) * ic;
+
+  VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  for (int y = start.y, ky = ky_start; y < end.y; ++y, ky += stride.y) {
+    for (int x = start.x, kx = kx_start; x < end.x; ++x, kx += kx_stride) {
+      for (int z4 = 0; z4 < ic / 4; ++z4, kx += 4) {
+        const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, z4), 0);
+        const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
+
+        sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum);
+        sum = fma(in_texel.yyyy, texelFetch(kernel_in, ivec2(kxs.y, ky), 0), sum);
+        sum = fma(in_texel.zzzz, texelFetch(kernel_in, ivec2(kxs.z, ky), 0), sum);
+        sum = fma(in_texel.wwww, texelFetch(kernel_in, ivec2(kxs.w, ky), 0), sum);
+      }
+    }
+  }
+
+  imageStore(image_out, pos, sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml
new file mode 100644
index 00000000000..7fc40c3242e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv_transpose2d:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv_transpose2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
new file mode 100644
index 00000000000..7c3dab547ed
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define BUF_T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_type(DTYPE)}
+#define SCALAR_T ${texel_component_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
+layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
+  BUF_T buffer_in[];
+};
+
+// Corresponds to {1,4,6,36} in the example below.
+layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// Corresponds to {3,3,7,10} in the example below.
+layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
+  ivec4 original_sizes;
+};
+
+// Corresponds to {8,12} in the example below.
+layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
+  ivec2 padded_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+/*
+ * Computes special prepacking for a 2D transpose convolution. Each shader
+ * invocation calculates the input buffer location to read into the desired
+ * texel.
+ *
+ * For details, refer to conv2d_prepack_weights.glsl which uses a similar
+ * approach. For transpose, there are slight differences to reflect the data
+ * access pattern in the shader. First, the weight tensor is flipped along the H
+ * and W dims. Second, steps 3 and 4 are slightly different so that the splits
+ * are interleaved.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
+
+  if (any(greaterThanEqual(idx, sizes))) {
+    return;
+  }
+
+  // As in usual staging shaders, map from GPU texel position to normal CPU
+  // buffer indices: (36,6) -> (4,6,36)
+  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+
+  // Re-map the normal CPU buffer indices to special indices, through a series
+  // of mappings: reshape is a no-op to the underlying indices, so we only map
+  // for flip, pad, and permute.
+  const int Np = padded_sizes.y;
+  const int Cp = padded_sizes.x;
+  const int N = original_sizes.w;
+  const int C = original_sizes.z;
+  const int H = original_sizes.y;
+  const int W = original_sizes.x;
+
+  // Undo step 6 premute: (4,2,3,36) -> (2,4,3,36)
+  // In the following comments, a=b=c=3.
+  // Undo step 3 permute, part 1: (8,a,b,c,4) -> (8,a,c,b,4)
+  // Undo step 3 permute, part 2: (8,a,c,b,4) -> (8,c,a,b,4)
+  // Undo step 3 permute, part 3: (8,c,a,b,4) -> (8,c,a,4,b)
+  // Undo step 3 permute, part 4: (8,c,a,4,b) -> (8,c,4,a,b)
+  const ivec4 p1 = swap_adj_dims(p0, 4, (Cp / 4), (H * Np * W));
+  const ivec4 p2 = swap_adj_dims(p1, W, (Np / 4), 4);
+  const ivec4 p3 = swap_adj_dims(p2, H, (Np / 4), (W * 4));
+  const ivec4 p4 = swap_adj_dims(p3, W, 4, 1);
+  const ivec4 p5 = swap_adj_dims(p4, H, 4, W);
+
+  // Undo step 0 permute: (8,12,3,3) -> (12,8,3,3)
+  const ivec4 p6 = swap_adj_dims(p5, Cp, Np, (W * H));
+  // Undo step 0 flip: (2,3)
+  const ivec4 w = p6 % W;
+  const ivec4 h = p6 % (H * W) / W;
+  const ivec4 p7 = p6 + W - 1 - 2 * w + W * (H - 1 - 2 * h);
+
+  // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
+  // For values in the padded region, write zero instead of buffer data.
+  const ivec4 c = p7 % (Cp * H * W) / (H * W);
+  const ivec4 n = p7 / (Cp * H * W);
+  const ivec4 p8 = p7 - n * (Cp - C) * H * W;
+  const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
+      ivec4(greaterThanEqual(n, ivec4(N)));
+
+  VEC4_T texel = VEC4_T(0);
+  if (mask.x == 0) {
+    texel.x = SCALAR_T(buffer_in[p8.x]);
+  }
+  if (mask.y == 0) {
+    texel.y = SCALAR_T(buffer_in[p8.y]);
+  }
+  if (mask.z == 0) {
+    texel.z = SCALAR_T(buffer_in[p8.z]);
+  }
+  if (mask.w == 0) {
+    texel.w = SCALAR_T(buffer_in[p8.w]);
+  }
+
+  imageStore(image_out, pos.xy, texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml
new file mode 100644
index 00000000000..d933cd097aa
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv_transpose2d_prepack_weights:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv_transpose2d_prepack_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
new file mode 100644
index 00000000000..17b3e06e61e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
+  ivec3 in_limits;
+};
+
+
+
+layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
+  ivec3 range;
+  int unused0;
+  ivec3 src_offset;
+  int unused1;
+  ivec3 dst_offset;
+  int unused2;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  const ivec3 out_pos = pos + dst_offset;
+  const ivec3 in_pos = pos + src_offset;
+
+  if (any(greaterThanEqual(pos, range))) {
+    return;
+  }
+
+  imageStore(image_out, out_pos, texelFetch(image_in, in_pos, 0));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
new file mode 100644
index 00000000000..4a31ba6bbca
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
@@ -0,0 +1,10 @@
+copy_offset:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: copy_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.glsl b/backends/vulkan/runtime/graph/ops/glsl/full.glsl
new file mode 100644
index 00000000000..4dd223414e4
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/full.glsl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+
+layout(set = 0, binding = 1) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+layout(set = 0, binding = 2) uniform PRECISION restrict FillVal {
+  float fill_value;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
+
+  if (any(greaterThanEqual(idx, sizes))) {
+    return;
+  }
+
+  VEC4_T outtex = VEC4_T(fill_value);
+  const int packed_dim_size = sizes[packed_dim];
+  int packed_idx = idx[packed_dim];
+
+  if (packed_idx + 3 >= packed_dim_size) {
+    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
+    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
+    outtex = outtex * valid_idx;
+  }
+
+  imageStore(image_out, ${get_pos[NDIM]("pos")}, outtex);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
new file mode 100644
index 00000000000..a997b73de61
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+full:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: full
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
index c353908c416..6c3ff2bb9fb 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -10,55 +10,54 @@
 
 #define PRECISION ${PRECISION}
 
+#define BUF_T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_type(DTYPE)}
+
 #include "indexing_utils.h"
 
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
 layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer {
-  ${T[DTYPE]} data[];
-}
-buffer_out;
+  BUF_T buffer_out[];
+};
 
-layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
-  ivec4 data;
-}
-gpu_sizes;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
-  ivec4 data;
-}
-cpu_sizes;
+layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
 
-  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+  if (any(greaterThanEqual(idx, sizes))) {
     return;
   }
 
-  const ${VEC4_T[DTYPE]} intex = texelFetch(image_in, ${GET_POS[NDIM]("pos")}, 0);
+  const VEC4_T intex = texelFetch(image_in, ${get_pos[NDIM]("pos")}, 0);
 
-  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
-  const ivec4 buf_indices =
-      base_index + ivec4(0, 1, 2, 3) * STRIDE_${PACKING}(cpu_sizes.data);
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
 
-  const int packed_dim_size = PACKED_DIM_${PACKING}(cpu_sizes.data);
-  int packed_coord = PACKED_DIM_${PACKING}(coord);
+  const int packed_dim_size = sizes[packed_dim];
+  int packed_idx = idx[packed_dim];
 
-  if (packed_coord < packed_dim_size) {
-    buffer_out.data[buf_indices.x] = intex.x;
+  if (packed_idx < packed_dim_size) {
+    buffer_out[buf_indices.x] = BUF_T(intex.x);
   }
-  if (packed_coord + 1 < packed_dim_size) {
-    buffer_out.data[buf_indices.y] = intex.y;
+  if (packed_idx + 1 < packed_dim_size) {
+    buffer_out[buf_indices.y] = BUF_T(intex.y);
   }
-  if (packed_coord + 2 < packed_dim_size) {
-    buffer_out.data[buf_indices.z] = intex.z;
+  if (packed_idx + 2 < packed_dim_size) {
+    buffer_out[buf_indices.z] = BUF_T(intex.z);
   }
-  if (packed_coord + 3 < packed_dim_size) {
-    buffer_out.data[buf_indices.w] = intex.w;
+  if (packed_idx + 3 < packed_dim_size) {
+    buffer_out[buf_indices.w] = BUF_T(intex.w);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
index 4683f51ac60..6885e0f3e2e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
@@ -8,23 +8,15 @@ image_to_nchw:
   parameter_names_with_default_values:
     NDIM: 3
     DTYPE: float
-    PACKING: CHANNELS_PACKED
   generate_variant_forall:
-    PACKING:
-      - VALUE: CHANNELS_PACKED
-        SUFFIX: C_packed
-      - VALUE: WIDTH_PACKED
-        SUFFIX: W_packed
-      - VALUE: HEIGHT_PACKED
-        SUFFIX: H_packed
+    NDIM:
+      - VALUE: 3
+        SUFFIX: 3d
+      - VALUE: 2
+        SUFFIX: 2d
     DTYPE:
       - VALUE: half
-        SUFFIX: half
       - VALUE: float
-        SUFFIX: float
       - VALUE: int
-        SUFFIX: int
   shader_variants:
-    - NAME: image3d_to_nchw
-    - NAME: image2d_to_nchw
-      NDIM: 2
+    - NAME: image_to_nchw
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index c76f054ec67..415bbedfe77 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -6,41 +6,166 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#define DIVUP4(x) ((x + 3) / 4)
+// Width Dim Index, assuming (W, H, C, N) order
+#define W_DIM 0
+// Height, assuming (W, H, C, N) order
+#define H_DIM 1
+// Channels, assuming (W, H, C, N) order
+#define C_DIM 2
 
-#define PACKED_DIM_CHANNELS_PACKED(vec) vec.z
+/*
+ * Describes which texture axis the "batches" dimension runs along in a 4D
+ * texture.
+ *
+ * Currently it is set to 2 since we represent batches by concatenating along
+ * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
+ * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
+ * order.
+ */
+#define BATCH_AXIS 2
+
+//
+// Basic Indexing Utility Macros and Functions
+//
+
+/*
+ * Divides input and rounds up to 4
+ */
+#define divup4(x) ((x + 3) / 4)
+
+/*
+ * Aligns input to the next multiple of 4
+ */
+#define alignup4(x) ((x + 3) & -4)
+
+//
+// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion
+//
+
+/*
+ * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
+ *        is packed along a texel
+ * Output: A ivec4 containing the buffer indices corresponding to each texel
+ *         element.
+ */
+ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) {
+  ivec4 strides =
+      ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
 
-#define PACKED_DIM_WIDTH_PACKED(vec) vec.x
+  int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z +
+      idx.w * strides.w;
 
-#define PACKED_DIM_HEIGHT_PACKED(vec) vec.y
+  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
+}
 
-#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \
-  ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z)
+/*
+ * Input: Index into a tensor's data buffer, (W, H, C, N) sizes of a tensor
+ * Returns: The WCHN index of the tensor that corresponds to the specified
+ *          buffer index, assuming the buffer has contiguous memory layout
+ */
+ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
+  return ivec4(
+      buf_i % sizes.x,
+      (buf_i / (sizes.x)) % sizes.y,
+      (buf_i / (sizes.x * sizes.y)) % sizes.z,
+      (buf_i / (sizes.x * sizes.y * sizes.z)));
+}
 
-#define POS_TO_COORD_WIDTH_PACKED(pos, sizes) \
-  ivec4((pos.x * 4), pos.y, pos.z % sizes.z, pos.z / sizes.z)
+//
+// (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
+//
 
-#define POS_TO_COORD_HEIGHT_PACKED(pos, sizes) \
-  ivec4(pos.x, (pos.y * 4), pos.z % sizes.z, pos.z / sizes.z)
+/*
+ * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim
+ *        is packed along a texel
+ * Output: Whether the texel position is outside the bounds of the image texture
+ *         given the size and packed dimension of the tensor.
+ */
+bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
 
-#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
-  ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
+  ivec3 max_pos = sizes.xyz;
+  max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS];
+  max_pos[packed_dim] /= 4;
+  return (any(greaterThanEqual(pos, max_pos)));
+}
 
-#define COORD_TO_POS_WIDTH_PACKED(coord, sizes) \
-  ivec3(coord.x / 4, coord.y, (coord.z + coord.w * sizes.z))
+/*
+ * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor,
+ *        which dim is packed along a texel
+ * Returns: the (w, h, c, n) tensor index cooresponding to the first element of
+ *          the texel at the specified position
+ */
+ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
 
-#define COORD_TO_POS_HEIGHT_PACKED(coord, sizes) \
-  ivec3(coord.x, coord.y / 4, (coord.z + coord.w * sizes.z))
+  // Packed dim contains 4 elements per texel
+  pos[packed_dim] *= 4;
+  // Construct the initial tensor index via swizzling
+#if BATCH_AXIS == 2
+  ivec4 tensor_idx = pos.xyzz;
+#endif
+#if BATCH_AXIS == 1
+  ivec4 tensor_idx = pos.xyzy;
+#endif
+#if BATCH_AXIS == 0
+  ivec4 tensor_idx = pos.xyzx;
+#endif
+  // Adjust the axis that the batch dim runs along
+  tensor_idx[3] /= sizes[BATCH_AXIS];
+  tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS];
 
-#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
-  ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
+  return tensor_idx;
+}
 
-#define COORD_TO_BUFFER_IDX(coord, sizes)                  \
-  coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
-      coord.w* sizes.z* sizes.y* sizes.x;
+/*
+ * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
+ *        is packed along a texel
+ * Returns: the (x, y, z) texture position containing element of the tensor at
+ *          the specified index
+ */
+ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 pos = idx.xyz;
+  pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
+  pos[packed_dim] /= 4;
+  return pos;
+}
+
+/*
+ * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
+ *        is packed along a texel
+ * Returns: the (x, y, z, i) texture position containing the element of the
+ *          tensor at the specified index, where i is the component within the
+ *          texel to which the element belongs
+ */
+ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
 
-#define STRIDE_CHANNELS_PACKED(vec) (vec.x * vec.y)
+  //  pos[4] is set to a placeholder value
+  ivec4 pos = idx.xyzx;
+  pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
+  pos[packed_dim] /= 4;
+  pos.w = idx[packed_dim] % 4;
+  return pos;
+}
 
-#define STRIDE_WIDTH_PACKED(vec) (1)
+//
+// Miscellaneous Utility Functions and Macros
+//
 
-#define STRIDE_HEIGHT_PACKED(vec) (vec.x)
+// Given a buffer(1-D) index cur, compute a new index where the corresponding
+// tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
+// describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
+// tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
+// plane=2*24=48.
+#define swap_adj_dims(cur, x, y, plane)                        \
+  cur +                                                        \
+      plane *                                                  \
+          ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
+           (x - 1) * ((cur % (y * plane)) / plane))
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
index fe1087f637a..a911c4fb6e4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
@@ -8,30 +8,28 @@
 
 #version 450 core
 
-#include "indexing_utils.h"
-
 #define PRECISION ${PRECISION}
 
+#include "indexing_utils.h"
+
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
 layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
 layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents {
-  uvec4 data;
-}
-out_extents;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
-  ivec4 data;
-}
-in_sizes;
+  ivec4 in_sizes;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
@@ -39,15 +37,15 @@ void main() {
 
   ivec3 mat1_pos = ivec3(0, pos.y, pos.z);
 
-  $if MAT2_PACKING == "HEIGHT_PACKED":
+  $if MAT2_PACKING == "H_packed":
     ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
   $else:
     ivec3 mat2_pos = ivec3(pos.x, 0, pos.z);
 
-  $if MAT1_PACKING == "WIDTH_PACKED":
-    int K = DIVUP4(in_sizes.data[0]);
+  $if MAT1_PACKING == "W_packed":
+    int K = divup4(in_sizes[0]);
     for (int i = 0; i < K; ++i) {
-      $if MAT2_PACKING == "HEIGHT_PACKED":
+      $if MAT2_PACKING == "H_packed":
         vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
         vec4 sums = vec4(
             dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
@@ -59,7 +57,7 @@ void main() {
 
         mat1_pos.x++;
         mat2_pos.y++;
-      $elif MAT2_PACKING == "WIDTH_PACKED":
+      $elif MAT2_PACKING == "W_packed":
         vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
         texel = fma(mat1_tex.xxxx, texelFetch(im_mat2, mat2_pos, 0), texel);
         mat2_pos.y++;
@@ -74,8 +72,8 @@ void main() {
       $else:
         $raise Exception("Unsupported value for MAT2_PACKING")
     }
-  $elif MAT1_PACKING == "CHANNELS_PACKED" and MAT2_PACKING == "CHANNELS_PACKED":
-    int K = in_sizes.data[0];
+  $elif MAT1_PACKING == "C_packed" and MAT2_PACKING == "C_packed":
+    int K = in_sizes[0];
     for (int i = 0; i < K; ++i) {
       texel = fma(
           texelFetch(im_mat1, mat1_pos, 0),
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml
index e1699eb1ee8..ef54dbc722a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml
@@ -8,18 +8,16 @@ matmul:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
-    MAT1_PACKING: WIDTH_PACKED
-    MAT2_PACKING: HEIGHT_PACKED
+    MAT1_PACKING: W_packed
+    MAT2_PACKING: H_packed
   generate_variant_forall:
     DTYPE:
       - VALUE: float
-        SUFFIX: float
       - VALUE: half
-        SUFFIX: half
   shader_variants:
     - NAME: matmul_W_packed_H_packed
     - NAME: matmul_W_packed_W_packed
-      MAT2_PACKING: WIDTH_PACKED
+      MAT2_PACKING: W_packed
     - NAME: matmul_C_packed_C_packed
-      MAT1_PACKING: CHANNELS_PACKED
-      MAT2_PACKING: CHANNELS_PACKED
+      MAT1_PACKING: C_packed
+      MAT2_PACKING: C_packed
diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
index 5ec8af29e70..25749afbf85 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
@@ -19,48 +19,45 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
 layout(set = 0, binding = 1, ${IMAGE_FORMAT["int"]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM]["int"]} image_idx;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents {
-  uvec4 data;
-}
-out_extents;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
 
-layout(set = 0, binding = 4) uniform PRECISION restrict InExtents {
-  uvec4 data;
-}
-in_extents;
+layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
 
 layout(set = 0, binding = 5) uniform PRECISION restrict Params {
   ivec2 kernel_size;
   ivec2 stride;
   ivec2 padding;
   ivec2 dilation;
-}
-params;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
-  const ivec2 ipos = pos.xy * params.stride - params.padding;
+  const ivec2 ipos = pos.xy * stride - padding;
 
   const ivec2 start = ipos;
-  const ivec2 end = ipos + params.kernel_size * params.dilation;
+  const ivec2 end = ipos + kernel_size * dilation;
 
   vec4 out_texel = vec4(FLT_MIN);
   ivec4 idx_texel = ivec4(0);
 
-  for (int y = start.y; y < end.y; y += params.dilation.y) {
-    for (int x = start.x; x < end.x; x += params.dilation.x) {
-      if ((x >= 0 && x < in_extents.data.x) && (y >= 0 && y < in_extents.data.y)) {
+  for (int y = start.y; y < end.y; y += dilation.y) {
+    for (int x = start.x; x < end.x; x += dilation.x) {
+      if ((x >= 0 && x < in_sizes.x) && (y >= 0 && y < in_sizes.y)) {
         const vec4 cur_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
 
         // Set idx if value is greatest in the pool; else, keep the existing idx.
-        ivec4 cur_idx = ivec4(x + int(in_extents.data.x) * y);
+        ivec4 cur_idx = ivec4(x + int(in_sizes.x) * y);
         ivec4 mask = ivec4(greaterThan(cur_texel, out_texel));
         idx_texel = ivec4(mix(idx_texel, cur_idx, mask));
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pool.yaml b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml
similarity index 90%
rename from backends/vulkan/runtime/graph/ops/glsl/pool.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml
index 8228ea862e7..3be032bf85d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/pool.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml
@@ -11,8 +11,6 @@ max_pool2d:
   generate_variant_forall:
     DTYPE:
       - VALUE: half
-        SUFFIX: half
       - VALUE: float
-        SUFFIX: float
   shader_variants:
     - NAME: max_pool2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
new file mode 100644
index 00000000000..235408c0a81
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_mean;
+layout(set = 0, binding = 2, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_rstd;
+
+layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
+layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
+
+layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+layout(set = 0, binding = 8) uniform PRECISION restrict Epsilon {
+  float epsilon;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  const int width = int(sizes.x);
+
+  VEC4_T mean = VEC4_T(0);
+  VEC4_T delta = VEC4_T(0);
+  VEC4_T delta2 = VEC4_T(0);
+  VEC4_T M2 = VEC4_T(0);
+
+  // Use Welford's online algorithm to compute mean and variance in one pass
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  for (int w = 0; w < width; ++w) {
+    VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
+    delta = v - mean;
+    mean += delta / (w + 1);
+    delta2 = v - mean;
+    M2 += delta * delta2;
+  }
+
+  VEC4_T var = M2 / width;
+  VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
+  VEC4_T offset = -rstd * mean;
+
+  for (int w = 0; w < width; ++w) {
+    VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
+    // broadcasting
+    VEC4_T weight = texelFetch(weight_in, ivec3(w, 0, 0), 0).xxxx;
+    VEC4_T bias = texelFetch(bias_in, ivec3(w, 0, 0), 0).xxxx;
+    VEC4_T outtex = (v * rstd + offset) * weight + bias;
+    imageStore(image_out, ivec3(w, pos.y, pos.z), outtex);
+  }
+
+  imageStore(image_mean, pos, mean);
+  imageStore(image_rstd, pos, rstd);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml
new file mode 100644
index 00000000000..44e9b627ada
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+native_layer_norm:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: native_layer_norm
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index 143d3786c05..07a22c8f96f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -10,55 +10,56 @@
 
 #define PRECISION ${PRECISION}
 
+#define BUF_T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_type(DTYPE)}
+#define SCALAR_T ${texel_component_type(DTYPE)}
+
 #include "indexing_utils.h"
 
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
-  ${T[DTYPE]} data[];
-}
-buffer_in;
+  BUF_T buffer_in[];
+};
 
-layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
-  ivec4 data;
-}
-gpu_sizes;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
-  ivec4 data;
-}
-cpu_sizes;
+layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
 
-  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+  if (any(greaterThanEqual(idx, sizes))) {
     return;
   }
 
-  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
-  const ivec4 buf_indices =
-      base_index + ivec4(0, 1, 2, 3) * STRIDE_${PACKING}(cpu_sizes.data);
-
-  ${T[DTYPE]} val_x = buffer_in.data[buf_indices.x];
-  ${T[DTYPE]} val_y = buffer_in.data[buf_indices.y];
-  ${T[DTYPE]} val_z = buffer_in.data[buf_indices.z];
-  ${T[DTYPE]} val_w = buffer_in.data[buf_indices.w];
-
-  ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
 
-  const int packed_dim_size = PACKED_DIM_${PACKING}(cpu_sizes.data);
-  int packed_coord = PACKED_DIM_${PACKING}(coord);
+  const int packed_dim_size = sizes[packed_dim];
+  int packed_idx = idx[packed_dim];
 
-  if (packed_coord + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_coord) + ivec4(0, 1, 2, 3);
-    ${VEC4_T[DTYPE]} valid_idx = ${VEC4_T[DTYPE]}(lessThan(packed_ind, ivec4(packed_dim_size)));
-    texel = texel * valid_idx;
+  VEC4_T texel = VEC4_T(0);
+  if (packed_idx < packed_dim_size) {
+    texel.x = SCALAR_T(buffer_in[buf_indices.x]);
+  }
+  if (packed_idx + 1 < packed_dim_size) {
+    texel.y = SCALAR_T(buffer_in[buf_indices.y]);
+  }
+  if (packed_idx + 2 < packed_dim_size) {
+    texel.z = SCALAR_T(buffer_in[buf_indices.z]);
+  }
+  if (packed_idx + 3 < packed_dim_size) {
+    texel.w = SCALAR_T(buffer_in[buf_indices.w]);
   }
 
-  imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel);
+  imageStore(image_out, ${get_pos[NDIM]("pos")}, texel);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
index ad74d663d6d..1fe02c85fd7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -8,23 +8,15 @@ nchw_to_image:
   parameter_names_with_default_values:
     NDIM: 3
     DTYPE: float
-    PACKING: CHANNELS_PACKED
   generate_variant_forall:
-    PACKING:
-      - VALUE: CHANNELS_PACKED
-        SUFFIX: C_packed
-      - VALUE: WIDTH_PACKED
-        SUFFIX: W_packed
-      - VALUE: HEIGHT_PACKED
-        SUFFIX: H_packed
+    NDIM:
+      - VALUE: 3
+        SUFFIX: 3d
+      - VALUE: 2
+        SUFFIX: 2d
     DTYPE:
       - VALUE: half
-        SUFFIX: half
       - VALUE: float
-        SUFFIX: float
       - VALUE: int
-        SUFFIX: int
   shader_variants:
-    - NAME: nchw_to_image3d
-    - NAME: nchw_to_image2d
-      NDIM: 2
+    - NAME: nchw_to_image
diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl
new file mode 100644
index 00000000000..7466b530a8c
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml
new file mode 100644
index 00000000000..f4b77f7b77f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+no_op:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    NDIM:
+      - VALUE: 3
+        SUFFIX: 3d
+      - VALUE: 2
+        SUFFIX: 2d
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+  shader_variants:
+    - NAME: no_op
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
new file mode 100644
index 00000000000..ff5ab63a4f7
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+layout(set = 0, binding = 4) uniform PRECISION restrict Block {
+  // output dims
+  uvec4 out_ndims;
+  // x = output channels aligned to 4, y = input channels aligned to 4
+  uvec2 ch_info;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  const int out_channel_4up = int(ch_info.x);
+  const int in_channel_4up = int(ch_info.y);
+  const int out_batch = int(sizes[3]);
+  const int max_dst_index = out_batch * out_channel_4up;
+  VEC4_T outval = VEC4_T(0.0);
+
+  for (int j = 0; j < 4; ++j) {
+    int dst_index = pos.z * 4 + j;
+    if (dst_index >= max_dst_index) {
+      // out of range
+      break;
+    }
+
+    ivec4 v = ivec4(0); // holds b,c,h,w
+    v[out_ndims[0]] = dst_index / out_channel_4up;
+    v[out_ndims[1]] = dst_index % out_channel_4up;
+    v[out_ndims[2]] = pos.y;
+    v[out_ndims[3]] = pos.x;
+
+    int src_index = v[0] * in_channel_4up + v[1];
+    int w = v[3];
+    int h = v[2];
+
+    VEC4_T inval = VEC4_T(texelFetch(image_in, ivec3(w, h, src_index / 4), 0));
+    outval[j] = inval[src_index % 4];
+  }
+
+  imageStore(image_out, pos, outval);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute.yaml
new file mode 100644
index 00000000000..77491a52856
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.yaml
@@ -0,0 +1,10 @@
+permute:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: permute
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl
new file mode 100644
index 00000000000..42c7f86aea8
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict RepeatArgs {
+  // With input_size (n, c_i, h, w) and repeat r
+  // out_size == (n, c_i * r, h, w)
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
+
+  if (any(greaterThanEqual(out_whcn, out_sizes))) {
+    return;
+  }
+
+  VEC4_T v;
+  // Loop over the 4 elements in texel, calculate the corresponding elem, and
+  // fetch. Not most efficient algorithm because likely we fetch same texel
+  // multiple times in this loop.
+
+  for (int i=0; i<4;i++) {
+    ivec4 in_whcn = out_whcn;
+    in_whcn.z = (out_whcn.z + i) % in_sizes.z;
+
+    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
+
+    v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
+  }
+
+  imageStore(image_out, out_pos, v);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml
new file mode 100644
index 00000000000..4147e82965a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml
@@ -0,0 +1,10 @@
+repeat_channel:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: repeat_channel
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
new file mode 100644
index 00000000000..f94e1120492
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
+  // data.x: index along batch dim to select
+  // data.y: number of batches
+  // data.z: number of texels per batch
+  // data.w: unused
+  ivec4 select_info;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int num_batches = select_info.y;
+  const int num_texel_per_batch = select_info.z;
+  const int index = select_info.x;
+
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  const uint src_pos_z = (num_texel_per_batch * index) + pos.z;
+  imageStore(
+      image_out, pos, texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml
new file mode 100644
index 00000000000..9c7d54c8f69
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml
@@ -0,0 +1,10 @@
+select_batch_4d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: select_batch_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
new file mode 100644
index 00000000000..0bbec798484
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${texel_component_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// index to select
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
+  int index;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  const int tex = index / 4;
+  const int ind = index % 4;
+  const T v = VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, tex), 0))[ind];
+
+  imageStore(image_out, ivec3(pos.x, pos.y, 0), VEC4_T(v, 0, 0, 0));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml
new file mode 100644
index 00000000000..1c5c4e34b06
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml
@@ -0,0 +1,10 @@
+select_channel_3d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: select_channel_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
new file mode 100644
index 00000000000..517362f76ea
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
+  // data.x: index along channel dim to select
+  // data.y: number of batches
+  // data.z: number of texels per batch
+  // data.w: unused
+  ivec4 select_info;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  const int num_batches = select_info.y;
+  const int num_texel_per_batch = select_info.z;
+  const int index = select_info.x;
+
+  // read in the same channel from 4 separate batches
+  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
+  for (int k = 0; k < 4; k++) {
+    if ((k + pos.z * 4) >=
+        num_batches) {
+      break;
+    }
+    const uint src_pos_z = (4 * num_texel_per_batch * pos.z) +
+        (k * num_texel_per_batch) + (index / 4);
+    const uint src_pos_t = index % 4;
+    out_texel[k] =
+        VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0))[src_pos_t];
+  }
+
+  imageStore(image_out, pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml
new file mode 100644
index 00000000000..6236555f5dd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml
@@ -0,0 +1,10 @@
+select_channel_4d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: select_channel_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
new file mode 100644
index 00000000000..87409fb35fd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// index to select
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
+  int index;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // w
+  const int src_x = pos.x;
+  // h
+  const int src_y = index;
+  // c
+  const int src_z = pos.y;
+
+  const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0));
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= sizes.y) {
+      return;
+    }
+
+    imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0));
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
new file mode 100644
index 00000000000..a373f1decd9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
@@ -0,0 +1,10 @@
+select_height_3d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: select_height_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
new file mode 100644
index 00000000000..2e4e2afb2db
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// index to select
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
+  // data.x: index along height dim to select
+  // data.y: number of batches
+  // data.z: number of texels per batch
+  // data.w: unused
+  ivec4 select_info;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  const int num_batches = select_info.y;
+  const int num_texel_per_batch = select_info.z;
+  const int index = select_info.x;
+
+  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
+  // read in the same channel from 4 separate batches
+  for (int k = 0; k < 4; k++) {
+    if ((k + pos.z * 4) >= num_batches
+        ) { // < 4 batches for this texel, exit early
+      break;
+    }
+    const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
+        k * num_texel_per_batch + (pos.y / 4);
+    out_texel[k] = VEC4_T(texelFetch(
+        image_in, ivec3(pos.x, index, src_pos_z), 0))[pos.y % 4];
+  }
+  imageStore(image_out, pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
new file mode 100644
index 00000000000..c3724f1157a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
@@ -0,0 +1,10 @@
+select_height_4d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: select_height_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
new file mode 100644
index 00000000000..1e12d15ab21
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// index to select
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
+  int index;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // w
+  const int src_x = index;
+  // h
+  const int src_y = pos.x;
+  // c
+  const int src_z = pos.y;
+
+  const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0));
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= sizes.y) {
+      return;
+    }
+
+    imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0));
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
new file mode 100644
index 00000000000..a3070bf6ca3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
@@ -0,0 +1,10 @@
+select_width_3d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: select_width_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
new file mode 100644
index 00000000000..ffbd8afbda0
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+// index to select
+layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
+  // data.x: index along width dim to select
+  // data.y: number of batches
+  // data.z: number of texels per batch
+  // data.w: unused
+  ivec4 select_info;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  const int num_batches = select_info.y;
+  const int num_texel_per_batch = select_info.z;
+  const int index = select_info.x;
+
+  //vec4 out_texel = vec4(0, 0, 0, 0);
+  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
+  // read in the same channel from 4 separate batches
+  for (int k = 0; k < 4; k++) {
+    if ((k + pos.z * 4) >=
+        num_batches) { // < 4 batches for this texel, exit early
+      break;
+    }
+    const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
+        k * num_texel_per_batch + (pos.y / 4);
+
+    out_texel[k] = VEC4_T(texelFetch(
+        image_in, ivec3(index, pos.x, src_pos_z), 0))[pos.y % 4];
+  }
+  imageStore(image_out, pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
new file mode 100644
index 00000000000..f1131d77395
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
@@ -0,0 +1,10 @@
+select_width_4d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: select_width_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl
new file mode 100644
index 00000000000..72594830cd4
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
+  int dim;
+  int offset;
+  int step;
+  // Used when dim=batch. Stride is the # of plances for each batch  value.
+  int stride;
+}
+slice_arg;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+    return;
+  }
+
+  ivec3 in_pos = pos;
+
+  int index = pos[slice_arg.dim] / slice_arg.stride;
+  int within_stride = pos[slice_arg.dim] % slice_arg.stride;
+
+  in_pos[slice_arg.dim] = slice_arg.offset * slice_arg.stride + index * slice_arg.step *
+    slice_arg.stride + within_stride;
+
+  imageStore(image_out, pos, texelFetch(image_in, in_pos, 0));
+
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml
new file mode 100644
index 00000000000..9e69b09a304
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml
@@ -0,0 +1,10 @@
+slice_batch_height_width:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: slice_batch_height_width
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
new file mode 100644
index 00000000000..607f77d8254
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+  ivec4 out_sizes;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(set = 0, binding = 4) uniform PRECISION restrict SliceArg {
+  int offset;
+  int step;
+}
+slice_arg;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
+
+  if (any(greaterThanEqual(idx, out_sizes))) {
+    return;
+  }
+
+  // We map the output pos using the buffer index.  For each index in the texel,
+  // we calculate the source whcn-coordinate amended with offset-ed channel
+  // value.  Then we calculate the actual texture position from the
+  // whcn-coordinate.
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim);
+
+  vec4 outex;
+  for (int i=0;i<4;i++) {
+      ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], out_sizes);
+
+      int in_channel = user_coor.z;
+
+      ivec4 in_user_coor = user_coor;
+      in_user_coor.z = slice_arg.offset + in_channel * slice_arg.step;
+
+      ivec4 in_pow_elem = to_texture_elem_pos(
+        in_user_coor,
+        in_sizes,
+        packed_dim);
+
+      vec4 v = texelFetch(image_in, in_pow_elem.xyz, 0);
+
+      outex[i] = v[in_pow_elem.w];
+  }
+  imageStore(image_out, out_pos, outex);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml
new file mode 100644
index 00000000000..31c0642ecf6
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml
@@ -0,0 +1,9 @@
+slice_channel:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: slice_channel
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl
index ed10ec2711c..03cd94fb3d7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl
@@ -8,40 +8,38 @@
 
 #version 450 core
 
+#define PRECISION ${PRECISION}
+
 #include "broadcasting_utils.h"
 #include "indexing_utils.h"
 
-#define PRECISION ${PRECISION}
-
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
-  uvec4 data;
-}
-out_extents;
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
 
 // dim to sum
 layout(set = 0, binding = 3) uniform PRECISION restrict DimVal {
-  int data;
-}
-dim;
+  int dim;
+};
 
 // size of dim (in the input)
 layout(set = 0, binding = 4) uniform PRECISION restrict DimSize {
-  int data;
-}
-dim_size;
+  int dim_size;
+};
 
 layout(set = 0, binding = 5) uniform PRECISION restrict Channel {
-  int data;
-}
-flattened_channels;
+  int flattened_channels;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
 /*
  * Returns a new tensor with values summed along dimension dim
  * Dimension dim is squeezed
@@ -56,17 +54,21 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
   vec4 out_texel = vec4(0);
 
   int src_n;
   int src_c;
 
   // Batch
-  if (dim.data == 0) {
-    for (int batch = 0; batch < dim_size.data; ++batch) {
+  if (dim == 0) {
+    for (int batch = 0; batch < dim_size; ++batch) {
       src_n = batch;
       src_c = pos.z;
-      int src_z = src_n * flattened_channels.data + src_c;
+      int src_z = src_n * flattened_channels + src_c;
       vec4 v = texelFetch(image_in, ivec3(pos.x, pos.y, src_z), 0);
       out_texel += v;
     }
@@ -74,13 +76,13 @@ void main() {
   }
 
   // Channel
-  else if (dim.data == 1) {
+  else if (dim == 1) {
     for (int out_index = 0; out_index < 4; ++out_index) {
-      for (int channel = 0; channel < dim_size.data; ++channel) {
+      for (int channel = 0; channel < dim_size; ++channel) {
         src_n = pos.z * 4 + out_index;
         src_c = channel;
         int src_z =
-            src_n * flattened_channels.data + src_c / 4;
+            src_n * flattened_channels + src_c / 4;
         vec4 v = texelFetch(image_in, ivec3(pos.x, pos.y, src_z), 0);
         out_texel[out_index] += v[channel % 4];
       }
@@ -93,9 +95,9 @@ void main() {
     for (int out_index = 0; out_index < 4; ++out_index) {
       src_n = pos.z * 4 + out_index;
       src_c = pos.y;
-      int src_z = src_n * flattened_channels.data + src_c / 4;
-      for (int hw = 0; hw < dim_size.data; ++hw) {
-        vec4 v = (dim.data == 2)
+      int src_z = src_n * flattened_channels + src_c / 4;
+      for (int hw = 0; hw < dim_size; ++hw) {
+        vec4 v = (dim == 2)
             ? texelFetch(image_in, ivec3(pos.x, hw, src_z), 0) // Height
             : texelFetch(image_in, ivec3(hw, pos.x, src_z), 0); // Width
         out_texel[out_index] += v[pos.y % 4];
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml
index 15b8239b84d..de3fddce888 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml
@@ -11,8 +11,6 @@ sum_dim:
   generate_variant_forall:
     DTYPE:
       - VALUE: half
-        SUFFIX: half
       - VALUE: float
-        SUFFIX: float
   shader_variants:
     - NAME: sum_dim
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl
index cd54981f099..64d37a13e8f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl
@@ -8,39 +8,37 @@
 
 #version 450 core
 
-#include "indexing_utils.h"
-
 #define PRECISION ${PRECISION}
 
+#include "indexing_utils.h"
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
-  uvec4 data;
-}
-out_extents;
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
 
 // dim to sum
 layout(set = 0, binding = 3) uniform PRECISION restrict DimVal {
-  int data;
-}
-dim;
+  int dim;
+};
 
 // size of dim (in the input)
 layout(set = 0, binding = 4) uniform PRECISION restrict DimSize {
-  int data;
-}
-dim_size;
+  int dim_size;
+};
 
 layout(set = 0, binding = 5) uniform PRECISION restrict Channel {
-  int data;
-}
-flattened_channels;
+  int flattened_channels;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
 /*
  * Returns a new tensor with values summed along dimension dim.
  * Output and input have same number of dimensions.
@@ -50,29 +48,33 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
   vec4 out_texel = vec4(0);
 
   int src_n;
   int src_c;
 
   // Batch
-  if (dim.data == 0) {
-    for (int batch = 0; batch < dim_size.data; ++batch) {
+  if (dim == 0) {
+    for (int batch = 0; batch < dim_size; ++batch) {
       src_n = batch;
       src_c = pos.z;
-      int src_z = src_n * flattened_channels.data + src_c;
+      int src_z = src_n * flattened_channels + src_c;
       out_texel += texelFetch(image_in, ivec3(pos.x, pos.y, src_z), 0);
     }
     imageStore(image_out, pos, out_texel);
   }
 
   // Channel
-  else if (dim.data == 1) {
+  else if (dim == 1) {
     for (int out_index = 0; out_index < 4; ++out_index) {
-      for (int channel = 0; channel < dim_size.data; ++channel) {
+      for (int channel = 0; channel < dim_size; ++channel) {
         src_n = pos.z;
         src_c = channel;
-        int src_z = src_n * flattened_channels.data + src_c / 4;
+        int src_z = src_n * flattened_channels + src_c / 4;
         vec4 v = texelFetch(image_in, ivec3(pos.x, pos.y, src_z), 0);
         out_texel[out_index] += v[channel % 4];
       }
@@ -82,8 +84,8 @@ void main() {
 
   // Height, Width
   else {
-    for (int hw = 0; hw < dim_size.data; ++hw) {
-      vec4 v = (dim.data == 2)
+    for (int hw = 0; hw < dim_size; ++hw) {
+      vec4 v = (dim == 2)
           ? texelFetch(image_in, ivec3(pos.x, hw, pos.z), 0) // Height
           : texelFetch(image_in, ivec3(hw, pos.y, pos.z), 0); // Width
       out_texel += v;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml
index 37635925748..f74bf229e5b 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml
@@ -11,8 +11,6 @@ sum_dim_keepdim:
   generate_variant_forall:
     DTYPE:
       - VALUE: half
-        SUFFIX: half
       - VALUE: float
-        SUFFIX: float
   shader_variants:
     - NAME: sum_dim_keepdim
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
index 5c8d4f845cc..85e2c5c1a5e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
@@ -10,37 +10,38 @@
 
 #define PRECISION ${PRECISION}
 
-#define OP(X, A, B) ${OPERATOR}
+#define VEC4_T ${texel_type(DTYPE)}
+
+#define op(X, A, B) ${OPERATOR}
+
+#include "indexing_utils.h"
 
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
-  uvec4 data;
-}
-out_extents;
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
 
 layout(set = 0, binding = 3) uniform PRECISION restrict Min {
-  float data;
-}
-minimum;
+  float minimum;
+};
 
 layout(set = 0, binding = 4) uniform PRECISION restrict Max {
-  float data;
-}
-maximum;
+  float maximum;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
-  vec4 in_texel = texelFetch(image_in, pos, 0);
-  imageStore(image_out, pos, OP(in_texel, minimum.data, maximum.data));
+  VEC4_T in_texel = texelFetch(image_in, pos, 0);
+  imageStore(image_out, pos, op(in_texel, minimum, maximum));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
index a4cfa38432d..c32593d700c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -6,9 +6,7 @@ unary_op:
   generate_variant_forall:
     DTYPE:
       - VALUE: half
-        SUFFIX: half
       - VALUE: float
-        SUFFIX: float
   shader_variants:
     - NAME: abs
       OPERATOR: abs(X)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
new file mode 100644
index 00000000000..17e16fa09c6
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+  ivec4 out_sizes;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+	const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+	const ivec4 out_tensor_idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
+
+  if (all(greaterThanEqual(out_tensor_idx, out_sizes))) {
+    return;
+  }
+
+  // Assume there is a virtual continous buffer in nchw format. From the output
+  // pos, we first calculate the index in the virual buffer, and then calculate
+  // the input position from the indx.
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(out_tensor_idx, out_sizes, packed_dim);
+
+  VEC4_T value;
+  // Need to look up the 4 values in the output texel separately.
+  for (int i =0 ; i < 4; i++) {
+    ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], in_sizes);
+    ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, packed_dim);
+    VEC4_T intex = texelFetch(image_in, in_pos_elem.xyz, 0);
+    value[i] = intex[in_pos_elem.w];
+  }
+
+  imageStore(image_out, out_pos, value);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.yaml b/backends/vulkan/runtime/graph/ops/glsl/view.yaml
new file mode 100644
index 00000000000..6ce0db3ddd1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/view.yaml
@@ -0,0 +1,10 @@
+view:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: view
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 28988433974..7515f17b211 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -32,16 +32,16 @@ void resize_binary_op_node(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
   (void)extra_args;
-  vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
 
   // TODO(T183442143): Verify tensors are broadcastable.
-  vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
-  vTensor& other = graph->get_val(args[1].refs[1]).toTensor();
+  vTensorPtr self = graph->get_tensor(args[1].refs[0]);
+  vTensorPtr other = graph->get_tensor(args[1].refs[1]);
 
   std::vector<int64_t> new_out_sizes =
-      calculate_broadcasted_output_size(self, other);
+      calculate_broadcasted_output_size(*self, *other);
 
-  out.virtual_resize(new_out_sizes);
+  out->virtual_resize(new_out_sizes);
 }
 
 void add_binary_op_node(
@@ -55,47 +55,49 @@ void add_binary_op_node(
   ValueRef arg2 =
       prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));
 
-  vTensor& t_in1 = graph.get_val(arg1).toTensor();
-  vTensor& t_in2 = graph.get_val(arg2).toTensor();
-
-  vTensor& t_out = graph.get_val(out).toTensor();
+  vTensorPtr t_in1 = graph.get_tensor(arg1);
+  vTensorPtr t_in2 = graph.get_tensor(arg2);
+  vTensorPtr t_out = graph.get_tensor(out);
 
-  check_binary_op_args(t_in1, t_in2, t_out);
+  check_binary_op_args(*t_in1, *t_in2, *t_out);
 
-  api::utils::uvec3 global_size = t_out.virtual_extents();
+  api::utils::uvec3 global_size = t_out->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   float alpha_val = 1.0f;
   // String is checked since floor_div passes in an unused string argument in
   // place of alpha
-  if (is_valid(alpha) && !graph.get_val(alpha).isString()) {
-    alpha_val = extract_scalar<float>(graph.get_val(alpha));
+  if (is_valid(alpha) && !graph.val_is_string(alpha)) {
+    alpha_val = graph.extract_scalar<float>(alpha);
   }
 
   const api::utils::ivec2 broadcast_params =
-      create_broadcast_params(t_in1, t_in2);
+      create_broadcast_params(*t_in1, *t_in2);
 
-  std::stringstream kernel_name;
-  kernel_name << "binary_" << op_name;
-  apply_memory_layout_suffix(kernel_name, t_out);
-  apply_dtype_suffix(kernel_name, t_out);
+  std::string kernel_name("binary_");
+  kernel_name.reserve(kShaderNameReserve);
+  kernel_name += op_name;
+  add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name.str()),
+      VK_KERNEL_FROM_STR(kernel_name),
       global_size,
       local_size,
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {{arg1, arg2}, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out.gpu_sizes_ubo(),
-       t_in1.gpu_sizes_ubo(),
-       t_in2.gpu_sizes_ubo(),
+      {t_out->sizes_ubo(),
+       t_in1->sizes_ubo(),
+       t_in2->sizes_ubo(),
        graph.create_params_buffer(broadcast_params),
        graph.create_params_buffer(alpha_val)},
-      // Resizing
-      resize_binary_op_node));
+      // Specialization Constants
+      {SV(t_out->gpu_memory_layout_int())},
+      // Resizing Logic
+      resize_binary_op_node,
+      {}));
 }
 
 #define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name)                          \
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
new file mode 100644
index 00000000000..e95e7bdc00d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_clone_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef out) {
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  std::string kernel_name = "clone";
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      {t_out->texture_limits_ubo()}));
+}
+
+void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // The vulkan delegate does not support changing memory format.
+  return add_clone_node(graph, args[0], args[2]);
+}
+
+// Clone node is not the most efficient implementation for the aten.clone
+// operation. A more efficient implementation can be achieved during vulkan
+// export with the use of shared object. This clone node is introduced to enable
+// a "copy" mechanism if there is no alternative (e.g. during direct
+// ComputeGraph manipulation, we need to make a copy of a Tensor).
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.clone.default, clone);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
new file mode 100644
index 00000000000..d40352d2240
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_conv2d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr self = graph->get_tensor(args[1].refs[0]);
+
+  size_t ndim = self->sizes().size();
+  std::vector<int64_t> new_out_sizes(ndim);
+  const bool transposed = graph->get_bool(extra_args[4]);
+
+  // Batch, Channel
+  if (ndim == 4) {
+    new_out_sizes.at(ndim - 4) = self->sizes().at(ndim - 4);
+  }
+
+  TensorRefPtr weight_ref = graph->get_tref(extra_args[0]);
+  const auto& weight_sizes = weight_ref->sizes;
+  new_out_sizes.at(ndim - 3) =
+      transposed ? weight_sizes.at(ndim - 3) : weight_sizes.at(ndim - 4);
+
+  // Height, Width
+  const auto& new_out_sizes_hw = calc_out_sizes_hw(
+      *graph,
+      self->sizes(),
+      extra_args[0],
+      /*kernel_size_only = */ false,
+      {extra_args[1], extra_args[2], extra_args[3], extra_args[5]},
+      transposed);
+  new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0);
+  new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1);
+
+  out->virtual_resize(new_out_sizes);
+}
+
+void resize_conv1d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr self = graph->get_tensor(args[1].refs[0]);
+  TensorRefPtr weight_ref = graph->get_tref(extra_args[0]);
+
+  int64_t stride_size = graph->get_int_list(extra_args[1])->at(0);
+  int64_t padding_size = graph->get_int_list(extra_args[2])->at(0);
+  int64_t dilation_size = graph->get_int_list(extra_args[3])->at(0);
+
+  const std::vector<int64_t>& weight_sizes = weight_ref->sizes;
+
+  const std::vector<int64_t>& in_sizes = self->sizes();
+  size_t ndim = in_sizes.size();
+  std::vector<int64_t> new_out_sizes(ndim);
+
+  int64_t kernel_size = weight_sizes.at(2);
+  int64_t in_length = in_sizes.at(2);
+
+  new_out_sizes.at(0) = in_sizes.at(0);
+  new_out_sizes.at(1) = weight_sizes.at(0);
+  new_out_sizes.at(2) = calc_out_size(
+      in_length, kernel_size, stride_size, padding_size, dilation_size, false);
+
+  out->virtual_resize(new_out_sizes);
+}
+
+ValueRef prepack_biases(
+    ComputeGraph& graph,
+    const ValueRef vref,
+    const ValueRef weight,
+    const bool transposed,
+    const api::StorageType storage_type,
+    const api::GPUMemoryLayout memory_layout) {
+  auto sizes = graph.get_sizes_of(weight);
+  const int64_t out_channels = transposed ? sizes.at(1) : sizes.at(0);
+
+  ValueRef v = graph.add_tensor(
+      {out_channels}, graph.get_dtype_of(weight), storage_type, memory_layout);
+  vTensorPtr t = graph.get_tensor(v);
+
+  api::ShaderInfo shader = get_nchw_to_image_shader(*t);
+
+  api::utils::uvec3 global_size = t->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      shader,
+      global_size,
+      local_size,
+      vref,
+      v,
+      {t->sizes_ubo()},
+      // Specialization constants
+      {SV(t->gpu_memory_layout_int())}));
+
+  return v;
+}
+
+enum class Conv2dMethod : uint8_t {
+  Depthwise,
+  Pointwise,
+  SlidingWindow,
+  Transposed,
+};
+
+api::ShaderInfo get_conv2d_shader(
+    ComputeGraph& graph,
+    const vTensor& t_out,
+    const bool prepack_weights,
+    const Conv2dMethod method,
+    const ValueRef weight) {
+  std::string kernel_name;
+  kernel_name.reserve(kShaderNameReserve);
+  switch (method) {
+    case Conv2dMethod::Depthwise:
+      kernel_name = "conv2d_dw";
+      if (!prepack_weights) {
+        const auto& weight_sizes = graph.get_tref(weight)->sizes;
+        if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
+          kernel_name += "_output_tile_3x3";
+        }
+        if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
+          kernel_name += "_output_tile_5x5";
+        }
+      }
+      break;
+    case Conv2dMethod::Pointwise:
+      if (prepack_weights) {
+        kernel_name = "conv2d";
+      } else {
+        kernel_name = "conv2d_pw";
+      }
+      break;
+    case Conv2dMethod::SlidingWindow:
+      kernel_name = "conv2d";
+      break;
+    case Conv2dMethod::Transposed:
+      kernel_name = "conv_transpose2d";
+      break;
+  }
+  if (prepack_weights) {
+    kernel_name += "_prepack_weights";
+  }
+  add_dtype_suffix(kernel_name, t_out);
+
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+std::vector<int64_t> get_final_sizes(
+    const std::vector<int64_t>& original_sizes,
+    const Conv2dMethod method) {
+  int64_t batch_padded =
+      api::utils::align_up(api::utils::val_at(-4, original_sizes), INT64_C(4));
+  int64_t channels_padded =
+      api::utils::align_up(api::utils::val_at(-3, original_sizes), INT64_C(4));
+  int64_t channels = api::utils::val_at(-3, original_sizes);
+  int64_t height = api::utils::val_at(-2, original_sizes);
+  int64_t width = api::utils::val_at(-1, original_sizes);
+
+  switch (method) {
+    case Conv2dMethod::Depthwise:
+      return std::vector<int64_t>{
+          4, batch_padded * channels / 4, height * width};
+    case Conv2dMethod::Pointwise:
+    case Conv2dMethod::SlidingWindow:
+      return std::vector<int64_t>{
+          4, batch_padded * height / 4, channels_padded * width};
+    case Conv2dMethod::Transposed:
+      return std::vector<int64_t>{
+          4, channels_padded * height / 4, batch_padded * width};
+  }
+}
+
+std::vector<int64_t> get_padded_sizes(
+    const std::vector<int64_t>& original_sizes,
+    const Conv2dMethod method) {
+  int64_t batch_padded =
+      api::utils::align_up(api::utils::val_at(-4, original_sizes), INT64_C(4));
+  int64_t channels_padded =
+      api::utils::align_up(api::utils::val_at(-3, original_sizes), INT64_C(4));
+
+  switch (method) {
+    case Conv2dMethod::Depthwise:
+      return std::vector<int64_t>{-1, batch_padded};
+    case Conv2dMethod::Pointwise:
+    case Conv2dMethod::SlidingWindow:
+    case Conv2dMethod::Transposed:
+      return std::vector<int64_t>{batch_padded, channels_padded};
+  }
+}
+
+ValueRef prepack_weights(
+    ComputeGraph& graph,
+    const ValueRef vref,
+    const Conv2dMethod method) {
+  const auto original_sizes = graph.get_sizes_of(vref);
+  const auto final_sizes = get_final_sizes(original_sizes, method);
+
+  ValueRef v = graph.add_tensor(
+      final_sizes,
+      graph.get_dtype_of(vref),
+      api::kTexture2D,
+      api::kChannelsPacked);
+  vTensorPtr t = graph.get_tensor(v);
+
+  api::utils::uvec3 global_size = t->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::ShaderInfo shader =
+      get_conv2d_shader(graph, *t, /*prepack_weights = */ true, method, vref);
+
+  const auto& padded_sizes = get_padded_sizes(original_sizes, method);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      shader,
+      global_size,
+      local_size,
+      vref,
+      v,
+      {t->sizes_ubo(),
+       graph.create_params_buffer(
+           api::utils::make_ivec4(original_sizes, /*reverse = */ true)),
+       graph.create_params_buffer(
+           api::utils::make_ivec2(padded_sizes, /*reverse = */ true))},
+      // Specialization constants
+      {SV(t->gpu_memory_layout_int())}));
+
+  return v;
+}
+
+void check_conv_args(const vTensor& in, const vTensor& out) {
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
+}
+
+struct Conv2dParams final {
+  api::utils::ivec2 overlay_region;
+  int in_group_size;
+};
+
+Conv2dParams create_conv2d_params(
+    ComputeGraph& graph,
+    const ValueRef weight,
+    const Kernel2dParams& p,
+    const bool transposed) {
+  const auto& overlay_region = api::utils::make_ivec2({
+      p.kernel_size.data[0] +
+          (p.kernel_size.data[0] - 1) * (p.dilation.data[0] - 1),
+      p.kernel_size.data[1] +
+          (p.kernel_size.data[1] - 1) * (p.dilation.data[1] - 1),
+  });
+  const auto weight_sizes = graph.get_sizes_of(weight);
+  const int32_t in_group_size =
+      api::utils::safe_downcast<int32_t>(api::utils::align_up(
+          transposed ? weight_sizes.at(0) : weight_sizes.at(1), INT64_C(4)));
+  return {overlay_region, in_group_size};
+}
+
+void check_conv2d_params(const Kernel2dParams& p, const bool transposed) {
+  if (transposed) {
+    if (p.dilation.data[0] > 1 || p.dilation.data[1] > 1) {
+      VK_THROW(
+          "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!");
+    }
+  }
+  if ((p.padding.data[0] > 0 && p.kernel_size.data[0] > 1 &&
+       p.dilation.data[0] > 1) ||
+      (p.padding.data[1] > 0 && p.kernel_size.data[1] > 1 &&
+       p.dilation.data[1] > 1)) {
+    VK_THROW(
+        "aten.convolution.default: padding > 0 while dilation, kernel_size > 1 is not supported yet!");
+  }
+}
+
+Conv2dMethod get_conv2d_method(
+    ComputeGraph& graph,
+    const ValueRef weight,
+    const int64_t groups,
+    const bool transposed) {
+  const auto weight_sizes = graph.get_sizes_of(weight);
+  if (!transposed && weight_sizes.at(0) == groups && weight_sizes.at(1) == 1) {
+    return Conv2dMethod::Depthwise;
+  }
+  if (groups > 1) {
+    VK_THROW("aten.convolution.default: groups > 1 is not supported yet!");
+  }
+  if (transposed) {
+    return Conv2dMethod::Transposed;
+  }
+  if (weight_sizes.at(2) == 1 && weight_sizes.at(3) == 1) {
+    return Conv2dMethod::Pointwise;
+  }
+  return Conv2dMethod::SlidingWindow;
+}
+
+void add_conv2d_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef transposed,
+    const ValueRef output_padding,
+    const ValueRef groups,
+    const ValueRef out) {
+  const bool transposed_val = graph.get_bool(transposed);
+  const int64_t groups_val = graph.get_int(groups);
+
+  const Conv2dMethod method =
+      get_conv2d_method(graph, weight, groups_val, transposed_val);
+
+  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
+  ValueRef arg_weight = prepack_weights(graph, weight, method);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight,
+      transposed_val,
+      /* storage_type = */ api::kTexture2D,
+      /* memory_layout = */ api::kWidthPacked);
+
+  vTensorPtr t_in = graph.get_tensor(arg_in);
+  vTensorPtr t_out = graph.get_tensor(out);
+  if (t_in->sizes().at(0) > 1) {
+    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
+  }
+  check_conv_args(*t_in, *t_out);
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      weight,
+      /*kernel_size_only = */ false,
+      stride,
+      padding,
+      dilation);
+  Conv2dParams extra_params =
+      create_conv2d_params(graph, weight, kernel_params, transposed_val);
+
+  check_conv2d_params(kernel_params, transposed_val);
+
+  api::ShaderInfo shader = get_conv2d_shader(
+      graph, *t_out, /*prepack_weights = */ false, method, weight);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      shader,
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          t_out->texture_limits_ubo(),
+          t_in->sizes_ubo(),
+          graph.create_params_buffer(kernel_params),
+          graph.create_params_buffer(extra_params),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_conv2d_node,
+      {weight, stride, padding, dilation, transposed, output_padding}));
+}
+
+void add_conv1d_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef out) {
+  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
+  ValueRef arg_weight = prepack_if_tensor_ref(graph, weight, api::kWidthPacked);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight,
+      /*transposed = */ false,
+      /*storage_type = */ api::kTexture3D,
+      /*memory_layout = */ api::kChannelsPacked);
+
+  vTensorPtr t_in = graph.get_tensor(arg_in);
+  vTensorPtr t_weight = graph.get_tensor(arg_weight);
+  vTensorPtr t_bias = graph.get_tensor(arg_bias);
+  vTensorPtr t_out = graph.get_tensor(out);
+  const int64_t groups_val = graph.get_int(groups);
+
+  std::vector<int64_t> in_sizes = t_in->sizes();
+  std::vector<int64_t> weight_sizes = t_weight->sizes();
+  std::vector<int64_t> out_sizes = t_out->sizes();
+
+  check_conv_args(*t_in, *t_out);
+
+  int32_t in_channels = in_sizes.at(1);
+  int32_t out_channels = weight_sizes.at(0);
+  int32_t kernel_size = weight_sizes.at(2);
+  int32_t stride_size = graph.get_int_list(stride)->at(0);
+  int32_t padding_size = graph.get_int_list(padding)->at(0);
+  int32_t dilation_size = graph.get_int_list(dilation)->at(0);
+  int32_t in_group_size = static_cast<int64_t>(in_channels / groups_val);
+  int32_t out_group_size = static_cast<int64_t>(out_channels / groups_val);
+
+  api::utils::uvec3 global_size = {1, static_cast<uint32_t>(out_channels), 1};
+  api::utils::uvec3 local_size = {1, 1, 1};
+
+  Kernel1dParams kernel_params = {
+      kernel_size,
+      stride_size,
+      padding_size,
+      dilation_size,
+      in_group_size,
+      out_group_size};
+
+  std::string kernel_name("conv1d");
+  kernel_name.reserve(kShaderNameReserve);
+
+  add_dtype_suffix(kernel_name, *t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          t_out->texture_limits_ubo(),
+          t_in->sizes_ubo(),
+          graph.create_params_buffer(kernel_params),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_conv1d_node,
+      {weight, stride, padding, dilation}));
+}
+
+void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int64_t in_ndim = graph.get_tensor(args[0])->sizes().size();
+  if (in_ndim == 4) {
+    return add_conv2d_node(
+        graph,
+        args[0],
+        args[1],
+        args[2],
+        args[3],
+        args[4],
+        args[5],
+        args[6],
+        args[7],
+        args[8],
+        args[9]);
+  } else {
+    return add_conv1d_node(
+        graph,
+        args[0],
+        args[1],
+        args[2],
+        args[3],
+        args[4],
+        args[5],
+        args[8],
+        args[9]);
+  }
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.convolution.default, conv);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
new file mode 100644
index 00000000000..0a5e20e4f7c
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_copy_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const api::utils::ivec3& range,
+    const api::utils::ivec3& src_offset,
+    const api::utils::ivec3& dst_offset,
+    const ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
+
+  std::string kernel_name = "copy_offset";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = api::utils::make_uvec3(range);
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  const struct Block final {
+    api::utils::ivec3 range;
+    int32_t unused0;
+    api::utils::ivec3 src_offset;
+    int32_t unused1;
+    api::utils::ivec3 dst_offset;
+    int32_t unused2;
+  } offset_params{
+      range,
+      0,
+      src_offset,
+      0,
+      dst_offset,
+      0,
+  };
+
+  auto shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      // Parameter buffers
+      {t_out->texture_limits_ubo(),
+       t_in->texture_limits_ubo(),
+       graph.create_params_buffer(offset_params)},
+      // Specialization Constants
+      {}));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
new file mode 100644
index 00000000000..6e0deb6b74e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+namespace vkcompute {
+
+void add_copy_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const api::utils::ivec3& range,
+    const api::utils::ivec3& src_offset,
+    const api::utils::ivec3& dst_offset,
+    const ValueRef out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
new file mode 100644
index 00000000000..5c1548df900
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_full_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  std::vector<int64_t> out_sizes = *graph->get_int_list(extra_args[0]);
+
+  out->virtual_resize(out_sizes);
+}
+
+void add_full_node(
+    ComputeGraph& graph,
+    const ValueRef size,
+    const ValueRef fill_value,
+    const ValueRef out) {
+  float fill_value_val = graph.extract_scalar<float>(fill_value);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("full");
+  kernel_name.reserve(kShaderNameReserve);
+
+  add_dtype_suffix(kernel_name, *t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE}},
+      // Shader params buffers
+      {t_out->sizes_ubo(), graph.create_params_buffer(fill_value_val)},
+      // Specialization Constants
+      {SV(t_out->gpu_memory_layout_int())},
+      // Resizing Logic
+      resize_full_node,
+      {size}));
+}
+
+void full(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_full_node(graph, args[0], args[1], args[6]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.full.default, full);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index a20cce2ce87..053ef0ff350 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -25,9 +25,8 @@ void check_matmul_args(
   VK_CHECK_COND(check_same_ndim(mat1, mat2));
 
   VK_CHECK_COND(
-      check_memory_layout_is(
-          mat1, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED) ||
-      check_memory_layout_is(mat1, api::GPUMemoryLayout::TENSOR_WIDTH_PACKED));
+      check_memory_layout_is(mat1, api::kChannelsPacked) ||
+      check_memory_layout_is(mat1, api::kWidthPacked));
   VK_CHECK_COND(check_same_memory_layout(mat1, out));
 
   VK_CHECK_COND(check_same_sizes_at(mat1, -1, mat2, -2));
@@ -38,22 +37,22 @@ void resize_matmul_node(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
   (void)extra_args;
-  vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
-  vTensor& mat1 = graph->get_val(args[1].refs[0]).toTensor();
-  vTensor& mat2 = graph->get_val(args[1].refs[1]).toTensor();
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
+  vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
 
   std::vector<int64_t> new_out_sizes(3);
-  if (mat1.sizes().size() == 2) {
+  if (mat1->sizes().size() == 2) {
     new_out_sizes.resize(2);
-    new_out_sizes.at(0) = mat1.sizes().at(0);
-    new_out_sizes.at(1) = mat2.sizes().at(1);
+    new_out_sizes.at(0) = mat1->sizes().at(0);
+    new_out_sizes.at(1) = mat2->sizes().at(1);
   } else {
-    new_out_sizes.at(0) = mat1.sizes().at(0);
-    new_out_sizes.at(1) = mat1.sizes().at(1);
-    new_out_sizes.at(2) = mat2.sizes().at(2);
+    new_out_sizes.at(0) = mat1->sizes().at(0);
+    new_out_sizes.at(1) = mat1->sizes().at(1);
+    new_out_sizes.at(2) = mat2->sizes().at(2);
   }
 
-  out.virtual_resize(new_out_sizes);
+  out->virtual_resize(new_out_sizes);
 }
 
 void add_matmul_node(
@@ -61,42 +60,43 @@ void add_matmul_node(
     const ValueRef mat1,
     const ValueRef mat2,
     const ValueRef out) {
-  ValueRef arg1 = prepack_if_tensor_ref(
-      graph, mat1, api::GPUMemoryLayout::TENSOR_WIDTH_PACKED);
+  ValueRef arg1 = prepack_if_tensor_ref(graph, mat1, api::kWidthPacked);
 
-  api::GPUMemoryLayout mat2_layout = graph.memory_layout_of(arg1) ==
-          api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED
-      ? api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED
-      : api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED;
+  api::GPUMemoryLayout mat2_layout =
+      graph.memory_layout_of(arg1) == api::kChannelsPacked
+      ? api::kChannelsPacked
+      : api::kHeightPacked;
 
   ValueRef arg2 = prepack_if_tensor_ref(graph, mat2, mat2_layout);
 
-  vTensor& t_mat1 = graph.get_val(arg1).toTensor();
-  vTensor& t_mat2 = graph.get_val(arg2).toTensor();
-  vTensor& t_out = graph.get_val(out).toTensor();
+  vTensorPtr t_mat1 = graph.get_tensor(arg1);
+  vTensorPtr t_mat2 = graph.get_tensor(arg2);
+  vTensorPtr t_out = graph.get_tensor(out);
 
-  check_matmul_args(t_mat1, t_mat2, t_out);
+  check_matmul_args(*t_mat1, *t_mat2, *t_out);
 
-  api::utils::uvec3 global_size = t_out.virtual_extents();
+  api::utils::uvec3 global_size = t_out->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::stringstream kernel_name;
-  kernel_name << "matmul";
-  apply_memory_layout_suffix(kernel_name, t_mat1);
-  apply_memory_layout_suffix(kernel_name, t_mat2);
-  apply_dtype_suffix(kernel_name, t_out);
+  std::string kernel_name("matmul");
+  kernel_name.reserve(kShaderNameReserve);
+  add_memory_layout_suffix(kernel_name, *t_mat1);
+  add_memory_layout_suffix(kernel_name, *t_mat2);
+  add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name.str()),
+      VK_KERNEL_FROM_STR(kernel_name),
       global_size,
       local_size,
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {{arg1, arg2}, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out.extents_ubo(), t_mat1.cpu_sizes_ubo()},
-      // Resizing
+      {t_out->texture_limits_ubo(), t_mat1->sizes_ubo()},
+      // Specialization Constants
+      {},
+      // Resizing Logic
       resize_matmul_node));
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
new file mode 100644
index 00000000000..0c579274448
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+std::vector<int64_t> calc_out_mean_sizes(
+    vTensor& self,
+    int64_t normalized_shape_dim) {
+  std::vector<int64_t> output_size = self.sizes();
+  int64_t self_dim = self.sizes().size();
+  for (int64_t i = 0; i < normalized_shape_dim; ++i) {
+    output_size.at(self_dim - i - 1) = 1;
+  }
+  return output_size;
+}
+
+void resize_native_layer_norm_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr mean = graph->get_tensor(args[0].refs[1]);
+  vTensorPtr rstd = graph->get_tensor(args[0].refs[2]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+  std::vector<int64_t> in_sizes = in->sizes();
+
+  const auto normalized_shape_dim = graph->get_int_list(extra_args[0])->size();
+
+  std::vector<int64_t> mean_size =
+      calc_out_mean_sizes(*in, normalized_shape_dim);
+
+  out->virtual_resize(in_sizes);
+  mean->virtual_resize(mean_size);
+  rstd->virtual_resize(mean_size);
+}
+
+void check_args(const vTensor& in, const vTensor& out) {
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
+}
+
+void add_native_layer_norm_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef normalized_shape,
+    const ValueRef weight,
+    const ValueRef bias,
+    const ValueRef eps,
+    const ValueRef out) {
+  const auto normalized_shape_dim =
+      graph.get_int_list(normalized_shape)->size();
+  if (normalized_shape_dim > 1) {
+    VK_THROW("native_layer_norm only supports normalized_shape with dim == 1");
+  }
+
+  if (graph.val_is_none(weight)) {
+    VK_THROW("native_layer_norm requires weight to be non-None");
+  }
+
+  if (graph.val_is_none(bias)) {
+    VK_THROW("native_layer_norm requires bias to be non-None");
+  }
+
+  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
+  ValueRef arg_weight =
+      prepack_if_tensor_ref(graph, weight, graph.memory_layout_of(arg_in));
+  ValueRef arg_bias =
+      prepack_if_tensor_ref(graph, bias, graph.memory_layout_of(arg_in));
+
+  const auto out_val = graph.get_value_list(out);
+  vTensorPtr t_out = graph.get_tensor(out_val->at(0));
+  vTensorPtr t_mean = graph.get_tensor(out_val->at(1));
+  vTensorPtr t_input = graph.get_tensor(in);
+  float epsilon = graph.extract_scalar<float>(eps);
+
+  check_args(*t_input, *t_out);
+
+  std::vector<int64_t> in_sizes = t_input->sizes();
+
+  api::utils::uvec3 global_size = t_mean->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("native_layer_norm");
+  kernel_name.reserve(kShaderNameReserve);
+
+  add_dtype_suffix(kernel_name, *t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{{out_val->at(0), out_val->at(1), out_val->at(2)},
+        api::MemoryAccessType::WRITE},
+       {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {t_out->texture_limits_ubo(),
+       t_out->sizes_ubo(),
+       graph.create_params_buffer(epsilon)},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_native_layer_norm_node,
+      {normalized_shape}));
+}
+
+void native_layer_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_native_layer_norm_node(
+      graph, args[0], args[1], args[2], args[3], args[4], args[5]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.native_layer_norm.default, native_layer_norm);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
new file mode 100644
index 00000000000..14b77e3b451
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Permute.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+using api::utils::ivec3;
+using api::utils::uvec2;
+using api::utils::uvec4;
+
+namespace {
+
+void check_args(
+    const vTensor& in,
+    const std::vector<int64_t>& permute_dims,
+    const vTensor& out) {
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
+
+  // This implementation doesn't not requires the input tensor to have the same
+  // dim size as the argument. The code will work as long as the input tensor's
+  // dim size is shorter than the permute dim array. In this case, the code
+  // assume size of 1 at the higher dimensions.
+
+  int64_t out_dim = out.dim();
+  VK_CHECK_COND(
+      out_dim == permute_dims.size(),
+      "Output tensor dim size must match argument");
+}
+
+} // namespace
+
+void add_permute_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    const std::vector<int64_t>& permute_dims,
+    ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  check_args(*t_in, permute_dims, *t_out);
+
+  uvec4 out_dims{0u, 1u, 2u, 3u};
+
+  int64_t out_dim = t_out->dim();
+  std::vector<bool> seen(out_dim);
+  for (int i = 0; i < t_out->dim(); i++) {
+    int64_t permute_dim = permute_dims[i];
+    VK_CHECK_COND(
+        !seen[permute_dim], "Argument dim ", permute_dim, "  is repeated");
+    seen[permute_dim] = true;
+
+    out_dims.data[(4u - out_dim) + i] = permute_dim + (4u - out_dim);
+  }
+
+  std::string kernel_name = "permute";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  uint32_t out_channels = dim_at<Dim4D::Channel>(t_out->sizes());
+  uint32_t in_channels = dim_at<Dim4D::Channel>(t_in->sizes());
+
+  uint32_t out_c_aligned = api::utils::align_up(out_channels, 4u);
+  uint32_t in_c_aligned = api::utils::align_up(in_channels, 4u);
+
+  const struct Block final {
+    uvec4 out_ndims;
+    uvec2 ch_info;
+  } params{
+      out_dims,
+      {out_c_aligned, in_c_aligned},
+  };
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      {t_out->texture_limits_ubo(),
+       t_out->sizes_ubo(),
+       graph.create_params_buffer(params)},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      nullptr,
+      {}));
+}
+
+void add_permute_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef permute_dims_ref,
+    ValueRef out) {
+  IntListPtr permute_dims = graph.get_int_list(permute_dims_ref);
+
+  add_permute_node(graph, in, *permute_dims, out);
+}
+
+void permute(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_permute_node(graph, args[0], args[1], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.permute.default, permute);
+  VK_REGISTER_OP(aten.permute_copy.default, permute);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.h b/backends/vulkan/runtime/graph/ops/impl/Permute.h
new file mode 100644
index 00000000000..941a8896fe2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <vector>
+
+namespace vkcompute {
+
+void add_permute_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    const std::vector<int64_t>& permute_dims,
+    ValueRef out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index fdc17762fd8..87aed6e273f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -21,55 +21,36 @@ void resize_max_pool2d_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
-  vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
-  vTensor& indices = graph->get_val(args[0].refs[1]).toTensor();
-  vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr indices = graph->get_tensor(args[0].refs[1]);
+  vTensorPtr self = graph->get_tensor(args[1].refs[0]);
 
-  size_t ndim = self.sizes().size();
+  size_t ndim = self->sizes().size();
   std::vector<int64_t> new_out_sizes(ndim);
 
-  // Batch
+  // Batch, Channel
   if (ndim == 4) {
-    new_out_sizes.at(ndim - 4) = self.sizes().at(ndim - 4);
+    new_out_sizes.at(ndim - 4) = self->sizes().at(ndim - 4);
   }
-  // Channel
-  new_out_sizes.at(ndim - 3) = self.sizes().at(ndim - 3);
-
-  const auto kernel_size = reverse(*graph, extra_args[0]);
-  const auto stride = reverse(*graph, extra_args[1]);
-  const auto padding = reverse(*graph, extra_args[2]);
-  const auto dilation = reverse(*graph, extra_args[3]);
-  const bool ceil_mode = graph->get_val(extra_args[4]).toBool();
-
-  // Height
-  new_out_sizes.at(ndim - 2) = calc_out_size(
-      self.sizes().at(ndim - 2),
-      kernel_size.data[1],
-      stride.data[1],
-      padding.data[1],
-      dilation.data[1],
-      ceil_mode);
-  // Width
-  new_out_sizes.at(ndim - 1) = calc_out_size(
-      self.sizes().at(ndim - 1),
-      kernel_size.data[0],
-      stride.data[0],
-      padding.data[0],
-      dilation.data[0],
-      ceil_mode);
-
-  VK_CHECK_COND(new_out_sizes.at(ndim - 2) >= 1);
-  VK_CHECK_COND(new_out_sizes.at(ndim - 1) >= 1);
-
-  out.virtual_resize(new_out_sizes);
-  indices.virtual_resize(new_out_sizes);
+  new_out_sizes.at(ndim - 3) = self->sizes().at(ndim - 3);
+
+  // Height, Width
+  const auto& new_out_sizes_hw = calc_out_sizes_hw(
+      *graph,
+      self->sizes(),
+      extra_args[0],
+      /*kernel_size_only = */ true,
+      {extra_args[1], extra_args[2], extra_args[3], extra_args[4]});
+  new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0);
+  new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1);
+
+  out->virtual_resize(new_out_sizes);
+  indices->virtual_resize(new_out_sizes);
 }
 
 void check_max_pool2d_args(const vTensor& in, const vTensor& out) {
-  VK_CHECK_COND(
-      check_memory_layout_is(in, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED));
-  VK_CHECK_COND(check_memory_layout_is(
-      out, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED));
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
 }
 
 void add_max_pool2d_node(
@@ -82,42 +63,44 @@ void add_max_pool2d_node(
     const ValueRef ceil_mode,
     const ValueRef out) {
   ValueRef arg = prepack_if_tensor_ref(graph, in);
-  vTensor& t_in = graph.get_val(arg).toTensor();
+  vTensorPtr t_in = graph.get_tensor(arg);
 
-  const auto& out_val = graph.get_val(out).toValueList();
-  vTensor& t_out = graph.get_val(out_val[0]).toTensor();
+  const auto out_val = graph.get_value_list(out);
+  vTensorPtr t_out = graph.get_tensor(out_val->at(0));
 
-  check_max_pool2d_args(t_in, t_out);
+  check_max_pool2d_args(*t_in, *t_out);
 
-  api::utils::uvec3 global_size = t_out.virtual_extents();
+  api::utils::uvec3 global_size = t_out->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::stringstream kernel_name;
-  kernel_name << "max_pool2d";
-  apply_dtype_suffix(kernel_name, t_out);
+  std::string kernel_name("max_pool2d");
+  add_dtype_suffix(kernel_name, *t_out);
 
-  KernelParams kernel_params{
-      reverse(graph, kernel_size),
-      reverse(graph, stride),
-      reverse(graph, padding),
-      reverse(graph, dilation),
-  };
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      kernel_size,
+      /*kernel_size_only = */ true,
+      stride,
+      padding,
+      dilation);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name.str()),
+      VK_KERNEL_FROM_STR(kernel_name),
       global_size,
       local_size,
       // Inputs and Outputs
-      {{{out_val[0], out_val[1]}, api::MemoryAccessType::WRITE},
+      {{{out_val->at(0), out_val->at(1)}, api::MemoryAccessType::WRITE},
        {arg, api::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out.extents_ubo(),
-          t_in.extents_ubo(),
+          t_out->texture_limits_ubo(),
+          t_in->sizes_ubo(),
           graph.create_params_buffer(kernel_params),
       },
-      // Resizing
+      // Specialization Constants
+      {},
+      // Resizing Logic
       resize_max_pool2d_node,
       {kernel_size, stride, padding, dilation, ceil_mode}));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
new file mode 100644
index 00000000000..dedc7978ada
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
+
+namespace vkcompute {
+
+namespace {
+
+void check_args(
+    const vTensor& in,
+    const std::vector<int64_t>& repeats,
+    const vTensor& out) {
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
+
+  int64_t in_dim = in.dim();
+  VK_CHECK_COND(
+      in_dim <= repeats.size(),
+      "Input tensor dim size must be not greater than the repeat argument's size");
+
+  VK_CHECK_COND(
+      dim_at<Dim4D::Width>(in.sizes()) * dim_at<Dim4D::Width>(repeats) ==
+          dim_at<Dim4D::Width>(out.sizes()),
+      "Output's width doesn't match input's width * repeat count");
+
+  VK_CHECK_COND(
+      dim_at<Dim4D::Height>(in.sizes()) * dim_at<Dim4D::Height>(repeats) ==
+          dim_at<Dim4D::Height>(out.sizes()),
+      "Output's height doesn't match input's height * repeat count");
+
+  VK_CHECK_COND(
+      dim_at<Dim4D::Channel>(in.sizes()) * dim_at<Dim4D::Channel>(repeats) ==
+          dim_at<Dim4D::Channel>(out.sizes()),
+      "Output's channel doesn't match input's channel * repeat count");
+
+  VK_CHECK_COND(
+      dim_at<Dim4D::Batch>(in.sizes()) * dim_at<Dim4D::Batch>(repeats) ==
+          dim_at<Dim4D::Batch>(out.sizes()),
+      "Output's batch doesn't match input's batch * repeat count");
+}
+
+} // namespace
+
+void add_repeat_channel_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    int64_t repeat_channel,
+    ValueRef out,
+    api::utils::ivec3& running_range) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  std::string kernel_name = "repeat_channel";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  const std::vector<int64_t>& in_sizes = t_in->sizes();
+
+  int32_t in_width =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Width>(in_sizes));
+  int32_t in_height =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Height>(in_sizes));
+  int32_t in_channel =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Channel>(in_sizes));
+  int32_t in_batch =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Batch>(in_sizes));
+
+  int32_t out_channel = repeat_channel * in_channel;
+
+  api::utils::ivec4 out_whcn_sizes{in_width, in_height, out_channel, in_batch};
+
+  api::utils::ivec4 in_whcn_sizes{in_width, in_height, in_channel, in_batch};
+
+  // Channel packed global work ids
+  running_range.data[2] =
+      out_whcn_sizes.data[3] * api::utils::div_up(out_whcn_sizes.data[2], 4);
+  api::utils::uvec3 global_size = api::utils::make_uvec3(running_range);
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  const struct Block final {
+    api::utils::ivec4 out_sizes;
+    api::utils::ivec4 in_size;
+  } repeat_channel_args{
+      out_whcn_sizes,
+      in_whcn_sizes,
+  };
+
+  auto shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      // Parameter buffers
+      {graph.create_params_buffer(repeat_channel_args)},
+      // Specialization Constants
+      {SV(t_out->gpu_memory_layout_int())}));
+}
+
+void add_repeat_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef repeats_ref,
+    ValueRef out) {
+  std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
+
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+  check_args(*t_in, repeats, *t_out);
+
+  // In this function, we expand the dimensions in the following order:
+  // 1. Channel
+  // 2. Width
+  // 3. Height
+  // 4. Batch
+  // After expanding a dimension, we will update the "running_range" since we
+  // will need to copy the "expanded" area.
+
+  api::utils::ivec3 running_range = t_in->texture_limits();
+
+  const std::vector<int64_t>& in_sizes = t_in->sizes();
+
+  // Since we use channel packing, repeating the channel dimension is the most
+  // complicated and time-consuming, as we need to reason over misaligned
+  // channels. Hence we expand it first to minimize cost. Also, in this first
+  // dimension, we copy over the input texure to the output. In subsequent
+  // dimensions, we read and write from the same tensor.
+
+  if (int64_t channel_repeat = dim_at<Dim4D::Channel>(repeats);
+      channel_repeat == 1) {
+    // If no repeat, short-cut to a direct copy
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    add_copy_offset_node(graph, in, running_range, src_offset, dst_offset, out);
+
+  } else {
+    add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
+  }
+
+  // TODO: refactor width, height, and batch into a common helper function.
+  // Width
+  if (int64_t width_repeat = dim_at<Dim4D::Width>(repeats); width_repeat > 1) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (int i = 1; i < width_repeat; ++i) {
+      api::utils::ivec3 dst_offset = api::utils::make_ivec3(
+          {i * dim_at<Dim4D::Width>(in_sizes), 0, 0}, false);
+
+      add_copy_offset_node(
+          graph, out, running_range, src_offset, dst_offset, out);
+    }
+
+    running_range.data[0] = running_range.data[0] * width_repeat;
+  }
+
+  // Height
+  if (int64_t height_repeat = dim_at<Dim4D::Height>(repeats);
+      height_repeat > 1) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (int i = 1; i < height_repeat; ++i) {
+      api::utils::ivec3 dst_offset = api::utils::make_ivec3(
+          {0, i * dim_at<Dim4D::Height>(in_sizes), 0}, false);
+
+      add_copy_offset_node(
+          graph, out, running_range, src_offset, dst_offset, out);
+    }
+
+    running_range.data[1] = running_range.data[1] * height_repeat;
+  }
+
+  // Batch
+  if (int64_t batch_repeat = dim_at<Dim4D::Batch>(repeats); batch_repeat > 1) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (int i = 1; i < batch_repeat; ++i) {
+      api::utils::ivec3 dst_offset =
+          api::utils::make_ivec3({0, 0, i * running_range.data[2]}, false);
+
+      add_copy_offset_node(
+          graph, out, running_range, src_offset, dst_offset, out);
+    }
+
+    running_range.data[2] = running_range.data[2] * batch_repeat;
+  }
+}
+
+void repeat(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  add_repeat_node(graph, args[0], args[1], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.repeat.default, repeat);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
new file mode 100644
index 00000000000..1d85984ef18
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void check_args(
+    const vTensor& t_in,
+    int64_t dim,
+    int64_t index,
+    const vTensor& t_out) {
+  VK_CHECK_COND(check_memory_layout_is(t_in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(t_out, api::kChannelsPacked));
+
+  const int64_t in_dim = t_in.dim();
+  VK_CHECK_COND(
+      in_dim == 3 || in_dim == 4,
+      "Vulkan select only support 3d or 4d tensors!");
+
+  const int64_t in_size = t_in.size(dim);
+
+  if (index < -in_size || index >= in_size) {
+    VK_CHECK_COND(
+        false,
+        "select(): index ",
+        index,
+        " t_outof range for tensor of size ",
+        in_size,
+        " at dimension ",
+        dim);
+  }
+}
+
+void add_select_int_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef index_ref,
+    const ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  int64_t index = graph.extract_scalar<int64_t>(index_ref);
+
+  check_args(*t_in, dim, index, *t_out);
+
+  const int64_t in_size = t_in->size(dim);
+
+  if (index < 0) {
+    index += in_size;
+  }
+
+  std::string kernel_name;
+
+  // for 3d tensors, these values are not used by the shader.
+  int32_t num_texel_per_batch = 1;
+  int32_t num_batches = 1;
+
+  int64_t in_dim = t_in->dim();
+  if (in_dim == 3) {
+    if (dim == 0) {
+      kernel_name = "select_channel_3d";
+    } else if (dim == 1) {
+      kernel_name = "select_height_3d";
+    } else if (dim == 2) {
+      kernel_name = "select_width_3d";
+    } else {
+      VK_CHECK_COND(
+          false, "Unexpected dim value=", dim, "for the input 3d tensor");
+    }
+  } else { // self.dim() == 4
+    num_texel_per_batch =
+        static_cast<int32_t>(std::ceil(static_cast<float>(t_in->size(1)) / 4));
+    num_batches = t_in->size(0);
+    if (dim == 0) {
+      kernel_name = "select_batch_4d";
+    } else if (dim == 1) {
+      kernel_name = "select_channel_4d";
+    } else if (dim == 2) {
+      kernel_name = "select_height_4d";
+    } else if (dim == 3) {
+      kernel_name = "select_width_4d";
+    } else {
+      VK_CHECK_COND(
+          false, "Unexpected dim value=", dim, "for the input 4d tensor");
+    }
+  }
+
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  // TODO: add resizing to support dynamic shapes.
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      // Parameter buffers
+      {t_out->texture_limits_ubo(),
+       t_out->sizes_ubo(),
+       // TODO: num_batches and num_texel_per_batch are provided by
+       // t_out->sizes. Can change the following to reduce params
+       // created.
+       graph.create_params_buffer(api::utils::make_ivec4(
+           {index, num_batches, num_texel_per_batch, 0}))},
+      // Specialization Constants
+      {}));
+}
+
+void select_int(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_select_int_node(graph, args[0], args[1], args[2], args[3]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.select.int, select_int);
+  VK_REGISTER_OP(aten.select_copy.int, select_int);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
new file mode 100644
index 00000000000..bceec27baee
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_slice_tensor_out_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef step_ref,
+    ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
+
+  // Need normalize the dim
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+
+  VK_CHECK_COND(
+      -t_in->dim() <= dim && dim < t_in->dim(),
+      "dim must be in range of [-self.dim(), self.dim()), but current dim's value is ",
+      dim,
+      " and self.dim() = ",
+      t_in->dim());
+
+  dim = normalize(dim, t_in->dim());
+
+  // Create a dim value as in the underlying dim is 4-dimension.
+  int64_t nchw_dim = dim + (4 - t_in->dim());
+
+  std::optional<int64_t> opt_start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref);
+  std::optional<int64_t> opt_end =
+      graph.extract_optional_scalar<int64_t>(opt_end_ref);
+  int64_t step = graph.extract_scalar<int64_t>(step_ref);
+
+  const auto in_sizes = t_in->sizes();
+  const auto out_sizes = t_out->sizes();
+
+  int64_t start = opt_start.value_or(0);
+  int64_t end = opt_end.value_or(in_sizes[dim]);
+
+  VK_CHECK_COND((0 <= start) && (start < in_sizes[dim]));
+  VK_CHECK_COND((0 <= end) && (end <= in_sizes[dim]));
+
+  if (nchw_dim == 1) {
+    // slice by channel
+    std::string kernel_name = "slice_channel";
+    kernel_name.reserve(kShaderNameReserve);
+    add_dtype_suffix(kernel_name, *t_out);
+
+    api::utils::uvec3 global_size = t_out->extents();
+    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+    const struct Block final {
+      int offset;
+      int step;
+    } params{
+        static_cast<int32_t>(start),
+        static_cast<int32_t>(step),
+    };
+
+    graph.execute_nodes().emplace_back(new ExecuteNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        global_size,
+        local_size,
+        {{out, api::MemoryAccessType::WRITE},
+         {in, api::MemoryAccessType::READ}},
+        {t_out->sizes_ubo(),
+         t_in->sizes_ubo(),
+         graph.create_params_buffer(params)}));
+
+  } else {
+    // GPU's coordinate is in x, y, z
+    int64_t gpu_dim = -1;
+    int64_t stride = 1;
+    if (nchw_dim == 3) {
+      gpu_dim = 0; // width: x dimension in gpu
+      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
+    } else if (nchw_dim == 2) {
+      gpu_dim = 1; // height: y dimension
+      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
+    } else if (nchw_dim == 0) {
+      gpu_dim = 2; // batch: z dimension
+
+      // Due to channel packing, each batch value is span over stride planes
+      int64_t n_channels = dim_at<Dim4D::Channel>(in_sizes);
+      stride = api::utils::div_up<int64_t>(n_channels, 4ll);
+    } else {
+      VK_THROW("Unexpected ncwh_dim!");
+    }
+
+    std::string kernel_name = "slice_batch_height_width";
+    kernel_name.reserve(kShaderNameReserve);
+    add_dtype_suffix(kernel_name, *t_out);
+
+    api::utils::uvec3 global_size = t_out->extents();
+    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+    const struct Block final {
+      int dim;
+      int offset;
+      int step;
+      int stride;
+    } params{
+        static_cast<int32_t>(gpu_dim),
+        static_cast<int32_t>(start),
+        static_cast<int32_t>(step),
+        static_cast<int32_t>(stride),
+    };
+
+    graph.execute_nodes().emplace_back(new ExecuteNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        global_size,
+        local_size,
+        {{out, api::MemoryAccessType::WRITE},
+         {in, api::MemoryAccessType::READ}},
+        {t_out->sizes_ubo(), graph.create_params_buffer(params)}));
+  }
+}
+
+void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_slice_tensor_out_node(
+      graph,
+      args[0],
+      args[1], // dim
+      args[2], // optional start
+      args[3], // optional end
+      args[4], // step
+      args[5]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index a506564a018..2c92af606cf 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -19,12 +19,12 @@ void add_staging_to_tensor_node(
     ComputeGraph& graph,
     const ValueRef in_staging,
     const ValueRef out_tensor) {
-  vTensor& t_out = graph.get_val(out_tensor).toTensor();
-  VK_CHECK_COND(graph.get_val(in_staging).isStaging());
+  vTensorPtr t_out = graph.get_tensor(out_tensor);
+  VK_CHECK_COND(graph.val_is_staging(in_staging));
 
-  api::ShaderInfo shader = get_nchw_to_image_shader(t_out);
+  api::ShaderInfo shader = get_nchw_to_image_shader(*t_out);
 
-  api::utils::uvec3 global_size = t_out.extents();
+  api::utils::uvec3 global_size = t_out->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -32,21 +32,28 @@ void add_staging_to_tensor_node(
       shader,
       global_size,
       local_size,
+      // Input and Outputs
       {{out_tensor, api::MemoryAccessType::WRITE},
        {in_staging, api::MemoryAccessType::READ}},
-      {t_out.gpu_sizes_ubo(), t_out.cpu_sizes_ubo()}));
+      // Parameter Buffers
+      {t_out->sizes_ubo()},
+      // Specialization Constants
+      {SV(t_out->gpu_memory_layout_int())},
+      // Resizing Logic
+      nullptr,
+      {}));
 }
 
 void add_tensor_to_staging_node(
     ComputeGraph& graph,
     const ValueRef in_tensor,
     const ValueRef out_staging) {
-  vTensor& t_in = graph.get_val(in_tensor).toTensor();
-  VK_CHECK_COND(graph.get_val(out_staging).isStaging());
+  vTensorPtr t_in = graph.get_tensor(in_tensor);
+  VK_CHECK_COND(graph.val_is_staging(out_staging));
 
-  api::ShaderInfo shader = get_image_to_nchw_shader(t_in);
+  api::ShaderInfo shader = get_image_to_nchw_shader(*t_in);
 
-  api::utils::uvec3 global_size = t_in.extents();
+  api::utils::uvec3 global_size = t_in->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -54,22 +61,25 @@ void add_tensor_to_staging_node(
       shader,
       global_size,
       local_size,
+      // Input and Outputs
       {{in_tensor, api::MemoryAccessType::READ},
        {out_staging, api::MemoryAccessType::WRITE}},
-      {t_in.gpu_sizes_ubo(), t_in.cpu_sizes_ubo()}));
+      // Parameter Buffers
+      {t_in->sizes_ubo()},
+      // Specialization Constants
+      {SV(t_in->gpu_memory_layout_int())}));
 }
 
 ValueRef prepack(
     ComputeGraph& graph,
     const ValueRef vref,
     const api::GPUMemoryLayout layout) {
-  TensorRef& tref = graph.get_val(vref).toTensorRef();
-  ValueRef v = graph.add_tensor(tref.sizes, tref.dtype, layout);
-  vTensor t = graph.get_val(v).toTensor();
+  ValueRef v = graph.add_tensor_like(vref, layout);
+  vTensorPtr t = graph.get_tensor(v);
 
-  api::ShaderInfo shader = get_nchw_to_image_shader(t);
+  api::ShaderInfo shader = get_nchw_to_image_shader(*t);
 
-  api::utils::uvec3 global_size = t.extents();
+  api::utils::uvec3 global_size = t->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
@@ -79,7 +89,9 @@ ValueRef prepack(
       local_size,
       vref,
       v,
-      {t.gpu_sizes_ubo(), t.cpu_sizes_ubo()}));
+      {t->sizes_ubo()},
+      // Specialization Constants
+      {SV(t->gpu_memory_layout_int())}));
 
   return v;
 }
@@ -88,7 +100,7 @@ ValueRef prepack_if_tensor_ref(
     ComputeGraph& graph,
     const ValueRef v,
     const api::GPUMemoryLayout layout) {
-  if (graph.get_val(v).isTensorRef()) {
+  if (graph.val_is_tref(v)) {
     return prepack(graph, v, layout);
   } else {
     return v;
@@ -96,9 +108,9 @@ ValueRef prepack_if_tensor_ref(
 }
 
 ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) {
-  if (graph.get_val(v).isTensorRef()) {
+  if (graph.val_is_tref(v)) {
     api::GPUMemoryLayout layout =
-        graph.suggested_memory_layout(graph.get_val(v).toTensorRef().sizes);
+        graph.suggested_memory_layout(graph.get_tref(v)->sizes);
     return prepack(graph, v, layout);
   } else {
     return v;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
index c4ba07ae4a0..cf7f891cdcb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
@@ -34,22 +34,20 @@ void resize_sum_node(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
   (void)args;
-  vTensor& out = graph->get_val(extra_args[0]).toTensor();
-  vTensor& in = graph->get_val(extra_args[1]).toTensor();
+  vTensorPtr out = graph->get_tensor(extra_args[0]);
+  vTensorPtr in = graph->get_tensor(extra_args[1]);
 
   const auto dim = extra_args[2];
   const auto keepdim = extra_args[3];
 
-  std::vector<int64_t> output_size = calc_out_sizes(in, dim, keepdim);
+  std::vector<int64_t> output_size = calc_out_sizes(*in, dim, keepdim);
 
-  out.virtual_resize(output_size);
+  out->virtual_resize(output_size);
 }
 
 void check_sum_args(const vTensor& in, const vTensor& out) {
-  VK_CHECK_COND(
-      check_memory_layout_is(in, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED));
-  VK_CHECK_COND(check_memory_layout_is(
-      out, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED));
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
 }
 
 void add_sum_dim_node(
@@ -60,40 +58,42 @@ void add_sum_dim_node(
     const ValueRef out) {
   ValueRef arg = prepack_if_tensor_ref(graph, in);
 
-  vTensor& t_out = graph.get_val(out).toTensor();
-  vTensor& t_input = graph.get_val(in).toTensor();
+  vTensorPtr t_out = graph.get_tensor(out);
+  vTensorPtr t_input = graph.get_tensor(in);
 
-  check_sum_args(t_input, t_out);
+  check_sum_args(*t_input, *t_out);
 
-  int64_t in_dim = t_input.sizes().size();
+  int64_t in_dim = t_input->sizes().size();
   int32_t channel =
-      in_dim > 2 ? static_cast<int32_t>(t_input.sizes()[in_dim - 3]) : 1;
-  uint32_t dim_size = t_input.sizes()[dim];
+      in_dim > 2 ? static_cast<int32_t>(t_input->sizes()[in_dim - 3]) : 1;
+  uint32_t dim_size = t_input->sizes()[dim];
 
-  api::utils::uvec3 global_size = t_out.virtual_extents();
+  api::utils::uvec3 global_size = t_out->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::stringstream kernel_name;
-  kernel_name << "sum_dim";
+  std::string kernel_name("sum_dim");
+  kernel_name.reserve(kShaderNameReserve);
   if (keepdim) {
-    kernel_name << "_keepdim";
+    kernel_name += "_keepdim";
   }
 
-  apply_dtype_suffix(kernel_name, t_out);
+  add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name.str()),
+      VK_KERNEL_FROM_STR(kernel_name),
       global_size,
       local_size,
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {arg, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out.extents_ubo(),
+      {t_out->texture_limits_ubo(),
        graph.create_params_buffer(dim + 4 - in_dim),
        graph.create_params_buffer(dim_size),
        graph.create_params_buffer(int(ceil(channel / 4.0)))},
-      // Resizing
+      // Specialization Constants
+      {},
+      // Resizing Logic
       resize_sum_node,
       {out, in, static_cast<int>(dim), keepdim}));
 }
@@ -104,10 +104,9 @@ ValueRef add_node(
     const int dim,
     const bool keepdim,
     const api::ScalarType dtype = api::kFloat) {
-  vTensor& v_input = graph.get_val(input).toTensor();
-  std::vector<int64_t> output_size = calc_out_sizes(v_input, dim, keepdim);
-  return graph.add_tensor(
-      output_size, dtype, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+  std::vector<int64_t> output_size =
+      calc_out_sizes(*(graph.get_tensor(input)), dim, keepdim);
+  return graph.add_tensor(output_size, dtype, api::kChannelsPacked);
 }
 
 void add_sum_dim_IntList(
@@ -116,17 +115,23 @@ void add_sum_dim_IntList(
     const ValueRef opt_dim,
     const ValueRef keepdim,
     const ValueRef out) {
-  bool keepdim_val = graph.get_val(keepdim).toBool();
-  vTensor& in_tensor = graph.get_val(in).toTensor();
+  bool keepdim_val = graph.get_bool(keepdim);
 
   std::set<int64_t> dims_set;
-  auto dims_to_sum = graph.get_val(opt_dim).toIntList();
-  int64_t in_dim = in_tensor.sizes().size();
-
-  for (const auto& dim : dims_to_sum) {
-    // Normalize (negative) dim into range [0, self.dim() - 1]
-    int64_t dim_normalized = normalize(dim, in_dim);
-    dims_set.insert(dim_normalized);
+  const auto dims_to_sum = *graph.get_int_list(opt_dim);
+  int64_t in_dim = graph.get_tensor(in)->sizes().size();
+
+  if (dims_to_sum.empty()) {
+    // If dim is not specified, reduce over all dims
+    for (int64_t i = 0; i < in_dim; ++i) {
+      dims_set.insert(i);
+    }
+  } else {
+    for (const auto& dim : dims_to_sum) {
+      // Normalize (negative) dim into range [0, self.dim() - 1]
+      int64_t dim_normalized = normalize(dim, in_dim);
+      dims_set.insert(dim_normalized);
+    }
   }
 
   // Reduce the higher dimensionalities first, otherwise when keepdim is
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 08b24801bc4..3888118b90d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -25,10 +25,10 @@ void resize_unary_op_node(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
   (void)extra_args;
-  vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
-  vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr self = graph->get_tensor(args[1].refs[0]);
 
-  out.virtual_resize(self.sizes());
+  out->virtual_resize(self->sizes());
 }
 
 void add_unary_op_node(
@@ -40,32 +40,33 @@ void add_unary_op_node(
     const std::string& op_name) {
   ValueRef arg = prepack_if_tensor_ref(graph, in);
 
-  vTensor& t_out = graph.get_val(out).toTensor();
-  api::utils::uvec3 global_size = t_out.virtual_extents();
+  vTensorPtr t_out = graph.get_tensor(out);
+  api::utils::uvec3 global_size = t_out->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::stringstream kernel_name;
-  kernel_name << op_name;
-  apply_dtype_suffix(kernel_name, t_out);
+  std::string kernel_name(op_name);
+  add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name.str()),
+      VK_KERNEL_FROM_STR(kernel_name),
       global_size,
       local_size,
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {arg, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out.extents_ubo(),
+      {t_out->texture_limits_ubo(),
        graph.create_params_buffer(min),
        graph.create_params_buffer(max)},
-      // Resizing
+      // Specialization Constants
+      {},
+      // Resizing Logic
       resize_unary_op_node));
 }
 
 float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
-  if (!graph.get_val(val).isNone()) {
-    return extract_scalar<float>(graph.get_val(val));
+  if (!graph.val_is_none(val)) {
+    return graph.extract_scalar<float>(val);
   }
   return max ? std::numeric_limits<float>::infinity()
              : -std::numeric_limits<float>::infinity();
@@ -82,8 +83,8 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
     return add_unary_op_node(                                            \
         graph,                                                           \
         args[0],                                                         \
-        get_val_or_inf(graph, args[1], /*max =*/false),                  \
-        get_val_or_inf(graph, args[2], /*max =*/true),                   \
+        get_val_or_inf(graph, args[1], /*max = */ false),                \
+        get_val_or_inf(graph, args[2], /*max = */ true),                 \
         args[3],                                                         \
         kClampShaderName);                                               \
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp
new file mode 100644
index 00000000000..c8ada796e8e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Permute.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_unsqueeze_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef dim_ref,
+    ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  VK_CHECK_COND(
+      t_in->dim() < 4, "Cannot unsqueeze a tensor with more than 3 dimensions");
+
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  int64_t out_dim = t_out->dim();
+
+  std::vector<int64_t> permute_dims(out_dim);
+  for (int i = 1; i <= dim; i++) {
+    permute_dims[i - 1] = i;
+  }
+  permute_dims[dim] = 0;
+
+  for (int i = dim + 1; i < out_dim; i++) {
+    permute_dims[i] = i;
+  }
+
+  add_permute_node(graph, in, permute_dims, out);
+}
+
+void unsqueeze(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_unsqueeze_node(graph, args[0], args[1], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
new file mode 100644
index 00000000000..e492e54832b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_view_node(ComputeGraph& graph, ValueRef in, ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  std::string kernel_name = "view";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      // Parameter Buffers
+      {t_out->sizes_ubo(), t_in->sizes_ubo()},
+      // Specialization Constants
+      {SV(t_in->gpu_memory_layout_int())}));
+}
+
+void view(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // Note: The second argument size_ref is not used here. Since the output
+  // tensor's size have been determined during compilation.
+  return add_view_node(graph, args[0], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.view_copy.default, view);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
index 86371d3c2d8..6b823fe30cd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
@@ -10,25 +10,152 @@
 
 namespace vkcompute {
 
+api::utils::ivec2 make_ivec2_from_list(ComputeGraph& graph, ValueRef vref) {
+  return api::utils::make_ivec2(*graph.get_int_list(vref), /*reverse = */ true);
+}
+
+api::utils::ivec2 make_ivec2_kernel_size(
+    ComputeGraph& graph,
+    const ValueRef weight,
+    const bool kernel_size_only) {
+  if (kernel_size_only) {
+    return make_ivec2_from_list(graph, weight);
+  } else {
+    const auto weight_sizes = graph.get_tref(weight)->sizes;
+    return api::utils::make_ivec2({weight_sizes.at(3), weight_sizes.at(2)});
+  }
+}
+
+Kernel2dParams create_kernel2d_params(
+    ComputeGraph& graph,
+    const ValueRef weight,
+    const bool kernel_size_only,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation) {
+  return {
+      make_ivec2_kernel_size(graph, weight, kernel_size_only),
+      make_ivec2_from_list(graph, stride),
+      make_ivec2_from_list(graph, padding),
+      make_ivec2_from_list(graph, dilation),
+  };
+}
+
 int64_t calc_out_size(
     const int64_t in_size,
-    const int64_t kernel,
+    const int64_t kernel_size,
     const int64_t stride,
     const int64_t padding,
     const int64_t dilation,
     const bool ceil_mode) {
   int64_t c = ceil_mode ? stride - 1 : 0;
   int64_t out_size =
-      (in_size + 2 * padding - dilation * (kernel - 1) - 1 + c) / stride + 1;
+      (in_size + 2 * padding - dilation * (kernel_size - 1) - 1 + c) / stride +
+      1;
   if (ceil_mode && (out_size - 1) * stride >= in_size + padding) {
     --out_size;
   }
+  VK_CHECK_COND(out_size >= 1);
+  return out_size;
+}
+
+std::vector<int64_t> calc_out_sizes_hw(
+    const std::vector<int64_t>& in_sizes,
+    const api::utils::ivec2& kernel_size,
+    const api::utils::ivec2& stride,
+    const api::utils::ivec2& padding,
+    const api::utils::ivec2& dilation,
+    const bool ceil_mode) {
+  const int64_t ndim = in_sizes.size();
+  std::vector<int64_t> out_sizes(2);
+
+  // Height
+  out_sizes.at(0) = calc_out_size(
+      in_sizes.at(ndim - 2),
+      kernel_size.data[1],
+      stride.data[1],
+      padding.data[1],
+      dilation.data[1],
+      ceil_mode);
+  // Width
+  out_sizes.at(1) = calc_out_size(
+      in_sizes.at(ndim - 1),
+      kernel_size.data[0],
+      stride.data[0],
+      padding.data[0],
+      dilation.data[0],
+      ceil_mode);
+
+  return out_sizes;
+}
+
+int64_t calc_transpose_out_size(
+    const int64_t in_size,
+    const int64_t kernel,
+    const int64_t stride,
+    const int64_t padding,
+    const int64_t dilation,
+    const int64_t output_padding) {
+  int64_t out_size = (in_size - 1) * stride - 2 * padding +
+      dilation * (kernel - 1) + output_padding + 1;
+  VK_CHECK_COND(out_size >= 1);
   return out_size;
 }
 
-api::utils::ivec2 reverse(ComputeGraph& graph, ValueRef vref) {
-  return api::utils::make_ivec2(
-      graph.get_val(vref).toIntList(), /*reverse=*/true);
+std::vector<int64_t> calc_transpose_out_sizes_hw(
+    const std::vector<int64_t>& in_sizes,
+    const api::utils::ivec2& kernel_size,
+    const api::utils::ivec2& stride,
+    const api::utils::ivec2& padding,
+    const api::utils::ivec2& dilation,
+    const api::utils::ivec2& output_padding) {
+  const int64_t ndim = in_sizes.size();
+  std::vector<int64_t> out_sizes(2);
+
+  // Height
+  out_sizes.at(0) = calc_transpose_out_size(
+      in_sizes.at(ndim - 2),
+      kernel_size.data[1],
+      stride.data[1],
+      padding.data[1],
+      dilation.data[1],
+      output_padding.data[1]);
+  // Width
+  out_sizes.at(1) = calc_transpose_out_size(
+      in_sizes.at(ndim - 1),
+      kernel_size.data[0],
+      stride.data[0],
+      padding.data[0],
+      dilation.data[0],
+      output_padding.data[0]);
+
+  return out_sizes;
+}
+
+std::vector<int64_t> calc_out_sizes_hw(
+    ComputeGraph& graph,
+    const std::vector<int64_t>& in_sizes,
+    const ValueRef weight,
+    const bool kernel_size_only,
+    const std::vector<ValueRef>& args,
+    const bool transposed) {
+  const auto kernel_size =
+      make_ivec2_kernel_size(graph, weight, kernel_size_only);
+  const auto stride = make_ivec2_from_list(graph, args[0]);
+  const auto padding = make_ivec2_from_list(graph, args[1]);
+  const auto dilation = make_ivec2_from_list(graph, args[2]);
+
+  if (transposed) {
+    const auto output_padding = make_ivec2_from_list(graph, args[3]);
+    return calc_transpose_out_sizes_hw(
+        in_sizes, kernel_size, stride, padding, dilation, output_padding);
+  } else {
+    const bool ceil_mode =
+        graph.val_is_bool(args[3]) ? graph.get_bool(args[3]) : false;
+
+    return calc_out_sizes_hw(
+        in_sizes, kernel_size, stride, padding, dilation, ceil_mode);
+  }
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
index 6e6763dc574..eb0215bfd59 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
@@ -16,13 +16,30 @@
 
 namespace vkcompute {
 
-struct KernelParams final {
+struct Kernel2dParams final {
   api::utils::ivec2 kernel_size;
   api::utils::ivec2 stride;
   api::utils::ivec2 padding;
   api::utils::ivec2 dilation;
 };
 
+struct Kernel1dParams final {
+  int kernel_size;
+  int stride;
+  int padding;
+  int dilation;
+  int in_group_size;
+  int out_group_size;
+};
+
+Kernel2dParams create_kernel2d_params(
+    ComputeGraph& graph,
+    const ValueRef weight,
+    const bool kernel_size_only,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation);
+
 int64_t calc_out_size(
     const int64_t in_size,
     const int64_t kernel_size,
@@ -31,6 +48,12 @@ int64_t calc_out_size(
     const int64_t dilation,
     const bool ceil_mode);
 
-api::utils::ivec2 reverse(ComputeGraph& graph, ValueRef vref);
+std::vector<int64_t> calc_out_sizes_hw(
+    ComputeGraph& graph,
+    const std::vector<int64_t>& in_sizes,
+    const ValueRef weight,
+    const bool kernel_size_only,
+    const std::vector<ValueRef>& args,
+    const bool transposed = false);
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
index 307d676d3b2..842cfa2f4fc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -78,13 +78,13 @@ bool is_packed_dim_broadcasted(const vTensor& sndr, const vTensor& rcvr) {
   // We assume that the tensors are broadcastable. If values aren't equal at
   // some index, then the value of rcvr is 1 and hence should be broadcasted.
   switch (sndr.gpu_memory_layout()) {
-    case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
+    case api::kChannelsPacked:
       return api::utils::val_at(-3, sndr.sizes()) >
           api::utils::val_at(-3, rcvr.sizes());
-    case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
+    case api::kHeightPacked:
       return api::utils::val_at(-2, sndr.sizes()) >
           api::utils::val_at(-2, rcvr.sizes());
-    case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
+    case api::kWidthPacked:
       return api::utils::val_at(-1, sndr.sizes()) >
           api::utils::val_at(-1, rcvr.sizes());
   }
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
index fc80b987604..158fad3cbba 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
@@ -36,18 +36,18 @@ uint32_t bind_values_to_descriptor_set(
   uint32_t idx = base_idx;
   for (auto& arg : args) {
     for (auto& ref : arg.refs) {
-      Value& val = graph->get_val(ref);
-      if (val.isTensor()) {
+      if (graph->val_is_tensor(ref)) {
         bind_tensor_to_descriptor_set(
-            val.toTensor(),
+            *(graph->get_tensor(ref)),
             pipeline_barrier,
             arg.access,
             descriptor_set,
             idx++);
-      } else if (val.isStaging()) {
-        bind_staging_to_descriptor_set(val.toStaging(), descriptor_set, idx++);
+      } else if (graph->val_is_staging(ref)) {
+        bind_staging_to_descriptor_set(
+            *(graph->get_staging(ref)), descriptor_set, idx++);
       } else {
-        VK_THROW("Unsupported type: ", val.type());
+        VK_THROW("Unsupported type: ", graph->get_val_type(ref));
       }
     }
   }
@@ -55,12 +55,12 @@ uint32_t bind_values_to_descriptor_set(
 }
 
 uint32_t bind_params_to_descriptor_set(
-    std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+    const api::ParamsBindList& params,
     api::DescriptorSet& descriptor_set,
     const uint32_t base_idx) {
   uint32_t idx = base_idx;
-  for (auto& param : params) {
-    descriptor_set.bind(idx++, param->buffer());
+  for (auto& param : params.bind_infos) {
+    descriptor_set.bind(idx++, param);
   }
   return idx;
 }
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
index 298ed8d76fd..8b3e579c746 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
@@ -35,7 +35,7 @@ uint32_t bind_values_to_descriptor_set(
 //
 
 uint32_t bind_params_to_descriptor_set(
-    std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+    const api::ParamsBindList& params,
     api::DescriptorSet& descriptor_set,
     const uint32_t base_idx);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
index 9bdb3c4be58..0bca0b4f055 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -10,34 +10,45 @@
 
 namespace vkcompute {
 
-void apply_dtype_suffix(std::stringstream& kernel_name, const vTensor& tensor) {
+void add_dtype_suffix(std::string& kernel_name, const vTensor& tensor) {
   switch (tensor.image().format()) {
     case VK_FORMAT_R32G32B32A32_SFLOAT:
-      kernel_name << "_float";
+      kernel_name += "_float";
       break;
     case VK_FORMAT_R16G16B16A16_SFLOAT:
-      kernel_name << "_half";
+      kernel_name += "_half";
       break;
     case VK_FORMAT_R32G32B32A32_SINT:
-      kernel_name << "_int";
+      kernel_name += "_int";
       break;
     default:
       break;
   }
 }
 
-void apply_memory_layout_suffix(
-    std::stringstream& kernel_name,
-    const vTensor& tensor) {
+void add_ndim_suffix(std::string& kernel_name, const vTensor& tensor) {
+  switch (tensor.storage_type()) {
+    case api::kTexture3D:
+      kernel_name += "_3d";
+      break;
+    case api::kTexture2D:
+      kernel_name += "_2d";
+      break;
+    default:
+      break;
+  }
+}
+
+void add_memory_layout_suffix(std::string& kernel_name, const vTensor& tensor) {
   switch (tensor.gpu_memory_layout()) {
-    case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-      kernel_name << "_C_packed";
+    case api::kChannelsPacked:
+      kernel_name += "_C_packed";
       break;
-    case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
-      kernel_name << "_H_packed";
+    case api::kHeightPacked:
+      kernel_name += "_H_packed";
       break;
-    case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-      kernel_name << "_W_packed";
+    case api::kWidthPacked:
+      kernel_name += "_W_packed";
       break;
     default:
       break;
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
index 3f094432bb1..a784a4acb4c 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
@@ -10,14 +10,16 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
-#include <sstream>
+#include <string>
 
 namespace vkcompute {
 
-void apply_dtype_suffix(std::stringstream& kernel_name, const vTensor& tensor);
+constexpr size_t kShaderNameReserve = 64u;
 
-void apply_memory_layout_suffix(
-    std::stringstream& kernel_name,
-    const vTensor& tensor);
+void add_dtype_suffix(std::string& kernel_name, const vTensor& tensor);
+
+void add_ndim_suffix(std::string& kernel_name, const vTensor& tensor);
+
+void add_memory_layout_suffix(std::string& kernel_name, const vTensor& tensor);
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 7f5ae409d44..945fda0768d 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -89,52 +89,48 @@ void copy_staging_to_ptr(
   memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
 }
 
-api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
-  if (v_dst.is_quantized()) {
-    VK_THROW("Quantized Tensors are currently not supported!");
-  }
+void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
+  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
+  uint8_t* data_ptr = mapping.template data<uint8_t>();
+  memset(data_ptr, 0, staging.nbytes());
+}
 
-  std::stringstream kernel_name;
+api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
+  std::string kernel_name;
+  kernel_name.reserve(kShaderNameReserve);
 
   switch (v_dst.storage_type()) {
-    case api::StorageType::TEXTURE_3D:
-      kernel_name << "nchw_to_image3d";
-      break;
-    case api::StorageType::TEXTURE_2D:
-      kernel_name << "nchw_to_image2d";
+    case api::kTexture3D:
+    case api::kTexture2D:
+      kernel_name = "nchw_to_image";
       break;
     default:
       VK_THROW("No kernel available!");
   }
 
-  apply_memory_layout_suffix(kernel_name, v_dst);
-  apply_dtype_suffix(kernel_name, v_dst);
+  add_ndim_suffix(kernel_name, v_dst);
+  add_dtype_suffix(kernel_name, v_dst);
 
-  return VK_KERNEL_FROM_STR(kernel_name.str());
+  return VK_KERNEL_FROM_STR(kernel_name);
 }
 
 api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) {
-  if (v_src.is_quantized()) {
-    VK_THROW("Quantized Tensors are currently not supported!");
-  }
-
-  std::stringstream kernel_name;
+  std::string kernel_name;
+  kernel_name.reserve(kShaderNameReserve);
 
   switch (v_src.storage_type()) {
-    case api::StorageType::TEXTURE_3D:
-      kernel_name << "image3d_to_nchw";
-      break;
-    case api::StorageType::TEXTURE_2D:
-      kernel_name << "image2d_to_nchw";
+    case api::kTexture3D:
+    case api::kTexture2D:
+      kernel_name = "image_to_nchw";
       break;
     default:
       VK_THROW("No kernel available!");
   }
 
-  apply_memory_layout_suffix(kernel_name, v_src);
-  apply_dtype_suffix(kernel_name, v_src);
+  add_ndim_suffix(kernel_name, v_src);
+  add_dtype_suffix(kernel_name, v_src);
 
-  return VK_KERNEL_FROM_STR(kernel_name.str());
+  return VK_KERNEL_FROM_STR(kernel_name);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
index 0634d8d02e7..0bcbff5d74e 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -25,6 +25,8 @@ void copy_staging_to_ptr(
     void* dst,
     const size_t nbytes);
 
+void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
+
 //
 // Functions to get shaders
 //
diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py
index 3f7e473c27c..9c12cb4a010 100644
--- a/backends/vulkan/serialization/vulkan_graph_builder.py
+++ b/backends/vulkan/serialization/vulkan_graph_builder.py
@@ -178,7 +178,11 @@ def create_tensor_value(self, spec: TensorSpec, constant_id: int = -1) -> int:
 
     def create_scalar_list_value(self, arg: List[_ScalarType]) -> int:
         new_id = len(self.values)
-        if isinstance(arg[0], bool):
+        if len(arg) == 0:
+            self.values.append(
+                vk_graph_schema.VkValue(vk_graph_schema.IntList(items=[]))
+            )
+        elif isinstance(arg[0], bool):
             self.values.append(
                 vk_graph_schema.VkValue(
                     vk_graph_schema.BoolList(items=[cast(bool, e) for e in arg])
@@ -221,13 +225,20 @@ def get_or_create_value_for(self, arg: _Argument):
             if arg in self.node_to_value_ids:
                 return self.node_to_value_ids[arg]
             return self.create_node_value(arg)
-        elif isinstance(arg, NoneType):
+        elif (
+            isinstance(arg, NoneType)
+            or isinstance(arg, torch.device)
+            or isinstance(arg, torch.dtype)
+            or isinstance(arg, torch.layout)
+        ):
             return self.create_null_value()
         elif isinstance(arg, _ScalarType):
             return self.create_scalar_value(arg)
         elif isinstance(arg, TensorSpec):
             return self.create_tensor_value(arg)
-        elif isinstance(arg, list) and isinstance(arg[0], _ScalarType):
+        elif isinstance(arg, list) and (
+            len(arg) == 0 or isinstance(arg[0], _ScalarType)
+        ):
             # pyre-ignore[6]
             return self.create_scalar_list_value(arg)
         elif isinstance(arg, list) and isinstance(arg[0], Node):
diff --git a/backends/vulkan/test/glsl/all_shaders.yaml b/backends/vulkan/test/glsl/all_shaders.yaml
index 09f051e04b5..46a1cd2bf5f 100644
--- a/backends/vulkan/test/glsl/all_shaders.yaml
+++ b/backends/vulkan/test/glsl/all_shaders.yaml
@@ -33,6 +33,17 @@ fill_texture__test:
   shader_variants:
     - NAME: fill_texture__test
 
+idx_fill_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+      - VALUE: int8
+  shader_variants:
+    - NAME: idx_fill_buffer
+
 idx_fill_texture:
   parameter_names_with_default_values:
     DTYPE: float
diff --git a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
index f5e5d6b4e4d..7f72ac58972 100644
--- a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
+++ b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
@@ -7,11 +7,10 @@
  */
 
 #version 450 core
-// clang-format off
+
 #define PRECISION ${PRECISION}
 
-#define OP(X, Y) ${OPERATOR}
-// clang-format on
+#define op(X, Y) ${OPERATOR}
 
 layout(std430) buffer;
 
@@ -38,5 +37,5 @@ void main() {
   vec4 in_texel = texelFetch(image_in, pos, 0);
   vec4 other_texel = texelFetch(image_other, pos, 0);
 
-  imageStore(image_out, pos, OP(in_texel, other_texel));
+  imageStore(image_out, pos, op(in_texel, other_texel));
 }
diff --git a/backends/vulkan/test/glsl/fill_buffer.glsl b/backends/vulkan/test/glsl/fill_buffer.glsl
new file mode 100644
index 00000000000..090d9e70d6c
--- /dev/null
+++ b/backends/vulkan/test/glsl/fill_buffer.glsl
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+$PRECISION = "highp"
+$DTYPE = "float"
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) buffer  PRECISION restrict writeonly Buffer {
+  VEC4_T data[];
+}
+buffer_in;
+
+layout(set = 0, binding = 1) uniform PRECISION restrict Params {
+  int len;
+}
+params;
+
+
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const float scale = 1;
+layout(constant_id = 4) const float offset = 0;
+
+void main() {
+  const int i = ivec3(gl_GlobalInvocationID).x;
+
+  const int base = 4 * i;
+  if (base < params.len) {
+    buffer_in.data[i] = scale * (VEC4_T(base) + VEC4_T(0, 1, 2, 3)) + offset;
+  }
+}
diff --git a/backends/vulkan/test/glsl/fill_texture__test.glsl b/backends/vulkan/test/glsl/fill_texture__test.glsl
index fafad11d498..76c630de55e 100644
--- a/backends/vulkan/test/glsl/fill_texture__test.glsl
+++ b/backends/vulkan/test/glsl/fill_texture__test.glsl
@@ -7,15 +7,12 @@
  */
 
 #version 450 core
+
 #define PRECISION ${PRECISION}
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
-
-// clang-format off
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} uOutput;
-// clang-format on
 layout(set = 0, binding = 1) uniform PRECISION restrict Block {
   ivec3 size;
   int fill;
diff --git a/backends/vulkan/test/glsl/idx_fill_buffer.glsl b/backends/vulkan/test/glsl/idx_fill_buffer.glsl
new file mode 100644
index 00000000000..98cf04e338d
--- /dev/null
+++ b/backends/vulkan/test/glsl/idx_fill_buffer.glsl
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
+
+#include "indexing_utils.h"
+
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+  #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+$elif DTYPE == "int8":
+  #extension GL_EXT_shader_8bit_storage : require
+  #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+$elif DTYPE == "uint8":
+  #extension GL_EXT_shader_8bit_storage : require
+  #extension GL_EXT_shader_explicit_arithmetic_types_uint8 : require
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) buffer  PRECISION restrict writeonly Buffer {
+  VEC4_T data[];
+}
+buffer_in;
+
+layout(set = 0, binding = 1) uniform PRECISION restrict Params {
+  int len;
+}
+params;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int i = ivec3(gl_GlobalInvocationID).x;
+
+  const int base = 4 * i;
+  if (base < params.len) {
+    buffer_in.data[i] = VEC4_T(base, base + 1, base + 2, base + 3);
+  }
+}
diff --git a/backends/vulkan/test/glsl/idx_fill_texture.glsl b/backends/vulkan/test/glsl/idx_fill_texture.glsl
index a6500bd3ede..fced95bca5d 100644
--- a/backends/vulkan/test/glsl/idx_fill_texture.glsl
+++ b/backends/vulkan/test/glsl/idx_fill_texture.glsl
@@ -10,37 +10,31 @@
 
 #define PRECISION ${PRECISION}
 
+#define VEC4_T ${texel_type(DTYPE)}
+
 #include "indexing_utils.h"
 
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 
-layout(set = 0, binding = 1) uniform PRECISION restrict GpuSizes {
-  ivec4 data;
-}
-gpu_sizes;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict CpuSizes {
-  ivec4 data;
-}
-cpu_sizes;
+layout(set = 0, binding = 1) uniform PRECISION restrict Sizes {
+  ivec4 sizes;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
 
-  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+  if (any(greaterThanEqual(idx, sizes))) {
     return;
   }
 
-  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
-  const ivec4 buf_indices =
-      base_index + ivec4(0, 1, 2, 3) * PLANE_SIZE_${PACKING}(gpu_sizes.data);
-
-  ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(buf_indices);
-
-  imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel);
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+  VEC4_T texel = VEC4_T(buf_indices);
+  imageStore(image_out, ${get_pos[NDIM]("pos")}, texel);
 }
diff --git a/backends/vulkan/test/glsl/indexing_utils.h b/backends/vulkan/test/glsl/indexing_utils.h
index a881b49801b..8563daaa5fb 100644
--- a/backends/vulkan/test/glsl/indexing_utils.h
+++ b/backends/vulkan/test/glsl/indexing_utils.h
@@ -6,27 +6,98 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#define PACKED_DIM_CHANNELS_PACKED(vec) vec.z
+// Width Dim Index, assuming (W, H, C, N) order
+#define W_DIM 0
+// Height, assuming (W, H, C, N) order
+#define H_DIM 1
+// Channels, assuming (W, H, C, N) order
+#define C_DIM 2
 
-#define PACKED_DIM_WIDTH_PACKED(vec) vec.x
+/*
+ * Describes which texture axis the "batches" dimension runs along in a 4D
+ * texture.
+ *
+ * Currently it is set to 2 since we represent batches by concatenating along
+ * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
+ * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
+ * order.
+ */
+#define BATCH_AXIS 2
+
+//
+// Basic Indexing Utility Macros and Functions
+//
 
-#define PACKED_DIM_HEIGHT_PACKED(vec) vec.y
+/*
+ * Aligns input to the next multiple of 4
+ */
+#define alignup4(x) ((x + 3) & -4)
 
-#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \
-  ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z)
+//
+// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion
+//
 
-#define POS_TO_COORD_WIDTH_PACKED(pos, sizes) \
-  ivec4((pos.x * 4), pos.y, pos.z % sizes.z, pos.z / sizes.z)
+/*
+ * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
+ *        is packed along a texel
+ * Output: A ivec4 containing the buffer indices corresponding to each texel
+ *         element.
+ */
+ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) {
+  ivec4 strides =
+      ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
 
-#define POS_TO_COORD_HEIGHT_PACKED(pos, sizes) \
-  ivec4(pos.x, (pos.y * 4), pos.z % sizes.z, pos.z / sizes.z)
+  int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z +
+      idx.w * strides.w;
 
-#define COORD_TO_BUFFER_IDX(coord, sizes)                  \
-  coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
-      coord.w* sizes.z* sizes.y* sizes.x;
+  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
+}
 
-#define PLANE_SIZE_CHANNELS_PACKED(vec) (vec.x * vec.y)
+//
+// (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
+//
+
+/*
+ * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim
+ *        is packed along a texel
+ * Output: Whether the texel position is outside the bounds of the image texture
+ *         given the size and packed dimension of the tensor.
+ */
+bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 max_pos = sizes.xyz;
+  max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS];
+  max_pos[packed_dim] /= 4;
+  return (any(greaterThanEqual(pos, max_pos)));
+}
+
+/*
+ * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor,
+ *        which dim is packed along a texel
+ * Returns: the (w, h, c, n) tensor index cooresponding to the first element of
+ *          the texel at the specified position
+ */
+ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
 
-#define PLANE_SIZE_WIDTH_PACKED(vec) (1)
+  // Packed dim contains 4 elements per texel
+  pos[packed_dim] *= 4;
+  // Construct the initial tensor index via swizzling
+#if BATCH_AXIS == 2
+  ivec4 tensor_idx = pos.xyzz;
+#endif
+#if BATCH_AXIS == 1
+  ivec4 tensor_idx = pos.xyzy;
+#endif
+#if BATCH_AXIS == 0
+  ivec4 tensor_idx = pos.xyzx;
+#endif
+  // Adjust the axis that the batch dim runs along
+  tensor_idx[3] /= sizes[BATCH_AXIS];
+  tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS];
 
-#define PLANE_SIZE_HEIGHT_PACKED(vec) (vec.x)
+  return tensor_idx;
+}
diff --git a/backends/vulkan/test/glsl/test_shader.glsl b/backends/vulkan/test/glsl/test_shader.glsl
index 39edc92cc62..4804528346d 100644
--- a/backends/vulkan/test/glsl/test_shader.glsl
+++ b/backends/vulkan/test/glsl/test_shader.glsl
@@ -7,16 +7,14 @@
  */
 
 #version 450 core
+
 #define PRECISION ${PRECISION}
-#define FORMAT ${FORMAT}
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
-
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
   ivec4 size;
 } uBlock;
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 91b36a368b8..2a100b92e38 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from collections import namedtuple
+
 from executorch.backends.vulkan.test.op_tests.utils.codegen import VkTestSuite
 
 
@@ -21,7 +23,7 @@
 
 
 def get_binary_elementwise_inputs():
-    return VkTestSuite(
+    test_suite = VkTestSuite(
         [
             ((M1, M2), (M1, M2)),
             ((M1, M2), (M1, 1), 2.0),
@@ -31,6 +33,11 @@ def get_binary_elementwise_inputs():
             ((S, S1, S2), (S, 1, S2), 2.0),
         ]
     )
+    test_suite.layouts = [
+        "api::kWidthPacked",
+        "api::kChannelsPacked",
+    ]
+    return test_suite
 
 
 def get_mm_inputs():
@@ -41,6 +48,12 @@ def get_mm_inputs():
         ],
     )
     test_suite.prepacked_args = ["mat2"]
+    # ATen matmul doesn't support half
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.layouts = [
+        "api::kWidthPacked",
+        "api::kChannelsPacked",
+    ]
     return test_suite
 
 
@@ -50,7 +63,368 @@ def get_pool2d_inputs():
             ((S, M1, M2), [2, 2], [1, 1], [0, 0], [1, 1]),
         ]
     )
-    test_suite.supports["layouts"] = ["api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED"]
+    return test_suite
+
+
+def get_conv_inputs():
+    test_suite = VkTestSuite(
+        [
+            (
+                (1, 6, 40, 50),
+                (8, 6, 3, 3),
+                (8,),
+                [1, 2],
+                [2, 3],
+                [1, 1],
+                False,
+                [0, 0],
+                1,
+            ),
+            (
+                (1, 6, 40, 50),
+                (6, 8, 3, 3),
+                (8,),
+                [1, 2],
+                [2, 3],
+                [1, 1],
+                True,
+                [0, 1],
+                1,
+            ),
+            (
+                (1, 8, 72, 96),
+                (8, 1, 3, 3),
+                (8,),
+                [1, 1],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                8,
+            ),
+            (
+                (1, 8, 72, 96),
+                (8, 8, 1, 1),
+                (8,),
+                [1, 1],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                1,
+            ),
+            (
+                (1, 6, 40, 50),
+                (8, 6, 3, 3),
+                None,
+                [1, 2],
+                [2, 3],
+                [1, 1],
+                False,
+                [0, 0],
+                1,
+            ),
+            (
+                (1, 6, 7),
+                (6, 1, 3),
+                (6,),
+                [1],
+                [0],
+                [1],
+                False,
+                [0],
+                6,
+            ),
+            (
+                (2, 20, 30),
+                (10, 4, 6),
+                (10,),
+                [5],
+                [5],
+                [3],
+                False,
+                [0],
+                5,
+            ),
+            (
+                (1, 9, 11),
+                (9, 1, 3),
+                None,
+                [1],
+                [0],
+                [1],
+                False,
+                [0],
+                9,
+            ),
+            (
+                (5, 15, 30),
+                (20, 3, 3),
+                None,
+                [3],
+                [5],
+                [7],
+                False,
+                [0],
+                5,
+            ),
+        ]
+    )
+    return test_suite
+
+
+def get_native_layer_norm_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((S1, S2), [S2], (S2), (S2), 0.001),
+            ((M, M1, M2), [M2], (M2), (M2), 0.001),
+            ((S, XL, M1, M2), [M2], (M2), (M2), 0.001),
+        ]
+    )
+    return test_suite
+
+
+def get_full_inputs():
+    test_suite = VkTestSuite(
+        [
+            ([S1, S2], 42.0),
+            ([M, M1, M2], 3.14),
+            ([L, M, M1, M2], 2.72),
+        ]
+    )
+    return test_suite
+
+
+def get_select_int_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((6, 2, 7), 0, 3),
+            ((6, 2, 7), 1, 0),
+            ((6, 2, 7), 2, 3),
+            ((6, 10, 7), 0, 3),
+            ((6, 10, 7), 1, 0),
+            ((6, 10, 7), 1, 9),
+            ((6, 10, 7), 2, 6),
+            ((9, 2, 9, 4), 0, 8),
+            ((9, 2, 9, 4), 1, 1),
+            ((9, 2, 9, 4), 2, 0),
+            ((9, 2, 9, 4), 2, 8),
+            ((9, 2, 9, 4), 3, 3),
+            ((8, 6, 1, 1), 0, 4),
+            ((8, 6, 1, 1), 1, 4),
+        ]
+    )
+    return test_suite
+
+
+def get_permute_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((9, 2, 9, 4), [0, 1, 2, 3]),
+            ((9, 2, 9, 4), [0, 1, 3, 2]),
+            ((9, 2, 9, 4), [0, 2, 1, 3]),
+            ((9, 2, 9, 4), [0, 2, 3, 1]),
+            ((9, 2, 9, 4), [0, 3, 1, 2]),
+            ((9, 2, 9, 4), [0, 3, 2, 1]),
+            ((9, 2, 9, 4), [3, 0, 1, 2]),
+            ((9, 2, 9, 4), [3, 2, 0, 1]),
+            ((9, 2, 9, 4), [2, 3, 0, 1]),
+            ((9, 2, 9, 4), [2, 0, 3, 1]),
+            ((9, 2, 9), [2, 0, 1]),
+            ((9, 2, 9), [1, 2, 0]),
+            ((9, 2), [0, 1]),
+            ((9, 2), [1, 0]),
+        ]
+    )
+
+    test_suite.layouts = ["api::kChannelsPacked"]
+    return test_suite
+
+
+def get_view_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((3, 4, 5), [1, 1, -1]),
+            ((3, 4, 5), [1, -1, 1]),
+            ((3, 4, 5), [-1, 1, 1]),
+            ((8, 7, 2, 3), [4, 3, 7, 4]),
+            ((8, 7, 2, 3), [7, -1, 2, 1]),
+            ((8, 7, 2, 3), [1, 1, 1, -1]),
+            ((8, 7, 2, 3), [-1]),
+            ((2, 3, 3, 7), [2, -1, 1, 1]),
+            ((3, 5, 2, 7), [7, -1, 2, 1]),
+            ((2, 2, 8, 6), [2, 6, -1, 1]),
+            ((2, 2, 8, 6), [6, -1, 1]),
+            ((S1, S2, S1, S2), [S2, -1, 1, S1]),
+            ((S1, S2, S1, S2), [S1, 1, -1, S2]),
+            ((S1, S2, S1, S2), [-1, 1, S1, S2]),
+        ]
+    )
+    test_suite.layouts = [
+        "api::kWidthPacked",
+        "api::kHeightPacked",
+        "api::kChannelsPacked",
+    ]
+    return test_suite
+
+
+def get_slice_inputs():
+    Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
+    Test.__new__.__defaults__ = (None, 0, None, None, 1)
+
+    # Slice by width and height
+    test_cases = [
+        Test(self=[1, 1, 4, 10], dim=3, start=3),
+        Test(self=[1, 1, 4, 10], dim=3, start=3, step=2),
+        Test(self=[1, 1, 4, 10], dim=3, start=3, end=4, step=2),
+        Test(self=[1, 1, 4, 10], dim=2, start=3),
+        Test(self=[9, 9, 9, 9], dim=2, start=0, end=9, step=1),
+        Test(self=[9, 9, 9, 9], dim=2, start=1, end=8, step=1),
+        Test(self=[9, 9, 9, 9], dim=2, start=1, end=2, step=1),
+        Test(self=[9, 9, 9, 9], dim=3, start=1, end=5, step=1),
+        Test(self=[9, 9, 9, 9], dim=3, start=1, end=5, step=2),
+        Test(self=[9, 9, 9, 9], dim=-1, start=1, end=5, step=2),
+        Test(self=[9, 9, 9, 9], dim=-2, start=1, end=5, step=2),
+        Test(self=[9, 9, 9], dim=1, start=2, step=1),
+        Test(self=[9, 9, 9], dim=1, start=2, step=2),
+        Test(self=[9, 9, 9], dim=2, start=2, step=1),
+        Test(self=[9, 9, 9], dim=2, start=2, step=2),
+        Test(self=[9, 9], dim=0, start=2, step=1),
+        Test(self=[9, 9], dim=0, start=2, step=2),
+        Test(self=[9, 9], dim=1, start=2, step=1),
+        Test(self=[9, 9], dim=1, start=2, step=2),
+    ]
+
+    # Slice by batch
+    test_cases += [
+        Test(self=[6, 5, 3, 2], dim=0),
+        Test(self=[6, 5, 3, 2], dim=0, step=2),
+        Test(self=[13, 13, 3, 2], dim=0, step=2),
+        Test(self=[13, 13, 3, 2], dim=0, start=1, step=2),
+        Test(self=[13, 13, 3, 2], dim=0, start=1, step=5),
+        Test(self=[13, 13, 3, 2], dim=0, start=1, step=20),
+        Test(self=[13, 2, 3, 2], dim=0, start=1, step=2),
+        Test(self=[13, 2, 3, 2], dim=0, start=1, step=5),
+        Test(self=[13, 2, 3, 2], dim=0, start=1, step=20),
+    ]
+
+    # Slice by channel
+    test_cases += [
+        Test(self=[2, 5, 1, 10], dim=1),
+        Test(self=[2, 5, 1, 10], dim=1, start=1),
+        Test(self=[2, 5, 1, 10], dim=1, start=1, step=2),
+        Test(self=[5, 13, 1, 10], dim=1),
+        Test(self=[5, 13, 1, 10], dim=1, start=1),
+        Test(self=[5, 13, 1, 10], dim=1, start=1, step=2),
+        Test(self=[5, 13, 1, 10], dim=1, start=1, step=5),
+        Test(self=[5, 13, 1, 10], dim=1, start=1, step=20),
+        Test(self=[13, 1, 10], dim=0),
+        Test(self=[13, 1, 10], dim=0, start=1),
+        Test(self=[13, 1, 10], dim=0, start=1, step=2),
+        Test(self=[13, 1, 10], dim=0, start=1, step=5),
+        Test(self=[13, 1, 10], dim=0, start=1, step=20),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    test_suite.data_gen = "make_seq_tensor"
+    return test_suite
+
+
+def get_unsqueeze_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((2, 3, 4), 0),
+            ((1, 1, 1), 0),
+            ((1, 1, 1), 1),
+            ((1, 1, 1), 2),
+            ((1, 1, 1), 3),
+            ((9, 9, 9), 0),
+            ((9, 9, 9), 1),
+            ((9, 9, 9), 2),
+            ((9, 9, 9), 3),
+            ((9, 9), 0),
+            ((9, 9), 1),
+            ((9, 9), 2),
+            ((9,), 0),
+            ((9,), 1),
+        ]
+    )
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    test_suite.data_gen = "make_seq_tensor"
+    return test_suite
+
+
+def get_clone_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((S2, S1, S2, S1),),
+            ((S2, S1, S2),),
+            ((S2, S1),),
+            ((S2,),),
+            ((XS, S1, XS, S1),),
+            ((XS, S1, XS),),
+            ((S1, XS, S1),),
+            ((XS, S1),),
+            ((S1, XS),),
+            ((S1,),),
+            ((XS,),),
+        ]
+    )
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    test_suite.data_gen = "make_seq_tensor"
+    return test_suite
+
+
+def get_repeat_inputs():
+    test_suite = VkTestSuite(
+        [
+            # Repeat channels only (most challenging case)
+            ((3, XS, S), [2, 1, 1]),
+            ((7, XS, S), [4, 1, 1]),
+            ((1, 7, XS, S), [1, 4, 1, 1]),
+            ((3, 7, XS, S), [1, 4, 1, 1]),
+            # Repat channels with other dims
+            ((1, 7, XS, S), [1, 4, 1, 3]),
+            ((3, 7, XS, S), [1, 4, 1, 3]),
+            ((3, 7, XS, S), [1, 4, 3, 1]),
+            ((3, 7, XS, S), [1, 4, 3, 3]),
+            # Repeat Batch
+            ((3, 7, XS, S), [3, 4, 3, 3]),
+            ((3, 7, XS, S), [3, 1, 3, 3]),
+            # More other cases
+            ((3, 7, 1, 1), [1, 4, 1, 1]),
+            ((2, 3), [1, 4]),
+            ((2, 3), [4, 1]),
+            ((2, 3), [4, 4]),
+            ((S1, S2, S2), [1, 3, 1]),
+            ((S1, S2, S2), [1, 3, 3]),
+            ((S1, S2, S2), [3, 3, 1]),
+            ((S1, S2, S2), [3, 3, 3]),
+            ((S1, S2, S2, S2), [1, 1, 3, 1]),
+            ((S1, S2, S2, S2), [1, 1, 1, 3]),
+            ((S1, S2, S2, S2), [1, 1, 3, 3]),
+            ((S1, S2, S2, S2), [1, 3, 1, 3]),
+            ((S1, S2, S2, S2), [3, 3, 3, 3]),
+            ((S1, S2, S2, S2), [3, 3, 1, 1]),
+            # Expanding cases
+            ((2, 3), [3, 1, 4]),
+            ((2, 3), [3, 3, 2, 4]),
+        ]
+    )
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.dtypes = ["at::kFloat"]
     return test_suite
 
 
@@ -61,12 +435,16 @@ def get_pool2d_inputs():
     "aten.mul.Tensor": get_binary_elementwise_inputs(),
     "aten.mm.default": get_mm_inputs(),
     "aten.max_pool2d_with_indices.default": get_pool2d_inputs(),
-}
-
-prepacked_args = {"aten.mm.default": {"mat2"}}
-
-support_exceptions = {
-    "aten.max_pool2d_with_indices.default": {
-        "layouts": ["api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED"]
-    },
+    "aten.convolution.default": get_conv_inputs(),
+    "aten.native_layer_norm.default": get_native_layer_norm_inputs(),
+    "aten.full.default": get_full_inputs(),
+    "aten.select.int": get_select_int_inputs(),
+    "aten.select_copy.int": get_select_int_inputs(),
+    "aten.permute.default": get_permute_inputs(),
+    "aten.permute_copy.default": get_permute_inputs(),
+    "aten.view_copy.default": get_view_inputs(),
+    "aten.slice_copy.Tensor": get_slice_inputs(),
+    "aten.unsqueeze_copy.default": get_unsqueeze_inputs(),
+    "aten.clone.default": get_clone_inputs(),
+    "aten.repeat.default": get_repeat_inputs(),
 }
diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl
index 79cf418fc33..c0259e71e70 100644
--- a/backends/vulkan/test/op_tests/targets.bzl
+++ b/backends/vulkan/test/op_tests/targets.bzl
@@ -1,4 +1,6 @@
 load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID")
+load("@fbsource//xplat/caffe2:pt_defs.bzl", "get_pt_ops_deps")
+load("@fbsource//xplat/caffe2:pt_ops.bzl", "pt_operator_library")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets(is_fbcode = False):
@@ -43,6 +45,24 @@ def define_common_targets(is_fbcode = False):
         default_outs = ["."],
     )
 
+    pt_operator_library(
+        name = "all_aten_ops",
+        check_decl = False,
+        include_all_operators = True,
+    )
+
+    runtime.cxx_library(
+        name = "all_aten_ops_lib",
+        srcs = [],
+        define_static_target = False,
+        exported_deps = get_pt_ops_deps(
+            name = "pt_ops_full",
+            deps = [
+                ":all_aten_ops",
+            ],
+        ),
+    )
+
     runtime.cxx_binary(
         name = "compute_graph_op_tests_bin",
         srcs = [
@@ -52,7 +72,7 @@ def define_common_targets(is_fbcode = False):
         deps = [
             "//third-party/googletest:gtest_main",
             "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
+            ":all_aten_ops_lib",
         ],
     )
 
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
index 76e59f53952..f0e5547b4fe 100644
--- a/backends/vulkan/test/op_tests/utils/codegen.py
+++ b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -4,8 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import re
 from dataclasses import dataclass
-
 from typing import Any, List, Optional, Union
 
 from executorch.backends.vulkan.test.op_tests.utils.codegen_base import (
@@ -14,15 +14,27 @@
     AT_TENSOR,
     BOOL,
     CppTestFileGen,
-    TENSOR_TUPLE,
+    DOUBLE,
+    INT,
+    OPT_AT_TENSOR,
+    OPT_BOOL,
+    OPT_DEVICE,
+    OPT_INT64,
+    OPT_LAYOUT,
+    OPT_MEMORY_FORMAT,
+    OPT_SCALAR_TYPE,
     TestSuite,
     TestSuiteGen,
+    THREE_TENSOR_TUPLE,
+    TWO_TENSOR_TUPLE,
 )
 from torchgen.api import cpp
 from torchgen.api.types import CppSignatureGroup
 
-from torchgen.gen import generate_static_dispatch_backend_call
-from torchgen.model import NativeFunction
+from torchgen.gen import generate_static_dispatch_backend_call, translate_args
+
+from torchgen.gen_aoti_c_shim import gen_static_dispatch_backend_call_signature
+from torchgen.model import NativeFunction, Variant
 
 ##################################
 ## Custom Test Suite Definition ##
@@ -31,13 +43,11 @@
 
 @dataclass
 class VkTestSuite(TestSuite):
-    supports = {
-        "storage_types": ["api::StorageType::TEXTURE_3D"],
-        "layouts": [
-            "api::GPUMemoryLayout::TENSOR_WIDTH_PACKED",
-            "api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED",
-        ],
-    }
+    def __init__(self, input_cases: List[Any]):
+        super().__init__(input_cases)
+        self.storage_types: List[str] = ["api::kTexture3D"]
+        self.layouts: List[str] = ["api::kChannelsPacked"]
+        self.data_gen: str = "make_rand_tensor"
 
 
 ##########################
@@ -80,7 +90,6 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
         self.dot = "->"
 
         self.args = []
-        self.out = None
         self.refs = {}
 
         self.should_prepack = False
@@ -96,7 +105,7 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
                 ATenArg(name=arg.name, cpp_type=cpp_type, default=arg.default)
             )
 
-            requires_prepack = "weight" in arg.name
+            requires_prepack = "weight" in arg.name or "bias" in arg.name
             supports_prepack = False
             if arg.name in self.suite_def.prepacked_args:
                 supports_prepack = True
@@ -116,7 +125,28 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
             self.refs["out"] = ValueRef(
                 name="out_ref", src_cpp_name="out", src_cpp_type=ret_type, is_out=True
             )
-        elif ret_type == TENSOR_TUPLE:
+        elif ret_type == TWO_TENSOR_TUPLE:
+            self.refs["out"] = [
+                ValueRef(
+                    name="out_ref_first",
+                    src_cpp_name="std::get<0>(out)",
+                    src_cpp_type="at::Tensor",
+                    is_out=True,
+                ),
+                ValueRef(
+                    name="out_ref_second",
+                    src_cpp_name="std::get<1>(out)",
+                    src_cpp_type="at::Tensor",
+                    is_out=True,
+                ),
+                ValueRef(
+                    name="out_ref",
+                    src_cpp_name="out",
+                    src_cpp_type=ret_type,
+                    is_out=False,
+                ),
+            ]
+        elif ret_type == THREE_TENSOR_TUPLE:
             self.refs["out"] = [
                 ValueRef(
                     name="out_ref_first",
@@ -130,6 +160,12 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
                     src_cpp_type="at::Tensor",
                     is_out=True,
                 ),
+                ValueRef(
+                    name="out_ref_third",
+                    src_cpp_name="std::get<2>(out)",
+                    src_cpp_type="at::Tensor",
+                    is_out=True,
+                ),
                 ValueRef(
                     name="out_ref",
                     src_cpp_name="out",
@@ -152,8 +188,20 @@ def create_aten_fn_call(self) -> str:
 
         return func_call
 
+    def create_aten_method_call(self) -> str:
+        # For functions with only Method variant, we fallback to the function
+        # declared in MethodOperators.h. The method is declared as
+        # at::_ops::{name}::call(*), and ATEN_FN is a handly macro.
+        cpp_sig = gen_static_dispatch_backend_call_signature(self.f_sig, self.f)
+        exprs = translate_args(self.f_sig, cpp_sig)
+        func_call = f"ATEN_FN({self.f_sig.name()})({exprs});"
+        return func_call
+
     def create_out_src(self) -> str:
-        return f"{self.out.cpp_type} out = " + self.create_aten_fn_call()
+        if Variant.function in self.f.variants:
+            return f"{self.out.cpp_type} out = " + self.create_aten_fn_call() + "\n"
+        else:
+            return f"{self.out.cpp_type} out = " + self.create_aten_method_call() + "\n"
 
     ## Graph code generation utils
 
@@ -163,7 +211,7 @@ def prepack_ref(self, ref: ValueRef) -> bool:
         else:
             return ref.supports_prepack and self.should_prepack
 
-    def create_value_for(self, ref: ValueRefList) -> str:
+    def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
         if isinstance(ref, list):
             ret_str = ""
             for r in ref:
@@ -173,6 +221,30 @@ def create_value_for(self, ref: ValueRefList) -> str:
         prepack = self.prepack_ref(ref)
 
         cpp_type = "IOValueRef" if (ref.is_in and not prepack) else "ValueRef"
+
+        if ref.src_cpp_type == OPT_AT_TENSOR:
+            ret_str = f"{cpp_type} {ref.name} = "
+            ret_str += f"!{ref.src_cpp_name}.has_value() ? "
+            ret_str += f"{self.graph}{self.dot}add_none() : "
+            if not prepack:
+                ret_str += f"{self.graph}{self.dot}"
+                ret_str += "add_input_tensor(" if ref.is_in else "add_tensor("
+                ret_str += f"{ref.src_cpp_name}->sizes().vec(), "
+                ret_str += f"from_at_scalartype({ref.src_cpp_name}->scalar_type())); \n"
+            elif prepack:
+                ret_str += f"{self.graph}{self.dot}"
+                ret_str += f"add_tensorref({ref.src_cpp_name}->sizes().vec(), "
+                ret_str += f"from_at_scalartype({ref.src_cpp_name}->scalar_type()), "
+                ret_str += f"{ref.src_cpp_name}->const_data_ptr()); \n"
+            return ret_str
+        elif ref.src_cpp_type == OPT_INT64:
+            ret_str = f"{cpp_type} {ref.name} = "
+            ret_str += f"!{ref.src_cpp_name}.has_value() ? "
+            ret_str += f"{self.graph}{self.dot}add_none() : "
+            ret_str += f"{self.graph}{self.dot}add_scalar<int64_t>"
+            ret_str += f"({ref.src_cpp_name}.value());\n"
+            return ret_str
+
         ret_str = f"{cpp_type} {ref.name} = {self.graph}{self.dot}"
         if ref.src_cpp_type == AT_TENSOR and not prepack:
             ret_str += "add_input_tensor(" if ref.is_in else "add_tensor("
@@ -189,8 +261,22 @@ def create_value_for(self, ref: ValueRefList) -> str:
             ret_str += f"add_scalar_list({ref.src_cpp_name}.vec()); \n"
         elif ref.src_cpp_type == BOOL:
             ret_str += f"add_scalar<bool>({ref.src_cpp_name}); \n"
-        elif ref.src_cpp_type == TENSOR_TUPLE:
+        elif ref.src_cpp_type == INT:
+            ret_str += f"add_scalar<int64_t>({ref.src_cpp_name}); \n"
+        elif ref.src_cpp_type == DOUBLE:
+            ret_str += f"add_scalar<double>({ref.src_cpp_name}); \n"
+        elif (
+            ref.src_cpp_type == OPT_SCALAR_TYPE
+            or ref.src_cpp_type == OPT_LAYOUT
+            or ref.src_cpp_type == OPT_DEVICE
+            or ref.src_cpp_type == OPT_BOOL
+            or ref.src_cpp_type == OPT_MEMORY_FORMAT
+        ):
+            ret_str += "add_none(); \n"
+        elif ref.src_cpp_type == TWO_TENSOR_TUPLE:
             ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second}}); \n"
+        elif ref.src_cpp_type == THREE_TENSOR_TUPLE:
+            ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second, {ref.name}_third}}); \n"
         else:
             raise RuntimeError(f"Unsupported cpp type {ref.src_cpp_type}")
 
@@ -224,14 +310,16 @@ def set_output(self, ref: ValueRefList) -> str:
         return ret_str
 
     def virtual_resize(self, ref: ValueRefList) -> str:
+        assert isinstance(ref, ValueRef)
         assert ref.src_cpp_type == AT_TENSOR and ref.is_in
         if self.prepack_ref(ref):
             return ""
-        ret_str = f"{self.graph}{self.dot}get_val({ref.name}.value).toTensor()"
-        ret_str += f".virtual_resize({ref.src_cpp_name}.sizes().vec());\n"
+        ret_str = f"{self.graph}{self.dot}get_tensor({ref.name}.value)"
+        ret_str += f"->virtual_resize({ref.src_cpp_name}.sizes().vec());\n"
         return ret_str
 
     def copy_into_staging(self, ref: ValueRefList) -> str:
+        assert isinstance(ref, ValueRef)
         assert ref.src_cpp_type == AT_TENSOR and ref.is_in
         if self.prepack_ref(ref):
             return ""
@@ -248,7 +336,9 @@ def declare_vk_out_for(self, ref: Union[ValueRef, List[ValueRef]]) -> str:
                 ret_str += self.declare_vk_out_for(r)
             return ret_str
 
-        return f"at::Tensor vk_{ref.name} = at::empty_like({ref.src_cpp_name});\n"
+        ret_str = f"at::Tensor vk_{ref.name} = at::empty_like({ref.src_cpp_name})"
+        ret_str += ".contiguous();\n"
+        return ret_str
 
     def copy_from_staging(self, ref: ValueRefList) -> str:
         if isinstance(ref, list):
@@ -272,13 +362,12 @@ def check_graph_out(self, ref: ValueRefList) -> str:
                 ret_str += self.check_graph_out(r)
             return ret_str
 
-        return f"EXPECT_TRUE(check_close({ref.src_cpp_name}, vk_{ref.name}));\n"
+        return f"EXPECT_TRUE(check_close({ref.src_cpp_name}, vk_{ref.name}, rtol, atol));\n"
 
     ## Top level code generation
 
     def gen_graph_build_code(self) -> str:
         graph_build = self.create_out_src()
-
         for aten_arg in self.args:
             graph_build += self.create_value_for(self.refs[aten_arg.name])
 
@@ -310,15 +399,31 @@ def gen_graph_exec_code(self) -> str:
 
         return graph_exec
 
+    def gen_conditional_skips(self) -> str:
+        skips = "if (test_dtype == at::kHalf && "
+        skips += f"!{self.graph}{self.dot}context()->adapter_ptr()->has_16bit_storage()) {{\n"
+        skips += "  GTEST_SKIP();"
+        skips += "}\n"
+        return skips
+
     def gen_op_check_fn(self) -> str:
         op_name = self.f.func.name.unambiguous_name()
-        op_check_fn = self.gen_decl(f"check_{op_name}") + " {"
+        op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n"
         if self.should_prepack:
             op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {"
-        op_check_fn += self.gen_graph_build_code()
-        op_check_fn += self.gen_graph_exec_code()
-        op_check_fn += self.check_graph_out(self.refs["out"])
-        op_check_fn += "}\n"
+
+        op_check_fn_body = ""
+        op_check_fn_body += self.gen_conditional_skips()
+        op_check_fn_body += self.gen_graph_build_code()
+        op_check_fn_body += self.gen_graph_exec_code()
+        op_check_fn_body += self.check_graph_out(self.refs["out"])
+
+        # Add two level of indent for readability
+        op_check_fn_body = re.sub(r"^", "        ", op_check_fn_body, flags=re.M)
+
+        op_check_fn += op_check_fn_body + "\n"
+        op_check_fn += "    }\n"
+
         return op_check_fn
 
 
@@ -327,19 +432,26 @@ def gen_op_check_fn(self) -> str:
 ##################################
 
 test_fixture_template = """
-class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<api::StorageType, api::GPUMemoryLayout>> {{
+class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<at::ScalarType, api::StorageType, api::GPUMemoryLayout>> {{
   protected:
     ComputeGraph* graph;
     at::ScalarType test_dtype = at::kFloat;
+    float rtol = 1e-5;
+    float atol = 1e-5;
 
     void SetUp() override {{
         GraphConfig config;
         api::StorageType default_storage_type;
         api::GPUMemoryLayout default_memory_layout;
-        std::tie(default_storage_type, default_memory_layout) = GetParam();
+        std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
         config.setStorageTypeOverride(default_storage_type);
         config.setMemoryLayoutOverride(default_memory_layout);
         graph = new ComputeGraph(config);
+
+        if (test_dtype == at::kHalf) {{
+            rtol = 1e-2;
+            atol = 1e-2;
+        }}
     }}
 
     void TearDown() override {{
@@ -356,7 +468,7 @@ class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple
 
 
 class VkTestSuiteGen(TestSuiteGen):
-    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: List[Any]):
+    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
         super().__init__(f, inputs)
         self.op_reg_name = op_reg_name
         self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
@@ -378,14 +490,16 @@ def generate_fixture_cpp(self) -> str:
         )
 
     def gen_parameterization(self) -> str:
-        storage_types = self.suite_def.supports["storage_types"]
-        layouts = self.suite_def.supports["layouts"]
+        dtypes = self.suite_def.dtypes
+        storage_types = self.suite_def.storage_types
+        layouts = self.suite_def.layouts
 
         return f"""
         INSTANTIATE_TEST_SUITE_P(
-            StorageLayoutCombos_{self.op_name},
+            Combos_{self.op_name},
             GeneratedOpsTest_{self.op_name},
             ::testing::Combine(
+                ::testing::Values({', '.join(dtypes)}),
                 ::testing::Values({', '.join(storage_types)}),
                 ::testing::Values({', '.join(layouts)})));
         """
@@ -403,6 +517,7 @@ def gen_parameterization(self) -> str:
 #include <tuple>
 
 using namespace vkcompute;
+using TensorOptions = at::TensorOptions;
 
 api::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
     switch(at_scalartype) {
@@ -420,18 +535,20 @@ def gen_parameterization(self) -> str:
 }
 
 #ifdef USE_VULKAN_FP16_INFERENCE
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-3) {
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) {
 #else
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-8) {
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) {
 #endif
     // Skip checking index tensors
     if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) {
         return true;
     }
     bool is_close = at::allclose(t1, t2, rtol, atol);
-    if (!is_close) {
-        std::cout << "t1:" << t1 << std::endl;
-        std::cout << "t2:" << t2 << std::endl;
+    if (!is_close && t1.numel() < 500) {
+        std::cout << "reference: " << std::endl;
+        print(t1, 150);
+        std::cout << "vulkan: " << std::endl;
+        print(t2, 150);
     }
     return is_close;
 }
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
index 2af45030ec5..d5feada1df8 100644
--- a/backends/vulkan/test/op_tests/utils/codegen_base.py
+++ b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from dataclasses import dataclass
 from typing import Any, List
 
 from torchgen.api import cpp
@@ -15,22 +14,33 @@
 ## ATen code patterns ##
 ########################
 
-AT_TENSOR = "at::Tensor"
-AT_SCALAR = "at::Scalar"
 AT_INT_ARRAY_REF = "at::IntArrayRef"
+AT_SCALAR = "at::Scalar"
+AT_TENSOR = "at::Tensor"
 BOOL = "bool"
-TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
+DOUBLE = "double"
+INT = "int64_t"
+OPT_AT_TENSOR = "::std::optional<at::Tensor>"
+OPT_BOOL = "::std::optional<bool>"
+OPT_INT64 = "::std::optional<int64_t>"
+OPT_DEVICE = "::std::optional<at::Device>"
+OPT_LAYOUT = "::std::optional<at::Layout>"
+OPT_MEMORY_FORMAT = "::std::optional<at::MemoryFormat>"
+OPT_SCALAR_TYPE = "::std::optional<at::ScalarType>"
+TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
+THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
 
 ###########################
 ## Test Suite definition ##
 ###########################
 
 
-@dataclass
 class TestSuite:
-    input_cases: List[Any]
-    prepacked_args = []
-    requires_prepack = False
+    def __init__(self, input_cases: List[Any]):
+        self.input_cases: List[Any] = input_cases
+        self.prepacked_args: List[str] = []
+        self.requires_prepack: bool = False
+        self.dtypes: List[str] = ["at::kFloat", "at::kHalf"]
 
     def supports_prepack(self):
         return len(self.prepacked_args) > 0
@@ -103,9 +113,12 @@ def gen_case_name(self, inputs: List[Any], prepack: bool = False) -> str:
                 name_str = name_str[:-1]
             else:
                 name_str += str(arg_sizes_or_val).replace(".", "p")
+
+        # minus sign is a invalid char for test case. change to "n".
+        name_str = name_str.replace("-", "n")
         return name_str
 
-    def create_input_data(self, arg: Argument, data: Any) -> str:
+    def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
         ctype = cpp.argumenttype_type(arg.type, mutable=arg.is_write, binds=arg.name)
         cpp_type = ctype.cpp_type(strip_ref=True)
 
@@ -115,13 +128,35 @@ def create_input_data(self, arg: Argument, data: Any) -> str:
             ret_str = f"{cpp_type} {arg.name} = "
 
         if cpp_type == AT_TENSOR:
-            ret_str += f"make_rand_tensor({init_list_str(data)}, test_dtype);"
+            ret_str += f"{self.suite_def.data_gen}({init_list_str(data)}, test_dtype);"
+        elif cpp_type == OPT_AT_TENSOR:
+            if str(data) == "None":
+                ret_str += "std::nullopt;"
+            else:
+                ret_str += f"make_rand_tensor({init_list_str(data)}, test_dtype);"
         elif cpp_type == AT_SCALAR:
             ret_str += f"{data};"
         elif cpp_type == AT_INT_ARRAY_REF:
             ret_str += f"{init_list_str(data)};"
         elif cpp_type == BOOL:
             ret_str += f"{str(data).lower()};"
+        elif cpp_type == INT:
+            ret_str += f"{str(data).lower()};"
+        elif cpp_type == DOUBLE:
+            ret_str += f"{str(data).lower()};"
+        elif cpp_type == OPT_INT64:
+            if str(data) == "None":
+                ret_str += "std::nullopt;"
+            else:
+                ret_str += f"{str(data)};"
+        elif (
+            cpp_type == OPT_SCALAR_TYPE
+            or cpp_type == OPT_LAYOUT
+            or cpp_type == OPT_DEVICE
+            or cpp_type == OPT_BOOL
+            or cpp_type == OPT_MEMORY_FORMAT
+        ):
+            ret_str += "std::nullopt;"
         else:
             raise RuntimeError(f"Unsupported cpp type {cpp_type}")
         return ret_str + "\n"
@@ -194,6 +229,25 @@ def generate_suite_cpp(self) -> str:
     return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low;
 }}
 
+
+at::Tensor make_seq_tensor(
+    std::vector<int64_t> sizes,
+      at::ScalarType dtype = at::kFloat) {{
+  int64_t n = 1;
+  for (auto size: sizes) {{
+    n *= size;
+  }}
+
+  std::vector<float> values(n);
+  for (int i=0;i<n;i++) {{
+    values[i] = (float) i;
+  }}
+
+  // from_blob doesn't take ownership of data. Hence must create a copy as
+  // "values" will go out of scope.
+  return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
+}}
+
 {test_suites_cpp}
 """
 
@@ -215,6 +269,6 @@ def generate_preamble(self) -> str:
     def generate_test_suites_cpp(self) -> str:
         return "\n".join([h.generate_suite_cpp() for h in self.suites_gens])
 
-    def add_suite(self, f: NativeFunction, test_suite: TestSuite) -> None:
-        suites_gen = TestSuiteGen(f, test_suite)
+    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
+        suites_gen = TestSuiteGen(f, all_input_cases)
         self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index d90cfad7bbe..a458fc1c24e 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -496,3 +496,305 @@ def forward(self, x):
             sample_inputs,
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_sum(self):
+        class SumModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.sum(x, (), keepdim=True)
+                x = torch.sum(x)
+                return x
+
+        module = SumModule()
+        sample_inputs = (torch.rand(size=(3, 2, 7, 5), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_conv2d(self):
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=6,
+                    out_channels=8,
+                    kernel_size=(3, 3),
+                    padding=(2, 3),
+                    stride=(1, 2),
+                    dilation=1,
+                    groups=1,
+                    bias=True,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv2d_module = Conv2dModule()
+        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv2d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_conv_transpose2d(self):
+        class ConvTranspose2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.ConvTranspose2d(
+                    in_channels=6,
+                    out_channels=8,
+                    kernel_size=(3, 3),
+                    padding=(2, 3),
+                    stride=(1, 2),
+                    output_padding=(0, 1),
+                    dilation=1,
+                    groups=1,
+                    bias=True,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv_transpose2d_module = ConvTranspose2dModule()
+        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv_transpose2d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_conv2d_dw(self):
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=8,
+                    out_channels=8,
+                    kernel_size=3,
+                    padding=1,
+                    groups=8,
+                    bias=True,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv2d_module = Conv2dModule()
+        sample_inputs = (torch.randn(size=(1, 8, 72, 96), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv2d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_conv2d_pw(self):
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=8,
+                    out_channels=8,
+                    kernel_size=1,
+                    padding=1,
+                    groups=1,
+                    bias=True,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv2d_module = Conv2dModule()
+        sample_inputs = (torch.randn(size=(1, 8, 72, 96), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv2d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_conv2d_bias_false(self):
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=6,
+                    out_channels=8,
+                    kernel_size=(3, 3),
+                    padding=(2, 3),
+                    stride=(1, 2),
+                    dilation=1,
+                    groups=1,
+                    bias=False,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv2d_module = Conv2dModule()
+        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv2d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_conv1d(self):
+        class Conv1dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv1d(
+                    in_channels=20,
+                    out_channels=10,
+                    kernel_size=6,
+                    stride=5,
+                    padding=5,
+                    dilation=3,
+                    groups=5,
+                    bias=True,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv1d_module = Conv1dModule()
+        sample_inputs = (torch.randn(size=(3, 20, 30), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv1d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_conv1d_bias_false(self):
+        class Conv1dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv1d(
+                    in_channels=6,
+                    out_channels=6,
+                    kernel_size=3,
+                    groups=6,
+                    bias=False,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv1d_module = Conv1dModule()
+        sample_inputs = (torch.randn(size=(1, 6, 7), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv1d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_native_layer_norm(self):
+        class NativeLayerNormModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.native_layer_norm(
+                    x, [5], torch.ones(5), torch.zeros(5), 1e-5
+                )
+
+        sample_inputs = (torch.randn(size=(3, 4, 5), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            NativeLayerNormModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_full(self):
+        class FullModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.full(x.shape, 42.0)
+
+        sample_inputs = (torch.randn(size=(2, 3, 4, 5), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            FullModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_reshape(self):
+        class ReshapeModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.reshape(x, [-1, x.size(-1)])
+
+        sample_inputs = (torch.randn(size=(5, 3, 4), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            ReshapeModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_view(self):
+        class ViewModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.view([-1, x.size(-1)])
+
+        sample_inputs = (torch.randn(size=(3, 2, 3, 4), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            ViewModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_unsqueeze(self):
+        class UnsqueezeModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.unsqueeze(x, 1)
+                x = torch.unsqueeze(x, 0)
+                return x
+
+        sample_inputs = (torch.randn(size=(3,), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            UnsqueezeModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_select(self):
+        class SelectModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x[0][3]
+
+        sample_inputs = (torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            SelectModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index a4e3b2acb29..db966b6a7c1 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -21,20 +21,22 @@ void record_nchw_to_image_op(
     api::VulkanBuffer& src_buffer,
     vTensor& v_dst) {
   api::PipelineBarrier pipeline_barrier{};
+  api::SpecVarList specialization_constants = {
+      SV(v_dst.gpu_memory_layout_int())};
 
   context->submit_compute_job(
       get_nchw_to_image_shader(v_dst),
       pipeline_barrier,
-      v_dst.virtual_extents(),
-      adaptive_work_group_size(v_dst.virtual_extents()),
+      v_dst.extents(),
+      adaptive_work_group_size(v_dst.extents()),
+      specialization_constants,
       VK_NULL_HANDLE,
       v_dst.image(
           pipeline_barrier,
           api::PipelineStage::COMPUTE,
           api::MemoryAccessType::WRITE),
       src_buffer,
-      v_dst.gpu_sizes_ubo()->buffer(),
-      v_dst.cpu_sizes_ubo()->buffer());
+      v_dst.sizes_ubo());
 }
 
 void record_image_to_nchw_op(
@@ -42,16 +44,61 @@ void record_image_to_nchw_op(
     vTensor& v_src,
     api::VulkanBuffer& dst_buffer) {
   api::PipelineBarrier pipeline_barrier{};
+  api::SpecVarList specialization_constants = {
+      SV(v_src.gpu_memory_layout_int())};
+
   context->submit_compute_job(
       get_image_to_nchw_shader(v_src),
       pipeline_barrier,
-      v_src.virtual_extents(),
-      adaptive_work_group_size(v_src.virtual_extents()),
+      v_src.extents(),
+      adaptive_work_group_size(v_src.extents()),
+      specialization_constants,
       VK_NULL_HANDLE,
       v_src.image(pipeline_barrier, api::PipelineStage::COMPUTE),
       dst_buffer,
-      v_src.gpu_sizes_ubo()->buffer(),
-      v_src.cpu_sizes_ubo()->buffer());
+      v_src.sizes_ubo());
+}
+
+void record_conv2d_prepack_weights_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst,
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& padded_sizes,
+    const bool transposed) {
+  api::PipelineBarrier pipeline_barrier{};
+
+  std::string kernel_name;
+  if (transposed) {
+    kernel_name = "conv_transpose2d";
+  } else {
+    kernel_name = "conv2d";
+  }
+  kernel_name += "_prepack_weights";
+  add_dtype_suffix(kernel_name, v_dst);
+  api::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  api::UniformParamsBuffer original_sizes_ubo(
+      context, api::utils::make_ivec4(original_sizes, /*reverse = */ true));
+  api::UniformParamsBuffer padded_sizes_ubo(
+      context, api::utils::make_ivec2(padded_sizes, /*reverse = */ true));
+
+  api::SpecVarList specialization_constants = {};
+  context->submit_compute_job(
+      shader,
+      pipeline_barrier,
+      v_dst.extents(),
+      adaptive_work_group_size(v_dst.extents()),
+      specialization_constants,
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      src_buffer,
+      v_dst.sizes_ubo(),
+      original_sizes_ubo.buffer(),
+      padded_sizes_ubo.buffer());
 }
 
 void record_binary_op(
@@ -60,16 +107,17 @@ void record_binary_op(
     vTensor& v_in1,
     vTensor& v_in2,
     vTensor& v_dst) {
-  std::stringstream kernel_name;
-  kernel_name << "binary_" << op_name << "_nobroadcast__test";
-  apply_dtype_suffix(kernel_name, v_dst);
+  std::string kernel_name = "binary_" + op_name + "_nobroadcast__test";
+  add_dtype_suffix(kernel_name, v_dst);
 
   api::PipelineBarrier pipeline_barrier{};
+  api::SpecVarList specialization_constants = {};
   context->submit_compute_job(
-      VK_KERNEL_FROM_STR(kernel_name.str()),
+      VK_KERNEL_FROM_STR(kernel_name),
       pipeline_barrier,
-      v_dst.virtual_extents(),
-      adaptive_work_group_size(v_dst.virtual_extents()),
+      v_dst.extents(),
+      adaptive_work_group_size(v_dst.extents()),
+      specialization_constants,
       VK_NULL_HANDLE,
       v_dst.image(
           pipeline_barrier,
@@ -77,7 +125,7 @@ void record_binary_op(
           api::MemoryAccessType::WRITE),
       v_in1.image(pipeline_barrier, api::PipelineStage::COMPUTE),
       v_in2.image(pipeline_barrier, api::PipelineStage::COMPUTE),
-      v_dst.extents_ubo()->buffer());
+      v_dst.sizes_ubo());
 }
 
 void execute_and_check_add(
@@ -123,7 +171,7 @@ void fill_vtensor(
     const IOValueRef idx,
     float val,
     bool iota) {
-  std::vector<float> data(graph.get_val(idx.value).toTensor().gpu_numel());
+  std::vector<float> data(graph.get_tensor(idx.value)->gpu_numel());
   if (iota) {
     std::iota(data.begin(), data.end(), val);
   } else {
@@ -192,13 +240,13 @@ void execute_graph_and_check_output(
 
   for (size_t i = 0; i < graph.outputs().size(); ++i) {
     IOValueRef out_ioval = graph.outputs().at(i);
-    vTensor& t_out = graph.get_val(out_ioval.value).toTensor();
+    vTensorPtr t_out = graph.get_tensor(out_ioval.value);
 
-    std::vector<float> output_data(t_out.gpu_numel());
+    std::vector<float> output_data(t_out->gpu_numel());
     graph.copy_from_staging(
         out_ioval.staging, output_data.data(), output_data.size());
 
-    for (size_t j = 0; j < t_out.numel(); ++j) {
+    for (size_t j = 0; j < t_out->numel(); ++j) {
       CHECK_VALUE(output_data, j, expected_outputs.at(i));
     }
   }
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 8dcba015520..a1f3b93dc3a 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -81,6 +81,14 @@ void record_image_to_nchw_op(
     vTensor& v_src,
     api::VulkanBuffer& dst_buffer);
 
+void record_conv2d_prepack_weights_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst,
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& padded_sizes,
+    const bool transposed);
+
 void record_binary_op(
     api::Context* const context,
     const std::string& op_name,
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 3265191180b..4955d0537ee 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -8,6 +8,8 @@
 
 #include <gtest/gtest.h>
 
+#include <c10/util/Half.h>
+
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
@@ -37,6 +39,10 @@ class VulkanComputeAPITest : public ::testing::Test {
   }
 };
 
+TEST_F(VulkanComputeAPITest, print_adapter) {
+  std::cout << *(api::context()->adapter_ptr()) << std::endl;
+}
+
 TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
   // Try to get shader from custom shader library
   const api::ShaderInfo& kernel = VK_KERNEL(test_shader);
@@ -44,14 +50,92 @@ TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
   ASSERT_TRUE(kernel.kernel_name == "test_shader");
 }
 
+TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
+  // Check equality operator
+  ASSERT_TRUE(SV(1.5f) == SV(1.5f));
+  ASSERT_FALSE(SV(15.0f) == SV(15));
+  ASSERT_FALSE(SV(1u) == SV(true));
+
+  size_t sv_size = sizeof(api::SpecVar);
+
+  api::SpecVarList spec_vars = {};
+  ASSERT_TRUE(spec_vars.size() == 0);
+  spec_vars = {SV(1.1f), SV(32), SV(45)};
+  ASSERT_TRUE(spec_vars.size() == 3);
+  api::SpecVarList spec_vars_other = {SV(2.6f), SV(true), SV(78u), SV(5.5f)};
+  spec_vars.append(spec_vars_other);
+  ASSERT_TRUE(spec_vars.size() == 7);
+
+  // Check validity of the data
+  const api::SpecVar* data = spec_vars.data();
+  ASSERT_TRUE(*(reinterpret_cast<const float*>(data + 3)) == 2.6f);
+  ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 1)) == 32);
+  ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 5)) == 78u);
+
+  // Check validity of the map entries
+  std::vector<VkSpecializationMapEntry> entries =
+      spec_vars.generate_map_entries();
+
+  for (size_t i = 0; i < spec_vars.size(); ++i) {
+    ASSERT_TRUE(entries[i].constantID == i);
+    ASSERT_TRUE(entries[i].offset == sv_size * i);
+    if (i != 4) {
+      ASSERT_TRUE(entries[i].size == 4);
+    } else {
+      ASSERT_TRUE(entries[i].size == 1);
+    }
+  }
+
+  // Check copy
+  api::SpecVarList spec_vars_copy(spec_vars);
+  ASSERT_TRUE(spec_vars_copy.size() == 7);
+
+  // Check validity of the copied data
+  const api::SpecVar* copy_data = spec_vars_copy.data();
+  ASSERT_TRUE(*(reinterpret_cast<const bool*>(copy_data + 4)) == true);
+  ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(copy_data + 2)) == 45);
+  ASSERT_TRUE(*(reinterpret_cast<const float*>(copy_data + 6)) == 5.5f);
+}
+
+TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
+  size_t len = 16;
+  api::StorageBuffer buffer(api::context(), api::kFloat, len);
+
+  float scale = 3.0f;
+  float offset = 1.5f;
+
+  {
+    api::UniformParamsBuffer params(api::context(), int32_t(len));
+    uint32_t len_div4 = api::utils::div_up(uint32_t(len), uint32_t(4));
+    api::PipelineBarrier pipeline_barrier{};
+    api::context()->submit_compute_job(
+        VK_KERNEL(fill_buffer),
+        pipeline_barrier,
+        {64, 1, 1},
+        {len_div4, 1, 1},
+        {SV(scale), SV(offset)},
+        VK_NULL_HANDLE,
+        buffer.buffer(),
+        params.buffer());
+  }
+
+  submit_to_gpu();
+
+  std::vector<float> data(len);
+  copy_staging_to_ptr(buffer, data.data(), buffer.nbytes());
+
+  for (size_t i = 0; i < len; ++i) {
+    CHECK_VALUE(data, i, scale * i + offset);
+  }
+}
+
 TEST_F(VulkanComputeAPITest, update_params_between_submit) {
   api::context()->set_cmd(/*reusable = */ true);
   std::vector<int64_t> sizes = {4, 4, 2};
   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
 
-  std::stringstream kernel_name;
-  kernel_name << "fill_texture__test";
-  apply_dtype_suffix(kernel_name, a);
+  std::string kernel_name("fill_texture__test");
+  add_dtype_suffix(kernel_name, a);
 
   struct Params final {
     api::utils::ivec3 size;
@@ -69,11 +153,13 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
 
   {
     api::PipelineBarrier pipeline_barrier{};
+    api::SpecVarList specialization_constants = {};
     api::context()->submit_compute_job(
-        VK_KERNEL_FROM_STR(kernel_name.str()),
+        VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
         {4, 4, 4},
         {4, 4, 4},
+        specialization_constants,
         VK_NULL_HANDLE,
         a.image(
             pipeline_barrier,
@@ -100,6 +186,74 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
   check_staging_buffer(staging_buffer, 4.0f);
 }
 
+template <typename T, api::ScalarType dtype>
+void test_storage_buffer_type(const size_t len) {
+  api::StorageBuffer buffer(api::context(), dtype, len);
+
+  std::string kernel_name("idx_fill_buffer");
+  switch (dtype) {
+    case api::kFloat:
+      kernel_name += "_float";
+      break;
+    case api::kHalf:
+      kernel_name += "_half";
+      break;
+    case api::kQInt8:
+      kernel_name += "_int8";
+      break;
+    case api::kQUInt8:
+      kernel_name += "_uint8";
+      break;
+    default:
+      throw std::runtime_error("Unsupported dtype");
+      break;
+  }
+
+  api::UniformParamsBuffer params(api::context(), int32_t(len));
+
+  {
+    uint32_t len_div4 = api::utils::div_up(uint32_t(len), uint32_t(4));
+    api::PipelineBarrier pipeline_barrier{};
+    api::SpecVarList specialization_constants = {};
+    api::context()->submit_compute_job(
+        VK_KERNEL_FROM_STR(kernel_name),
+        pipeline_barrier,
+        {64, 1, 1},
+        {len_div4, 1, 1},
+        specialization_constants,
+        VK_NULL_HANDLE,
+        buffer.buffer(),
+        params.buffer());
+  }
+
+  submit_to_gpu();
+
+  std::vector<T> data(len);
+  copy_staging_to_ptr(buffer, data.data(), buffer.nbytes());
+
+  for (size_t i = 0; i < len; ++i) {
+    CHECK_VALUE(data, i, T(i));
+  }
+}
+
+TEST_F(VulkanComputeAPITest, test_buffer_float) {
+  test_storage_buffer_type<float, api::kFloat>(16);
+}
+
+TEST_F(VulkanComputeAPITest, test_buffer_float16) {
+  if (!api::context()->adapter_ptr()->has_full_float16_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_storage_buffer_type<c10::Half, api::kHalf>(16);
+}
+
+TEST_F(VulkanComputeAPITest, test_buffer_int8) {
+  if (!api::context()->adapter_ptr()->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_storage_buffer_type<int8_t, api::kQInt8>(16);
+}
+
 TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
   // Simple test that performs a + b -> c
 
@@ -133,7 +287,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
-  // No allocations made yet
+  // No allocations made so far
   EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   std::vector<float> data_a(a.gpu_numel());
@@ -178,7 +332,7 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   vTensor d = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
   vTensor e = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
-  // No allocations made yet
+  // No allocations made so far
   EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   // a and d can share the same memory allocation
@@ -193,7 +347,7 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   api::MemoryAllocation c_mem = allocate_memory_for(c);
   c.image().bind_allocation(c_mem);
 
-  // Only 3 allocations should be made
+  // 3 allocations should be made
   EXPECT_TRUE(get_vma_allocation_count() == 3);
 
   // Specify input data
@@ -267,7 +421,7 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
   std::vector<int64_t> sizes = {4, 4, 1};
   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
-  // No allocations made yet
+  // No allocations yet
   EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   std::vector<float> data_a(a.gpu_numel());
@@ -378,9 +532,8 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
 // Compute Graph Tests
 //
 
-#define EXTRACT_TENSOR(name)                             \
-  std::vector<float> data_##name(                        \
-      graph.get_val(name.value).toTensor().gpu_numel()); \
+#define EXTRACT_TENSOR(name)                                                 \
+  std::vector<float> data_##name(graph.get_tensor(name.value)->gpu_numel()); \
   graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
 
 TEST(VulkanComputeGraphTest, test_values_scalars) {
@@ -390,10 +543,10 @@ TEST(VulkanComputeGraphTest, test_values_scalars) {
   ValueRef idx;
 
   idx = graph.add_scalar<int64_t>(4);
-  EXPECT_TRUE(graph.get_val(idx).toInt() == 4);
+  EXPECT_TRUE(graph.get_int(idx) == 4);
 
   idx = graph.add_scalar<double>(5.5f);
-  EXPECT_TRUE(graph.get_val(idx).toDouble() == 5.5f);
+  EXPECT_TRUE(graph.get_double(idx) == 5.5f);
 }
 
 TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) {
@@ -401,10 +554,10 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) {
   ComputeGraph graph(config);
 
   ValueRef idx = graph.add_scalar_list<int64_t>({1, 2, 3, 4});
-  std::vector<int64_t>& arr = graph.get_val(idx).toIntList();
-  EXPECT_TRUE(arr.size() == 4);
+  const auto arr = graph.get_int_list(idx);
+  EXPECT_TRUE(arr->size() == 4);
   for (int i = 0; i < 4; i++) {
-    EXPECT_TRUE(arr[i] == i + 1);
+    EXPECT_TRUE(arr->at(i) == i + 1);
   }
 }
 
@@ -417,10 +570,10 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) {
     std::vector<double> data = {5.0, 4.0, 3.0, 2.0, 1.0};
     idx = graph.add_scalar_list(std::move(data));
   }
-  std::vector<double>& arr = graph.get_val(idx).toDoubleList();
-  EXPECT_TRUE(arr.size() == 5);
+  const auto& arr = graph.get_double_list(idx);
+  EXPECT_TRUE(arr->size() == 5);
   for (int i = 0; i < 5; i++) {
-    EXPECT_TRUE(arr[i] == (5 - i));
+    EXPECT_TRUE(arr->at(i) == (5 - i));
   }
 }
 
@@ -433,7 +586,7 @@ TEST(VulkanComputeGraphTest, test_values_string) {
     std::string data = "hello, world";
     idx = graph.add_string(std::move(data));
   }
-  std::string& stored = graph.get_val(idx).toString();
+  std::string stored = graph.get_string(idx);
   EXPECT_TRUE(stored == "hello, world");
 }
 
@@ -476,7 +629,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
     EXTRACT_TENSOR(out);
 
     // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.get_val(out.value).toTensor().numel(); ++i) {
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
       CHECK_VALUE(data_out, i, val_c);
     }
   }
@@ -534,7 +687,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
     EXTRACT_TENSOR(out);
 
     // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.get_val(out.value).toTensor().numel(); ++i) {
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
       CHECK_VALUE(data_out, i, val_out);
     }
   }
@@ -558,9 +711,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       api::kFloat,
       /*shared_object_idx = */ 4);
 
-  // +4: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for each staging shader
+  // +2: t.sizes_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 6);
+  EXPECT_TRUE(get_vma_allocation_count() == 4);
 
   ValueRef c = graph.add_tensor(
       size_big,
@@ -575,10 +728,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       api::kFloat,
       /*shared_object_idx = */ 2);
 
-  // +3: out.gpu_sizes_ubo(), alpha UBO, broadcast UBO for arithmetic shader
-  // +2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() uniform buffer for staging shader
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +1: t.sizes_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 12);
+  EXPECT_TRUE(get_vma_allocation_count() == 9);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -593,15 +746,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   out.staging = graph.set_output_tensor(out.value);
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for staging shader
+  // +1: t.sizes_ubo() for staging shader
   // +1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 17);
+  EXPECT_TRUE(get_vma_allocation_count() == 13);
 
   graph.prepare();
   graph.encode_execute();
 
   // +3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 20);
+  EXPECT_TRUE(get_vma_allocation_count() == 16);
 
   // Run graph
 
@@ -609,11 +762,11 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
 
   for (auto& new_sizes : new_sizes_list) {
-    graph.get_val(a.value).toTensor().virtual_resize(new_sizes);
-    graph.get_val(b.value).toTensor().virtual_resize(new_sizes);
-    graph.get_val(c).toTensor().virtual_resize(new_sizes);
-    graph.get_val(d.value).toTensor().virtual_resize(new_sizes);
-    graph.get_val(e).toTensor().virtual_resize(new_sizes);
+    graph.get_tensor(a.value)->virtual_resize(new_sizes);
+    graph.get_tensor(b.value)->virtual_resize(new_sizes);
+    graph.get_tensor(c)->virtual_resize(new_sizes);
+    graph.get_tensor(d.value)->virtual_resize(new_sizes);
+    graph.get_tensor(e)->virtual_resize(new_sizes);
 
     float val_a = new_sizes[1] + 4.0f;
     float val_b = new_sizes[2] + 1.5f;
@@ -630,7 +783,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
     EXTRACT_TENSOR(out);
 
     // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.get_val(out.value).toTensor().numel(); i++) {
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
       CHECK_VALUE(data_out, i, val_out);
     }
   }
@@ -645,7 +798,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
     graph.propagate_resize();
 
     // Check output shape
-    EXPECT_TRUE(graph.get_val(out.value).toTensor().sizes() == new_sizes);
+    EXPECT_TRUE(graph.get_tensor(out.value)->sizes() == new_sizes);
 
     float val_a = new_sizes[1] + 6.0f;
     float val_b = new_sizes[2] + 2.5f;
@@ -662,7 +815,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
     EXTRACT_TENSOR(out);
 
     // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.get_val(out.value).toTensor().numel(); i++) {
+    for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
       CHECK_VALUE(data_out, i, val_out);
     }
   }
@@ -717,7 +870,7 @@ TEST(VulkanComputeGraphTest, test_large_graph) {
 
     EXTRACT_TENSOR(out);
 
-    for (int i = 0; i < graph.get_val(out.value).toTensor().numel(); i++) {
+    for (int i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
       CHECK_VALUE(data_out, i, val_e);
     }
   }
@@ -745,40 +898,42 @@ void run_from_gpu_test(
         api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
     api::ScalarType dtype = api::kFloat,
     api::StorageType storage_type = api::StorageType::TEXTURE_3D) {
+  if (dtype == api::kHalf &&
+      !api::context()->adapter_ptr()->has_16bit_storage()) {
+    return;
+  }
   vTensor vten =
-      vTensor(api::context(), sizes, api::kFloat, storage_type, memory_layout);
+      vTensor(api::context(), sizes, dtype, storage_type, memory_layout);
 
-  std::stringstream kernel_name;
-  kernel_name << "idx_fill_texture";
-  apply_memory_layout_suffix(kernel_name, vten);
-  apply_dtype_suffix(kernel_name, vten);
+  std::string kernel_name("idx_fill_texture");
+  add_memory_layout_suffix(kernel_name, vten);
+  add_dtype_suffix(kernel_name, vten);
 
   {
     api::PipelineBarrier pipeline_barrier{};
+    api::SpecVarList specialization_constants = {vten.gpu_memory_layout_int()};
     api::context()->submit_compute_job(
-        VK_KERNEL_FROM_STR(kernel_name.str()),
+        VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
-        vten.virtual_extents(),
+        vten.extents(),
         {4, 4, 4},
+        specialization_constants,
         VK_NULL_HANDLE,
         vten.image(
             pipeline_barrier,
             api::PipelineStage::COMPUTE,
             api::MemoryAccessType::WRITE),
-        vten.gpu_sizes_ubo()->buffer(),
-        vten.cpu_sizes_ubo()->buffer());
+        vten.sizes_ubo());
   }
 
-  api::StorageBuffer staging_buffer(
-      api::context(), api::kFloat, vten.gpu_numel());
+  api::StorageBuffer staging_buffer(api::context(), dtype, vten.gpu_numel());
 
   record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
 
   submit_to_gpu();
 
   std::vector<T> data_out(staging_buffer.numel());
-  copy_staging_to_ptr(
-      staging_buffer, data_out.data(), sizeof(float) * staging_buffer.numel());
+  copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes());
 
   for (int i = 0; i < vten.numel(); i++) {
     CHECK_VALUE(data_out, i, i);
@@ -792,12 +947,16 @@ void run_to_gpu_test(
         api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
     api::ScalarType dtype = api::kFloat,
     api::StorageType storage_type = api::StorageType::TEXTURE_3D) {
+  if (dtype == api::kHalf &&
+      !api::context()->adapter_ptr()->has_16bit_storage()) {
+    return;
+  }
+
   vTensor vten =
       vTensor(api::context(), sizes, api::kFloat, storage_type, memory_layout);
 
   // Create and fill input staging buffer
-  api::StorageBuffer staging_buffer_in(
-      api::context(), api::kFloat, vten.gpu_numel());
+  api::StorageBuffer staging_buffer_in(api::context(), dtype, vten.gpu_numel());
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
@@ -807,7 +966,7 @@ void run_to_gpu_test(
 
   // Output staging buffer
   api::StorageBuffer staging_buffer_out(
-      api::context(), api::kFloat, vten.gpu_numel());
+      api::context(), dtype, vten.gpu_numel());
 
   // Copy data in and out of the tensor
   record_nchw_to_image_op(api::context(), staging_buffer_in.buffer(), vten);
@@ -819,9 +978,7 @@ void run_to_gpu_test(
   // Extract data from output staging buffer
   std::vector<T> data_out(staging_buffer_out.numel());
   copy_staging_to_ptr(
-      staging_buffer_out,
-      data_out.data(),
-      sizeof(float) * staging_buffer_out.numel());
+      staging_buffer_out, data_out.data(), staging_buffer_out.nbytes());
 
   // All indices should be equal to the input data
   for (int i = 0; i < vten.numel(); i++) {
@@ -874,7 +1031,7 @@ TEST(VulkanToFromGPUShaderTest, to_gpu_and_from_gpu_test_texture) {
 
   for (auto& sizes : to_test) {
     RUN_TESTS(float, api::kFloat)
-    RUN_TESTS(float, api::kHalf)
+    RUN_TESTS(c10::Half, api::kHalf)
   }
 #undef RUN_TESTS
 }
@@ -1044,11 +1201,39 @@ void test_mm(
 }
 
 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
-#define RUN_TESTS(dtype, layout, prepack)                                  \
-  test_mm(/*B=*/1, /*M=*/31, /*K=*/127, /*N=*/23, dtype, layout, prepack); \
-  test_mm(/*B=*/5, /*M=*/31, /*K=*/127, /*N=*/23, dtype, layout, prepack); \
-  test_mm(/*B=*/7, /*M=*/13, /*K=*/89, /*N=*/17, dtype, layout, prepack);  \
-  test_mm(/*B=*/1, /*M=*/13, /*K=*/89, /*N=*/17, dtype, layout, prepack);
+#define RUN_TESTS(dtype, layout, prepack) \
+  test_mm(                                \
+      /*B = */ 1,                         \
+      /*M = */ 31,                        \
+      /*K = */ 127,                       \
+      /*N = */ 23,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);                           \
+  test_mm(                                \
+      /*B = */ 5,                         \
+      /*M = */ 31,                        \
+      /*K = */ 127,                       \
+      /*N = */ 23,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);                           \
+  test_mm(                                \
+      /*B = */ 7,                         \
+      /*M = */ 13,                        \
+      /*K = */ 89,                        \
+      /*N = */ 17,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);                           \
+  test_mm(                                \
+      /*B = */ 1,                         \
+      /*M = */ 13,                        \
+      /*K = */ 89,                        \
+      /*N = */ 17,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);
 
   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
   CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
@@ -1102,21 +1287,21 @@ void test_max_pool2d(
 
   // Run graph
 
-  fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota=*/true);
+  fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota = */ true);
 
-  vTensor& t_in = graph.get_val(in_ioval.value).toTensor();
-  std::vector<float> input_data(t_in.gpu_numel());
+  vTensorPtr t_in = graph.get_tensor(in_ioval.value);
+  std::vector<float> input_data(t_in->gpu_numel());
   graph.copy_from_staging(
       in_ioval.staging, input_data.data(), input_data.size());
 
   graph.execute();
 
-  vTensor& t_out = graph.get_val(out_ioval.value).toTensor();
-  std::vector<float> output_data(t_out.gpu_numel());
+  vTensorPtr t_out = graph.get_tensor(out_ioval.value);
+  std::vector<float> output_data(t_out->gpu_numel());
   graph.copy_from_staging(
       out_ioval.staging, output_data.data(), output_data.size());
-  vTensor& t_idx = graph.get_val(idx_ioval.value).toTensor();
-  std::vector<int> index_data(t_idx.gpu_numel());
+  vTensorPtr t_idx = graph.get_tensor(idx_ioval.value);
+  std::vector<int> index_data(t_idx->gpu_numel());
   graph.copy_from_staging(
       idx_ioval.staging, index_data.data(), index_data.size());
 
@@ -1124,9 +1309,9 @@ void test_max_pool2d(
 
   int h_offset = kernel_copy[0] - 1;
   int w_offset = kernel_copy[1] - 1;
-  int h_out = api::utils::val_at(-2, t_out.sizes());
-  int w_out = api::utils::val_at(-1, t_out.sizes());
-  int w_in = api::utils::val_at(-1, t_in.sizes());
+  int h_out = api::utils::val_at(-2, t_out->sizes());
+  int w_out = api::utils::val_at(-1, t_out->sizes());
+  int w_in = api::utils::val_at(-1, t_in->sizes());
   for (size_t i = 0; i < h_out; ++i) {
     for (size_t j = 0; j < w_out; ++j) {
       size_t idx_out = i * w_out + j;
@@ -1140,7 +1325,79 @@ void test_max_pool2d(
 TEST(VulkanComputeGraphOpsTest, max_pool2d_smoke_test) {
   std::vector<int64_t> kernel = {2, 3};
   test_max_pool2d(
-      /*in_size=*/{1, 4, 6},
-      /*base_val=*/10.0f,
+      /*in_size = */ {1, 4, 6},
+      /*base_val = */ 10.0f,
       kernel);
 }
+
+void test_conv2d(
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& gpu_sizes,
+    const bool transposed,
+    const std::vector<float>& data_out_expected) {
+  vTensor vten = vTensor(
+      api::context(),
+      gpu_sizes,
+      api::kFloat,
+      api::StorageType::TEXTURE_2D,
+      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+
+  // Create and fill input staging buffer
+  const int64_t in_numel = api::utils::multiply_integers(original_sizes);
+  api::StorageBuffer staging_buffer_in(api::context(), api::kFloat, in_numel);
+
+  std::vector<float> data_in(in_numel);
+  for (int i = 0; i < in_numel; i++) {
+    data_in[i] = i + 1;
+  }
+  copy_ptr_to_staging(
+      data_in.data(), staging_buffer_in, sizeof(float) * in_numel);
+
+  // Output staging buffer
+  const int64_t out_numel =
+      padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
+  api::StorageBuffer staging_buffer_out(api::context(), api::kFloat, out_numel);
+
+  // Copy data in and out of the tensor
+  record_conv2d_prepack_weights_op(
+      api::context(),
+      staging_buffer_in.buffer(),
+      vten,
+      original_sizes,
+      padded_sizes,
+      transposed);
+  record_image_to_nchw_op(api::context(), vten, staging_buffer_out.buffer());
+
+  // Execute command buffer
+  submit_to_gpu();
+
+  // Extract data from output staging buffer
+  std::vector<float> data_out(out_numel);
+  copy_staging_to_ptr(
+      staging_buffer_out, data_out.data(), sizeof(float) * out_numel);
+
+  // Check data matches results copied from ATen-VK
+  for (int i = 0; i < vten.numel(); i++) {
+    CHECK_VALUE(data_out, i, data_out_expected[i]);
+  }
+}
+
+TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
+  test_conv2d(
+      /*original_sizes = */ {2, 3, 1, 2},
+      /*padded_sizes = */ {4, 4},
+      /*gpu_sizes = */ {4, 1, 8},
+      /*transposed = */ false,
+      /*data_out_expected = */ {1, 3, 5,  0,  2, 4, 6, 0, 7, 9, 11,
+                                0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0,  0,  0, 0, 0, 0, 0, 0});
+  test_conv2d(
+      /*original_sizes = */ {2, 3, 1, 2},
+      /*padded_sizes = */ {4, 4},
+      /*gpu_sizes = */ {4, 1, 8},
+      /*transposed = */ true,
+      /*data_out_expected = */ {2, 8, 0, 0, 1, 7, 0,  0, 4, 10, 0,
+                                0, 3, 9, 0, 0, 6, 12, 0, 0, 5,  11,
+                                0, 0, 0, 0, 0, 0, 0,  0, 0, 0});
+}
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 90a4f98952a..688ea02d6d7 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -81,7 +81,7 @@ add_library(xnnpack_backend STATIC ${_xnnpack_backend__srcs})
 target_link_libraries(xnnpack_backend
                       PRIVATE
                       ${xnnpack_third_party}
-                      executorch
+                      executorch_no_prim_ops
                       xnnpack_schema)
 
 target_include_directories(xnnpack_backend
diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md
index fe359b7adc9..6e1731799d3 100644
--- a/backends/xnnpack/README.md
+++ b/backends/xnnpack/README.md
@@ -7,8 +7,11 @@ mechanism for leveraging the XNNPACK library to accelerate operators running on
 CPU.
 
 ## Layout
-- `runtime/` : Runtime logic used at inference. This contains all the cpp files
-  used to build the runtime graph and execute the XNNPACK model
+- `cmake/` : CMake related files
+- `operators`: the directory to store all of op visitors
+    - `node_visitor.py`: Implementation of serializing each lowerable operator
+      node
+    - ...
 - `partition/`: Partitioner is used to identify operators in model's graph that
   are suitable for lowering to XNNPACK delegate
     - `xnnpack_partitioner.py`: Contains partitioner that tags graph patterns
@@ -16,10 +19,8 @@ CPU.
     - `configs.py`: Contains lists of op/modules for XNNPACK lowering
 - `passes/`: Contains passes which are used before preprocessing to prepare the
   graph for XNNPACK lowering
-- `operators`: the directory to store all of op visitors
-    - `node_visitor.py`: Implementation of serializing each lowerable operator
-      node
-    - ...
+- `runtime/` : Runtime logic used at inference. This contains all the cpp files
+  used to build the runtime graph and execute the XNNPACK model
 - `serialization/`: Contains files related to serializing the XNNPACK graph
   representation of the PyTorch model
     - `schema.fbs`: Flatbuffer schema of serialization format
@@ -28,64 +29,107 @@ CPU.
     - `xnnpack_graph_serialize`: Implementation for serializing dataclasses
       from graph schema to flatbuffer
 - `test/`: Tests for XNNPACK Delegate
+- `third-party/`: third-party libraries used by XNNPACK Delegate
 - `xnnpack_preprocess.py`: Contains preprocess implementation which is called
   by `to_backend` on the graph or subgraph of a model returning a preprocessed
   blob responsible for executing the graph or subgraph at runtime
 
+## End to End Example
+
+To further understand the features of the XNNPACK Delegate and how to use it, consider the following end to end example with MobilenetV2.
+
+### Lowering a model to XNNPACK
+```python
+import torch
+import torchvision.models as models
+
+from torch.export import export, ExportedProgram
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import EdgeProgramManager, ExecutorchProgramManager, to_edge
+from executorch.exir.backend.backend_api import to_backend
+
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+exported_program: ExportedProgram = export(mobilenet_v2, sample_inputs)
+edge: EdgeProgramManager = to_edge(exported_program)
+
+edge = edge.to_backend(XnnpackPartitioner())
+```
+
+We will go through this example with the [MobileNetV2](https://pytorch.org/hub/pytorch_vision_mobilenet_v2/) pretrained model downloaded from the TorchVision library. The flow of lowering a model starts after exporting the model `to_edge`. We call the `to_backend` api with the `XnnpackPartitioner`. The partitioner identifies the subgraphs suitable for XNNPACK backend delegate to consume. Afterwards, the identified subgraphs will be serialized with the XNNPACK Delegate flatbuffer schema and each subgraph will be replaced with a call to the XNNPACK Delegate.
+
+```python
+>>> print(edge.exported_program().graph_module)
+GraphModule(
+  (lowered_module_0): LoweredBackendModule()
+  (lowered_module_1): LoweredBackendModule()
+)
+
+def forward(self, arg314_1):
+    lowered_module_0 = self.lowered_module_0
+    executorch_call_delegate = torch.ops.higher_order.executorch_call_delegate(lowered_module_0, arg314_1);  lowered_module_0 = arg314_1 = None
+    getitem = executorch_call_delegate[0];  executorch_call_delegate = None
+    aten_view_copy_default = executorch_exir_dialects_edge__ops_aten_view_copy_default(getitem, [1, 1280]);  getitem = None
+    aten_clone_default = executorch_exir_dialects_edge__ops_aten_clone_default(aten_view_copy_default);  aten_view_copy_default = None
+    lowered_module_1 = self.lowered_module_1
+    executorch_call_delegate_1 = torch.ops.higher_order.executorch_call_delegate(lowered_module_1, aten_clone_default);  lowered_module_1 = aten_clone_default = None
+    getitem_1 = executorch_call_delegate_1[0];  executorch_call_delegate_1 = None
+    return (getitem_1,)
+```
+
+We print the graph after lowering above to show the new nodes that were inserted to call the XNNPACK Delegate. The subgraphs which are being delegated to XNNPACK are the first argument at each call site. It can be observed that the majority of `convolution-relu-add` blocks and `linear` blocks were able to be delegated to XNNPACK. We can also see the operators which were not able to be lowered to the XNNPACK delegate, such as `clone` and `view_copy`.
+
+```python
+exec_prog = edge.to_executorch()
+
+with open("xnnpack_mobilenetv2.pte", "wb") as file:
+    exec_prog.write_to_file(file)
+```
+After lowering to the XNNPACK Program, we can then prepare it for executorch and save the model as a `.pte` file. `.pte` is a binary format that stores the serialized ExecuTorch graph.
+
+
+### Running the XNNPACK Model with CMake
+After exporting the XNNPACK Delegated model, we can now try running it with example inputs using CMake. We can build and use the xnn_executor_runner, which is a sample wrapper for the ExecuTorch Runtime and XNNPACK Backend. We first begin by configuring the CMake build like such:
+```bash
+# cd to the root of executorch repo
+cd executorch
+
+# Get a clean cmake-out directory
+rm- -rf cmake-out
+mkdir cmake-out
+
+# Configure cmake
+cmake \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-out .
+```
+Then you can build the runtime componenets with
+
+```bash
+cmake --build cmake-out -j9 --target install --config Release
+```
+
+Now you should be able to find the executable built at `./cmake-out/backends/xnnpack/xnn_executor_runner` you can run the executable with the model you generated as such
+```bash
+./cmake-out/backends/xnnpack/xnn_executor_runner --model_path=./mv2_xnnpack_fp32.pte
+```
+
 ## Help & Improvements
 If you have problems or questions, or have suggestions for ways to make
 implementation and testing better, please reach out to the PyTorch Edge team or
 create an issue on [github](https://www.github.com/pytorch/executorch/issues).
 
-## Contributing
-
-Please follow the following steps and guidelines when adding a new operator
-implementation to this library. The goals of these guidelines are to
-- Make it straightforward to add new XNNPACK operators.
-- Ensure that the newly added operators are of high quality, and are easy to
-  maintain
-- Make it easy for users to find available operator implementations, and to
-  trust in their quality and behavioral stability.
-
-### AoT and Serialization Overview
-#### Serialization:
-XNNPACK delegate uses flatbuffer to serialize its nodes and values. In order to
-add
-[preprocessing](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/xnnpack_preprocess.py)
-support for a new operator, we must add the operator in both the flatbuffer
-[schema](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/serialization/schema.fbs),
-as well as the mirrored python [data
-class](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/serialization/xnnpack_graph_schema.py).
-These tables are based on the arguments to the XNNPACK Subgraph APIs. These
-APIs can be found
-[here](https://github.com/google/xnnpack/blob/master/include/xnnpack.h). We
-essentially serialize all the static arguments we need to call `define_{new
-operator}()`.
-
-#### AoT Preprocess:
-To add logic to preprocess new operators for the XNNPACK Delegate, we can
-create new node_visitors that perform the serialization of the new operator. An
-example can be found [here](). The function of these node_visitors is to
-serialize all the data we define to need in the schema above.
-
-#### AoT Partitioner:
-XnnpackPartitioner is used to select the pattern (like the linear module
-graph) in a big graph such that the selected nodes will be delegated to
-XNNPACK. To support a new op (for example, sigmoid), add the corresponding op
-or module to the
-[config.py](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/partition/configs.py),
-which captures the sigmoid op.
-
-#### How does it work?
-- Tag the nodes: in the XNNPACK partitioner's config, which lists all ops that
-  are supported by the current XNNPACK backend in executorch. When call
-  `XnnpackPartitioner.partition()`, it will tag all the nodes that matches the
-  patterns listed in self.pattern
-- Lower the nodes; when we call `to_backend(graph_module, XnnpackPartitioner)`,
-  it will loop through all the tagged nodes, and lower the group with the same
-  tag.
-
-
-#### Adding Tests for newly minted operators
-To test newly added operators, we can add unit tests in:
-[tests](https://github.com/pytorch/executorch/tree/main/backends/xnnpack/test)
+
+## See Also
+For more information about the XNNPACK Delegate, please check out the following resources:
+- [ExecuTorch XNNPACK Delegate](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html)
+- [Building and Running ExecuTorch with XNNPACK Backend](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html)
diff --git a/backends/xnnpack/operators/op_squeeze.py b/backends/xnnpack/operators/op_squeeze.py
index e857b6c68bb..8ed5aa36ae6 100644
--- a/backends/xnnpack/operators/op_squeeze.py
+++ b/backends/xnnpack/operators/op_squeeze.py
@@ -7,7 +7,6 @@
 from typing import cast, Dict
 
 import torch
-from executorch.backends.transforms import get_shape
 from executorch.backends.xnnpack.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -53,7 +52,21 @@ def define_node(
             "val" in input_node.meta,
             "Missing val in tensor metadata for input when serializing XNNStaticReshape node",
         )
-        new_shape = get_shape(input_node)[:-1]
+        dynamic_shape = node.meta["val"].shape
+        new_shape = []
+
+        num_dynamic_dims = 0
+        for dim in dynamic_shape:
+            if isinstance(dim, torch.SymInt):
+                num_dynamic_dims += 1
+                new_shape.append(0)
+            else:
+                new_shape.append(dim)
+
+        check_or_raise(
+            num_dynamic_dims <= 1,
+            "XNNPACK reshape only supports 1 dynamic dimension. This may occur when ",
+        )
 
         ser_node = XNode(
             xnode_union=XNNStaticReshape(
@@ -101,7 +114,21 @@ def define_node(
             "val" in input_node.meta,
             "Missing val in tensor metadata for input when serializing XNNStaticReshape node",
         )
-        new_shape = get_shape(input_node) + [1]
+        dynamic_shape = node.meta["val"].shape
+        new_shape = []
+
+        num_dynamic_dims = 0
+        for dim in dynamic_shape:
+            if isinstance(dim, torch.SymInt):
+                num_dynamic_dims += 1
+                new_shape.append(0)
+            else:
+                new_shape.append(dim)
+
+        check_or_raise(
+            num_dynamic_dims <= 1,
+            "XNNPACK reshape only supports 1 dynamic dimension. This may occur when ",
+        )
 
         ser_node = XNode(
             xnode_union=XNNStaticReshape(
diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/passes/convert_to_linear.py
index 942c09c8ae8..69f882523c8 100644
--- a/backends/xnnpack/passes/convert_to_linear.py
+++ b/backends/xnnpack/passes/convert_to_linear.py
@@ -36,6 +36,7 @@ class ConvertToLinearPass(XNNPACKPass):
     targets = [
         exir_ops.edge.aten.mm.default,
         exir_ops.edge.aten.addmm.default,
+        exir_ops.edge.aten.bmm.default,
     ]
 
     @staticmethod
diff --git a/backends/xnnpack/test/models/deeplab_v3.py b/backends/xnnpack/test/models/deeplab_v3.py
index ccaccb898d2..c5f6bfe17bc 100644
--- a/backends/xnnpack/test/models/deeplab_v3.py
+++ b/backends/xnnpack/test/models/deeplab_v3.py
@@ -36,6 +36,5 @@ def test_fp32_dl3(self):
             .partition()
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/models/edsr.py b/backends/xnnpack/test/models/edsr.py
index d748e35bb74..ca080b20b49 100644
--- a/backends/xnnpack/test/models/edsr.py
+++ b/backends/xnnpack/test/models/edsr.py
@@ -25,8 +25,7 @@ def test_fp32_edsr(self):
             .partition()
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_edsr(self):
@@ -38,6 +37,5 @@ def test_qs8_edsr(self):
             .partition()
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/models/emformer_rnnt.py b/backends/xnnpack/test/models/emformer_rnnt.py
index 3728c9b07c9..3992c828964 100644
--- a/backends/xnnpack/test/models/emformer_rnnt.py
+++ b/backends/xnnpack/test/models/emformer_rnnt.py
@@ -21,8 +21,8 @@ def __init__(self):
             self.rnnt = decoder.model
 
     class Joiner(EmformerRnnt):
-        def forward(self, predict_inputs):
-            return self.rnnt.join(*predict_inputs)
+        def forward(self, a, b, c, d):
+            return self.rnnt.join(a, b, c, d)
 
         def get_example_inputs(self):
             join_inputs = (
@@ -31,7 +31,7 @@ def get_example_inputs(self):
                 torch.rand([1, 128, 1024]),
                 torch.tensor([128]),
             )
-            return (join_inputs,)
+            return join_inputs
 
     def test_fp32_emformer_joiner(self):
         joiner = self.Joiner()
@@ -43,21 +43,19 @@ def test_fp32_emformer_joiner(self):
             .check(["torch.ops.higher_order.executorch_call_delegate"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     class Predictor(EmformerRnnt):
-        def forward(self, predict_inputs):
-            return self.rnnt.predict(*predict_inputs)
+        def forward(self, a, b):
+            return self.rnnt.predict(a, b, None)
 
         def get_example_inputs(self):
             predict_inputs = (
                 torch.zeros([1, 128], dtype=int),
                 torch.tensor([128], dtype=int),
-                None,
             )
-            return (predict_inputs,)
+            return predict_inputs
 
     @unittest.skip("T183426271")
     def test_fp32_emformer_predictor(self):
@@ -70,20 +68,19 @@ def test_fp32_emformer_predictor(self):
             .check(["torch.ops.higher_order.executorch_call_delegate"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     class Transcriber(EmformerRnnt):
-        def forward(self, predict_inputs):
-            return self.rnnt.transcribe(*predict_inputs)
+        def forward(self, a, b):
+            return self.rnnt.transcribe(a, b)
 
         def get_example_inputs(self):
             transcribe_inputs = (
                 torch.randn(1, 128, 80),
                 torch.tensor([128]),
             )
-            return (transcribe_inputs,)
+            return transcribe_inputs
 
     def test_fp32_emformer_transcriber(self):
         transcriber = self.Transcriber()
@@ -95,6 +92,5 @@ def test_fp32_emformer_transcriber(self):
             .check(["torch.ops.higher_order.executorch_call_delegate"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/models/inception_v3.py b/backends/xnnpack/test/models/inception_v3.py
index 58839014557..ddadbdd2f81 100644
--- a/backends/xnnpack/test/models/inception_v3.py
+++ b/backends/xnnpack/test/models/inception_v3.py
@@ -7,9 +7,9 @@
 import unittest
 
 import torch
-import torchvision.models as models
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from torchvision import models
 
 
 class TestInceptionV3(unittest.TestCase):
@@ -42,8 +42,7 @@ def test_fp32_ic3(self):
             .check_not(list(self.all_operators))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_ic3(self):
@@ -63,6 +62,5 @@ def test_qs8_ic3(self):
             .check_not(list(ops_after_quantization))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/models/inception_v4.py b/backends/xnnpack/test/models/inception_v4.py
index 534fb90ad6c..528512c82f2 100644
--- a/backends/xnnpack/test/models/inception_v4.py
+++ b/backends/xnnpack/test/models/inception_v4.py
@@ -39,8 +39,7 @@ def test_fp32_ic4(self):
             .check_not(list(self.all_operators))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_ic4(self):
@@ -60,6 +59,5 @@ def test_qs8_ic4(self):
             .check_not(list(ops_after_quantization))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/models/llama2_et_example.py b/backends/xnnpack/test/models/llama2_et_example.py
index 46dae356cd8..4716f2d6a95 100644
--- a/backends/xnnpack/test/models/llama2_et_example.py
+++ b/backends/xnnpack/test/models/llama2_et_example.py
@@ -45,6 +45,5 @@ def _test(self, dtype: torch.dtype = torch.float):
             .dump_artifact()
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs(atol=5e-2)
+            .run_method_and_compare_outputs(atol=5e-2)
         )
diff --git a/backends/xnnpack/test/models/mobilebert.py b/backends/xnnpack/test/models/mobilebert.py
index bf6b2dfc408..df66ffd4507 100644
--- a/backends/xnnpack/test/models/mobilebert.py
+++ b/backends/xnnpack/test/models/mobilebert.py
@@ -38,6 +38,5 @@ def test_fp32_mobilebert(self):
             .check_not(list(self.supported_ops))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/models/mobilenet_v2.py b/backends/xnnpack/test/models/mobilenet_v2.py
index dbd9bc744b4..c2bc364b9c4 100644
--- a/backends/xnnpack/test/models/mobilenet_v2.py
+++ b/backends/xnnpack/test/models/mobilenet_v2.py
@@ -7,9 +7,9 @@
 import unittest
 
 import torch
-import torchvision.models as models
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from torchvision import models
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 
 
@@ -29,9 +29,15 @@ class TestMobileNetV2(unittest.TestCase):
     }
 
     def test_fp32_mv2(self):
+        dynamic_shapes = (
+            {
+                2: torch.export.Dim("height", min=224, max=455),
+                3: torch.export.Dim("width", min=224, max=455),
+            },
+        )
 
         (
-            Tester(self.mv2, self.model_inputs)
+            Tester(self.mv2, self.model_inputs, dynamic_shapes=dynamic_shapes)
             .export()
             .to_edge()
             .check(list(self.all_operators))
@@ -40,8 +46,7 @@ def test_fp32_mv2(self):
             .check_not(list(self.all_operators))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs(num_runs=10)
         )
 
     def test_qs8_mv2(self):
@@ -50,8 +55,15 @@ def test_qs8_mv2(self):
             "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
         }
 
+        dynamic_shapes = (
+            {
+                2: torch.export.Dim("height", min=224, max=455),
+                3: torch.export.Dim("width", min=224, max=455),
+            },
+        )
+
         (
-            Tester(self.mv2, self.model_inputs)
+            Tester(self.mv2, self.model_inputs, dynamic_shapes=dynamic_shapes)
             .quantize(Quantize(calibrate=False))
             .export()
             .to_edge()
@@ -61,6 +73,5 @@ def test_qs8_mv2(self):
             .check_not(list(ops_after_quantization))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs(num_runs=10)
         )
diff --git a/backends/xnnpack/test/models/mobilenet_v3.py b/backends/xnnpack/test/models/mobilenet_v3.py
index 20d04b119e1..d990fa0e3bf 100644
--- a/backends/xnnpack/test/models/mobilenet_v3.py
+++ b/backends/xnnpack/test/models/mobilenet_v3.py
@@ -7,15 +7,21 @@
 import unittest
 
 import torch
-import torchvision.models as models
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from torchvision import models
 
 
 class TestMobileNetV3(unittest.TestCase):
     mv3 = models.mobilenetv3.mobilenet_v3_small(pretrained=True)
     mv3 = mv3.eval()
     model_inputs = (torch.ones(1, 3, 224, 224),)
+    dynamic_shapes = (
+        {
+            2: torch.export.Dim("height", min=224, max=455),
+            3: torch.export.Dim("width", min=224, max=455),
+        },
+    )
 
     all_operators = {
         "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
@@ -33,7 +39,7 @@ class TestMobileNetV3(unittest.TestCase):
 
     def test_fp32_mv3(self):
         (
-            Tester(self.mv3, self.model_inputs)
+            Tester(self.mv3, self.model_inputs, dynamic_shapes=self.dynamic_shapes)
             .export()
             .to_edge()
             .check(list(self.all_operators))
@@ -42,8 +48,7 @@ def test_fp32_mv3(self):
             .check_not(list(self.all_operators))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs(num_runs=5)
         )
 
     def test_qs8_mv3(self):
@@ -53,7 +58,7 @@ def test_qs8_mv3(self):
         ops_after_lowering = self.all_operators
 
         (
-            Tester(self.mv3, self.model_inputs)
+            Tester(self.mv3, self.model_inputs, dynamic_shapes=self.dynamic_shapes)
             .quantize(Quantize(calibrate=False))
             .export()
             .to_edge()
@@ -63,6 +68,5 @@ def test_qs8_mv3(self):
             .check_not(list(ops_after_lowering))
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs(num_runs=5)
         )
diff --git a/backends/xnnpack/test/models/resnet.py b/backends/xnnpack/test/models/resnet.py
index 73e68c855e9..06c889fc179 100644
--- a/backends/xnnpack/test/models/resnet.py
+++ b/backends/xnnpack/test/models/resnet.py
@@ -14,29 +14,63 @@
 
 
 class TestResNet18(unittest.TestCase):
-    def test_fp32_resnet18(self):
-        inputs = (torch.ones(1, 3, 224, 224),)
+    inputs = (torch.ones(1, 3, 224, 224),)
+    dynamic_shapes = (
+        {
+            2: torch.export.Dim("height", min=224, max=455),
+            3: torch.export.Dim("width", min=224, max=455),
+        },
+    )
+
+    class DynamicResNet(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = torchvision.models.resnet18()
+
+        def forward(self, x):
+            x = torch.nn.functional.interpolate(
+                x,
+                size=(224, 224),
+                mode="bilinear",
+                align_corners=True,
+                antialias=False,
+            )
+            return self.model(x)
+
+    def _test_exported_resnet(self, tester):
         (
-            Tester(torchvision.models.resnet18(), inputs)
-            .export()
+            tester.export()
             .to_edge()
             .partition()
+            .check_not(
+                [
+                    "executorch_exir_dialects_edge__ops_aten_convolution_default",
+                    "executorch_exir_dialects_edge__ops_aten_mean_dim",
+                ]
+            )
+            .check(["torch.ops.higher_order.executorch_call_delegate"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
+    def test_fp32_resnet18(self):
+        self._test_exported_resnet(Tester(torchvision.models.resnet18(), self.inputs))
+
     def test_qs8_resnet18(self):
-        inputs = (torch.ones(1, 3, 224, 224),)
-        (
-            Tester(torchvision.models.resnet18(), inputs)
-            .quantize(Quantize(calibrate=False))
-            .export()
-            .to_edge()
-            .partition()
-            .to_executorch()
-            .serialize()
-            .run_method()
-            .compare_outputs()
+        quantized_tester = Tester(torchvision.models.resnet18(), self.inputs).quantize(
+            Quantize(calibrate=False)
+        )
+        self._test_exported_resnet(quantized_tester)
+
+    def test_fp32_resnet18_dynamic(self):
+        self._test_exported_resnet(
+            Tester(self.DynamicResNet(), self.inputs, self.dynamic_shapes)
+        )
+
+    def test_qs8_resnet18_dynamic(self):
+        self._test_exported_resnet(
+            Tester(self.DynamicResNet(), self.inputs, self.dynamic_shapes).quantize(
+                Quantize(calibrate=False)
+            )
         )
diff --git a/backends/xnnpack/test/models/torchvision_vit.py b/backends/xnnpack/test/models/torchvision_vit.py
index 226cc73f401..836a9056857 100644
--- a/backends/xnnpack/test/models/torchvision_vit.py
+++ b/backends/xnnpack/test/models/torchvision_vit.py
@@ -7,14 +7,37 @@
 import unittest
 
 import torch
-import torchvision.models as models
 from executorch.backends.xnnpack.test.tester import Tester
+from torchvision import models
 
 
 class TestViT(unittest.TestCase):
     vit = models.vision_transformer.vit_b_16(weights="IMAGENET1K_V1")
     vit = vit.eval()
     model_inputs = (torch.ones(1, 3, 224, 224),)
+    dynamic_shapes = (
+        {
+            2: torch.export.Dim("height", min=224, max=455),
+            3: torch.export.Dim("width", min=224, max=455),
+        },
+    )
+
+    class DynamicViT(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vit = models.vision_transformer.vit_b_16(weights="IMAGENET1K_V1")
+            self.vit = self.vit.eval()
+
+        def forward(self, x):
+            x = torch.nn.functional.interpolate(
+                x,
+                size=(224, 224),
+                mode="bilinear",
+                align_corners=True,
+                antialias=False,
+            )
+            return self.vit(x)
+
     all_operators = {
         "executorch_exir_dialects_edge__ops_aten_expand_copy_default",
         "executorch_exir_dialects_edge__ops_aten_cat_default",
@@ -34,7 +57,8 @@ class TestViT(unittest.TestCase):
         "executorch_exir_dialects_edge__ops_aten_bmm_default",
     }
 
-    def test_fp32_vit(self):
+    def _test_exported_vit(self, tester, check_nots=None):
+        check_nots = check_nots or []
         lowerable_xnn_operators = self.all_operators - {
             "executorch_exir_dialects_edge__ops_aten_expand_copy_default",
             "executorch_exir_dialects_edge__ops_aten_gelu_default",
@@ -48,15 +72,33 @@ def test_fp32_vit(self):
             "executorch_exir_dialects_edge__ops_aten_bmm_default",
         }
         (
-            Tester(self.vit, self.model_inputs)
-            .export()
+            tester.export()
             .to_edge()
             .check(list(self.all_operators))
             .partition()
             .check(["torch.ops.higher_order.executorch_call_delegate"])
             .check_not(list(lowerable_xnn_operators))
+            .check_not(check_nots)
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp32_vit(self):
+        self._test_exported_vit(Tester(self.vit, self.model_inputs))
+
+    def test_dynamic_vit(self):
+        bilinear_ops = {
+            "executorch_exir_dialects_edge__ops_aten_sub_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_index_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_arange_start_step",
+            "executorch_exir_dialects_edge__ops_aten__to_copy_default",
+            "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_clamp_default",
+        }
+
+        self._test_exported_vit(
+            Tester(self.DynamicViT(), self.model_inputs, self.dynamic_shapes),
+            bilinear_ops,
         )
diff --git a/backends/xnnpack/test/models/very_big_model.py b/backends/xnnpack/test/models/very_big_model.py
index 2200b50a6b2..f3f06380414 100644
--- a/backends/xnnpack/test/models/very_big_model.py
+++ b/backends/xnnpack/test/models/very_big_model.py
@@ -39,6 +39,5 @@ def test_very_big_model(self):
             .check(["torch.ops.higher_order.executorch_call_delegate"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/models/w2l.py b/backends/xnnpack/test/models/w2l.py
index 10d7ca15b08..7f63d0b15f1 100644
--- a/backends/xnnpack/test/models/w2l.py
+++ b/backends/xnnpack/test/models/w2l.py
@@ -15,13 +15,15 @@ class TestW2L(unittest.TestCase):
     batch_size = 10
     input_frames = 700
     vocab_size = 4096
+    num_features = 1
     wav2letter = models.Wav2Letter(num_classes=vocab_size).eval()
 
-    model_inputs = (torch.randn(batch_size, 1, input_frames),)
+    model_inputs = (torch.randn(batch_size, num_features, input_frames),)
+    dynamic_shape = ({0: torch.export.Dim("batch", min=2, max=10)},)
 
     def test_fp32_w2l(self):
         (
-            Tester(self.wav2letter, self.model_inputs)
+            Tester(self.wav2letter, self.model_inputs, self.dynamic_shape)
             .export()
             .to_edge()
             .partition()
@@ -34,13 +36,12 @@ def test_fp32_w2l(self):
             .check(["torch.ops.higher_order.executorch_call_delegate"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs(num_runs=10)
         )
 
     def test_qs8_w2l(self):
         (
-            Tester(self.wav2letter.eval(), self.model_inputs)
+            Tester(self.wav2letter.eval(), self.model_inputs, self.dynamic_shape)
             .quantize()
             .export()
             .to_edge()
@@ -54,6 +55,5 @@ def test_qs8_w2l(self):
             .check(["torch.ops.higher_order.executorch_call_delegate"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs(num_runs=10)
         )
diff --git a/backends/xnnpack/test/ops/abs.py b/backends/xnnpack/test/ops/abs.py
index c71fe5ab4e0..2906654dfb7 100644
--- a/backends/xnnpack/test/ops/abs.py
+++ b/backends/xnnpack/test/ops/abs.py
@@ -31,8 +31,7 @@ def _test_abs(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_abs_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_abs(self):
diff --git a/backends/xnnpack/test/ops/add.py b/backends/xnnpack/test/ops/add.py
index 3a56e0f4c6a..8b0d0c6234d 100644
--- a/backends/xnnpack/test/ops/add.py
+++ b/backends/xnnpack/test/ops/add.py
@@ -54,8 +54,7 @@ def _test_add(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_add_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_add(self):
@@ -79,8 +78,7 @@ def test_fp32_add_constant(self):
             .check_not(["executorch_exir_dialects_edge__ops_aten_add_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_add_constant(self):
@@ -121,8 +119,7 @@ def test_qs8_add(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_add2(self):
@@ -145,8 +142,7 @@ def test_qs8_add2(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_add3(self):
@@ -169,8 +165,7 @@ def test_qs8_add3(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     class AddRelu(torch.nn.Module):
@@ -194,8 +189,7 @@ def test_fp32_add_relu(self):
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_add_relu(self):
@@ -214,8 +208,7 @@ def test_qs8_add_relu(self):
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_add_relu_seq(self):
@@ -261,6 +254,5 @@ def forward(self, x, z):
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/avgpool2d.py b/backends/xnnpack/test/ops/avgpool2d.py
index 2dd46932988..edb92d09a35 100644
--- a/backends/xnnpack/test/ops/avgpool2d.py
+++ b/backends/xnnpack/test/ops/avgpool2d.py
@@ -42,8 +42,7 @@ def _test_argpool2d(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_avgpool2d(self):
diff --git a/backends/xnnpack/test/ops/bilinear2d.py b/backends/xnnpack/test/ops/bilinear2d.py
index 2e80eaf2bc5..ab9d3d3c11d 100644
--- a/backends/xnnpack/test/ops/bilinear2d.py
+++ b/backends/xnnpack/test/ops/bilinear2d.py
@@ -87,8 +87,7 @@ def test_fp32_static_resize_bilinear2d(self):
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp32_static_resize_bilinear2d_with_align_cornesr(self):
@@ -103,8 +102,7 @@ def test_fp32_static_resize_bilinear2d_with_align_cornesr(self):
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp32_static_resize_bilinear2d_antialiased(self):
diff --git a/backends/xnnpack/test/ops/cat.py b/backends/xnnpack/test/ops/cat.py
index 8cb9b760b0d..85c5b51a2c7 100644
--- a/backends/xnnpack/test/ops/cat.py
+++ b/backends/xnnpack/test/ops/cat.py
@@ -11,16 +11,31 @@
 
 
 class TestCat(unittest.TestCase):
-    class Cat(torch.nn.Module):
-        def forward(self, xs):
+    class Cat2(torch.nn.Module):
+        def forward(self, arg1, arg2):
+            xs = [arg1, arg2]
             x = torch.cat(xs)
             return x + x  # Quantize by propagation.
 
-    class Cat2(torch.nn.Module):
-        def forward(self, xs):
-            return torch.cat(xs)
+    class Cat3(torch.nn.Module):
+        def forward(self, arg1, arg2, arg3):
+            xs = [arg1, arg2, arg3]
+            x = torch.cat(xs)
+            return x + x  # Quantize by propagation.
+
+    class Cat4(torch.nn.Module):
+        def forward(self, arg1, arg2, arg3, arg4):
+            xs = [arg1, arg2, arg3, arg4]
+            x = torch.cat(xs)
+            return x + x  # Quantize by propagation.
 
-    def _test_cat(self, module, inputs, quant=False, quant_ops=2):
+    class Cat5(torch.nn.Module):
+        def forward(self, arg1, arg2, arg3, arg4, arg5):
+            xs = [arg1, arg2, arg3, arg4, arg5]
+            x = torch.cat(xs)
+            return x + x  # Quantize by propagation.
+
+    def _test_cat(self, module, inputs, cat_num=1, quant=False, quant_ops=2):
         tester = Tester(module, inputs)
 
         if quant:
@@ -36,7 +51,7 @@ def _test_cat(self, module, inputs, quant=False, quant_ops=2):
                     # Q/DQ pair for each input and quantized op. For most tests, there are
                     # two quantized ops - cat and add.
                     torch.ops.quantized_decomposed.quantize_per_tensor.default: (
-                        len(inputs[0]) + quant_ops
+                        cat_num + quant_ops
                     )
                 }
             )
@@ -55,8 +70,7 @@ def _test_cat(self, module, inputs, quant=False, quant_ops=2):
             .check_not(["executorch_exir_dialects_edge__ops_aten_cat"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_cat2(self):
@@ -64,10 +78,8 @@ def test_fp16_cat2(self):
         Using Clamp2 because fp16 add is done in fp32 ATM. Need to fix that first.
         """
         inputs = (
-            (
-                torch.ones(1, 2, 3).to(torch.float16),
-                torch.ones(3, 2, 3).to(torch.float16),
-            ),
+            torch.ones(1, 2, 3).to(torch.float16),
+            torch.ones(3, 2, 3).to(torch.float16),
         )
         self._test_cat(self.Cat2(), inputs)
 
@@ -76,81 +88,71 @@ def test_fp16_cat3(self):
         Using Clamp2 because fp16 add is done in fp32 ATM. Need to fix that first.
         """
         inputs = (
-            (
-                torch.ones(1, 2, 3).to(torch.float16),
-                torch.ones(3, 2, 3).to(torch.float16),
-                torch.ones(2, 2, 3).to(torch.float16),
-            ),
+            torch.ones(1, 2, 3).to(torch.float16),
+            torch.ones(3, 2, 3).to(torch.float16),
+            torch.ones(2, 2, 3).to(torch.float16),
         )
-        self._test_cat(self.Cat2(), inputs)
+        self._test_cat(self.Cat3(), inputs)
 
     def test_fp16_cat4(self):
         """
         Using Clamp2 because fp16 add is done in fp32 ATM. Need to fix that first.
         """
         inputs = (
-            (
-                torch.ones(1, 2, 3).to(torch.float16),
-                torch.ones(3, 2, 3).to(torch.float16),
-                torch.ones(2, 2, 3).to(torch.float16),
-                torch.ones(5, 2, 3).to(torch.float16),
-            ),
+            torch.ones(1, 2, 3).to(torch.float16),
+            torch.ones(3, 2, 3).to(torch.float16),
+            torch.ones(2, 2, 3).to(torch.float16),
+            torch.ones(5, 2, 3).to(torch.float16),
         )
-        self._test_cat(self.Cat2(), inputs)
+        self._test_cat(self.Cat4(), inputs)
 
     def test_fp32_cat2(self):
-        inputs = ((torch.ones(1, 2, 3), torch.ones(3, 2, 3)),)
-        self._test_cat(self.Cat(), inputs)
+        inputs = (torch.ones(1, 2, 3), torch.ones(3, 2, 3))
+        self._test_cat(self.Cat2(), inputs)
 
     def test_fp32_cat3(self):
-        inputs = ((torch.ones(1, 2, 3), torch.ones(3, 2, 3), torch.ones(2, 2, 3)),)
-        self._test_cat(self.Cat(), inputs)
+        inputs = (torch.ones(1, 2, 3), torch.ones(3, 2, 3), torch.ones(2, 2, 3))
+        self._test_cat(self.Cat3(), inputs)
 
     def test_fp32_cat4(self):
         inputs = (
-            (
-                torch.ones(1, 2, 3),
-                torch.ones(3, 2, 3),
-                torch.ones(2, 2, 3),
-                torch.ones(5, 2, 3),
-            ),
+            torch.ones(1, 2, 3),
+            torch.ones(3, 2, 3),
+            torch.ones(2, 2, 3),
+            torch.ones(5, 2, 3),
         )
-        self._test_cat(self.Cat(), inputs)
+        self._test_cat(self.Cat4(), inputs)
 
     def test_qs8_cat2(self):
-        inputs = ((torch.ones(1, 2, 3), torch.ones(3, 2, 3)),)
-        self._test_cat(self.Cat(), inputs, quant=True)
+        inputs = (torch.ones(1, 2, 3), torch.ones(3, 2, 3))
+        self._test_cat(self.Cat2(), inputs, cat_num=2, quant=True)
 
     def test_qs8_cat3(self):
-        inputs = ((torch.ones(1, 2, 3), torch.ones(3, 2, 3), torch.ones(2, 2, 3)),)
-        self._test_cat(self.Cat(), inputs, quant=True)
+        inputs = (torch.ones(1, 2, 3), torch.ones(3, 2, 3), torch.ones(2, 2, 3))
+        self._test_cat(self.Cat3(), inputs, cat_num=3, quant=True)
 
     def test_qs8_cat4(self):
         inputs = (
-            (
-                torch.ones(1, 2, 3),
-                torch.ones(3, 2, 3),
-                torch.ones(2, 2, 3),
-                torch.ones(5, 2, 3),
-            ),
+            torch.ones(1, 2, 3),
+            torch.ones(3, 2, 3),
+            torch.ones(2, 2, 3),
+            torch.ones(5, 2, 3),
         )
-        self._test_cat(self.Cat(), inputs, quant=True)
+        self._test_cat(self.Cat4(), inputs, cat_num=4, quant=True)
 
     def test_fp32_cat_unsupported(self):
         """
         XNNPACK only supports concatenating up to 4 values, so it should not delegate here.
         """
         inputs = (
-            (
-                torch.ones(1, 2, 3),
-                torch.ones(3, 2, 3),
-                torch.ones(2, 2, 3),
-                torch.ones(5, 2, 3),
-                torch.ones(1, 2, 3),
-            ),
+            torch.ones(1, 2, 3),
+            torch.ones(3, 2, 3),
+            torch.ones(2, 2, 3),
+            torch.ones(5, 2, 3),
+            torch.ones(1, 2, 3),
         )
         (
-            Tester(self.Cat(), inputs)
+            Tester(self.Cat5(), inputs)
             .export()
             .check_count({"torch.ops.aten.cat": 1})
             .to_edge()
diff --git a/backends/xnnpack/test/ops/ceil.py b/backends/xnnpack/test/ops/ceil.py
index 853de03ff1d..8d59f3b35d7 100644
--- a/backends/xnnpack/test/ops/ceil.py
+++ b/backends/xnnpack/test/ops/ceil.py
@@ -31,8 +31,7 @@ def _test_ceil(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_ceil_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_ceil(self):
diff --git a/backends/xnnpack/test/ops/clamp.py b/backends/xnnpack/test/ops/clamp.py
index 6ffaed3fe1b..c52fd011f8b 100644
--- a/backends/xnnpack/test/ops/clamp.py
+++ b/backends/xnnpack/test/ops/clamp.py
@@ -33,8 +33,7 @@ def _test_clamp(self, module, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_clamp(self):
@@ -77,6 +76,5 @@ def test_qs8_clamp(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/conv1d.py b/backends/xnnpack/test/ops/conv1d.py
index 604e37c724c..6558fd673ff 100644
--- a/backends/xnnpack/test/ops/conv1d.py
+++ b/backends/xnnpack/test/ops/conv1d.py
@@ -81,9 +81,15 @@ def forward(self, x):
             z = torch.add(y, z)
             return z
 
-    def _test_conv1d(self, module, inputs, conv_count, quantized=False):
+    def _test_conv1d(
+        self, module, inputs, conv_count, quantized=False, dynamic_shape=None
+    ):
         (
-            (Tester(module, inputs).quantize() if quantized else Tester(module, inputs))
+            (
+                Tester(module, inputs, dynamic_shape).quantize()
+                if quantized
+                else Tester(module, inputs)
+            )
             .export()
             .check_count({"torch.ops.aten.convolution.default": conv_count})
             .to_edge()
@@ -97,26 +103,45 @@ def _test_conv1d(self, module, inputs, conv_count, quantized=False):
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_conv1d(self):
-        inputs = (torch.randn(1, 2, 4).to(torch.float16),)
-        self._test_conv1d(self.Conv1d(dtype=torch.float16), inputs, conv_count=1)
+        inputs = (torch.randn(2, 2, 4).to(torch.float16),)
+        dynamic_shapes = ({0: torch.export.Dim("batch", min=2, max=10)},)
+        self._test_conv1d(
+            self.Conv1d(dtype=torch.float16),
+            inputs,
+            conv_count=1,
+            dynamic_shape=dynamic_shapes,
+        )
 
     def test_fp32_conv1d(self):
-        inputs = (torch.randn(1, 2, 4),)
-        self._test_conv1d(self.Conv1d(), inputs, 1)
+        inputs = (torch.randn(2, 2, 4),)
+        dynamic_shapes = ({0: torch.export.Dim("batch", min=2, max=10)},)
+        self._test_conv1d(self.Conv1d(), inputs, 1, dynamic_shape=dynamic_shapes)
 
     def test_fp32_conv1d_batchnorm_seq(self):
-        inputs = (torch.randn(1, 2, 4),)
-        self._test_conv1d(self.Conv1dBatchNormSequential(), inputs, 2)
+        inputs = (torch.randn(2, 2, 4),)
+        dynamic_shapes = ({0: torch.export.Dim("batch", min=2, max=10)},)
+        self._test_conv1d(
+            self.Conv1dBatchNormSequential(), inputs, 2, dynamic_shape=dynamic_shapes
+        )
 
     def test_qs8_conv1d(self):
-        inputs = (torch.randn(1, 2, 4),)
-        self._test_conv1d(self.Conv1d(), inputs, 1, quantized=True)
+        inputs = (torch.randn(2, 2, 4),)
+        dynamic_shapes = ({0: torch.export.Dim("batch", min=2, max=10)},)
+        self._test_conv1d(
+            self.Conv1d(), inputs, 1, quantized=True, dynamic_shape=dynamic_shapes
+        )
 
     def test_qs8_conv1d_batchnorm_seq(self):
-        inputs = (torch.randn(1, 2, 4),)
-        self._test_conv1d(self.Conv1dBatchNormSequential(), inputs, 2, quantized=True)
+        inputs = (torch.randn(2, 2, 4),)
+        dynamic_shapes = ({0: torch.export.Dim("batch", min=2, max=10)},)
+        self._test_conv1d(
+            self.Conv1dBatchNormSequential(),
+            inputs,
+            2,
+            quantized=True,
+            dynamic_shape=dynamic_shapes,
+        )
diff --git a/backends/xnnpack/test/ops/conv2d.py b/backends/xnnpack/test/ops/conv2d.py
index 3eb80072a68..9a2bb25dc8d 100644
--- a/backends/xnnpack/test/ops/conv2d.py
+++ b/backends/xnnpack/test/ops/conv2d.py
@@ -152,8 +152,7 @@ def _test(
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs(qtol=1)
+            .run_method_and_compare_outputs(qtol=1)
         )
 
     def test_fp16_conv2d(self) -> None:
diff --git a/backends/xnnpack/test/ops/div.py b/backends/xnnpack/test/ops/div.py
index 007122db981..2882c59b875 100644
--- a/backends/xnnpack/test/ops/div.py
+++ b/backends/xnnpack/test/ops/div.py
@@ -39,8 +39,7 @@ def _test_div(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_div_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_div(self):
@@ -64,6 +63,5 @@ def test_fp32_div_single_input(self):
             .check_not(["executorch_exir_dialects_edge__ops_aten_div_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/elu.py b/backends/xnnpack/test/ops/elu.py
index f1f8d7628a6..89fef6f9d4b 100644
--- a/backends/xnnpack/test/ops/elu.py
+++ b/backends/xnnpack/test/ops/elu.py
@@ -39,8 +39,7 @@ def _test_elu(self, inputs):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T171810227 - Missing recomposition for ELU")
@@ -74,8 +73,7 @@ def test_qs8_elu(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T171810227 - Missing recomposition for ELU")
@@ -99,6 +97,5 @@ def test_qs8_elu_functional(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/floor.py b/backends/xnnpack/test/ops/floor.py
index 31c3da09b42..cb65ca2aa58 100644
--- a/backends/xnnpack/test/ops/floor.py
+++ b/backends/xnnpack/test/ops/floor.py
@@ -31,8 +31,7 @@ def _test_floor(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_floor_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_floor(self):
diff --git a/backends/xnnpack/test/ops/hardswish.py b/backends/xnnpack/test/ops/hardswish.py
index d35e7ab5d78..8f6a190412c 100644
--- a/backends/xnnpack/test/ops/hardswish.py
+++ b/backends/xnnpack/test/ops/hardswish.py
@@ -41,8 +41,7 @@ def _test_hardswish(self, inputs):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T158969708 - Missing recomposition pass for hardswish")
@@ -75,6 +74,5 @@ def test_fp32_hardswish_functional(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/hardtanh.py b/backends/xnnpack/test/ops/hardtanh.py
index fdcfb7c7efe..d13624663ca 100644
--- a/backends/xnnpack/test/ops/hardtanh.py
+++ b/backends/xnnpack/test/ops/hardtanh.py
@@ -38,8 +38,7 @@ def test_fp32_hardtanh(self):
                 .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
                 .to_executorch()
                 .serialize()
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
 
     def test_fp32_hardtanh_bound(self):
@@ -58,8 +57,7 @@ def test_fp32_hardtanh_bound(self):
                 .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
                 .to_executorch()
                 .serialize()
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
 
     def test_qs8_hardtanh(self):
@@ -90,6 +88,5 @@ def test_qs8_hardtanh(self):
                 )
                 .to_executorch()
                 .serialize()
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
diff --git a/backends/xnnpack/test/ops/leaky_relu.py b/backends/xnnpack/test/ops/leaky_relu.py
index 477188ed752..ae5f2e3197e 100644
--- a/backends/xnnpack/test/ops/leaky_relu.py
+++ b/backends/xnnpack/test/ops/leaky_relu.py
@@ -43,8 +43,7 @@ def _test_leaky_relu(self, module, inputs):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_leaky_relu(self):
@@ -76,8 +75,7 @@ def test_fp32_leaky_relu_functional(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T172863987 - Missing quantizer support.")
@@ -107,8 +105,7 @@ def test_qs8_leaky_relu(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T172863987 - Missing quantizer support.")
@@ -143,6 +140,5 @@ def test_qs8_leaky_relu_default_slope(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py
index b4a9cb62856..85b760e38ad 100644
--- a/backends/xnnpack/test/ops/linear.py
+++ b/backends/xnnpack/test/ops/linear.py
@@ -26,23 +26,27 @@
 class TestLinear(unittest.TestCase):
     def test_fp16_linear(self):
         for use_bias in (True, False):
-            self._test_linear(
-                lambda in_size, out_size: torch.nn.Linear(
-                    in_size, out_size, bias=use_bias  # noqa
-                ),
-                uses_bias=use_bias,
-                dtype=torch.float16,
-                atol=5e-2,
-            )
+            for num_batch_dims in range(1, 3):
+                self._test_linear(
+                    lambda in_size, out_size: torch.nn.Linear(
+                        in_size, out_size, bias=use_bias  # noqa
+                    ),
+                    num_batch_dims=num_batch_dims,
+                    uses_bias=use_bias,
+                    dtype=torch.float16,
+                    atol=5e-2,
+                )
 
     def test_fp32_linear(self):
         for use_bias in (True, False):
-            self._test_linear(
-                lambda in_size, out_size: torch.nn.Linear(
-                    in_size, out_size, bias=use_bias  # noqa
-                ),
-                uses_bias=use_bias,
-            )
+            for num_batch_dims in range(1, 3):
+                self._test_linear(
+                    lambda in_size, out_size: torch.nn.Linear(
+                        in_size, out_size, bias=use_bias  # noqa
+                    ),
+                    uses_bias=use_bias,
+                    num_batch_dims=num_batch_dims,
+                )
 
     def test_fp32_addmm(self):
         """
@@ -63,24 +67,71 @@ def forward(self, x):
             uses_bias=True,
         )
 
+    def test_fp32_linear_fused_relu(self):
+        class LinearReluModule(torch.nn.Module):
+            def __init__(self, in_size, out_size, use_bias):
+                super().__init__()
+                self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias)
+
+            def forward(self, x):
+                return torch.nn.functional.relu(self.linear(x))
+
+        for use_bias in (True, False):
+            for num_batch_dims in range(1, 3):
+                self._test_linear(
+                    lambda in_size, out_size: LinearReluModule(
+                        in_size,
+                        out_size,
+                        use_bias,  # noqa
+                    ),
+                    uses_bias=use_bias,
+                    num_batch_dims=num_batch_dims,
+                )
+
+    def test_qs8_linear_fused_relu(self):
+        class LinearReluModule(torch.nn.Module):
+            def __init__(self, in_size, out_size, use_bias):
+                super().__init__()
+                self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias)
+
+            def forward(self, x):
+                return torch.nn.functional.relu(self.linear(x))
+
+        for use_bias in (True, False):
+            for num_batch_dims in range(1, 3):
+                self._test_linear(
+                    lambda in_size, out_size: LinearReluModule(
+                        in_size,
+                        out_size,
+                        use_bias,  # noqa
+                    ),
+                    num_batch_dims=num_batch_dims,
+                    uses_bias=use_bias,
+                    quant=True,
+                )
+
     def test_qs8_linear(self):
         for use_bias in (True, False):
-            self._test_linear(
-                lambda in_size, out_size: torch.nn.Linear(
-                    in_size, out_size, bias=use_bias  # noqa
-                ),
-                uses_bias=use_bias,
-            )
+            for num_batch_dims in range(1, 3):
+                self._test_linear(
+                    lambda in_size, out_size: torch.nn.Linear(
+                        in_size, out_size, bias=use_bias  # noqa
+                    ),
+                    uses_bias=use_bias,
+                    num_batch_dims=num_batch_dims,
+                )
 
     @unittest.skip("XNNPACK currently only supports per-channel dynamic quantization.")
     def test_qd8_per_tensor_linear(self):
         for uses_bias in (False, True):
             inputs = (torch.randn(2, 4),)
             module = torch.nn.Linear(4, 5, bias=uses_bias)
+            dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},)
 
             self._test_dqlinear(
                 module,
                 inputs,
+                dynamic_shapes=dynamic_shapes,
                 is_per_channel=False,
                 uses_bias=uses_bias,
             )
@@ -93,6 +144,7 @@ def test_qd8_per_channel_linear(self):
             self._test_dqlinear(
                 module,
                 inputs,
+                dynamic_shapes=({0: torch.export.Dim("batch", max=100)},),
                 is_per_channel=True,
                 uses_bias=uses_bias,
             )
@@ -114,7 +166,7 @@ def test_qd8_per_channel_4w_linear(self):
         qconfig = self._get_4b_dqconfig()
         input_channels = [2, 63]
         output_channels = [1, 8, 127]
-        batches = [1, 2]
+        batches = [2, 2]
         use_bias = [False, True]
 
         for bs, bias, ipc, opc in product(
@@ -129,13 +181,14 @@ def test_qd8_per_channel_4w_linear(self):
             self._test_dqlinear(
                 module,
                 inputs,
+                dynamic_shapes=({0: torch.export.Dim("batch", max=100)},),
                 is_per_channel=True,
                 uses_bias=bias,
                 qconfig=qconfig,
             )
 
     def test_qd8_per_channel_linear_parallel(self):
-        in_size = 1
+        in_size = 2
         input_size = 4
         output_size = 5
 
@@ -165,17 +218,39 @@ def forward(self, x, y):
             torch.rand(in_size, input_size, dtype=torch.float),
             torch.rand(in_size, input_size, dtype=torch.float),
         )
+        batch_dim = torch.export.Dim("batch", max=100)
+        dynamic_shapes = ({0: batch_dim}, {0: batch_dim})
 
         self._test_dqlinear(
             ParallelLinear(),
             inputs,
+            dynamic_shapes=dynamic_shapes,
             linear_count=2,
             is_per_channel=True,
             uses_bias=True,
         )
 
+    def test_qd8_per_channel_linear_with_two_batch(self):
+        in_size = 2
+        input_size = 4
+        output_size = 5
+
+        linear = torch.nn.Linear(input_size, output_size)
+        inputs = (torch.randn(2, in_size, input_size, dtype=torch.float),)
+        batch_dim = torch.export.Dim("batch", max=100)
+        dynamic_shapes = ({0: batch_dim, 1: batch_dim},)
+
+        self._test_dqlinear(
+            linear,
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            linear_count=1,
+            is_per_channel=True,
+            uses_bias=True,
+        )
+
     def test_qd8_per_channel_linear_sequential(self):
-        in_size = 1
+        in_size = 2
         input_size = 4
         intermediate_size = 5
         output_size = 3
@@ -203,17 +278,20 @@ def forward(self, x):
                 return b
 
         inputs = (torch.rand(in_size, input_size, dtype=torch.float),)
+        dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},)
 
         self._test_dqlinear(
             LinearSequential(),
             inputs,
+            dynamic_shapes=dynamic_shapes,
             linear_count=2,
             is_per_channel=True,
             uses_bias=True,
+            atol=1e-1,
         )
 
     def test_qd8_per_channel_linear_parellel_and_sequential(self):
-        in_size = 1
+        in_size = 2
         input_size = 4
         intermediate_size = 5
         output_size = 3
@@ -252,50 +330,21 @@ def forward(self, x, y):
             torch.rand(in_size, input_size, dtype=torch.float),
             torch.rand(in_size, input_size, dtype=torch.float),
         )
+        dynamic_shapes = (
+            {0: torch.export.Dim("batch", max=100)},
+            {0: torch.export.Dim("batch2", max=100)},
+        )
 
         self._test_dqlinear(
-            LinearModule(), inputs, linear_count=3, is_per_channel=True, uses_bias=True
+            LinearModule(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            linear_count=3,
+            is_per_channel=True,
+            uses_bias=True,
+            atol=1e-1,
         )
 
-    def test_fp32_linear_fused_relu(self):
-        class LinearReluModule(torch.nn.Module):
-            def __init__(self, in_size, out_size, use_bias):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias)
-
-            def forward(self, x):
-                return torch.nn.functional.relu(self.linear(x))
-
-        for use_bias in (True, False):
-            self._test_linear(
-                lambda in_size, out_size: LinearReluModule(
-                    in_size,
-                    out_size,
-                    use_bias,  # noqa
-                ),
-                uses_bias=use_bias,
-            )
-
-    def test_qs8_linear_fused_relu(self):
-        class LinearReluModule(torch.nn.Module):
-            def __init__(self, in_size, out_size, use_bias):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias)
-
-            def forward(self, x):
-                return torch.nn.functional.relu(self.linear(x))
-
-        for use_bias in (True, False):
-            self._test_linear(
-                lambda in_size, out_size: LinearReluModule(
-                    in_size,
-                    out_size,
-                    use_bias,  # noqa
-                ),
-                uses_bias=use_bias,
-                quant=True,
-            )
-
     class ManualDQLinear(torch.nn.Module):
         def __init__(
             self,
@@ -595,8 +644,7 @@ def _test_manual_dq_linear(
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs(atol=atol, rtol=rtol)
+            .run_method_and_compare_outputs(atol=atol, rtol=rtol)
         )
 
     def _run_manual_dqlinear_tests(self, weight_n_bit: int, op_dtype: torch.dtype):
@@ -677,6 +725,7 @@ def _test_linear(
         self,
         make_module,
         uses_bias,
+        num_batch_dims=1,
         quant=False,
         dtype: torch.dtype = torch.float,
         atol=1e-03,
@@ -693,7 +742,7 @@ def _test_linear(
             )
         )
 
-        in_sizes = [1, 4, 4]
+        in_sizes = [3, 4, 4]
         input_sizes = [4, 37, 17]
         output_sizes = [4, 17, 37]
 
@@ -705,11 +754,19 @@ def _test_linear(
             in_size = int(in_sizes[i])
             input_size = int(input_sizes[i])
             output_size = int(output_sizes[i])
+            input_shape = [in_size] * num_batch_dims + [input_size]
+            print(f"Testing input_shape {input_shape} with {output_size} out_channels")
 
             module = make_module(input_size, output_size).eval().to(dtype)
-            inputs = (torch.randn(in_size, input_size).to(dtype),)
+            inputs = (torch.randn(input_shape).to(dtype),)
+            dynamic_shape = {}
+            for i in range(num_batch_dims):
+                dynamic_shape[i] = torch.export.Dim(f"batch{i}", min=2, max=in_size)
+
+            dynamic_shape = (dynamic_shape,)
+            print(dynamic_shape)
 
-            tester = Tester(module, inputs)
+            tester = Tester(module, inputs, dynamic_shapes=dynamic_shape)
 
             if quant:
                 tester.quantize()
@@ -731,18 +788,18 @@ def _test_linear(
 
             tester.to_executorch()
             tester.serialize()
-            tester.run_method()
-            tester.compare_outputs(qtol=quant, atol=atol)
-            print("success")
+            tester.run_method_and_compare_outputs(qtol=quant, atol=atol)
 
     def _test_dqlinear(
         self,
         module,
         inputs,
+        dynamic_shapes,
         linear_count=1,
         is_per_channel=False,
         uses_bias=False,
         qconfig: Optional[QuantizationConfig] = None,
+        atol=5e-02,
     ):
         aten_op, edge_op = (
             (
@@ -761,13 +818,12 @@ def _test_dqlinear(
             is_dynamic=True,
         )
 
-        tester = Tester(module, inputs)
+        tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes)
         tester.quantize(Quantize(quantization_config=quant_config))
 
         tester.export()
         tester.check_count({aten_op: linear_count})
         tester.check(["torch.ops.quantized_decomposed"])
-        tester.dump_artifact()
         tester.to_edge()
         tester.check_count({edge_op: linear_count})
 
@@ -779,5 +835,4 @@ def _test_dqlinear(
 
         tester.to_executorch()
         tester.serialize()
-        tester.run_method()
-        tester.compare_outputs(atol=5e-02)
+        tester.run_method_and_compare_outputs(atol=atol)
diff --git a/backends/xnnpack/test/ops/max_dim.py b/backends/xnnpack/test/ops/max_dim.py
index b43d1ce4e82..9cab1236e4c 100644
--- a/backends/xnnpack/test/ops/max_dim.py
+++ b/backends/xnnpack/test/ops/max_dim.py
@@ -37,8 +37,7 @@ def _test_max_dim(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_max_dim"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T171468483 - Fails to partition due to index output dtype.")
@@ -65,6 +64,5 @@ def test_fp32_max_dim_no_indices(self):
             .check_not(["executorch_exir_dialects_edge__ops_aten_max_dim"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/maximum.py b/backends/xnnpack/test/ops/maximum.py
index 5ce05d33e37..feff02744d3 100644
--- a/backends/xnnpack/test/ops/maximum.py
+++ b/backends/xnnpack/test/ops/maximum.py
@@ -30,8 +30,7 @@ def _test_maximum(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_maximum_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_maximum(self):
@@ -64,6 +63,5 @@ def test_fp32_maximum_broadcast(self):
             .check_not(["executorch_exir_dialects_edge__ops_aten_maximum_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/maxpool2d.py b/backends/xnnpack/test/ops/maxpool2d.py
index 84c76a6e6c9..7e510dd9155 100644
--- a/backends/xnnpack/test/ops/maxpool2d.py
+++ b/backends/xnnpack/test/ops/maxpool2d.py
@@ -64,8 +64,7 @@ def _test_maxpool2d(self, inputs):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_maxpool2d(self):
@@ -135,6 +134,5 @@ def forward(self, x):
                 )
                 .to_executorch()
                 .serialize()
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
diff --git a/backends/xnnpack/test/ops/mean_dim.py b/backends/xnnpack/test/ops/mean_dim.py
index b8d7e77a224..750b0e8f508 100644
--- a/backends/xnnpack/test/ops/mean_dim.py
+++ b/backends/xnnpack/test/ops/mean_dim.py
@@ -33,8 +33,7 @@ def _test_mean_dim(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_mean_dim(self):
@@ -85,6 +84,5 @@ def test_qs8_mean_dim(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs(qtol=1)
+            .run_method_and_compare_outputs(qtol=1)
         )
diff --git a/backends/xnnpack/test/ops/minimum.py b/backends/xnnpack/test/ops/minimum.py
index 5d6f08fd1a2..121fbeb1852 100644
--- a/backends/xnnpack/test/ops/minimum.py
+++ b/backends/xnnpack/test/ops/minimum.py
@@ -30,8 +30,7 @@ def _test_minimum(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_minimum_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_minimum(self):
diff --git a/backends/xnnpack/test/ops/multiply.py b/backends/xnnpack/test/ops/multiply.py
index 09f9b39ea60..d151f58bd6a 100644
--- a/backends/xnnpack/test/ops/multiply.py
+++ b/backends/xnnpack/test/ops/multiply.py
@@ -43,8 +43,7 @@ def _test_mul(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_mul_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_mul(self):
@@ -78,8 +77,7 @@ def test_qs8_mul(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_mul2(self):
@@ -102,8 +100,7 @@ def test_qs8_mul2(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_mul_functional(self):
@@ -126,8 +123,7 @@ def test_qs8_mul_functional(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_mul_relu(self):
@@ -156,6 +152,5 @@ def test_qs8_mul_relu(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/negate.py b/backends/xnnpack/test/ops/negate.py
index b7777136f5a..c4a47bb93ce 100644
--- a/backends/xnnpack/test/ops/negate.py
+++ b/backends/xnnpack/test/ops/negate.py
@@ -31,8 +31,7 @@ def _test_negate(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_neg_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_negate(self):
diff --git a/backends/xnnpack/test/ops/permute.py b/backends/xnnpack/test/ops/permute.py
index 3441acb6315..2c995376753 100644
--- a/backends/xnnpack/test/ops/permute.py
+++ b/backends/xnnpack/test/ops/permute.py
@@ -45,8 +45,7 @@ def _test_permute(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_permute_copy_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_permute(self):
@@ -72,8 +71,7 @@ def test_fp32_permute_copy(self):
             .check_not(["executorch_exir_dialects_edge__ops_aten_permute_copy_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_permute(self):
@@ -102,8 +100,7 @@ def test_qs8_permute(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_permute_copy(self):
@@ -132,6 +129,5 @@ def test_qs8_permute_copy(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/pow.py b/backends/xnnpack/test/ops/pow.py
index b4bd6b5862c..d99f2c546e6 100644
--- a/backends/xnnpack/test/ops/pow.py
+++ b/backends/xnnpack/test/ops/pow.py
@@ -34,8 +34,7 @@ def _test_pow2(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_pow2(self):
diff --git a/backends/xnnpack/test/ops/prelu.py b/backends/xnnpack/test/ops/prelu.py
index a4e9ef7df95..985ddecf363 100644
--- a/backends/xnnpack/test/ops/prelu.py
+++ b/backends/xnnpack/test/ops/prelu.py
@@ -36,8 +36,7 @@ def _test_prelu(self, module, inputs):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T158653285 - Missing recomposition for PReLU")
diff --git a/backends/xnnpack/test/ops/quantize_per_tensor.py b/backends/xnnpack/test/ops/quantize_per_tensor.py
index 82aaca0b6f7..f912428a8ab 100644
--- a/backends/xnnpack/test/ops/quantize_per_tensor.py
+++ b/backends/xnnpack/test/ops/quantize_per_tensor.py
@@ -39,8 +39,7 @@ def forward(self, x):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_dequantize_per_tenstor(self):
@@ -76,6 +75,5 @@ def forward(self, x):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/relu.py b/backends/xnnpack/test/ops/relu.py
index c52055e45f1..3ab1c72b57d 100644
--- a/backends/xnnpack/test/ops/relu.py
+++ b/backends/xnnpack/test/ops/relu.py
@@ -33,6 +33,5 @@ def test_fp32_relu(self):
             .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/sdpa.py b/backends/xnnpack/test/ops/sdpa.py
index 5cf8534c928..d68bcab2086 100644
--- a/backends/xnnpack/test/ops/sdpa.py
+++ b/backends/xnnpack/test/ops/sdpa.py
@@ -70,8 +70,7 @@ def _test(self, module, inputs, atol=1e-03, rtol=1e-03):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs(atol=atol, rtol=rtol)
+            .run_method_and_compare_outputs(atol=atol, rtol=rtol)
         )
 
     def test_fp16_sdpa_mask2d(self):
diff --git a/backends/xnnpack/test/ops/sigmoid.py b/backends/xnnpack/test/ops/sigmoid.py
index be8eda605ee..5ed6fc64402 100644
--- a/backends/xnnpack/test/ops/sigmoid.py
+++ b/backends/xnnpack/test/ops/sigmoid.py
@@ -32,8 +32,7 @@ def _test_sigmoid(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_sigmoid(self):
diff --git a/backends/xnnpack/test/ops/slice_copy.py b/backends/xnnpack/test/ops/slice_copy.py
index 99b5842313f..2d0f150dd15 100644
--- a/backends/xnnpack/test/ops/slice_copy.py
+++ b/backends/xnnpack/test/ops/slice_copy.py
@@ -27,8 +27,7 @@ def _test_slice_copy(self, module, inputs, copy_count=1, edge_copy_count=1):
             .check_not(["executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_slice_copy(self):
@@ -143,6 +142,5 @@ def forward(self, x):
             .check_not(["executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/softmax.py b/backends/xnnpack/test/ops/softmax.py
index 43ff89f1206..d3f674d7ae2 100644
--- a/backends/xnnpack/test/ops/softmax.py
+++ b/backends/xnnpack/test/ops/softmax.py
@@ -38,8 +38,7 @@ def _test_softmax(self, inputs):
                 .check_not(["executorch_exir_dialects_edge__ops_aten__softmax_default"])
                 .to_executorch()
                 .serialize()
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
 
     def test_fp16_softmax(self):
diff --git a/backends/xnnpack/test/ops/sqrt.py b/backends/xnnpack/test/ops/sqrt.py
index 99ab8f72340..e2a5f4ac2f6 100644
--- a/backends/xnnpack/test/ops/sqrt.py
+++ b/backends/xnnpack/test/ops/sqrt.py
@@ -16,6 +16,7 @@ def __init__(self):
             super().__init__()
 
         def forward(self, x):
+            x = torch.abs(x)
             z = torch.sqrt(x)
             return z
 
@@ -31,14 +32,13 @@ def _test_sqrt(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_sqrt_default"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_sqrt(self):
-        inputs = (torch.randn(20).to(torch.float16).abs(),)
+        inputs = (torch.randn(20).to(torch.float16),)
         self._test_sqrt(inputs)
 
     def test_fp32_sqrt(self):
-        inputs = (torch.randn(20).abs(),)
+        inputs = (torch.randn(20),)
         self._test_sqrt(inputs)
diff --git a/backends/xnnpack/test/ops/square.py b/backends/xnnpack/test/ops/square.py
index faad836becf..02dc12e16e4 100644
--- a/backends/xnnpack/test/ops/square.py
+++ b/backends/xnnpack/test/ops/square.py
@@ -37,8 +37,7 @@ def _test_square(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_square(self):
diff --git a/backends/xnnpack/test/ops/static_constant_pad.py b/backends/xnnpack/test/ops/static_constant_pad.py
index 6b8563e291d..c836b404ac7 100644
--- a/backends/xnnpack/test/ops/static_constant_pad.py
+++ b/backends/xnnpack/test/ops/static_constant_pad.py
@@ -99,8 +99,7 @@ def _test_static_constant_pad_functional(self, inputs):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_static_constant_pad_functional(self):
@@ -154,8 +153,7 @@ def forward(self, x):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_static_constant_pad_2d(self):
@@ -180,6 +178,5 @@ def test_qs8_static_constant_pad_2d(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/ops/sub.py b/backends/xnnpack/test/ops/sub.py
index bcb4f389bd6..d3cc6e8aa80 100644
--- a/backends/xnnpack/test/ops/sub.py
+++ b/backends/xnnpack/test/ops/sub.py
@@ -39,8 +39,7 @@ def _test_sub(self, inputs):
             .check_not(["executorch_exir_dialects_edge__ops_aten_sub_Tensor"])
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp16_sub(self):
@@ -75,8 +74,7 @@ def test_qs8_sub(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T171957656 - Quantized sub not implemented.")
@@ -100,8 +98,7 @@ def test_qs8_sub2(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T171957656 - Quantized sub not implemented.")
@@ -125,8 +122,7 @@ def test_qs8_sub3(self):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     @unittest.skip("T171957656 - Quantized sub not implemented.")
@@ -166,6 +162,5 @@ def forward(self, x, y):
             )
             .to_executorch()
             .serialize()
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/passes/test_batch_norm_fusion.py b/backends/xnnpack/test/passes/test_batch_norm_fusion.py
index ab9b02af4bf..06517c526c8 100644
--- a/backends/xnnpack/test/passes/test_batch_norm_fusion.py
+++ b/backends/xnnpack/test/passes/test_batch_norm_fusion.py
@@ -40,8 +40,7 @@ def test_fp32_batch_norm_fusion(self):
             .to_edge()
             .run_passes(self.PassStage)
             .check_count({self.bn_name: 1})
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_q8_batch_norm_fusion(self):
@@ -52,8 +51,7 @@ def test_q8_batch_norm_fusion(self):
             .to_edge()
             .run_passes(self.PassStage)
             .check_count({self.bn_name: 1})
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_fp32_batch_norm_no_fusion_doesnt_partition(self):
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
index abb18a8c0b2..36e566abc36 100644
--- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
+++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -42,8 +42,7 @@ def test_fp32_channels_last_tagged_reshape_pass(self):
                         self.to_copy_name: num_reshape,
                     }
                 )
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
 
     def test_qs8_channels_last_tagged_reshape_pass(self):
@@ -64,8 +63,7 @@ def test_qs8_channels_last_tagged_reshape_pass(self):
                     ]
                     * num_reshape
                 )
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
 
     class ConvRelu(torch.nn.Module):
@@ -86,8 +84,7 @@ def test_fp32_channels_last_tagged_reshape_pass_conv_relu(self):
             .check(
                 [self.to_copy_name, self.conv_name, self.relu_name, self.to_copy_name]
             )
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_qs8_channels_last_tagged_reshape_pass_conv_relu(self):
@@ -109,8 +106,7 @@ def test_qs8_channels_last_tagged_reshape_pass_conv_relu(self):
                     self.to_copy_name,
                 ]
             )
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     class Conv2dBnHardtanhMeanSequenceModule(torch.nn.Module):
@@ -175,6 +171,5 @@ def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self):
                     self.to_copy_name: 4,
                 }
             )
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/passes/test_convert_to_linear.py b/backends/xnnpack/test/passes/test_convert_to_linear.py
index 783336a01cd..0fa80246fd6 100644
--- a/backends/xnnpack/test/passes/test_convert_to_linear.py
+++ b/backends/xnnpack/test/passes/test_convert_to_linear.py
@@ -35,6 +35,5 @@ def test_fp32_convert_to_linear(self):
                 .check_count(
                     {"executorch_exir_dialects_edge__ops_aten_linear_default": 1}
                 )
-                .run_method()
-                .compare_outputs()
+                .run_method_and_compare_outputs()
             )
diff --git a/backends/xnnpack/test/passes/test_remove_get_item_pass.py b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
index 35bd4d8b966..fa68c403e38 100644
--- a/backends/xnnpack/test/passes/test_remove_get_item_pass.py
+++ b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
@@ -42,8 +42,7 @@ def test_fp32_max_pool2d_remove_getitem(self):
             .to_edge()
             .run_passes(self.PassStage)
             .check_count({self.max_pool2d_name: 1})
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_q8_max_pool2d_remove_getitem(self):
@@ -54,8 +53,7 @@ def test_q8_max_pool2d_remove_getitem(self):
             .to_edge()
             .run_passes(self.PassStage)
             .check_count({self.max_pool2d_name: 1})
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     class MaxModule(torch.nn.Module):
@@ -79,8 +77,7 @@ def test_fp32_max_remove_getitem(self):
                     self.amax_name: 1,
                 }
             )
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
 
     def test_q8_max_remove_getitem(self):
@@ -95,6 +92,5 @@ def test_q8_max_remove_getitem(self):
                     self.amax_name: 1,
                 }
             )
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
         )
diff --git a/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py b/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
index 97c31c3d43a..dc67a6582df 100644
--- a/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
+++ b/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
@@ -55,8 +55,7 @@ def test_tag_implicit_q_dq_test(self):
             .export()
             .to_edge()
             .run_passes(self.PassStage)
-            .run_method()
-            .compare_outputs()
+            .run_method_and_compare_outputs()
             .get_artifact(Tester.stage_name(self.PassStage))
         )
 
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index ec03fa2529d..8812d5e5019 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -7,6 +7,7 @@
 import copy
 
 import logging
+import random
 import sys
 from abc import ABC, abstractmethod
 from collections import Counter, OrderedDict
@@ -26,7 +27,7 @@
 )
 from executorch.exir.backend.backend_api import validation_disabled
 from executorch.exir.backend.partitioner import Partitioner
-from executorch.exir.passes.spec_prop_pass import SpecPropPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.exir.print_program import pretty_print, print_program
 
 logger = logging.getLogger(__name__)
@@ -177,11 +178,18 @@ def graph_module(self) -> str:
 
 @register_stage
 class Export(Stage):
-    def __init__(self):
+    def __init__(self, dynamic_shapes: Optional[Tuple[Any]] = None):
         self.exported_program = None
+        self.dynamic_shapes = dynamic_shapes
 
-    def run(self, artifact: torch.nn.Module, inputs) -> None:
-        self.exported_program = export(artifact, inputs)
+    def run(
+        self,
+        artifact: torch.nn.Module,
+        inputs: Tuple[torch.Tensor],
+    ) -> None:
+        self.exported_program = export(
+            artifact, inputs, dynamic_shapes=self.dynamic_shapes
+        )
 
     @property
     def artifact(self) -> ExportedProgram:
@@ -261,8 +269,8 @@ def __init__(
         config: Optional[ExecutorchBackendConfig] = None,
     ):
         self.config = config or ExecutorchBackendConfig(
-            passes=[SpecPropPass()],
             extract_delegate_segments=True,
+            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
         )
         self.executorch_program = None
 
@@ -334,11 +342,13 @@ def __init__(
         self,
         module: torch.nn.Module,
         inputs: Tuple[torch.Tensor],
+        dynamic_shapes: Optional[Tuple[Any]] = None,
     ):
         module.eval()
 
         self.original_module = module
         self.inputs = inputs
+        self.dynamic_shapes = dynamic_shapes
         self.stages: Dict[str, Stage] = OrderedDict.fromkeys(list(_stages_.keys()))
         self.pipeline = {
             self.stage_name(Quantize): [self.stage_name(Export)],
@@ -371,6 +381,59 @@ def __init__(
         # Artifact output from stage
         self.stage_output = None
 
+    def generate_random_inputs(self):
+        # Get shapes of inputs
+        input_shapes = []
+        if self.dynamic_shapes is None:
+            for tensor_arg in self.inputs:
+                assert isinstance(tensor_arg, torch.Tensor)
+                input_shapes.append(tensor_arg.shape)
+        else:
+            # Random shapes depending on dynamic shape constraint
+            dim_name_to_size = {}
+            for arg_idx in range(len(self.inputs)):
+                assert isinstance(self.inputs[arg_idx], torch.Tensor)
+                ex_shape = list(self.inputs[arg_idx].shape)
+                dynamic_dim_spec = self.dynamic_shapes[arg_idx]
+                for dim_idx, dim_spec in dynamic_dim_spec.items():
+                    assert dim_idx < len(ex_shape)
+                    if isinstance(dim_spec, torch.export.dynamic_shapes._DerivedDim):
+                        # derived dims are of the form {0: 2 * torch.export.Dim() // 2}
+                        # The root contains the min/max of the export dim and fn contains
+                        # the function to compute the derived dim.
+                        dim_spec = dim_spec.root
+                        fn = dim_spec.fn
+                    elif isinstance(dim_spec, torch.export.dynamic_shapes._Dim):
+                        # Not derived dim so fn is just itself
+                        def fn(x):
+                            return x
+
+                    else:
+                        raise RuntimeError(
+                            f"Expected Dynamic Dims to be of type _DerivedDim or _Dim but got {type(dim_spec)}"
+                        )
+                    dim_name = dim_spec.__name__
+                    if dim_name not in dim_name_to_size:
+                        upper_bound = min(
+                            dim_spec.max, 1000
+                        )  # unbounded int max is too large
+                        lower_bound = (
+                            dim_spec.min if dim_spec.min != 2 else 1
+                        )  # 0/1 specialization means dim_spec.min can never be 1
+                        dim_name_to_size[dim_name] = fn(
+                            random.randint(lower_bound, upper_bound)
+                        )
+                    ex_shape[dim_idx] = dim_name_to_size[dim_spec.__name__]
+                input_shapes.append(torch.Size(ex_shape))
+        # create random tensor inputs with the shapes given above:
+        random_inputs = []
+        for arg_idx in range(len(self.inputs)):
+            random_inputs.append(
+                torch.randn(input_shapes[arg_idx]).to(dtype=self.inputs[arg_idx].dtype)
+            )
+
+        yield tuple(random_inputs)
+
     @staticmethod
     def stage_name(stage) -> str:
         t = stage if isinstance(stage, type) else type(stage)
@@ -406,7 +469,9 @@ def quantize(self, quantize_stage: Optional[Quantize] = None):
         return self._run_stage(quantize_stage or Quantize(), self.inputs)
 
     def export(self, export_stage: Optional[Export] = None):
-        return self._run_stage(export_stage or Export(), self.inputs)
+        return self._run_stage(
+            export_stage or Export(dynamic_shapes=self.dynamic_shapes), self.inputs
+        )
 
     def to_edge(self, to_edge_stage: Optional[ToEdge] = None):
         return self._run_stage(to_edge_stage or ToEdge())
@@ -469,21 +534,39 @@ def check_node_count(self, input: Dict[Any, int]):
 
         return self
 
-    def run_method(
-        self, stage: Optional[str] = None, inputs: Optional[Tuple[torch.Tensor]] = None
+    def run_method_and_compare_outputs(
+        self,
+        stage: Optional[str] = None,
+        inputs: Optional[Tuple[torch.Tensor]] = None,
+        num_runs=1,
+        atol=1e-03,
+        rtol=1e-03,
+        qtol=0,
     ):
-        inputs_to_run = inputs or self.inputs
-        export_stage = self.stages[self.stage_name(Export)]
-
-        # Reference output (and quantization scale)
-        (
-            self.reference_output,
-            self.quantization_scale,
-        ) = self._calculate_reference_output(export_stage.artifact, inputs_to_run)
+        number_of_runs = 1 if inputs is not None else num_runs
+        reference_stage = self.stages[self.stage_name(Export)]
 
-        # Output from running artifact at stage
         stage = stage or self.cur
-        self.stage_output = self.stages[stage].run_artifact(inputs_to_run)
+
+        print(f"Comparing Stage {stage} with Stage {reference_stage}")
+        for run_iteration in range(number_of_runs):
+            inputs_to_run = inputs if inputs else next(self.generate_random_inputs())
+            input_shapes = [generated_input.shape for generated_input in inputs_to_run]
+            print(f"Run {run_iteration} with input shapes: {input_shapes}")
+
+            # Reference output (and quantization scale)
+            (
+                reference_output,
+                quantization_scale,
+            ) = self._calculate_reference_output(
+                reference_stage.artifact, inputs_to_run
+            )
+
+            # Output from running artifact at stage
+            stage_output = self.stages[stage].run_artifact(inputs_to_run)
+            self._compare_outputs(
+                reference_output, stage_output, quantization_scale, atol, rtol, qtol
+            )
 
         return self
 
@@ -512,7 +595,7 @@ def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
                 f"Output {i} does not match reference output.\n"
                 f"\tGiven atol: {atol}, rtol: {rtol}.\n"
                 f"\tOutput tensor shape: {model.shape}, dtype: {model.dtype}\n"
-                f"\tDifference: max: {torch.max(model-ref)}, abs: {torch.max(torch.abs(model-ref))}.\n"
+                f"\tDifference: max: {torch.max(model-ref)}, abs: {torch.max(torch.abs(model-ref))}, mean abs error: {torch.mean(torch.abs(model-ref))}.\n"
                 f"\t-- Model vs. Reference --\n"
                 f"\t Numel: {model.numel()}, {ref.numel()}\n"
                 f"\tMedian: {model.median()}, {ref.median()}\n"
@@ -521,33 +604,37 @@ def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
                 f"\t   Min: {model.min()}, {ref.min()}\n"
             )
 
-    def compare_outputs(self, atol=1e-03, rtol=1e-03, qtol=0):
+    @staticmethod
+    def _compare_outputs(
+        reference_output,
+        stage_output,
+        quantization_scale=None,
+        atol=1e-03,
+        rtol=1e-03,
+        qtol=0,
+    ):
         """
         Compares the original of the original nn module with the output of the generated artifact.
         This requres calling run_method before calling compare_outputs. As that runs the generated
         artifact on the sample inputs and sets the stage output to be compared against the reference.
         """
-        assert self.reference_output is not None
-        assert self.stage_output is not None
-
         # Wrap both outputs as tuple, since executor output is always a tuple even if single tensor
-        if isinstance(self.reference_output, torch.Tensor):
-            self.reference_output = (self.reference_output,)
-        if isinstance(self.stage_output, torch.Tensor):
-            self.stage_output = (self.stage_output,)
+        if isinstance(reference_output, torch.Tensor):
+            reference_output = (reference_output,)
+        if isinstance(stage_output, torch.Tensor):
+            stage_output = (stage_output,)
 
         # If a qtol is provided and we found an dequantization node prior to the output, relax the
         # atol by qtol quant units.
-        if self.quantization_scale is not None:
-            atol += self.quantization_scale * qtol
+        if quantization_scale is not None:
+            atol += quantization_scale * qtol
 
-        self._assert_outputs_equal(
-            self.stage_output,
-            self.reference_output,
+        Tester._assert_outputs_equal(
+            stage_output,
+            reference_output,
             atol=atol,
             rtol=rtol,
         )
-        return self
 
     @staticmethod
     def _calculate_reference_output(
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index 78425485526..0b6adae0a7f 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -25,9 +25,9 @@ PORTABLE=OFF
 QUANTIZED=OFF
 XNNPACK=OFF
 HEADERS_PATH="include"
-EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
+EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
 COREML_FRAMEWORK="coreml_backend:libcoremldelegate.a:"
-CUSTOM_FRAMEWORK="custom_backend:libcustom_ops.a,libcustom_ops_lib.a:"
+CUSTOM_FRAMEWORK="custom_backend:libcustom_ops.a:"
 MPS_FRAMEWORK="mps_backend:libmpsdelegate.a:"
 OPTIMIZED_FRAMEWORK="optimized_backend:liboptimized_kernels.a,liboptimized_ops_lib.a:"
 PORTABLE_FRAMEWORK="portable_backend:libportable_kernels.a,libportable_ops_lib.a:"
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 4346881224b..91174c08f75 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -19,6 +19,18 @@ excludes = [
 buck_targets = [
   "//runtime/executor:program",
 ]
+deps = [
+  "executorch_no_prim_ops",
+]
+filters = [
+  ".cpp$",
+]
+
+
+[targets.executorch_no_prim_ops]
+buck_targets = [
+  "//runtime/executor:program_no_prim_ops",
+]
 deps = [
   "program_schema",
 ]
@@ -43,6 +55,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.optimized_kernels]
@@ -59,6 +72,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -76,6 +90,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -97,6 +112,7 @@ filters = [
 excludes = [
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 
@@ -113,6 +129,7 @@ filters = [
   ".cpp$",
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 
@@ -125,6 +142,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "extension_data_loader",
 ]
 
@@ -137,6 +155,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 # ---------------------------------- extension end ----------------------------------
@@ -154,6 +173,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
   "quantized_kernels",
 ]
@@ -169,6 +189,7 @@ excludes = [
   "^codegen",
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 # ---------------------------------- binary end ----------------------------------
@@ -185,6 +206,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -197,6 +219,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.mps_schema]
@@ -222,6 +245,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -235,6 +259,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.xnnpack_dynamic_quant_utils]
@@ -275,6 +300,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "optimized_kernels",
   "xnnpack_backend",
 ]
@@ -292,6 +318,7 @@ excludes = [
 deps = [
   "custom_ops",
   "executorch",
+  "executorch_no_prim_ops",
   "extension_data_loader",
   "extension_module",
   "portable_kernels",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 14ec7bf1f45..60c8ebda5e6 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -13,27 +13,20 @@
 cmake_minimum_required(VERSION 3.19)
 
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../..")
-add_library(executorch STATIC IMPORTED)
-find_library(
-    EXECUTORCH_LIBRARY_PATH executorch
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
-)
-set_target_properties(
-    executorch PROPERTIES IMPORTED_LOCATION "${EXECUTORCH_LIBRARY_PATH}"
-)
-target_include_directories(executorch INTERFACE ${_root})
+set(required_lib_list executorch executorch_no_prim_ops portable_kernels)
+foreach(lib ${required_lib_list})
+    set(lib_var "LIB_${lib}")
+    add_library(${lib} STATIC IMPORTED)
+    find_library(
+        ${lib_var} ${lib} HINTS "${_root}" CMAKE_FIND_ROOT_PATH_BOTH
+    )
+    set_target_properties(
+        ${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}"
+    )
+    target_include_directories(${lib} INTERFACE ${_root})
+endforeach()
 
-add_library(portable_kernels STATIC IMPORTED)
-find_library(
-    PORTABLE_KERNELS_PATH portable_kernels
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
-)
-set_target_properties(
-    portable_kernels PROPERTIES IMPORTED_LOCATION "${PORTABLE_KERNELS_PATH}"
-)
-target_include_directories(portable_kernels INTERFACE ${_root})
+target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
 
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
     set(FLATCCRT_LIB flatccrt_d)
@@ -45,7 +38,7 @@ set(lib_list
     etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate
     qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend
     XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas
-    optimized_ops_lib optimized_native_cpu_ops_lib
+    optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib
 )
 foreach(lib ${lib_list})
     # Name of the variable which stores result of the find_library search
diff --git a/build/resolve_buck.py b/build/resolve_buck.py
index cba151ab340..463e6bf6c37 100644
--- a/build/resolve_buck.py
+++ b/build/resolve_buck.py
@@ -76,6 +76,10 @@ class BuckInfo:
         archive_name="buck2-aarch64-apple-darwin.zst",
         target_versions=["99e407b49dc432eda0cbddd67ea78346"],
     ),
+    ("darwin", "x86_64"): BuckInfo(
+        archive_name="buck2-x86_64-apple-darwin.zst",
+        target_versions=["9150d78e7a7531799a1b06ce58623bbc"],
+    ),
 }
 
 
diff --git a/build/test_android_ci.sh b/build/test_android_ci.sh
index acc853727fa..2b019a1cd6c 100755
--- a/build/test_android_ci.sh
+++ b/build/test_android_ci.sh
@@ -32,6 +32,7 @@ build_android_llama_demo_app() {
   pushd examples/demo-apps/android/LlamaDemo
   ANDROID_NDK=/opt/ndk ANDROID_ABI=arm64-v8a ./gradlew setup
   ANDROID_HOME=/opt/android/sdk ./gradlew build
+  ANDROID_HOME=/opt/android/sdk ./gradlew assembleAndroidTest
   popd
 }
 
diff --git a/docs/README.md b/docs/README.md
index 1360d54aa99..5d0fae3de5c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -57,7 +57,11 @@ To build the documentation locally:
    ```bash
    pip3 install -r ./.ci/docker/requirements-ci.txt
    ```
+1. Update submodules
 
+   ```bash
+   git submodule sync && git submodule update --init
+   ```
 1. Run:
 
    ```bash
diff --git a/docs/source/_static/img/llama_ios_app.mp4 b/docs/source/_static/img/llama_ios_app.mp4
new file mode 100644
index 00000000000..fead47644d6
Binary files /dev/null and b/docs/source/_static/img/llama_ios_app.mp4 differ
diff --git a/docs/source/_static/img/llama_ios_app.png b/docs/source/_static/img/llama_ios_app.png
new file mode 100644
index 00000000000..4f9020efb87
Binary files /dev/null and b/docs/source/_static/img/llama_ios_app.png differ
diff --git a/docs/source/_static/img/llm_manual_print_data_tabular.png b/docs/source/_static/img/llm_manual_print_data_tabular.png
new file mode 100644
index 00000000000..6052a404246
Binary files /dev/null and b/docs/source/_static/img/llm_manual_print_data_tabular.png differ
diff --git a/docs/source/_static/img/print_data_tabular.png b/docs/source/_static/img/print_data_tabular.png
index 593ea4088ca..7e20b129bb4 100644
Binary files a/docs/source/_static/img/print_data_tabular.png and b/docs/source/_static/img/print_data_tabular.png differ
diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md
index c442b2cc6b8..da830e542c8 100644
--- a/docs/source/build-run-coreml.md
+++ b/docs/source/build-run-coreml.md
@@ -1,6 +1,6 @@
 # Building and Running ExecuTorch with Core ML Backend
 
-Core ML delegate uses Core ML apis to enable running neural networks via Apple's hardware acceleration. For more about coreml you can read [here](https://developer.apple.com/documentation/coreml). In this tutorial we will walk through steps of lowering a PyTorch model to Core ML delegate
+Core ML delegate uses Core ML APIs to enable running neural networks via Apple's hardware acceleration. For more about Core ML you can read [here](https://developer.apple.com/documentation/coreml). In this tutorial, we will walk through the steps of lowering a PyTorch model to Core ML delegate
 
 
 ::::{grid} 2
@@ -24,8 +24,8 @@ Core ML delegate uses Core ML apis to enable running neural networks via Apple's
 In order to be able to successfully build and run the ExecuTorch's Core ML backend you'll need the following hardware and software components.
 
 ### Hardware:
-- A [mac](https://www.apple.com/mac/]) system for building.
-- A [mac](https://www.apple.com/mac/]) or [iPhone](https://www.apple.com/iphone/) or [iPad](https://www.apple.com/ipad/) or [Apple TV](https://www.apple.com/tv-home/) device for running the model.
+- A [mac](https://www.apple.com/mac/) system for building.
+- A [mac](https://www.apple.com/mac/) or [iPhone](https://www.apple.com/iphone/) or [iPad](https://www.apple.com/ipad/) or [Apple TV](https://www.apple.com/tv-home/) device for running the model.
 
 ### Software:
 
@@ -67,22 +67,53 @@ python3 -m examples.apple.coreml.scripts.export --model_name mv3
 
 ### Runtime:
 
-**Running the Core ML delegated Program**:
+**Running a Core ML delegated Program**:
 1. Build the runner.
 ```bash
 cd executorch
 
-# Generates ./coreml_executor_runner.
+# Builds `coreml_executor_runner`.
 ./examples/apple/coreml/scripts/build_executor_runner.sh
 ```
-2. Run the exported program.
+2. Run the CoreML delegated program.
 ```bash
 cd executorch
 
-# Runs the exported mv3 model on the Core ML backend.
+# Runs the exported mv3 model using the Core ML backend.
 ./coreml_executor_runner --model_path mv3_coreml_all.pte
 ```
 
+**Profiling a Core ML delegated Program**:
+
+Note that profiling is supported on [macOS](https://developer.apple.com/macos) >= 14.4.
+
+1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) when exporting your model.
+```bash
+cd executorch
+
+# Generates `mv3_coreml_all.pte` and `mv3_coreml_etrecord.bin` files.
+python3 -m examples.apple.coreml.scripts.export --model_name mv3 --generate_etrecord
+```
+
+2. Build the runner.
+```bash
+# Builds `coreml_executor_runner`.
+./examples/apple/coreml/scripts/build_executor_runner.sh
+```
+3. Run and generate an [ETDump](./sdk-etdump.md).
+```bash
+cd executorch
+
+# Generate the ETDump file.
+./coreml_executor_runner --model_path mv3_coreml_all.pte --profile_model --etdump_path etdump.etdp
+```
+
+4. Create an instance of the [Inspector API](./sdk-inspector.rst) by passing in the [ETDump](./sdk-etdump.md) you have sourced from the runtime along with the optionally generated [ETRecord](./sdk-etrecord.rst) from step 1 or execute the following command in your terminal to display the profiling data table.
+```bash
+python examples/apple/coreml/scripts/inspector_cli.py --etdump_path etdump.etdp --etrecord_path mv3_coreml.bin
+```
+
+
 ## Deploying and running on a device
 
 **Running the Core ML delegated Program in the Demo iOS App**:
@@ -92,27 +123,27 @@ cd executorch
 
 3. Complete the [Final Steps](demo-apps-ios.md#final-steps) section of the tutorial to build and run the demo app.
 
-<br>**Running the Core ML delegated Program in your own App**
-1. Build **Core ML** delegate. The following will create a `executorch.xcframework` in the `cmake-out` directory.
+<br>**Running the Core ML delegated Program in your App**
+1. Build frameworks, running the following will create a `executorch.xcframework` and `coreml_backend.xcframework` in the `cmake-out` directory.
 ```bash
 cd executorch
 ./build/build_apple_frameworks.sh --Release --coreml
 ```
 2. Create a new [Xcode project](https://developer.apple.com/documentation/xcode/creating-an-xcode-project-for-an-app#) or open an existing project.
 
-3. Drag the `executorch.xcframework` generated from Step 2 to Frameworks.
+3. Drag the `executorch.xcframework` and `coreml_backend.xcframework` generated from Step 2 to Frameworks.
 
 4. Go to the project's [Build Phases](https://developer.apple.com/documentation/xcode/customizing-the-build-phases-of-a-target) -  Link Binaries With Libraries, click the + sign, and add the following frameworks:
 ```
-- executorch.xcframework
-- coreml_backend.xcframework
-- Accelerate.framework
-- CoreML.framework
-- libsqlite3.tbd
+executorch.xcframework
+coreml_backend.xcframework
+Accelerate.framework
+CoreML.framework
+libsqlite3.tbd
 ```
 5. Add the exported program to the [Copy Bundle Phase](https://developer.apple.com/documentation/xcode/customizing-the-build-phases-of-a-target#Copy-files-to-the-finished-product) of your Xcode target.
 
-6. Please follow the [running a model](running-a-model-cpp-tutorial.md) tutorial to integrate the code for loading a ExecuTorch program.
+6. Please follow the [running a model](./running-a-model-cpp-tutorial.md) tutorial to integrate the code for loading an ExecuTorch program.
 
 7. Update the code to load the program from the Application's bundle.
 ``` objective-c
@@ -120,9 +151,7 @@ using namespace torch::executor;
 
 NSURL *model_url = [NBundle.mainBundle URLForResource:@"mv3_coreml_all" extension:@"pte"];
 
-Result<util::FileDataLoader> loader =
-        util::FileDataLoader::from(model_url.path.UTF8String);
-
+Result<util::FileDataLoader> loader = util::FileDataLoader::from(model_url.path.UTF8String);
 ```
 
 8. Use [Xcode](https://developer.apple.com/documentation/xcode/building-and-running-an-app#Build-run-and-debug-your-app) to deploy the application on the device.
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index 30904b29ddb..1a94577e90c 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -115,6 +115,10 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b
 
 ```bash
 cd $EXECUTORCH_ROOT
+# Workaround for fbs files in exir/_serialize
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+
 mkdir build_x86_64
 cd build_x86_64
 cmake .. -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=${QNN_SDK_ROOT}
@@ -138,8 +142,8 @@ mkdir build_android
 cd build_android
 # build executorch & qnn_executorch_backend
 cmake .. \
-    -DBUCK2=buck2 \
     -DCMAKE_INSTALL_PREFIX=$PWD \
+    -DEXECUTORCH_BUILD_SDK=ON \
     -DEXECUTORCH_BUILD_QNN=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
@@ -220,6 +224,7 @@ So, we can run `qnn_executor_runner` like
 ```bash
 adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR}
 adb push ${EXECUTORCH_ROOT}/build_android/examples/qualcomm/qnn_executor_runner ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/build_android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
 adb shell "cd ${DEVICE_DIR} \
            && export LD_LIBRARY_PATH=${DEVICE_DIR} \
            && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
diff --git a/docs/source/build-run-vulkan.md b/docs/source/build-run-vulkan.md
new file mode 100644
index 00000000000..736859b86f6
--- /dev/null
+++ b/docs/source/build-run-vulkan.md
@@ -0,0 +1 @@
+```{include} ../../backends/vulkan/docs/android_demo.md
diff --git a/docs/source/build-run-xtensa.md b/docs/source/build-run-xtensa.md
index 296d9ac1193..17fd6049f90 100644
--- a/docs/source/build-run-xtensa.md
+++ b/docs/source/build-run-xtensa.md
@@ -64,17 +64,18 @@ Step 2. Make sure you have completed the ExecuTorch setup tutorials linked to at
 The working tree is:
 
 ```
-examples/xtensa/
+examples/cadence/
 ├── aot
 ├── kernels
 ├── ops
+├── tests
 ├── third-party
 └── utils
 ```
 
 ***AoT (Ahead-of-Time) Components***:
 
-The AoT folder contains all of the python scripts and functions needed to export the model to an ExecuTorch `.pte` file. In our case, [export_example.py](https://github.com/pytorch/executorch/blob/main/examples/xtensa/aot/export_example.py) defines a model and some example inputs (set to a vector of ones), and runs it through the quantizer (from [quantizer.py](https://github.com/pytorch/executorch/blob/main/examples/xtensa/aot/quantizer.py)). Then a few compiler passes, also defined in [quantizer.py](https://github.com/pytorch/executorch/blob/main/examples/xtensa/aot/quantizer.py), will replace operators with custom ones that are supported and optimized on the chip. Any operator needed to compute things should be defined in [meta_registrations.py](https://github.com/pytorch/executorch/blob/main/examples/xtensa/aot/meta_registrations.py) and have corresponding implemetations in the other folders.
+The AoT folder contains all of the python scripts and functions needed to export the model to an ExecuTorch `.pte` file. In our case, [export_example.py](https://github.com/pytorch/executorch/blob/main/examples/cadence/aot/export_example.py) is an API that takes a model (nn.Module) and representative inputs and runs it through the quantizer (from [quantizer.py](https://github.com/pytorch/executorch/blob/main/examples/cadence/aot/quantizer.py)). Then a few compiler passes, also defined in [quantizer.py](https://github.com/pytorch/executorch/blob/main/examples/cadence/aot/quantizer.py), will replace operators with custom ones that are supported and optimized on the chip. Any operator needed to compute things should be defined in [meta_registrations.py](https://github.com/pytorch/executorch/blob/main/examples/cadence/aot/meta_registrations.py) and have corresponding implemetations in the other folders.
 
 ***Operators***:
 
@@ -97,17 +98,31 @@ cd executorch
 python3 -m examples.portable.scripts.export --model_name="add"
 ```
 
-***Quantized Linear***:
+***Quantized Operators***:
 
-The second, more complex model is a quantized [linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) operation. The model is defined [here](https://github.com/pytorch/executorch/blob/main/examples/xtensa/aot/export_example.py#L88). Linear is the backbone of most Automatic Speech Recognition (ASR) models.
+The other, more complex model are custom operators, including:
+  - a quantized [linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) operation. The model is defined [here](https://github.com/pytorch/executorch/blob/main/examples/cadence/tests/quantized_linear_example.py#L28). Linear is the backbone of most Automatic Speech Recognition (ASR) models.
+  - a quantized [conv1d](https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html) operation. The model is defined [here](https://github.com/pytorch/executorch/blob/main/examples/cadence/tests/quantized_conv1d_example.py#L36). Convolutions are important in wake word and many denoising models.
 
-The generated file is called `XtensaDemoModel.pte`.
+In both cases the generated file is called `XtensaDemoModel.pte`.
+
+```bash
+cd executorch
+python3 -m examples.cadence.tests.quantized_<linear,conv1d>_example
+```
+
+***Small Model: RNNT predictor***:
+
+The torchaudio [RNNT-emformer](https://pytorch.org/audio/stable/tutorials/online_asr_tutorial.html) model is an Automatic Speech Recognition (ASR) model, comprised of three different submodels: an encoder, a predictor and a joiner.
+The predictor is a sequence of basic ops (embedding, ReLU, linear, layer norm) and can be exported using:
 
 ```bash
 cd executorch
-python3 -m examples.xtensa.aot.export_example
+python3 -m examples.cadence.tests.rnnt_predictor_quantized_example
 ```
 
+The generated file is called `XtensaDemoModel.pte`.
+
 ### Runtime
 
 **Building the DSP firmware image**
@@ -116,7 +131,7 @@ In this step, you'll be building the DSP firmware image that consists of the sam
 ***Step 1***. Configure the environment variables needed to point to the Xtensa toolchain that you have installed in the previous step. The three environment variables that need to be set include:
 ```bash
 # Directory in which the Xtensa toolchain was installed
-export XTENSA_TOOLCHAIN=/home/user_name/xtensa/XtDevTools/install/tools
+export XTENSA_TOOLCHAIN=/home/user_name/cadence/XtDevTools/install/tools
 # The version of the toolchain that was installed. This is essentially the name of the directory
 # that is present in the XTENSA_TOOLCHAIN directory from above.
 export TOOLCHAIN_VER=RI-2021.8-linux
@@ -135,31 +150,32 @@ In order to run the CMake build, you need the path to the following:
 cd executorch
 rm -rf cmake-out
 # prebuild and install executorch library
-cmake -DBUCK2=buck2 \
-    -DCMAKE_TOOLCHAIN_FILE=<path_to_executorch>/examples/xtensa/xtensa.cmake \
+cmake -DCMAKE_TOOLCHAIN_FILE=<path_to_executorch>/examples/cadence/cadence.cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Debug \
+    -DPYTHON_EXECUTABLE=python3 \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+    -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+    -DEXECUTORCH_BUILD_CPUINFO=OFF \
     -DEXECUTORCH_BUILD_FLATC=OFF \
     -DFLATC_EXECUTABLE="$(which flatc)" \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DPYTHON_EXECUTABLE=python3 \
     -Bcmake-out .
 
 cmake --build cmake-out -j8 --target install --config Debug
-# build xtensa runner
+# build cadence runner
 cmake -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_TOOLCHAIN_FILE=<path_to_executorch>/examples/xtensa/xtensa.cmake \
+    -DCMAKE_TOOLCHAIN_FILE=<path_to_executorch>/examples/cadence/cadence.cmake \
     -DCMAKE_PREFIX_PATH=<path_to_executorch>/cmake-out \
     -DMODEL_PATH=<path_to_program_file_generated_in_previous_step> \
     -DNXP_SDK_ROOT_DIR=<path_to_nxp_sdk_root> -DEXECUTORCH_BUILD_FLATC=0 \
     -DFLATC_EXECUTABLE="$(which flatc)" \
     -DNN_LIB_BASE_DIR=<path_to_nnlib_cloned_in_step_2> \
-    -Bcmake-out/examples/xtensa \
-    examples/xtensa
+    -Bcmake-out/examples/cadence \
+    examples/cadence
 
-cmake --build cmake-out/examples/xtensa -j8 -t xtensa_executorch_example
+cmake --build cmake-out/examples/cadence -j8 -t cadence_executorch_example
 ```
 
 After having succesfully run the above step you should see two binary files in their CMake output directory.
@@ -196,6 +212,6 @@ First 20 elements of output 0
 
 In this tutorial, you have learned how to export a quantized operation, build the ExecuTorch runtime and run this model on the Xtensa HiFi4 DSP chip.
 
-The model in this tutorial is a typical operation appearing in ASR models, and can be extended to a complete ASR model by creating the model in [export_example.py](https://github.com/pytorch/executorch/blob/main/examples/xtensa/aot/export_example.py) and adding the needed operators/kernels to [operators](https://github.com/pytorch/executorch/blob/main/examples/xtensa/ops) and [kernels](https://github.com/pytorch/executorch/blob/main/examples/xtensa/kernels).
+The (quantized linear) model in this tutorial is a typical operation appearing in ASR models, and can be extended to a complete ASR model by creating the model as a new test and adding the needed operators/kernels to [operators](https://github.com/pytorch/executorch/blob/main/examples/cadence/ops) and [kernels](https://github.com/pytorch/executorch/blob/main/examples/cadence/kernels).
 
 Other models can be created following the same structure, always assuming that operators and kernels are available.
diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 86c0c136300..1dad3b032fc 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -9,7 +9,7 @@ MemoryPlanning is the very last action taken before taking an `ExportedProgram`
 Concretely, there are three passes related to memory planning:
 * `SpecPropPass` computes a TensorSpec for each tensor in the graph (inputs, intermediates or outputs). The most important field of the tensor spec is a symbolic expression of the shapes of the tensor, where the initial set of symbols comes from the dimensions of input tensors, intermediate tensor shapes’ symbolic expression is propagated via tensor operations. The dimensions can be marked as either dynamic or static by users and when the dims are dynamic, users are required to annotate the dim with a ValueRange.
 
-* `SymShapEvalPass` evaluates the symbolic expressions to concrete integers with their upper bounds. There are two ways to doing the upper bound specialization:
+* `SymShapeEvalPass` evaluates the symbolic expressions to concrete integers with their upper bounds. There are two ways to doing the upper bound specialization:
 HintBasedSymShapeEval (to be deprecated) is the old way of evaluating the upper bound. It doesn’t look at the ValueRange of the symbols but uses the shapes of example inputs to replace all the symbols. We call it “hint based“ because the example inputs’ shapes are just hints of what the input shapes might be at run time and are used for tracing only. ValueRangeBasedSymShapeEval is the recommended way of doing UpperBoundMemory planning. It will actually look at the ValueRange of the symbols and do an inference over the ranges to get a real upper bound.
 
 * `MemoryPlanningPass` does the actual memory planning given all tensors get a TensorSpec with concrete integer shapes.
@@ -18,9 +18,9 @@ HintBasedSymShapeEval (to be deprecated) is the old way of evaluating the upper
 
 ExecuTorch provides two options for memory planning algorithms out of the box, but users can define their own if the provided options are inappropriate or insufficient for their use case.
 
-* The naive algorithm simply concatenates all the tensors together in a linear memory without considering any memory re-use. It serves as an upper bound for total memory consumption and serves as a baseline.
+* The naive algorithm simply concatenates all the tensors together in a linear memory block without considering memory re-use. It serves as an upper bound for total memory consumption and serves as a baseline.
 
-* The Greedy algorithm tries to re-use the already allocated memory and choose based on the best-fit criteria. Specifically:
+* The Greedy algorithm tries to re-use the already allocated memory based on the best-fit criteria. Specifically:
 When there isn’t an allocated memory whose lifetime doesn’t overlap with the current tensor that we try to do memory planning for, we allocate a new memory buffer with the same size and lifetime as the current tensor. When there is one or more allocated memory buffer, whose lifetime overlaps with the current tensor, we pick the buffer that has the closest size with current tensor so as to reduce memory fragmentation. Finally, we allocate these memory buffers linearly in memory.
 
 
@@ -48,7 +48,7 @@ Users can write custom memory plans to take advantage of multiple memory locatio
 
 ```python
 class CustomPoolMemoryPlanningPass(MemoryPlanningPass):
-    def call(self, graph_module: GraphModule) -> PassResult:
+    def run(self, graph_module: GraphModule, graph_signature: Optional[ExportGraphSignature]) -> PassResult:
         for subgm in graph_module.modules():
             if not isinstance(subgm, GraphModule):
                 continue
@@ -68,7 +68,7 @@ class CustomPoolMemoryPlanningPass(MemoryPlanningPass):
                 elif node.target == torch.ops.aten.mul.out:
                     node.meta["spec"].mem_id = 1
 
-        return super().call(graph_module)
+        return super().run(graph_module, graph_signature)
 ```
 
 Then later when lowering to ExecuTorch you can use your custom plan in the following way:
@@ -83,4 +83,4 @@ program = edge_program.to_executorch(
         )
 ```
 
-Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12)
+Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12).
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 239319f7c2f..423f0618f68 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -73,38 +73,23 @@
 
 # Get ET_VERSION_DOCS during the build.
 et_version_docs = os.environ.get("ET_VERSION_DOCS", None)
-
+print(f"et_version_docs: {et_version_docs}")
 
 # The code below will cut version displayed in the dropdown like this:
-# tags like v0.1.0 = > 0.1
-# branch like release/0.1 => 0.1
-# main will remain main
-# if not set will fail back to main
+# By default, set to "main".
+# If it's a tag like refs/tags/v1.2.3-rc4 or refs/tags/v1.2.3, then
+# cut to 1.2
 # the version varible is used in layout.html: https://github.com/pytorch/executorch/blob/main/docs/source/_templates/layout.html#L29
+version = release = "main"
 if et_version_docs:
-    # Check if starts with release/ and set the version to the number after slash
-    if et_version_docs.startswith("release/"):
+    if et_version_docs.startswith("refs/tags/v"):
+        version = ".".join(
+            et_version_docs.split("/")[-1].split("-")[0].lstrip("v").split(".")[:2]
+        )
+    elif et_version_docs.startswith("refs/heads/release/"):
         version = et_version_docs.split("/")[-1]
-    else:
-        # Remove "v" prefix if present
-        if et_version_docs.startswith("v"):
-            et_version_docs = et_version_docs[1:]
-        # Split to major, minor, and patch
-        version_components = et_version_docs.split(".")
-
-        # Combine the major and minor version components:
-        if len(version_components) >= 2:
-            version = release = ".".join(version_components[:2])
-        else:
-            # If there are not enough components, use the full version
-            version = release = et_version_docs
-
-    html_title = " ".join((project, version, "documentation"))
-# IF ET_VERSION_DOCS not set, set version to main.
-# This can be updated to nightly and so on.
-else:
-    version = "main"
-    release = "main"
+print(f"Version: {version}")
+html_title = " ".join((project, version, "documentation"))
 
 breathe_projects = {"ExecuTorch": "../build/xml/"}
 breathe_default_project = "ExecuTorch"
diff --git a/docs/source/debug-backend-delegate.md b/docs/source/debug-backend-delegate.md
new file mode 100644
index 00000000000..ebcf94136c7
--- /dev/null
+++ b/docs/source/debug-backend-delegate.md
@@ -0,0 +1,65 @@
+# Debug Backend Delegate
+
+We provide a list of util functions to give users insights on what happened to the graph modules during the `to_backend()` stage.
+
+## Get delegation summary
+The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
+
+```python
+from executorch.exir.backend.utils import get_delegation_info
+from tabulate import tabulate
+
+# ... After call to to_backend(), but before to_executorch()
+graph_module = edge_manager.exported_program().graph_module
+delegation_info = get_delegation_info(graph_module)
+print(delegation_info.get_summary())
+df = delegation_info.get_operator_delegation_dataframe()
+print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
+```
+
+Example printout:
+```
+Total  delegated  subgraphs:  86
+Number  of  delegated  nodes:  473
+Number  of  non-delegated  nodes:  430
+```
+
+
+|    |  op_type                                 |  occurrences_in_delegated_graphs  |  occurrences_in_non_delegated_graphs  |
+|----|---------------------------------|------- |-----|
+|  0  |  aten__softmax_default  |  12  |  0  |
+|  1  |  aten_add_tensor  |  37  |  0  |
+|  2  |  aten_addmm_default  |  48  |  0  |
+|  3  |  aten_arange_start_step  |  0  |  25  |
+|      |  ...  |    |    |
+|  23  |  aten_view_copy_default  |  170  |  48  |
+|      |  ...  |    |    |
+|  26  |  Total  |  473  |  430  |
+
+From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs. Users can use information like this to debug.
+
+## Visualize delegated graph
+To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph:
+
+```python
+from executorch.exir.backend.utils import print_delegated_graph
+graph_module = edge_manager.exported_program().graph_module
+print(print_delegated_graph(graph_module))
+```
+It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` but hide the the subgraph consumes by the backend, while this function exposes the contents inside the subgraph.
+
+In the example printout below, observe that `embedding` and `add` operators are delegated to `XNNPACK` while the `sub` operator is not.
+
+```
+%aten_unsqueeze_copy_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_23, -2), kwargs = {})
+  %aten_unsqueeze_copy_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_24, -1), kwargs = {})
+  %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
+    backend_id: XnnpackBackend
+    lowered graph():
+      %aten_embedding_default : [num_users=1] = placeholder[target=aten_embedding_default]
+      %aten_embedding_default_1 : [num_users=1] = placeholder[target=aten_embedding_default_1]
+      %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+      return (aten_add_tensor,)
+  %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+  %aten_sub_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.sub.Tensor](args = (%aten_unsqueeze_copy_default, %aten_unsqueeze_copy_default_1), kwargs = {})
+```
diff --git a/docs/source/demo-apps-ios.md b/docs/source/demo-apps-ios.md
index e04b6cae681..d68b1309e2b 100644
--- a/docs/source/demo-apps-ios.md
+++ b/docs/source/demo-apps-ios.md
@@ -1 +1 @@
-```{include} ../../examples/demo-apps/apple_ios/README.md
+```{include} ../../examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
index ffc33498483..833b5ee0ca9 100644
--- a/docs/source/getting-started-setup.md
+++ b/docs/source/getting-started-setup.md
@@ -184,6 +184,39 @@ Output 0: tensor(sizes=[1], [2.])
 
 To learn how to build a similar program, visit the [ExecuTorch in C++ Tutorial](running-a-model-cpp-tutorial.md).
 
+### [Optional] Setting Up Buck2
+**Buck2** is an open-source build system that some of our examples currently utilize for building and running.
+
+However, please note that the installation of `Buck2` is optional for using ExecuTorch and we are in the process of transitioning away from `Buck2` and migrating all relevant sections to `cmake`. This section will be removed once we finish the migration.
+
+To set up `Buck2`, You will need the following prerequisits for this section:
+* The `zstd` command line tool — install by running
+   ```bash
+   pip3 install zstd
+   ```
+* Version `${executorch_version:buck2}` of the `buck2` commandline tool — you can download a
+  prebuilt archive for your system from [the Buck2
+  repo](https://github.com/facebook/buck2/releases/tag/2024-02-15). Note that
+  the version is important, and newer or older versions may not work with the
+  version of the buck2 prelude used by the ExecuTorch repo.
+
+Configure Buck2 by decompressing with the following command (filename depends
+   on your system):
+
+   ```bash
+   # For example, buck2-x86_64-unknown-linux-musl.zst or buck2-aarch64-apple-darwin.zst
+   zstd -cdq buck2-DOWNLOADED_FILENAME.zst > /tmp/buck2 && chmod +x /tmp/buck2
+   ```
+
+You may want to copy the `buck2` binary into your `$PATH` so you can run it
+   as `buck2`.
+
+After the installation, you can run the `add.pte` program by following `buck2` command:
+
+```bash
+/tmp/buck2 run //examples/portable/executor_runner:executor_runner -- --model_path add.pte
+```
+
 ## Next Steps
 
 Congratulations! You have successfully exported, built, and run your first
diff --git a/docs/source/index.rst b/docs/source/index.rst
index adbda475aa2..47feda0f783 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -4,12 +4,16 @@ Welcome to the ExecuTorch Documentation
 =======================================
 
 .. important::
-   This is a preview version of ExecuTorch and should be used for testing
-   and evaluation purposes only. It is not recommended for use in production
-   settings. We welcome any feedback, suggestions, and bug reports from the
-   community to help us improve the technology. Please use the `PyTorch
-   Forums <https://discuss.pytorch.org/c/executorch>`__ for discussion and
-   feedback about ExecuTorch using the **ExecuTorch** category, and our `GitHub
+   This is an alpha release; the ExecuTorch APIs and the ``.pte`` binary format
+   may change in incompatible ways before stabilizing in a future beta release.
+   When deploying models, we currently recommend using a version of the runtime
+   built from the same git revision that was used to generate the ``.pte`` file.
+   Once the format has stabilized, this will no longer be necessary.
+
+   We welcome any feedback, suggestions, and bug reports from the community
+   to help us improve the technology. Please use the `PyTorch Forums
+   <https://discuss.pytorch.org/c/executorch>`__ for discussion and feedback
+   about ExecuTorch using the **ExecuTorch** category, and our `GitHub
    repository <https://github.com/pytorch/executorch/issues>`__ for bug
    reporting.
 
@@ -100,6 +104,7 @@ Topics in this section will help you get started with ExecuTorch.
    demo-apps-android
    examples-end-to-end-to-lower-model-to-delegate
    tutorial-xnnpack-delegate-lowering
+   build-run-vulkan
    ..
       Alphabetical by backend name. Be sure to keep the same order in the
       customcarditem entries below.
@@ -183,8 +188,10 @@ Topics in this section will help you get started with ExecuTorch.
    :hidden:
 
    native-delegates-executorch-xnnpack-delegate
+   native-delegates-executorch-vulkan-delegate
    backend-delegates-integration
    backend-delegates-dependencies
+   debug-backend-delegate
 
 .. toctree::
    :glob:
@@ -262,6 +269,13 @@ ExecuTorch tutorials.
    :link: tutorial-xnnpack-delegate-lowering.html
    :tags: Export,Backend,Delegation,Quantization,XNNPACK
 
+.. customcarditem::
+   :header: Building and Running ExecuTorch with Vulkan Backend
+   :card_description: A tutorial that walks you through the process of building ExecuTorch with Vulkan Backend
+   :image: _static/img/generic-pytorch-logo.png
+   :link: build-run-vulkan.html
+   :tags: Export,Backend,Delegation,Vulkan
+
 ..
    Alphabetical by backend name. Be sure to keep the same order in the Tutorials
    toctree entry above.
diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md
index 4b0794ea1c5..4d391b1a944 100644
--- a/docs/source/kernel-library-custom-aten-kernel.md
+++ b/docs/source/kernel-library-custom-aten-kernel.md
@@ -86,10 +86,88 @@ ATen operator with a dtype/dim order specialized kernel (works for `Double` dtyp
       kernel_name: torch::executor::add_out
 
 ```
+### Custom Ops C++ API
+
+For a custom kernel that implements a custom operator, we provides 2 ways to register it into ExecuTorch runtime:
+1. Using `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` C++ macros.
+2. Using `functions.yaml` and codegen'd C++ libraries.
+
+The first option requires C++17 and doesn't have selective build support yet, but it's faster than the second option where we have to go through yaml authoring and build system tweaking.
+
+The first option is particularly suitable for fast prototyping but can also be used in production.
+
+Similar to `TORCH_LIBRARY`, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime.
+
+#### Prepare custom kernel implementation
+
+Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see native_functions.yaml). For example:
+
+```yaml
+custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
+custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
+```
+
+Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
+
+
+```c++
+// custom_linear.h/custom_linear.cpp
+#include <executorch/runtime/kernel/kernel_includes.h>
+Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
+   // calculation
+   return out;
+}
+```
+#### Use a C++ macro to register it into PyTorch & ExecuTorch
+
+Append the following line in the example above:
+```c++
+// custom_linear.h/custom_linear.cpp
+// opset namespace myop
+EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
+```
+
+Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
+
+```c++
+// custom_linear_pytorch.cpp
+#include "custom_linear.h"
+#include <torch/library.h>
+
+at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
+    // initialize out
+    at::Tensor out = at::empty({weight.size(1), input.size(1)});
+    // wrap kernel in custom_linear.cpp into ATen kernel
+    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
+    return out;
+}
+// standard API to register ops into PyTorch
+TORCH_LIBRARY(myop, m) {
+    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
+    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
+}
+```
+
+#### Compile and link the custom kernel
+
+Link it into ExecuTorch runtime: In our `CMakeLists.txt`` that builds the binary/application, we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
+
+Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
+
+```python
+import torch
+torch.ops.load_library("libcustom_linear.so/dylib")
+
+# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp.
+op = torch.ops.myop.custom_linear.default
+```
+
 
 ### Custom Ops Yaml Entry
 
-For custom ops (the ones that are not part of the out variants of core ATen opset) we need to specify the operator schema as well as a `kernel` section. So instead of `op` we use `func` with the operator schema. As an example, here’s a yaml entry for a custom op:
+As mentioned above, this option provides more support in terms of selective build and features such as merging operator libraries.
+
+First we need to specify the operator schema as well as a `kernel` section. So instead of `op` we use `func` with the operator schema. As an example, here’s a yaml entry for a custom op:
 ```yaml
 - func: allclose.out(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False, bool dummy_param=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
@@ -159,6 +237,30 @@ target_link_libraries(executorch_binary generated_lib)
 
 ```
 
+We also provide the ability to merge two yaml files, given a precedence. `merge_yaml(FUNCTIONS_YAML functions_yaml FALLBACK_YAML fallback_yaml OUTPUT_DIR out_dir)` merges functions_yaml and fallback_yaml into a single yaml, if there's duplicate entries in functions_yaml and fallback_yaml, this macro will always take the one in functions_yaml.
+
+Example:
+
+```yaml
+# functions.yaml
+- op: add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_add_out
+```
+
+And out fallback:
+
+```yaml
+# fallback.yaml
+- op: add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::add_out
+```
+
+The merged yaml will have the entry in functions.yaml.
+
 #### Buck2
 
 `executorch_generated_lib` is the macro that takes the yaml files and depends on the selective build macro `et_operator_library`. For an example:
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 7d79e51df7e..1c70cc7e7fd 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -101,9 +101,9 @@ functions_yaml_target = "//executorch/kernels/portable:functions.yaml",
     deps = [
         "//executorch/examples/portable/custom_ops:custom_ops_1", # kernel library
         "//executorch/examples/portable/custom_ops:custom_ops_2", # kernel library
-  "//executorch/kernels/portable:operators", # kernel library
+        "//executorch/kernels/portable:operators", # kernel library
         ":select_ops_from_yaml",
-  ":select_ops_in_list",
+        ":select_ops_in_list",
     ],
 )
 ```
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index eff8fd52ffb..ae743e8e6d0 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -1,5 +1,18 @@
 # Getting Started with LLMs via ExecuTorch
 
+Welcome to LLM Manual! This manual is designed to provide a practical example to leverage
+ExecuTorch in onboarding your own Large Language Models (LLMs). Our primary goal is to offer
+ a clear and concise guideline on how to integrate our system with your own LLMs.
+
+Please note that this project is intended as a demonstration and not as a fully functional
+example with optimal performance. As such, certain components such as the sampler, tokenizer,
+and others are provided in their bare minimum versions solely for demonstration purposes.
+Consequently, the results produced by the model may vary and might not always be optimal.
+
+We encourage users to use this project as a starting point and adapt it to their specific needs,
+which includes creating your own versions of the tokenizer, sampler, acceleration backends, and
+other components. We hope this project serves as a useful guide in your journey with LLMs and ExecuTorch.
+
 ### Table Of Contents
 
 
@@ -14,208 +27,490 @@
 
 ## Prerequisites
 
-Let’s start by getting an ExecuTorch environment:
+To follow this guide, you'll need to clone the ExecuTorch repository and install dependencies.
+ExecuTorch recommends Python 3.10 and the use of Conda to manage your environment. Conda is not
+required, though be aware that you may need to replace the use of python/pip with python3/pip3
+depending on your environment.
+
+::::{tab-set}
+:::{tab-item} conda
+Instructions on installing miniconda can be [found here](https://docs.anaconda.com/free/miniconda).
 
-1.  Create a third-party folder (Keeps the file paths organized)
-```
-mkdir  third-party
-cd  third-party
 ```
-2. If you’re new to ExecuTorch follow [these steps](https://pytorch.org/executorch/main/getting-started-setup.html#set-up-your-environment) to set up your environment.
+# Create a directory for this example.
+mkdir et-nanogpt
+cd et-nanogpt
 
-## Instantiating and Executing an LLM
+# Clone the ExecuTorch repository and submodules.
+mkdir third-party
+git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+cd third-party/executorch
+git submodule update --init
 
-We will use Karpathy’s [NanoGPT](https://github.com/karpathy/nanoGPT) but you can use another model if you prefer.
+# Create a conda environment and install requirements.
+conda create -yn executorch python=3.10.0
+conda activate executorch
+pip install cmake zstd
+./install_requirements.sh
 
+cd ../..
+```
+:::
+:::{tab-item} pyenv-virtualenv
+Instructions on installing pyenv-virtualenv can be [found here](https://github.com/pyenv/pyenv-virtualenv?tab=readme-ov-file#installing-with-homebrew-for-macos-users).
 
+Importantly, if installing pyenv through brew, it does not automatically enable pyenv in the terminal, leading to errors. Run the following commands to enable.
+See the pyenv-virtualenv installation guide above on how to add this to your .bashrc or .zshrc to avoid needing to run these commands manually.
+```
+eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"
+```
 
-There are just 2 steps to this:
+```
+# Create a directory for this example.
+mkdir et-nanogpt
+cd et-nanogpt
 
-1.  Export the LLM Model
-2.  Create a runtime to execute the model
+pyenv install -s 3.10
+pyenv virtualenv 3.10 executorch
+pyenv activate executorch
 
+# Clone the ExecuTorch repository and submodules.
+mkdir third-party
+git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+cd third-party/executorch
+git submodule update --init
 
+# Install requirements.
+pip install cmake zstd
+PYTHON_EXECUTABLE=python ./install_requirements.sh
 
+cd ../..
+```
+:::
+::::
 
-Note: Reminder to exit out of the “third-party” directory, before proceeding.
+For more information, see [Setting Up ExecuTorch](../getting-started-setup.md).
 
-### Step 1. Export
 
-[Exporting to ExecuTorch](https://pytorch.org/executorch/main/export-overview.html) simply describes taking an existing model and converting it to the ExecuTorch format.
+## Running a Large Language Model Locally
 
+This example uses Karpathy’s [nanoGPT](https://github.com/karpathy/nanoGPT), which is a minimal implementation of
+GPT-2 124M. This guide is applicable to other language models, as ExecuTorch is model-invariant.
 
+There are two steps to running a model with ExecuTorch:
 
-To start, let’s retrieve our model:
+1.  Export the model. This step preprocesses it into a format suitable for runtime execution.
+2.  At runtime, load the model file and run with the ExecuTorch runtime.
 
-`wget  https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py`
+<br />
 
-Next, we’ll create a script (call it export.py) to generate the ExecuTorch Program (which gets dumped into an ExecuTorch Binary):
+The export step happens ahead of time, typically as part of the application build or when the model changes. The resultant
+.pte file is distributed with the application. At runtime, the application loads the .pte file and passes it to the
+ExecuTorch runtime.
 
+### Step 1. Exporting to ExecuTorch
 
+Exporting takes a PyTorch model and converts it into a format that can run efficiently on consumer devices.
 
-1.  Create the model and example inputs
-```
-import torch
-from model import GPT
+For this example, you will need the nanoGPT model and the corresponding tokenizer vocabulary.
 
-model  =  GPT.from_pretrained('gpt2')
-example_inputs = (torch.randint(0, 100, (1, 8), dtype=torch.long), )
+::::{tab-set}
+:::{tab-item} curl
+```
+curl https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py -O
+curl https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json -O
+```
+:::
+:::{tab-item} wget
 ```
+wget https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py
+wget https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json
+```
+:::
+::::
 
+To convert the model into a format optimized for standalone execution, there are two steps. First, use the PyTorch
+`export` function to convert the PyTorch model into an intermediate, platform-independent intermediate representation. Then
+use the ExecuTorch `to_edge` and `to_executorch` methods to prepare the model for on-device execution. This creates a .pte
+file which can be loaded by a desktop or mobile application at runtime.
 
+Create a file called export_nanogpt.py with the following contents:
 
-2.  Trace the model
-Tracing extracts a cleaner representation of our model for conversion to ExecuTorch.
-You can read more about tracing in [torch.export — PyTorch 2.2 documentation](https://pytorch.org/docs/stable/export.html).
+```python
+# export_nanogpt.py
 
-```
-from torch.nn.attention import sdpa_kernel,  SDPBackend
+import torch
+
+from executorch.exir import EdgeCompileConfig, to_edge
+from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch._export import capture_pre_autograd_graph
 from torch.export import export
 
-# Using a custom SDPA kernel for LLMs
-with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]),  torch.no_grad():
+from model import GPT
 
-m  =  capture_pre_autograd_graph(model,  example_inputs)
+# Load the model.
+model = GPT.from_pretrained('gpt2')
+
+# Create example inputs. This is used in the export process to provide
+# hints on the expected shape of the model input.
+example_inputs = (torch.randint(0, 100, (1, model.config.block_size), dtype=torch.long), )
+
+# Set up dynamic shape configuration. This allows the sizes of the input tensors
+# to differ from the sizes of the tensors in `example_inputs` during runtime, as
+# long as they adhere to the rules specified in the dynamic shape configuration.
+# Here we set the range of 0th model input's 1st dimension as
+# [0, model.config.block_size].
+# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# for details about creating dynamic shapes.
+dynamic_shape = (
+    {1: torch.export.Dim("token_dim", max=model.config.block_size)},
+)
 
-traced_model  =  export(m,  example_inputs)
-```
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
 
-3.  Export the model to ExecuTorch
-Exporting (or lowering) takes the model and creates a runnable ExecuTorch program, without delegate to any specific bakends for further acceleration.
-```
-from executorch.exir import EdgeCompileConfig,  to_edge
+# Convert the model into a runnable ExecuTorch program.
+edge_config = EdgeCompileConfig(_check_ir_validity=False)
+edge_manager = to_edge(traced_model,  compile_config=edge_config)
+et_program = edge_manager.to_executorch()
 
-edge_config  =  EdgeCompileConfig(_check_ir_validity=False)
-edge_manager  =  to_edge(traced_model,  compile_config=edge_config)
-et_program  =  edge_manager.to_executorch()
+# Save the ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
 ```
 
-Also ExecuTorch provides different backend support for mobile acceleration. Simply call `to_backend()` with the specific backend partitioner on edge_manager  during exportation. Take Xnnpack delegation as an example:
+To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory.
 
+For more information, see [Exporting to ExecuTorch](../tutorials/export-to-executorch-tutorial) and
+[torch.export](https://pytorch.org/docs/stable/export.html).
 
-```
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
-from executorch.exir import EdgeCompileConfig, to_edge
+### Step 2. Invoking the Runtime
 
-edge_config = edge_config = get_xnnpack_edge_compile_config()
-edge_manager = to_edge(traced_model, compile_config=edge_config)
-edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+ExecuTorch provides a set of runtime APIs and types to load and run models.
 
-et_program = edge_manager.to_executorch()
-```
+Create a file called main.cpp with the following contents:
 
-After that, we’re ready to run our model. Remember to save you model before proceeding:
+```cpp
+// main.cpp
 
-```
-#Write the serialized ExecuTorch program to a file.
-with open("nanogpt.pte",  "wb") as file:
-file.write(et_program.buffer)
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "basic_tokenizer.h"
+#include "basic_sampler.h"
+#include "managed_tensor.h"
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+using namespace torch::executor;
+
+using SizesType = exec_aten::SizesType;
+using DimOrderType = exec_aten::DimOrderType;
+using StridesType = exec_aten::StridesType;
 ```
 
+The model inputs and outputs take the form of tensors. A tensor can be thought of as an multi-dimensional array.
+The ExecuTorch `EValue` class provides a wrapper around tensors and other ExecuTorch data types.
 
-Then run the script.
-`python export.py`
+Since the LLM generates one token at a time, the driver code needs to repeatedly invoke the model, building the
+output token by token. Each generated token is passed as input for the next run.
 
-### Step 2. Running the model
-Running model stands for executing the exported model on ExecuTorch runtime platform.
+```cpp
+// main.cpp
 
-Before running, we need to retrieve vocabulary file GPT2 used for tokenization:
+// The value of the gpt2 `<|endoftext|>` token.
+#define ENDOFTEXT_TOKEN 50256
 
-```
-wget  https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json
-```
-1.  Create the prompt:
-Prompt here means the initial cue given to the model, which it uses as a starting point to generate following sentences. Here we use “Hello world!” as example:
+std::string generate(
+    Module& llm_model,
+    std::string& prompt,
+    BasicTokenizer& tokenizer,
+    BasicSampler& sampler,
+    size_t max_input_length,
+    size_t max_output_length) {
 
+    // Convert the input text into a list of integers (tokens) that represents
+    // it, using the string-to-token mapping that the model was trained on.
+    // Each token is an integer that represents a word or part of a word.
+    std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
+    std::vector<int64_t> output_tokens;
 
-```
-string  prompt  =  "Hello world!";
+    for (auto i = 0u; i < max_output_length; i++) {
+        // Convert the input_tokens from a vector of int64_t to EValue.
+        // EValue is a unified data type in the ExecuTorch runtime.
+        ManagedTensor tensor_tokens(
+            input_tokens.data(),
+            {1, static_cast<int>(input_tokens.size())},
+            ScalarType::Long);
+        std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
+
+        // Run the model. It will return a tensor of logits (log-probabilities).
+        Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+
+        // Convert the output logits from EValue to std::vector, which is what
+        // the sampler expects.
+        Tensor logits_tensor = logits_evalue.get()[0].toTensor();
+        std::vector<float> logits(logits_tensor.data_ptr<float>(),
+            logits_tensor.data_ptr<float>() + logits_tensor.numel());
+
+        // Sample the next token from the logits.
+        int64_t next_token = sampler.sample(logits);
+
+        // Break if we reached the end of the text.
+        if (next_token == ENDOFTEXT_TOKEN) {
+            break;
+        }
+
+        // Add the next token to the output.
+        output_tokens.push_back(next_token);
+
+        std::cout << tokenizer.decode({ next_token });
+        std::cout.flush();
+
+        // Update next input.
+        input_tokens.push_back(next_token);
+        if (input_tokens.size() > max_input_length) {
+            input_tokens.erase(input_tokens.begin());
+        }
+    }
+
+    std::cout << std::endl;
+
+    // Convert the output tokens into a human-readable string.
+    std::string output_string = tokenizer.decode(output_tokens);
+    return output_string;
+}
 ```
 
+The `Module` class handles loading the .pte file and preparing for execution.
 
-2.  Load tokenizer and model
-A Tokenizer is a crucial component among different Natural Language Processing (NLP) tasks. The primary functionalities are:
+The tokenizer is responsible for converting from a human-readable string representation of the prompt to the
+numerical form expected by the model. To do this, the tokenzier associates short substrings with a given token ID.
+The tokens can be thought of as representing words or parts of words, though, in-practice, they may be arbitrary
+sequences of characters.
 
--   Encode: Convert text into structural and numerical representations by parsing text into smaller units.Each unit is replaced by a specific number for the NLP model to consume
+The tokenizer loads the vocabulary from a file, which contains the mapping between each token ID and the text it
+represents. Call `tokenizer.encode()` and `tokenizer.decode()` to convert between string and token representations.
 
--   Decode: Convert the numerical representations back for human interpretation.
+The sampler is responsible for selecting the next token, based on the logits, or log-probabilties, output by the
+model. The LLM returns a logit value for each possible next token. The sampler chooses which token to use based
+on some strategy. The simplest approach, used here, is to take the token with the highest logit value.
 
+Samplers may provide configurable options, such as configurable amount of randomness to the outputs selection,
+penalties for repeated tokens, and biases to prioritize or de-prioritize specific tokens.
 
-In our NanoGPT example, we create a simple tokenizer called BasicTokenizer to demonstrate the function. You can use other implementations like [tiktoken](https://github.com/openai/tiktoken) or your own implementation to do that.
 
+```cpp
+// main.cpp
 
-```
-#include  "basic_tokenizer.h"
-BasicTokenizer tokenizer("vocab.json");
-```
+int main() {
+    // Set up the prompt. This provides the seed text for the model to elaborate.
+    std::cout << "Enter model prompt: ";
+    std::string prompt;
+    std::getline(std::cin, prompt);
 
+    // The tokenizer is used to convert between tokens (used by the model) and
+    // human-readable strings.
+    BasicTokenizer tokenizer("vocab.json");
 
-To load the exported ExecuTorch model into runtime environment, we can use **Module** class:
+    // The sampler is used to sample the next token from the logits.
+    BasicSampler sampler = BasicSampler();
 
+    // Load the exported nanoGPT program, which was generated via the previous steps.
+    Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
 
-```
-#include <executorch/extension/module/module.h>
-Module llm_model("nanogpt.pte");
+    const auto max_input_tokens = 1024;
+    const auto max_output_tokens = 30;
+    std::cout << prompt;
+    generate(model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
+}
 ```
 
+Finally, download the following files into the same directory as main.h:
 
-3.  Tokenize the prompt
 ```
-vector<int64_t> tokens = tokenizer.encode(prompt);
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h
+curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h
 ```
 
-4.  Generate outputs
-We use the loaded model to generate text based on tokenized prompt. Here we create a helper function to illustrate the pipeline:
+To learn more, see [Running an ExecuTorch Model in C++](../running-a-model-cpp-tutorial.md)
+and the [ExecuTorch Runtime API Reference](../executorch-runtime-api-reference.md).
+
+### Building and Running
+
+ExecuTorch uses the CMake build system. To compile and link against the ExecuTorch runtime,
+include the ExecuTorch project via `add_directory` and link against `executorch` and additional
+dependencies.
+
+Create a file named CMakeLists.txt with the following content:
 
 ```
-vector<int64_t> generate(Module& llm_model, vector<int64_t>& input_tokens, BasicSampler& sampler, size_t target_output_length) {
-    vector<int64_t> output_tokens;
-    for (int i = 0; i < target_output_length; i++) {
-        // Convert the input_tokens from a vector of int64_t to EValue.
-        // Evalue is a unified data type in the executorch runtime.
-        ManagedTensor tensor_tokens(input_tokens.data(), {1, 8}, ScalarType::Long);
-        vector<EValue> inputs = {tensor_tokens.get_tensor()};
-        // Run the model given the Evalue inputs. The model will also return a sequence of EValues as output.
-        Result<vector<EValue>> logits_evalue = llm_model.forward(inputs);
-        // Convert the output from EValue to a logits in float.
-        Tensor logits_tensor = logits_evalue.get()[0].toTensor();
-        vector<float> logits(logits_tensor.data_ptr<float>(), logits_tensor.data_ptr<float>() + logits_tensor.numel());
-        // Sample the next token from the logits.
-        int64_t next_token = sampler.sample(logits);
-        // Record the next token
-        output_tokens.push_back(next_token);
-        // Update next input.
-        input_tokens.erase(input_tokens.begin());
-        input_tokens.push_back(next_token);
-    }
-    return output_tokens;
-}
+# CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.19)
+project(nanogpt_runner)
 
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+
+# Include the executorch subdirectory.
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+    ${CMAKE_BINARY_DIR}/third-party/executorch)
+
+add_executable(nanogpt_runner main.cpp)
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels
+```
+
+At this point, the working directory should contain the following files:
+
+- CMakeLists.txt
+- main.cpp
+- basic_tokenizer.h
+- basic_sampler.h
+- managed_tensor.h
+- export_nanogpt.py
+- model.py
+- vocab.json
+- nanogpt.pte
+
+If all of these are present, you can now build and run:
+```bash
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
+cmake --build cmake-out -j10
+./cmake-out/nanogpt_runner
 ```
 
+You should see the message:
 
-And in the main function, we leverage the function to generate the outputs.
 ```
-vector<int64_t> outputs = generate(llm_model, tokens, sampler, /*target_output_length*/20);
+Enter model prompt:
 ```
-Notice that here outputs are tokens, rather than actual natural language.
 
-5.  Decode the output.
-We convert the generated output tokens back to natural language for better understanding:
+Type some seed text for the model and press enter. Here we use "Hello world!" as
+an example prompt:
 
 ```
-string out_str = tokenizer.decode(outputs);
-```
+Enter model prompt: Hello world!
+Hello world!
 
-6.  Print the generated text
+I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
 ```
-cout << "output: " << out_str << endl;
+
+At this point, it is likely to run very slowly. This is because ExecuTorch hasn't been told to optimize for
+specific hardware (delegation), and because it is doing all of the calculations in 32-bit floating point (no quantization).
+
+## Delegation
+
+While ExecuTorch provides a portable, cross-platform implementation for all
+operators, it also provides specialized backends for a number of different
+targets. These include, but are not limited to, x86 and ARM CPU acceleration via
+the XNNPACK backend, Apple acceleration via the Core ML backend and Metal
+Performance Shader (MPS) backend, and GPU acceleration via the Vulkan backend.
+
+Because optimizations are specific to a given backend, each pte file is specific
+to the backend(s) targeted at export. To support multiple devices, such as
+XNNPACK acceleration for Android and Core ML for iOS, export a separate PTE file
+for each backend.
+
+To delegate to a backend at export time, ExecuTorch provides the `to_backend()`
+function in the `EdgeProgramManager` object, which takes a backend-specific
+partitioner object. The partitioner is responsible for finding parts of the
+computation graph that can be accelerated by the target backend，and
+`to_backend()` function will delegate matched part to given backend for
+acceleration and optimization. Any portions of the computation graph not
+delegated will be executed by the ExecuTorch operator implementations.
+
+To delegate the exported model to a specific backend, we need to import its
+partitioner as well as edge compile config from ExecuTorch codebase first, then
+call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
+object `to_edge` function created.
+
+Here's an example of how to delegate nanoGPT to XNNPACK (if you're deploying to an Android phone for instance):
+
+```python
+# export_nanogpt.py
+
+# Load partitioner for Xnnpack backend
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# Model to be delegated to specific backend should use specific edge compile config
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
+from executorch.exir import EdgeCompileConfig, to_edge
+
+import torch
+from torch.export import export
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torch._export import capture_pre_autograd_graph
+
+from model import GPT
+
+# Load the nanoGPT model.
+model = GPT.from_pretrained('gpt2')
+
+# Create example inputs. This is used in the export process to provide
+# hints on the expected shape of the model input.
+example_inputs = (
+        torch.randint(0, 100, (1, model.config.block_size - 1), dtype=torch.long),
+    )
+
+# Set up dynamic shape configuration. This allows the sizes of the input tensors
+# to differ from the sizes of the tensors in `example_inputs` during runtime, as
+# long as they adhere to the rules specified in the dynamic shape configuration.
+# Here we set the range of 0th model input's 1st dimension as
+# [0, model.config.block_size].
+# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# for details about creating dynamic shapes.
+dynamic_shape = (
+    {1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
+)
+
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
+
+# Convert the model into a runnable ExecuTorch program.
+# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
+edge_config = get_xnnpack_edge_compile_config()
+edge_manager = to_edge(traced_model, compile_config=edge_config)
+
+# Delegate exported model to Xnnpack backend by invoking `to_backend` function with Xnnpack partitioner.
+edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+et_program = edge_manager.to_executorch()
+
+# Save the Xnnpack-delegated ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
+
+
 ```
-### Build and Run
 
-1. Create the Cmake file for build
+Additionally, update CMakeLists.txt to build and link the XNNPACK backend to
+ExecuTorch runner.
+
 ```
 cmake_minimum_required(VERSION 3.19)
 project(nanogpt_runner)
@@ -223,71 +518,103 @@ project(nanogpt_runner)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
-
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
-option(EXECUTORCH_BUILD_XNNPACK "" ON)
-option(EXECUTORCH_BUILD_SDK "" ON) # Needed for etdump
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
 # Include the executorch subdirectory.
 add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../executorch
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
     ${CMAKE_BINARY_DIR}/executorch)
 
 # include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-add_executable(nanogpt_runner nanogpt_runner.cpp)
+add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
     nanogpt_runner
     PRIVATE
-    etdump
-    extension_module
-    portable_ops_lib)
-
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
 
-This CMake file links the ExecuTorch codebase, along with the necessary extensions and XNNPACK modules, to the nanogpt runner.
+Keep the rest of the code the same. For more details refer to [Exporting
+to ExecuTorch](#step-1-exporting-to-executorch) and [Invoking the
+Runtime](#step-2-invoking-the-runtime) for more details
 
-2. Build the c++ environment for nanorunner
-```
-(rm -rf cmake-out \
-  && mkdir cmake-out \
-  && cd cmake-out \
-  && cmake ..)
-```
+At this point, the working directory should contain the following files:
 
-3. With this CMake file as well as built environment iin place, you can build the nanogpt runner binary by executing the following command:
+- CMakeLists.txt
+- main.cpp
+- basic_tokenizer.h
+- basic_sampler.h
+- managed_tensor.h
+- export_nanogpt.py
+- model.py
+- vocab.json
 
+If all of these are present, you can now export Xnnpack delegated pte model:
+```bash
+python export_nanogpt.py
 ```
-cmake --build cmake-out --target nanogpt_runner -j9
+
+It will generate `nanogpt.pte`, under the same working directory.
+
+Then we can build and run the model by:
+```bash
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
+cmake --build cmake-out -j10
+./cmake-out/nanogpt_runner
 ```
 
-4. After the build is complete, you can run the binary with this command:
+
+You should see the message:
+
 ```
-./cmake-out/nanogpt_runner
+Enter model prompt:
 ```
-If everything worked it should see something like this:
+
+Type some seed text for the model and press enter. Here we use "Hello world!" as
+an example prompt:
+
 ```
-prompt: Hello world!
-output: Hello world!
+Enter model prompt: Hello world!
+Hello world!
 
-I'm not sure if you've heard of the "Curse of the Dragon" or
+I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
 ```
 
-## Quantization (Optional)
+The delegated model should be noticeably faster compared to the non-delegated model.
 
-Quantization refers to a set of techniques for running calculations and storing tensors using lower precision types. Compared to 32-bit floating point, using 8-bit integers can provide both a significant speedup and reduction in memory usage. There are many approaches to quantizing a model, varying in amount of pre-processing required, data types used, and impact on model accuracy and performance.
+For more information regarding backend delegateion, see the ExecuTorch guides
+for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md) and [Core ML
+Backend](../build-run-coreml.md).
 
-Because compute and memory are highly constrained on mobile devices, some form of quantization is necessary to ship large models on consumer electronics. In particular, large language models, such as Llama2, may require quantizing model weights to 4 bits or less.
+## Quantization
 
-Leveraging quantization requires transforming the model before export. PyTorch provides multiple quantization flows. Because we are quantizing a model for export, we need to use the PyTorch 2.0 export (pt2e) quantization API.
+Quantization refers to a set of techniques for running calculations and storing tensors using lower precision types.
+Compared to 32-bit floating point, using 8-bit integers can provide both a significant speedup and reduction in
+memory usage. There are many approaches to quantizing a model, varying in amount of pre-processing required, data
+types used, and impact on model accuracy and performance.
 
-This example targets CPU acceleration using the XNNPACK delegate. As such, we need to use the XNNPACK-specific quantizer. Targeting a different backend will require use of the corresponding quantizer.
+Because compute and memory are highly constrained on mobile devices, some form of quantization is necessary to ship
+large models on consumer electronics. In particular, large language models, such as Llama2, may require quantizing
+model weights to 4 bits or less.
 
-To use 8-bit integer dynamic quantization with the XNNPACK delegate, perform the following calls prior to calling export. This will update and annotate the computational graph to use quantized operators, where available.
+Leveraging quantization requires transforming the model before export. PyTorch provides the pt2e (PyTorch 2 Export)
+API for this purpose. This example targets CPU acceleration using the XNNPACK delegate. As such, it needs to use the
+ XNNPACK-specific quantizer. Targeting a different backend will require use of the corresponding quantizer.
+
+To use 8-bit integer dynamic quantization with the XNNPACK delegate, call `prepare_pt2e`, calibrate the model by
+running with a representative input, and then call `convert_pt2e`. This updates the computational graph to use
+quantized operators where available.
+
+```python
+# export_nanogpt.py
 
-```
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
 )
@@ -296,7 +623,9 @@ from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     XNNPACKQuantizer,
 )
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+```
 
+```python
 # Use dynamic, per-channel quantization.
 xnnpack_quant_config = get_symmetric_quantization_config(
     is_per_channel=True, is_dynamic=True
@@ -318,48 +647,53 @@ m = convert_pt2e(m, fold_quantize=False)
 DuplicateDynamicQuantChainPass()(m)
 
 traced_model = export(m, example_inputs)
-
 ```
 
-Additionally, add or update the to_backend() call to use XnnpackDynamicallyQuantizedPartitioner. This will instruct the lowering logic to emit the correct quantized operators.
+Additionally, add or update the `to_backend()` call to use `XnnpackPartitioner`. This instructs ExecuTorch to
+optimize the model for CPU execution via the XNNPACK backend.
 
-```
+```python
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
-    XnnpackDynamicallyQuantizedPartitioner,
+    XnnpackPartitioner,
 )
+```
 
+```python
 edge_manager = to_edge(traced_model, compile_config=edge_config)
-
-# Lower to XNNPACK using the appropriate quantized partitioner.
-edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
-
+edge_manager = edge_manager.to_backend(XnnpackPartitioner()) # Lower to XNNPACK.
 et_program = edge_manager.to_executorch()
 ```
-Finally, update the CMakeLists.txt to link the XNNPACK backend with the runner.
+
+Finally, ensure that the runner links against the `xnnpack_backend` target in CMakeLists.txt.
 
 ```
-add_executable(nanogpt_runner nanogpt_runner.cpp)
+add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
     nanogpt_runner
     PRIVATE
-    etdump
-    extension_module
-    portable_ops_lib
-    xnnpack_backend) # Link the XNNPACK backend
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
 
-## Debugging and Profiling
-After lowering a model by calling to_backend(), you might want to see what got delegated and what didn’t. We provide util functions to help you get insight on the delegation, and with such information, you can debug and maybe improve the delegation.
+For more information, see [Quantization in ExecuTorch](../quantization-overview.md).
 
-### Debug the Delegation
+## Profiling and Debugging
+After lowering a model by calling `to_backend()`, you may want to see what got delegated and what didn’t. ExecuTorch
+provides utility methods to give insight on the delegation. You can use this information to gain visibility into
+the underlying computation and diagnose potential performance issues. Model authors can use this information to
+structure the model in a way that is compatible with the target backend.
 
-1.  Get high level information
-get_delegation_info gives you a summary of what happened to the model after the to_backend() call:
+### Visualizing the Delegation
 
-```
+The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
+
+```python
 from executorch.exir.backend.utils import get_delegation_info
 from tabulate import tabulate
 
+# ... After call to to_backend(), but before to_executorch()
 graph_module = edge_manager.exported_program().graph_module
 delegation_info = get_delegation_info(graph_module)
 print(delegation_info.get_summary())
@@ -367,8 +701,7 @@ df = delegation_info.get_operator_delegation_dataframe()
 print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
 ```
 
-
-Take NanoGPT lowered to XNNPACK as an example:
+For nanoGPT targeting the XNNPACK backend, you might see the following:
 ```
 Total  delegated  subgraphs:  86
 Number  of  delegated  nodes:  473
@@ -376,121 +709,122 @@ Number  of  non-delegated  nodes:  430
 ```
 
 
-|    |  op_type                                 |  occurrences_in_delegated_graphs  |  occurrences_in_non_delegated_graphs  |
+|    |  op_type                                 |  # in_delegated_graphs  |  # in_non_delegated_graphs  |
 |----|---------------------------------|------- |-----|
 |  0  |  aten__softmax_default  |  12  |  0  |
 |  1  |  aten_add_tensor  |  37  |  0  |
 |  2  |  aten_addmm_default  |  48  |  0  |
 |  3  |  aten_arange_start_step  |  0  |  25  |
-|  4  |  aten_bmm_default  |  24  |  0  |
-|  5  |  aten_clone_default  |  0  |  38  |
-|  6  |  aten_embedding_default  |  0  |  2  |
-|  7  |  aten_expand_copy_default  |  48  |  0  |
-|  8  |  aten_full_default  |  0  |  12  |
-|  9  |  aten_full_like_default  |  0  |  12  |
-|  10  |  aten_gelu_default  |  0  |  12  |
-|  11  |  aten_index_tensor  |  0  |  1  |
-|  12  |  aten_le_scalar  |  0  |  12  |
-|  13  |  aten_logical_and_default  |  0  |  12  |
-|  14  |  aten_logical_not_default  |  0  |  12  |
-|  15  |  aten_mm_default  |  1  |  0  |
-|  16  |  aten_mul_scalar  |  24  |  0  |
-|  17  |  aten_native_layer_norm_default  |  0  |  25  |
-|  18  |  aten_permute_copy_default  |  109  |  0  |
-|  19  |  aten_scalar_tensor_default  |  0  |  12  |
-|  20  |  aten_split_with_sizes_copy_default  |  0  |  12  |
-|  21  |  aten_sub_tensor  |  0  |  12  |
-|  22  |  aten_unsqueeze_copy_default  |  0  |  24  |
+|      |  ...  |    |    |
 |  23  |  aten_view_copy_default  |  170  |  48  |
-|  24  |  aten_where_self  |  0  |  12  |
-|  25  |  getitem  |  0  |  147  |
+|      |  ...  |    |    |
 |  26  |  Total  |  473  |  430  |
 
-In the table, we see that op type aten_view_copy_default appears 170 times in delegate graphs and 48 times in non-delegated graphs.
-
-| 23 | aten_view_copy_default | 170 | 48 |
-
-From here, we might want to know in which part of the graph it wasn’t delegated. For that, you can use the `print_delegated_graph` util function to see a printout of the whole graph with highlighted lowered graphs.
+From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs.
+To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph.
 
-2.  Print graph module
-Call this function right after you call `to_backend()`
-
-```
+```python
 from executorch.exir.backend.utils import print_delegated_graph
-graph_module = self.edge_manager.exported_program().graph_module
+graph_module = edge_manager.exported_program().graph_module
 print(print_delegated_graph(graph_module))
 ```
+This may generate a large amount of output for large models. Consider using "Control+F" or "Command+F" to locate the operator you’re interested in
+(e.g. “aten_view_copy_default”). Observe which instances are not under lowered graphs.
 
-On the printed graph, you can do "Control+F" (or "Command+F" on a Mac) on the operator type you’re interested in (e.g. “aten_view_copy_default”) and observe which ones of them are not under “lowered graph()”s.
+In the fragment of the output for nanoGPT below, observe that embedding and add operators are delegated to XNNPACK while the sub operator is not.
 
-### Performance Analysis (Optional)
+```
+%aten_unsqueeze_copy_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_23, -2), kwargs = {})
+  %aten_unsqueeze_copy_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_24, -1), kwargs = {})
+  %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
+    backend_id: XnnpackBackend
+    lowered graph():
+      %aten_embedding_default : [num_users=1] = placeholder[target=aten_embedding_default]
+      %aten_embedding_default_1 : [num_users=1] = placeholder[target=aten_embedding_default_1]
+      %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+      return (aten_add_tensor,)
+  %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+  %aten_sub_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.sub.Tensor](args = (%aten_unsqueeze_copy_default, %aten_unsqueeze_copy_default_1), kwargs = {})
+```
+
+### Performance Analysis
 
-Through the ExecuTorch SDK, users are able to profile a model and inspect its latency performance.
+Through the ExecuTorch SDK, users are able to profile model execution, giving timing information for each operator in the model.
 
 #### Prerequisites
 
 ##### ETRecord generation (Optional)
 
-ETRecord contains model graphs and metadata for linking runtime results (such as profiling) to the eager model. You will be able to view all profiling events with just ETDump (see next section), but with ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [https://pytorch.org/executorch/main/sdk-etrecord.html](https://pytorch.org/executorch/main/sdk-etrecord.html)
+An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [the ETRecord docs](../sdk-etrecord.md).
 
 
-
-**Steps for enablement:**
-ETRecord is created during export. In your export script, you just called `to_edge() `and it returned edge_program_manager
+In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_backend()` mutates the graph in-place.
 
 ```
 import copy
+from executorch.sdk import generate_etrecord
 
-# Make the deep copy right after your call to to_edge()
-edge_program_manager_copy  =  copy.deepcopy(edge_program_manager)
+# Make the deep copy immediately after to to_edge()
+edge_manager_copy = copy.deepcopy(edge_manager)
 
 # ...
-# Then generate ETRecord right after your call to to_executorch()
-etrecord_path  =  "etrecord.bin"
-generate_etrecord(etrecord_path,  edge_program_manager_copy,  et_program_manager)
+# Generate ETRecord right after to_executorch()
+etrecord_path = "etrecord.bin"
+generate_etrecord(etrecord_path, edge_manager_copy, et_program)
 ```
-Run the export script, then the ETRecord should be generated under path ./etrecord.bin.
-
-##### ETDump generation
 
-ETDump contains runtime results from executing an ExecuTorch model. For more information, see [https://pytorch.org/executorch/main/sdk-etdump.html](https://pytorch.org/executorch/main/sdk-etdump.html)
+Run the export script and the ETRecord will be generated as `etrecord.bin`.
 
+##### ETDump generation
 
+An ETDump is an artifact generated at runtime containing a trace of the model execution. For more information, see [the ETDump docs](../sdk-etdump.md).
 
-**Steps for enablement:**
-You need to enable ETDump generation in your nanogpt_runner.cpp.
+Include the ETDump header in your code.
+```cpp
+// main.cpp
 
-1.  Include the ETDump header in your code.
-```
-#include  <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/sdk/etdump/etdump_flatcc.h>
 ```
 
-2.  Create an Instance of the ETDumpGen class and pass it into the Module constructor
-```
+Create an Instance of the ETDumpGen class and pass it to the Module constructor.
+```cpp
 std::unique_ptr<torch::executor::ETDumpGen> etdump_gen_ = std::make_unique<torch::executor::ETDumpGen>();
-Module llm_model("nanogpt.pte", Module::MlockConfig::UseMlock, std::move(etdump_gen_));
+Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors, std::move(etdump_gen_));
 ```
 
-3.  Dump out the ETDump buffer after call to generate()
-```
+After calling `generate()`, save the ETDump to a file. You can capture multiple
+model runs in a single trace, if desired.
+```cpp
 torch::executor::ETDumpGen* etdump_gen =
-static_cast<torch::executor::ETDumpGen*>(llm_model.event_tracer());
+    static_cast<torch::executor::ETDumpGen*>(model.event_tracer());
 
 ET_LOG(Info, "ETDump size: %zu blocks", etdump_gen->get_num_blocks());
 etdump_result result = etdump_gen->get_etdump_data();
 if (result.buf != nullptr && result.size > 0) {
-// On a device with a file system users can just write it out
-// to the file-system.
-FILE* f = fopen("etdump.etdp", "w+");
-fwrite((uint8_t*)result.buf, 1, result.size, f);
-fclose(f);
-free(result.buf);
+    // On a device with a file system, users can just write it to a file.
+    FILE* f = fopen("etdump.etdp", "w+");
+    fwrite((uint8_t*)result.buf, 1, result.size, f);
+    fclose(f);
+    free(result.buf);
 }
 ```
 
-4.  Compile your binary with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime. Add these to your CMakeLists.txt
+Additionally, update CMakeLists.txt to build with SDK and enable events to be traced and logged into ETDump:
 
 ```
+option(EXECUTORCH_BUILD_SDK "" ON)
+
+# ...
+
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend # Provides the XNNPACK CPU acceleration backend
+    etdump) # Provides event tracing and logging
+
 target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
 target_compile_options(portable_ops_lib PUBLIC -DET_EVENT_TRACER_ENABLED)
 ```
@@ -498,45 +832,35 @@ Run the runner, you will see “etdump.etdp” generated.
 
 #### Analyze with Inspector APIs
 
-Once you’ve collected debug artifacts ETDump (and the optional ETRecord), you can feed them into Inspector APIs in order to get performance details.
+Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information.
 
-##### Creating an Inspector
-```
+```python
 from executorch.sdk import Inspector
 
-inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")
-# If you did not generate an ETRecord, then just pass in ETDump: `inspector = Inspector(etdump_path="etdump.etdp")`
-```
+inspector = Inspector(etdump_path="etdump.etdp")
+# If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")`
 
-Using an Inspector
-```
-with  open("inspector_out.txt", "w") as file:
+with open("inspector_out.txt", "w") as file:
     inspector.print_data_tabular(file)
 ```
-This saves the performance data in a tabular format in “inspector_out.txt”, with each row being a profiling event. Top rows:
-
-|  |  event_block_name  |  event_name  |  p10  (ms)  |  p50  (ms)  |  p90  (ms)  |  avg  (ms)  |  min  (ms)  |  max  (ms)  |  op_types  |  is_delegated_op  |  delegate_backend_name  |
-|---|----------------------|------------------|-----------|---------------|--------------|-------------|-------------|--------------|-------------|---------------------------|----------|
-|  0  |  Default  |  Method::init  |  60.502  |  60.502  |  60.502  |  60.502  |  60.502  |  60.502  |  []  |  False  |  |
-|  1  |  Default  |  Program::load_method  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  []  |  False  |  |
-|  2  |  Execute  |  native_call_arange.start_out  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  []  |  False  |  |
-|  3  |  Execute  |  native_call_embedding.out  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  []  |  False  |  |
-|  4  |  Execute  |  native_call_embedding.out  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  []  |  False  |  |
+This prints the performance data in a tabular format in “inspector_out.txt”, with each row being a profiling event. Top rows look like this:
+![](../_static/img/llm_manual_print_data_tabular.png)
+<a href="../_static/img/llm_manual_print_data_tabular.png" target="_blank">View in full size</a>
 
-For more information about Inspector APIs and the rich functionality it provides, see [https://pytorch.org/executorch/main/sdk-inspector.html](https://pytorch.org/executorch/main/sdk-inspector.html).
+To learn more about the Inspector and the rich functionality it provides, see the [Inspector API Reference](../sdk-inspector.md).
 
-## How to use custom kernels
-With our new custom op APIs, custom op/kernel authors can easily bring in their op/kernel into PyTorch/ExecuTorch and the process is streamlined.
+## Custom Kernels
+With the ExecuTorch custom operator APIs, custom operator and kernel authors can easily bring in their kernel into PyTorch/ExecuTorch.
 
 There are three steps to use custom kernels in ExecuTorch:
 
-1.  Prepare the kernel implementation using ExecuTorch types.
-2.  Compile and link the custom kernel to both AOT Python environment as well as the runner binary.
+1.  Write the custom kernel using ExecuTorch types.
+2.  Compile and link the custom kernel to both AOT Python environment as well as the runtime binary.
 3.  Source-to-source transformation to swap an operator with a custom op.
 
-### Prepare custom kernel implementation
+### Writing a Custom Kernel
 
-Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see [native_functions.yaml](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)). For example:
+Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see [native_functions.yaml](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)).
 
 ```
 custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
@@ -544,89 +868,87 @@ custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
 custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
 ```
 
-Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
-```
-// custom_linear.h/custom_linear.cpp
+Write your custom kernel according to the schema defined above. Use the `EXECUTORCH_LIBRARY` macro to make the kernel available to the ExecuTorch runtime.
+
+```cpp
+// custom_linear.h / custom_linear.cpp
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
-
-// calculation
-return out;
+    // calculation
+    return out;
 }
 
-// opset namespace myop
+// Register as myop::custom_linear.out
 EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
 ```
 
-Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
+To make this operator available in PyTorch, you can define a wrapper around the ExecuTorch custom kernel. Note that the ExecuTorch
+implementation uses ExecuTorch tensor types, while the PyTorch wrapper uses ATen tensors.
 
-```
+```cpp
 // custom_linear_pytorch.cpp
+
 #include "custom_linear.h"
 #include <torch/library.h>
 
 at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
 
-// initialize out
-at::Tensor out = at::empty({weight.size(1), input.size(1)});
+    // initialize out
+    at::Tensor out = at::empty({weight.size(1), input.size(1)});
 
-// wrap kernel in custom_linear.cpp into ATen kernel
-WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
+    // wrap kernel in custom_linear.cpp into ATen kernel
+    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
 
-return out;
+    return out;
 }
 
-// standard API to register ops into PyTorch
+// Register the operator with PyTorch.
 TORCH_LIBRARY(myop,  m) {
-
-m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
-
-m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
+    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
+    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
 }
 ```
 
-### Compile and link the custom kernel
-
-Link it into ExecuTorch runtime: In our runner CMakeLists.txt we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
-
+### Compile and Link the Custom Kernel
 
+To make it available to the ExecuTorch runtime, compile custom_linear.h/cpp into the binary target. You can also build the kernel as a dynamically loaded library (.so or .dylib) and link it as well.
 
-Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
+To make it available to PyTorch, package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into the python environment.
+This is needed to make PyTorch aware of the custom operator at the time of export.
 
-```
+```python
 import torch
-torch.ops.load_library("libcustom_linear.so/dylib")
+torch.ops.load_library("libcustom_linear.so")
 ```
 
+Once loaded, you can use the custom operator in PyTorch code.
 
-Once loaded we can perform the next step, of introducing the custom op into PyTorch environment.
-
-### Source-to-source transformation to introduce the custom op
+For more information, see [PyTorch Custom Operators](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html) and
+and [ExecuTorch Kernel Registration](../kernel-library-custom-aten-kernel.md).
 
-Easier way to introduce our customized linear is by rewriting the eager model. However, that may miss some occurrences of torch.nn.Linear in our example. A safer option is to walk through all the modules in the module hierarchy and perform the swapping.
+### Using a Custom Operator in a Model
 
-For example, we can do the following to swap torch.nn.Linear with our custom linear op:
+The custom operator can explicitly used in the PyTorch model, or you can write a transformation to replace instances of a core operator with the custom variant. For this example, you could find
+all instances of `torch.nn.Linear` and replace them with `CustomLinear`.
 
-```
+```python
 def  replace_linear_with_custom_linear(module):
-    for  name,  child  in  module.named_children():
-        if  isinstance(child,  nn.Linear):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
             setattr(
                 module,
                 name,
                 CustomLinear(child.in_features,  child.out_features, child.bias),
         )
-    else:
-        replace_linear_with_custom_linear(child)
+        else:
+            replace_linear_with_custom_linear(child)
 ```
 
-The rest of the steps will be the same as the normal flow. Now you can run this module in eager as well as export it to ExecuTorch and run on the runner.
-
-## How to build Mobile Apps
-You can also execute an LLM using ExecuTorch on iOS and Android
-
-**For iOS details see the [iOS Sample App](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/apple_ios).**
+The remaining steps are the same as the normal flow. Now you can run this module in eager mode as well as export to ExecuTorch.
 
+## How to Build Mobile Apps
+See the instructions for building and running LLMs using ExecuTorch on iOS and Android.
 
-**For Android see the [Android Instructions](https://pytorch.org/executorch/main/llm/llama-demo-android.html).**
+* **[iOS ExecuTorch LLaMA Demo App](llama-demo-ios.md)**
+* **[Android ExecuTorch LLaMA Demo App](llama-demo-android.md)**
diff --git a/docs/source/llm/llama-demo-ios.md b/docs/source/llm/llama-demo-ios.md
new file mode 100644
index 00000000000..cc25a24f335
--- /dev/null
+++ b/docs/source/llm/llama-demo-ios.md
@@ -0,0 +1,2 @@
+```{include} ../../../examples/demo-apps/apple_ios/LLaMA/README.md
+```
\ No newline at end of file
diff --git a/docs/source/native-delegates-executorch-vulkan-delegate.md b/docs/source/native-delegates-executorch-vulkan-delegate.md
new file mode 100644
index 00000000000..2c83c7f899c
--- /dev/null
+++ b/docs/source/native-delegates-executorch-vulkan-delegate.md
@@ -0,0 +1 @@
+```{include} ../../backends/vulkan/README.md
diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md
index 12b2e9c2ba7..1d12daef9d8 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md
@@ -74,16 +74,8 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. After running the model it will produce basic per-op and total timings. We provide an example of the profiling below. The timings listed are the average across runs, and the units are in microseconds.
+We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information.
 
-```
-Fully Connected (NC, F32) GEMM: 109.510002
-Total Time: 109.510002
-```
-
-::::{note}
-Profiling is a work in progress, and is planned to be integrated with [SDK Tools](sdk-delegate-integration.md) and Tensorboard.
-::::
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
 ## Quantization
diff --git a/docs/source/runtime-build-and-cross-compilation.md b/docs/source/runtime-build-and-cross-compilation.md
index 22246b8f8c7..e328bd1541a 100644
--- a/docs/source/runtime-build-and-cross-compilation.md
+++ b/docs/source/runtime-build-and-cross-compilation.md
@@ -60,7 +60,7 @@ cd executorch
 #
 # NOTE: If your `buck2` binary is not on the PATH, you can change this line to
 # say something like `-DBUCK2=/tmp/buck2` to point directly to the tool.
-(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DBUCK2=buck2 ..)
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
 ```
 
 Once this is done, you don't need to do it again until you pull from the upstream repo again, or if you modify any CMake-related files.
diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md
index 2ed256d2aeb..33deae3904b 100644
--- a/docs/source/sdk-bundled-io.md
+++ b/docs/source/sdk-bundled-io.md
@@ -23,6 +23,8 @@ ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Fo
 
 In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTestSuite`, to hold essential info for ExecuTorch program verification.
 
+`MethodTestCase` represents a single testcase. Each `MethodTestCase` contains inputs and expected outputs for a single execution.
+
 :::{dropdown} `MethodTestCase`
 
 ```{eval-rst}
@@ -31,6 +33,8 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest
 ```
 :::
 
+`MethodTestSuite` contains all testing info for single method, including a str representing method name, and a `List[MethodTestCase]` for all testcases:
+
 :::{dropdown} `MethodTestSuite`
 
 ```{eval-rst}
@@ -44,18 +48,18 @@ Since each model may have multiple inference methods, we need to generate `List[
 
 ### Step 3: Generate `BundledProgram`
 
-We provide `create_bundled_program` API under `executorch/sdk/bundled_program/core.py` to generate `BundledProgram` by bundling the emitted ExecuTorch program with the `List[MethodTestSuite]`:
+We provide `BundledProgram` class under `executorch/sdk/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
+                            `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`:
 
 :::{dropdown} `BundledProgram`
 
 ```{eval-rst}
-.. currentmodule:: executorch.sdk.bundled_program.core
-.. autofunction:: create_bundled_program
+.. autofunction:: executorch.sdk.bundled_program.core.BundledProgram.__init__
     :noindex:
 ```
 :::
 
-`create_bundled_program` will do sannity check internally to see if the given `List[MethodTestSuite]` matches the given Program's requirements. Specifically:
+Construtor of `BundledProgram `will do sannity check internally to see if the given `List[MethodTestSuite]` matches the given Program's requirements. Specifically:
 1. The method_names of each `MethodTestSuite` in `List[MethodTestSuite]` for should be also in program. Please notice that it is no need to set testcases for every method in the Program.
 2. The metadata of each testcase should meet the requirement of the coresponding inference methods input.
 
@@ -83,20 +87,20 @@ To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs,
 Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch model and the representative inputs we want to test it along with.
 
 ```python
-
 import torch
 
+from executorch.exir import to_edge
+from executorch.sdk import BundledProgram
+
 from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import create_bundled_program
 from executorch.sdk.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-
-from executorch.exir import to_edge
+from torch._export import capture_pre_autograd_graph
 from torch.export import export
 
-# Step 1: ExecuTorch Program Export
 
+# Step 1: ExecuTorch Program Export
 class SampleModel(torch.nn.Module):
     """An example model with multi-methods. Each method has multiple input and single output"""
 
@@ -105,7 +109,7 @@ class SampleModel(torch.nn.Module):
         self.a: torch.Tensor = 3 * torch.ones(2, 2, dtype=torch.int32)
         self.b: torch.Tensor = 2 * torch.ones(2, 2, dtype=torch.int32)
 
-    def encode(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
         z = x.clone()
         torch.mul(self.a, x, out=z)
         y = x.clone()
@@ -113,74 +117,62 @@ class SampleModel(torch.nn.Module):
         torch.add(y, q, out=y)
         return y
 
-    def decode(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
-        y = x * q
-        torch.add(y, self.b, out=y)
-        return y
 
-# Inference method names of SampleModel we want to bundle testcases to.
+# Inference method name of SampleModel we want to bundle testcases to.
 # Notices that we do not need to bundle testcases for every inference methods.
-method_names = ["encode", "decode"]
+method_name = "forward"
 model = SampleModel()
 
-capture_inputs = {
-    m_name: (
-        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-    )
-    for m_name in method_names
-}
+# Inputs for graph capture.
+capture_input = (
+    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+)
 
-# Find each method of model needs to be traced my its name, export its FX Graph.
-method_graphs = {
-    m_name: export(getattr(model, m_name), capture_inputs[m_name])
-    for m_name in method_names
-}
+# Export method's FX Graph.
+method_graph = export(
+    capture_pre_autograd_graph(model, capture_input),
+    capture_input,
+)
 
-# Emit the traced methods into ET Program.
-program = to_edge(method_graphs).to_executorch().executorch_program
+
+# Emit the traced method into ET Program.
+et_program = to_edge(method_graph).to_executorch()
 
 # Step 2: Construct MethodTestSuite for Each Method
 
 # Prepare the Test Inputs.
 
-# number of input sets to be verified
+# Number of input sets to be verified
 n_input = 10
 
-# Input sets to be verified for each inference methods.
-# To simplify, here we create same inputs for all methods.
-inputs = {
-    # Inference method name corresponding to its test cases.
-    m_name: [
-        # Each list below is a individual input set.
-        # The number of inputs, dtype and size of each input follow Program's spec.
-        [
-            (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-            (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-        ]
-        for _ in range(n_input)
+# Input sets to be verified.
+inputs = [
+    # Each list below is a individual input set.
+    # The number of inputs, dtype and size of each input follow Program's spec.
+    [
+        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
     ]
-    for m_name in method_names
-}
+    for _ in range(n_input)
+]
 
 # Generate Test Suites
 method_test_suites = [
     MethodTestSuite(
-        method_name=m_name,
+        method_name=method_name,
         test_cases=[
             MethodTestCase(
                 inputs=input,
-                expected_outputs=getattr(model, m_name)(*input),
+                expected_outputs=(getattr(model, method_name)(*input), ),
             )
-            for input in inputs[m_name]
+            for input in inputs
         ],
-    )
-    for m_name in method_names
+    ),
 ]
 
 # Step 3: Generate BundledProgram
-
-bundled_program = create_bundled_program(program, method_test_suites)
+bundled_program = BundledProgram(et_program, method_test_suites)
 
 # Step 4: Serialize BundledProgram to flatbuffer.
 serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
@@ -320,10 +312,10 @@ Here's the example of the dtype of test input not meet model's requirement:
 ```python
 import torch
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import create_bundled_program
-
 from executorch.exir import to_edge
+from executorch.sdk import BundledProgram
+
+from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -344,15 +336,16 @@ class Module(torch.nn.Module):
 model = Module()
 method_names = ["forward"]
 
-inputs = torch.ones(2, 2, dtype=torch.float)
+inputs = (torch.ones(2, 2, dtype=torch.float), )
 
 # Find each method of model needs to be traced my its name, export its FX Graph.
-method_graphs = {
-    m_name: export(getattr(model, m_name), (inputs,)) for m_name in method_names
-}
+method_graph = export(
+    capture_pre_autograd_graph(model, inputs),
+    inputs,
+)
 
 # Emit the traced methods into ET Program.
-program = to_edge(method_graphs).to_executorch().executorch_program
+et_program = to_edge(method_graph).to_executorch()
 
 # number of input sets to be verified
 n_input = 10
@@ -378,7 +371,7 @@ method_test_suites = [
         test_cases=[
             MethodTestCase(
                 inputs=input,
-                expected_outputs=getattr(model, m_name)(*input),
+                expected_outputs=(getattr(model, m_name)(*input),),
             )
             for input in inputs[m_name]
         ],
@@ -388,7 +381,7 @@ method_test_suites = [
 
 # Generate BundledProgram
 
-bundled_program = create_bundled_program(program, method_test_suites)
+bundled_program = BundledProgram(et_program, method_test_suites)
 ```
 
 :::{dropdown} Raised Error
@@ -455,10 +448,10 @@ Another common error would be the method name in any `MethodTestSuite` does not
 ```python
 import torch
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import create_bundled_program
-
 from executorch.exir import to_edge
+from executorch.sdk import BundledProgram
+
+from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -477,18 +470,18 @@ class Module(torch.nn.Module):
 
 
 model = Module()
-
 method_names = ["forward"]
 
-inputs = torch.ones(2, 2, dtype=torch.float)
+inputs = (torch.ones(2, 2, dtype=torch.float),)
 
 # Find each method of model needs to be traced my its name, export its FX Graph.
-method_graphs = {
-    m_name: export(getattr(model, m_name), (inputs,)) for m_name in method_names
-}
+method_graph = export(
+    capture_pre_autograd_graph(model, inputs),
+    inputs,
+)
 
 # Emit the traced methods into ET Program.
-program = to_edge(method_graphs).to_executorch().executorch_program
+et_program = to_edge(method_graph).to_executorch()
 
 # number of input sets to be verified
 n_input = 10
@@ -513,7 +506,7 @@ method_test_suites = [
         test_cases=[
             MethodTestCase(
                 inputs=input,
-                expected_outputs=getattr(model, m_name)(*input),
+                expected_outputs=(getattr(model, m_name)(*input),),
             )
             for input in inputs[m_name]
         ],
@@ -525,7 +518,7 @@ method_test_suites = [
 method_test_suites[0].method_name = "MISSING_METHOD_NAME"
 
 # Generate BundledProgram
-bundled_program = create_bundled_program(program, method_test_suites)
+bundled_program = BundledProgram(et_program, method_test_suites)
 
 ```
 
diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md
index 1563038eb52..45e50b44e87 100644
--- a/docs/source/sdk-debugging.md
+++ b/docs/source/sdk-debugging.md
@@ -20,7 +20,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn
     Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
     etdump_gen.set_debug_buffer(buffer);
     etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kIntermediateOutputs);
+        EventTracerDebugLogLevel::kProgramOutputs);
     ```
 
     - Intermediate outputs of executed (non-delegated) operations (will include the program level outputs too)
@@ -28,7 +28,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn
     Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
     etdump_gen.set_debug_buffer(buffer);
     etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kProgramOutputs);
+        EventTracerDebugLogLevel::kIntermediateOutputs);
     ```
 3. Build the runtime with the pre-processor flag that enables tracking of debug events. Instructions are in the [ETDump documentation](./sdk-etdump.md).
 4. Run your model and dump out the ETDump buffer as described [here](./sdk-etdump.md). (Do so similarly for the debug buffer if configured above)
diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md
index 7f8c61af8c5..80033711552 100644
--- a/docs/source/sdk-delegate-integration.md
+++ b/docs/source/sdk-delegate-integration.md
@@ -20,7 +20,7 @@ Delegate authors propagate what transformations occur in a lowered backend by re
 
 For example:
 - **{ 0: (10, 11), 1: (11, 12) }:** Identifiers 0 and 1 in the runtime correspond to operators with the debug handles (10, 11) and (11, 12) respectively.
-- **{ “Fancy Fusion”: (11, 12, 15) }**: Identifier “Fancy Fusion” in the runtime corresponds to operators with debug handles (11, 12, 15).
+- **{ “fused_op_1_2_3”: (11, 12, 15) }**: Identifier “fused_op_1_2_3” in the runtime corresponds to operators with debug handles (11, 12, 15), and 11, 12, 15 corresponds to the op 1, op 2 and op 3.
 
 ```{Note}
 Identifiers are a means of connecting runtime results to the model graph; the interpretation of the identifiers is defined by the delegate author.
diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md
index 8937ea5a777..4eacb18b14c 100644
--- a/docs/source/sdk-etdump.md
+++ b/docs/source/sdk-etdump.md
@@ -34,31 +34,11 @@ if (result.buf != nullptr && result.size > 0) {
   }
 ```
 
-4. ***Compile*** your binary with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime.
-
-    i). ***Buck***
-
-    In Buck, users simply depend on the etdump target which is:
-    ```
-    //executorch/sdk/etdump:etdump_flatcc
-    ```
-    When compiling their binary through Buck, users can pass in this buck config to enable the pre-processor flag. For example, when compiling `sdk_example_runner` to enable ETDump generation, users compile using the following command:
-    ```
-    buck2 build -c executorch.event_tracer_enabled=true examples/sdk/sdk_example_runner:sdk_example_runner
-    ```
-
-    ii). ***CMake***
-
-    In CMake, users add this to their compile flags:
-    ```
-    -DET_EVENT_TRACER_ENABLED
-    ```
-
-    This flag needs to be added to the ExecuTorch library and any operator library that the users are compiling into their binary. For reference, users can take a look at `examples/sdk/CMakeLists.txt`. The lines of of interest are:
-    ```
-    target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
-    target_compile_options(portable_ops_lib PUBLIC -DET_EVENT_TRACER_ENABLED)
-    ```
+4. ***Compile*** your binary using CMake with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime. This flag needs to be added to the ExecuTorch library and any operator library that you are compiling into your binary. For reference, you can take a look at `examples/sdk/CMakeLists.txt`. The lines of interest are:
+```
+target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
+```
 ## Using an ETDump
 
-1. Pass this ETDump into the [Inspector API](./sdk-inspector.rst) to access this data and  do post-run analysis.
+Pass this ETDump into the [Inspector API](./sdk-inspector.rst) to access this data and do post-run analysis.
diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst
index e9eeb52b4f5..43ed5095c64 100644
--- a/docs/source/sdk-etrecord.rst
+++ b/docs/source/sdk-etrecord.rst
@@ -29,7 +29,7 @@ the ExecuTorch program (returned by the call to ``to_executorch()``), and option
 they are interested in working with via our tooling.
 
 .. warning::
-    Users should do a deepcopy of the output of to_edge() and pass in the deepcopy to the generate_etrecord API. This is needed because the subsequent call, to_executorch(), does an in-place mutation and will lose debug data in the process.
+    Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
 
 .. currentmodule:: executorch.sdk.etrecord._etrecord
 .. autofunction:: generate_etrecord
diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst
index 23c529cb9d2..e15c1f2a395 100644
--- a/docs/source/sdk-inspector.rst
+++ b/docs/source/sdk-inspector.rst
@@ -56,6 +56,7 @@ print_data_tabular
     inspector.print_data_tabular()
 
 .. image:: _static/img/print_data_tabular.png
+Note that the unit of delegate profiling events is "cycles". We're working on providing a way to set different units in the future.
 
 
 find_total_for_module
diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md
index 85270a44bcf..53f7d88613a 100644
--- a/docs/source/sdk-overview.md
+++ b/docs/source/sdk-overview.md
@@ -14,7 +14,7 @@ The ExecuTorch SDK supports the following features:
     - Model loading and execution time
 - **Delegate Integration** - Surfacing performance details from delegate backends
     - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy)
-- **Debugging** (Intermediate outputs and output quality analysis) - Coming soon
+- **Debugging** - Intermediate outputs and output quality analysis
 - **Visualization** - Coming soon
 
 ## Fundamental components of the SDK
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 4defc3079e3..b82a1a9f25b 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -171,7 +171,7 @@ Now you should be able to find the executable built at `./cmake-out/backends/xnn
 
 
 ## Running the XNNPACK Model with Buck
-Alternatively, you can use `buck2` to run the `.pte` file with XNNPACK delegate instructions in it on your host platform. You can follow the instructions here to install [buck2](getting-started-setup.md#building-a-runtime). You can now run it with the prebuilt `xnn_executor_runner` provided in the examples. This will run the model on some sample inputs.
+Alternatively, you can use `buck2` to run the `.pte` file with XNNPACK delegate instructions in it on your host platform. You can follow the instructions here to install [buck2](getting-started-setup.md#Build-&-Run). You can now run it with the prebuilt `xnn_executor_runner` provided in the examples. This will run the model on some sample inputs.
 
 ```bash
 buck2 run examples/xnnpack:xnn_executor_runner -- --model_path ./mv2_xnnpack_fp32.pte
diff --git a/docs/source/tutorials_source/export-to-executorch-tutorial.py b/docs/source/tutorials_source/export-to-executorch-tutorial.py
index 49fc2c42b73..2071567ddd1 100644
--- a/docs/source/tutorials_source/export-to-executorch-tutorial.py
+++ b/docs/source/tutorials_source/export-to-executorch-tutorial.py
@@ -44,15 +44,11 @@
 #
 # The first step of lowering to ExecuTorch is to export the given model (any
 # callable or ``torch.nn.Module``) to a graph representation. This is done via
-# the two-stage APIs, ``torch._export.capture_pre_autograd_graph``, and
-# ``torch.export``.
-#
-# Both APIs take in a model (any callable or ``torch.nn.Module``), a tuple of
+# ``torch.export``, which takes in an ``torch.nn.Module``, a tuple of
 # positional arguments, optionally a dictionary of keyword arguments (not shown
 # in the example), and a list of dynamic shapes (covered later).
 
 import torch
-from torch._export import capture_pre_autograd_graph
 from torch.export import export, ExportedProgram
 
 
@@ -70,40 +66,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 example_args = (torch.randn(1, 3, 256, 256),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args)
-print("Pre-Autograd ATen Dialect Graph")
-print(pre_autograd_aten_dialect)
-
-aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
-print("ATen Dialect Graph")
+aten_dialect: ExportedProgram = export(SimpleConv(), example_args)
 print(aten_dialect)
 
 ######################################################################
-# The output of ``torch._export.capture_pre_autograd_graph`` is a fully
-# flattened graph (meaning the graph does not contain any module hierarchy,
-# except in the case of control flow operators). Furthermore, the captured graph
-# contains only ATen operators (~3000 ops) which are Autograd safe, for example, safe
-# for eager mode training.
-#
-# The output of ``torch.export`` further compiles the graph to a lower and
-# cleaner representation. Specifically, it has the following:
-#
-# - The graph is purely functional, meaning it does not contain operations with
-#   side effects such as mutations or aliasing.
-# - The graph contains only a small defined
-#   `Core ATen IR <https://pytorch.org/docs/stable/torch.compiler_ir.html#core-aten-ir>`__
-#   operator set (~180 ops), along with registered custom operators.
-# - The nodes in the graph contain metadata captured during tracing, such as a
-#   stacktrace from user's code.
+# The output of ``torch.export.export`` is a fully flattened graph (meaning the
+# graph does not contain any module hierarchy, except in the case of control
+# flow operators). Additionally, the graph is purely functional, meaning it does
+# not contain operations with side effects such as mutations or aliasing.
 #
 # More specifications about the result of ``torch.export`` can be found
-# `here <https://pytorch.org/docs/2.1/export.html>`__ .
+# `here <https://pytorch.org/docs/main/export.html>`__ .
 #
-# Since the result of ``torch.export`` is a graph containing the Core ATen
-# operators, we will call this the ``ATen Dialect``, and since
-# ``torch._export.capture_pre_autograd_graph`` returns a graph containing the
-# set of ATen operators which are Autograd safe, we will call it the
-# ``Pre-Autograd ATen Dialect``.
+# The graph returned by ``torch.export`` only contains functional ATen operators
+# (~2000 ops), which we will call the ``ATen Dialect``.
 
 ######################################################################
 # Expressing Dynamism
@@ -124,10 +100,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return x + y
 
 
-f = Basic()
 example_args = (torch.randn(3, 3), torch.randn(3, 3))
-pre_autograd_aten_dialect = capture_pre_autograd_graph(f, example_args)
-aten_dialect: ExportedProgram = export(f, example_args)
+aten_dialect: ExportedProgram = export(Basic(), example_args)
 
 # Works correctly
 print(aten_dialect.module()(torch.ones(3, 3), torch.ones(3, 3)))
@@ -153,15 +127,12 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return x + y
 
 
-f = Basic()
 example_args = (torch.randn(3, 3), torch.randn(3, 3))
 dim1_x = Dim("dim1_x", min=1, max=10)
 dynamic_shapes = {"x": {1: dim1_x}, "y": {1: dim1_x}}
-pre_autograd_aten_dialect = capture_pre_autograd_graph(
-    f, example_args, dynamic_shapes=dynamic_shapes
+aten_dialect: ExportedProgram = export(
+    Basic(), example_args, dynamic_shapes=dynamic_shapes
 )
-aten_dialect: ExportedProgram = export(f, example_args, dynamic_shapes=dynamic_shapes)
-print("ATen Dialect Graph")
 print(aten_dialect)
 
 ######################################################################
@@ -198,7 +169,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # As our goal is to capture the entire computational graph from a PyTorch
 # program, we might ultimately run into untraceable parts of programs. To
 # address these issues, the
-# `torch.export documentation <https://pytorch.org/docs/2.1/export.html#limitations-of-torch-export>`__,
+# `torch.export documentation <https://pytorch.org/docs/main/export.html#limitations-of-torch-export>`__,
 # or the
 # `torch.export tutorial <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__
 # would be the best place to look.
@@ -207,10 +178,12 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # Performing Quantization
 # -----------------------
 #
-# To quantize a model, we can do so between the call to
-# ``torch._export.capture_pre_autograd_graph`` and ``torch.export``, in the
-# ``Pre-Autograd ATen Dialect``. This is because quantization must operate at a
-# level which is safe for eager mode training.
+# To quantize a model, we first need to capture the graph with
+# ``torch._export.capture_pre_autograd_graph``, perform quantization, and then
+# call ``torch.export``. ``torch._export.capture_pre_autograd_graph`` returns a
+# graph which contains ATen operators which are Autograd safe, meaning they are
+# safe for eager-mode training, which is needed for quantization. We will call
+# the graph at this level, the ``Pre-Autograd ATen Dialect`` graph.
 #
 # Compared to
 # `FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html>`__,
@@ -220,6 +193,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # will annotate the nodes in the graph with information needed to quantize the
 # model properly for a specific backend.
 
+from torch._export import capture_pre_autograd_graph
+
 example_args = (torch.randn(1, 3, 256, 256),)
 pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args)
 print("Pre-Autograd ATen Dialect Graph")
@@ -268,13 +243,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 from executorch.exir import EdgeProgramManager, to_edge
 
 example_args = (torch.randn(1, 3, 256, 256),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args)
-print("Pre-Autograd ATen Dialect Graph")
-print(pre_autograd_aten_dialect)
-
-aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
-print("ATen Dialect Graph")
-print(aten_dialect)
+aten_dialect: ExportedProgram = export(SimpleConv(), example_args)
 
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 print("Edge Dialect Graph")
@@ -298,16 +267,10 @@ def forward(self, x):
 
 
 encode_args = (torch.randn(1, 10),)
-aten_encode: ExportedProgram = export(
-    capture_pre_autograd_graph(Encode(), encode_args),
-    encode_args,
-)
+aten_encode: ExportedProgram = export(Encode(), encode_args)
 
 decode_args = (torch.randn(1, 5),)
-aten_decode: ExportedProgram = export(
-    capture_pre_autograd_graph(Decode(), decode_args),
-    decode_args,
-)
+aten_decode: ExportedProgram = export(Decode(), decode_args)
 
 edge_program: EdgeProgramManager = to_edge(
     {"encode": aten_encode, "decode": aten_decode}
@@ -328,8 +291,7 @@ def forward(self, x):
 # rather than the ``torch.ops.aten`` namespace.
 
 example_args = (torch.randn(1, 3, 256, 256),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args)
-aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
+aten_dialect: ExportedProgram = export(SimpleConv(), example_args)
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 print("Edge Dialect Graph")
 print(edge_program.exported_program())
@@ -353,7 +315,9 @@ def call_operator(self, op, args, kwargs, meta):
 print(transformed_edge_program.exported_program())
 
 ######################################################################
-# Note: if you see error like `torch._export.verifier.SpecViolationError: Operator torch._ops.aten._native_batch_norm_legit_functional.default is not Aten Canonical`,
+# Note: if you see error like ``torch._export.verifier.SpecViolationError:
+# Operator torch._ops.aten._native_batch_norm_legit_functional.default is not
+# Aten Canonical``,
 # please file an issue in https://github.com/pytorch/executorch/issues and we're happy to help!
 
 
@@ -365,7 +329,7 @@ def call_operator(self, op, args, kwargs, meta):
 # backend through the ``to_backend`` API.  An in-depth documentation on the
 # specifics of backend delegation, including how to delegate to a backend and
 # how to implement a backend, can be found
-# `here <../compiler-delegate-and-partitioner.html>`__
+# `here <../compiler-delegate-and-partitioner.html>`__.
 #
 # There are three ways for using this API:
 #
@@ -393,8 +357,7 @@ def forward(self, x):
 
 # Export and lower the module to Edge Dialect
 example_args = (torch.ones(1),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(LowerableModule(), example_args)
-aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
+aten_dialect: ExportedProgram = export(LowerableModule(), example_args)
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 to_be_lowered_module = edge_program.exported_program()
 
@@ -460,8 +423,7 @@ def forward(self, x):
 
 
 example_args = (torch.ones(1),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(ComposedModule(), example_args)
-aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
+aten_dialect: ExportedProgram = export(ComposedModule(), example_args)
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 exported_program = edge_program.exported_program()
 print("Edge Dialect graph")
@@ -499,8 +461,7 @@ def forward(self, a, x, b):
 
 
 example_args = (torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2))
-pre_autograd_aten_dialect = capture_pre_autograd_graph(Foo(), example_args)
-aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
+aten_dialect: ExportedProgram = export(Foo(), example_args)
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 exported_program = edge_program.exported_program()
 print("Edge Dialect graph")
@@ -534,8 +495,7 @@ def forward(self, a, x, b):
 
 
 example_args = (torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2))
-pre_autograd_aten_dialect = capture_pre_autograd_graph(Foo(), example_args)
-aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
+aten_dialect: ExportedProgram = export(Foo(), example_args)
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 exported_program = edge_program.exported_program()
 delegated_program = edge_program.to_backend(AddMulPartitionerDemo())
diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py
index 8cf186a8cd9..27474c2251e 100644
--- a/docs/source/tutorials_source/sdk-integration-tutorial.py
+++ b/docs/source/tutorials_source/sdk-integration-tutorial.py
@@ -20,7 +20,7 @@
 # This tutorial will show a full end-to-end flow of how to utilize the SDK.
 # Specifically, it will:
 #
-# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord>`__, `ETDump <../sdk-etdump.html>`__).
+# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__).
 # 2. Create an Inspector class consuming these artifacts.
 # 3. Utilize the Inspector class to analyze the model.
 
@@ -42,7 +42,7 @@
 #
 # ``executorch.sdk.generate_etrecord`` takes in an output file path (str), the
 # edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model
-# (``ExecutorchProgramManager``), and an optional dictionary of additional models
+# (``ExecutorchProgramManager``), and an optional dictionary of additional models.
 #
 # In this tutorial, an example model (shown below) is used to demonstrate.
 
@@ -113,9 +113,9 @@ def forward(self, x):
 ######################################################################
 #
 # .. warning::
-#    Users should do a deepcopy of the output of to_edge() and pass in the
-#    deepcopy to the generate_etrecord API. This is needed because the
-#    subsequent call, to_executorch(), does an in-place mutation and will
+#    Users should do a deepcopy of the output of ``to_edge()`` and pass in the
+#    deepcopy to the ``generate_etrecord`` API. This is needed because the
+#    subsequent call, ``to_executorch()``, does an in-place mutation and will
 #    lose debug data in the process.
 #
 
@@ -169,21 +169,10 @@ def forward(self, x):
     f.write(serialized_bundled_program)
 
 ######################################################################
-# We provide 2 ways of executing the Bundled Model to generate the ``ETDump``:
-#
-# **Option 1:**
-#
-# Use Buck (follow `these instructions <../getting-started-setup.html#building-a-runtime>`__ to set up buck)::
-#
-#       cd executorch
-#       buck2 run -c executorch.event_tracer_enabled=true examples/sdk/sdk_example_runner:sdk_example_runner -- --bundled_program_path <bundled_program>
-#
-# **Option 2:**
-#
-# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake)::
+# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
 #
 #       cd executorch
-#       rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DBUCK2=buck2 -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
+#       rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
 #       cd ..
 #       cmake --build cmake-out -j8 -t sdk_example_runner
 #       ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path <bundled_program>
@@ -308,6 +297,6 @@ def forward(self, x):
 # ^^^^^^^^^^^^^^^
 #
 # - `ExecuTorch SDK <../sdk-overview.html>`__
-# - `ETRecord <../sdk-etrecord>`__
+# - `ETRecord <../sdk-etrecord.html>`__
 # - `ETDump <../sdk-etdump.html>`__
 # - `Inspector <../sdk-inspector.html>`__
diff --git a/examples/README.md b/examples/README.md
index bce3e08b58f..68f4d550dfe 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,6 +9,7 @@ ExecuTorch's extensive support spans from simple modules like "Add" to comprehen
 ## Directory structure
 ```
 examples
+├── llm_manual                        # A storage place for the files that [LLM Maunal](https://pytorch.org/executorch/main/llm/getting-started.html) needs
 ├── models                            # Contains a set of popular and representative PyTorch models
 ├── portable                          # Contains end-to-end demos for ExecuTorch in portable mode
 ├── selective_build                   # Contains demos of selective build for optimizing the binary size of the ExecuTorch runtime
@@ -20,7 +21,7 @@ examples
 |   └── mps                           # Contains end-to-end demos of MPS backend
 ├── arm                               # Contains demos of the Arm TOSA and Ethos-U NPU flows
 ├── qualcomm                          # Contains demos of Qualcomm QNN backend
-├── xtensa                            # Contains demos of exporting and running a simple model on Xtensa Hifi4 DSP
+├── cadence                           # Contains demos of exporting and running a simple model on Xtensa DSPs
 ├── third-party                       # Third-party libraries required for working on the demos
 └── README.md                         # This file
 ```
@@ -30,6 +31,9 @@ examples
 
 A user's journey may commence by exploring the demos located in the [`portable/`](./portable) directory. Here, you will gain insights into the fundamental end-to-end workflow to generate a binary file from a ML model in [portable mode](../docs/source/concepts.md##portable-mode-lean-mode) and run it on the ExecuTorch runtime.
 
+## Demo of Llama 2 and Llama 3
+
+[This page](./models/llama2/README.md) demonstrates how to run Llama 2 7B and Llama 3 8B models on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
 ## Demo of Selective Build
 
@@ -37,7 +41,7 @@ To understand how to deploy the ExecuTorch runtime with optimization for binary
 
 ## Demo of ExecuTorch SDK
 
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification, and ETDump generation.
+You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
 
 ## Demo Apps
 
@@ -63,11 +67,6 @@ You will find demos of [ExecuTorch QNN Backend](./qualcomm) in the [`qualcomm/`]
 
 The [`xtensa/`](./xtensa) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it.
 
-
-## Demo of ExecuTorch SDK
-
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
-
 ## Dependencies
 
 Various models and workflows listed in this directory have dependencies on some other packages. You need to follow the setup guide in [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup) to have appropriate packages installed.
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 00000000000..2e41cd717f6
--- /dev/null
+++ b/examples/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/examples/apple/coreml/README.md b/examples/apple/coreml/README.md
index a10f3efcc95..f4270956b2c 100644
--- a/examples/apple/coreml/README.md
+++ b/examples/apple/coreml/README.md
@@ -1,6 +1,6 @@
 # Examples
 
-This directory contains scripts and other helper utilities to illustrate an end-to-end workflow to run a **Core ML** delegated `torch.nn.module` with the **ExecuTorch** runtime.
+This directory contains scripts and other helper utilities to illustrate an end-to-end workflow to run a Core ML delegated `torch.nn.module` with the ExecuTorch runtime.
 
 
 ## Directory structure
@@ -13,7 +13,7 @@ coreml
 
 ## Using the examples
 
-We will walk through an example model to generate a **Core ML** delegated binary file from a python `torch.nn.module` then we will use the `coreml/executor_runner` to run the exported binary file.
+We will walk through an example model to generate a Core ML delegated binary file from a python `torch.nn.module` then we will use the `coreml_executor_runner` to run the exported binary file.
 
 1. Following the setup guide in [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
 you should be able to get the basic development environment for ExecuTorch working.
@@ -27,7 +27,7 @@ cd executorch
 
 ```
 
-3. Run the export script to generate a **Core ML** delegated binary file.
+3. Run the export script to generate a Core ML delegated binary file.
 
 ```bash
 cd executorch
@@ -35,11 +35,11 @@ cd executorch
 # To get a list of example models
 python3 -m examples.portable.scripts.export -h
 
-# Generates ./add_coreml_all.pte file if successful.
+# Generates add_coreml_all.pte file if successful.
 python3 -m examples.apple.coreml.scripts.export --model_name add
 ```
 
-4. Once we have the **Core ML** delegated model binary (pte) file, then let's run it with the **ExecuTorch** runtime using the `coreml_executor_runner`.
+4. Run the binary file using the `coreml_executor_runner`.
 
 ```bash
 cd executorch
@@ -47,20 +47,30 @@ cd executorch
 # Builds the Core ML executor runner. Generates ./coreml_executor_runner if successful.
 ./examples/apple/coreml/scripts/build_executor_runner.sh
 
-# Run the Core ML delegate model.
+# Run the delegated model.
 ./coreml_executor_runner --model_path add_coreml_all.pte
 ```
 
 ## Frequently encountered errors and resolution.
-- The `examples.apple.coreml.scripts.export` could fail if the model is not supported by the **Core ML** backend. The following models from the examples models list (` python3 -m examples.portable.scripts.export -h`)are currently supported by the **Core ML** backend.
+- The `examples.apple.coreml.scripts.export` could fail if the model is not supported by the Core ML backend. The following models from the examples models list (` python3 -m examples.portable.scripts.export -h`) are currently supported by the Core ML backend.
 
-```
+```text
 add
 add_mul
+dl3
+edsr
+emformer_join
+emformer_predict
+emformer_transcribe
+ic3
 ic4
 linear
+llama2
+llava_encoder
+mobilebert
 mul
 mv2
+mv2_untrained
 mv3
 resnet18
 resnet50
diff --git a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
index 66c0b182cd5..16e9e590027 100644
--- a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
+++ b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
@@ -15,7 +15,10 @@
 		C94D51642ACFCBC500AF47FD /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51632ACFCBC500AF47FD /* CoreML.framework */; };
 		C94D51662ACFCBCB00AF47FD /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51652ACFCBCB00AF47FD /* Accelerate.framework */; };
 		C94D51682ACFCC7100AF47FD /* libcoremldelegate.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */; };
+		C97BFFA42BC0C17300F55BAC /* libportable_kernels.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */; };
+		C97BFFA62BC0C1F200F55BAC /* libportable_ops_lib.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */; };
 		C988D69D2B998CDE00979CF6 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */; };
+		F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -40,7 +43,10 @@
 		C94D51632ACFCBC500AF47FD /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
 		C94D51652ACFCBCB00AF47FD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libcoremldelegate.a; path = libraries/libcoremldelegate.a; sourceTree = "<group>"; };
+		C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libportable_kernels.a; path = libraries/libportable_kernels.a; sourceTree = "<group>"; };
+		C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libportable_ops_lib.a; path = libraries/libportable_ops_lib.a; sourceTree = "<group>"; };
 		C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "libraries/libprotobuf-lite.a"; sourceTree = "<group>"; };
+		F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -49,11 +55,14 @@
 			buildActionMask = 2147483647;
 			files = (
 				38626BB52B225A890059413D /* libetdump.a in Frameworks */,
+				F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
 				38626BB42B225A560059413D /* libflatccrt.a in Frameworks */,
 				C94D51682ACFCC7100AF47FD /* libcoremldelegate.a in Frameworks */,
 				C94D51662ACFCBCB00AF47FD /* Accelerate.framework in Frameworks */,
 				C988D69D2B998CDE00979CF6 /* libprotobuf-lite.a in Frameworks */,
+				C97BFFA62BC0C1F200F55BAC /* libportable_ops_lib.a in Frameworks */,
 				C94D51642ACFCBC500AF47FD /* CoreML.framework in Frameworks */,
+				C97BFFA42BC0C17300F55BAC /* libportable_kernels.a in Frameworks */,
 				C94D51622ACFCBBA00AF47FD /* libsqlite3.tbd in Frameworks */,
 				C94D515E2ACFCBA000AF47FD /* libexecutorch.a in Frameworks */,
 			);
@@ -90,6 +99,9 @@
 				C94D515C2ACFCBA000AF47FD /* libexecutorch.a */,
 				C94D51612ACFCBBA00AF47FD /* libsqlite3.tbd */,
 				C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */,
+				F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */,
+				C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */,
+				C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */,
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index ad63d2a942c..347f3b4474f 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -37,6 +37,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
 -DEXECUTORCH_BUILD_XNNPACK=OFF \
 -DEXECUTORCH_BUILD_SDK=ON \
 -DEXECUTORCH_BUILD_COREML=ON \
+-DCOREML_BUILD_EXECUTOR_RUNNER=ON \
 -Dprotobuf_BUILD_TESTS=OFF \
 -Dprotobuf_BUILD_EXAMPLES=OFF \
 -DCMAKE_MACOSX_BUNDLE=OFF \
@@ -60,12 +61,15 @@ cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH"
 # Copy required libraries
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libetdump.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libcoremldelegate.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lite.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libexecutorch.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch_no_prim_ops.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libexecutorch_no_prim_ops.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lite.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libprotobuf-lite.a"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lited.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libprotobuf-lite.a"  \;
-cp -f "$EXECUTORCH_ROOT_PATH/third-party/flatcc/lib/libflatccrt.a" "$LIBRARIES_DIR_PATH"
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libetdump.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libetdump.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libcoremldelegate.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libcoremldelegate.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libportable_ops_lib.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libportable_ops_lib.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libportable_kernels.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libportable_kernels.a"  \;
+cp -f "$EXECUTORCH_ROOT_PATH/third-party/flatcc/lib/libflatccrt.a" "$LIBRARIES_DIR_PATH/libflatccrt.a"
 
 # Build the runner
 echo "ExecuTorch: Building runner"
diff --git a/examples/apple/coreml/scripts/extract_coreml_models.py b/examples/apple/coreml/scripts/extract_coreml_models.py
index 32c750196dd..6317b0f3d3f 100644
--- a/examples/apple/coreml/scripts/extract_coreml_models.py
+++ b/examples/apple/coreml/scripts/extract_coreml_models.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-#
 # Copyright © 2024 Apple Inc. All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -55,7 +53,8 @@ def extract_coreml_models(pte_data: bytes):
         if executorchcoreml.unflatten_directory_contents(
             coreml_processed_bytes, str(model_path.absolute())
         ):
-            print(f"CoreML model is extracted and saved to path = {model_path}")
+            print(f"Core ML models are extracted and saved to path = {model_path}")
+        model_index += 1
 
     if len(coreml_delegates) == 0:
         print("The model isn't delegated to CoreML.")
@@ -63,7 +62,7 @@ def extract_coreml_models(pte_data: bytes):
 
 if __name__ == "__main__":
     """
-    Extracts the CoreML models embedded in the ``.pte`` file and saves them to the
+    Extracts the Core ML models embedded in the ``.pte`` file and saves them to the
     file system.
     """
     parser = argparse.ArgumentParser()
diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py
index 3f8990bdab6..077c8c26ef7 100644
--- a/examples/apple/coreml/scripts/inspector_cli.py
+++ b/examples/apple/coreml/scripts/inspector_cli.py
@@ -7,7 +7,7 @@
 import argparse
 import json
 
-from typing import Any, Dict, Final, List, Tuple
+from typing import Any, Dict, Final, List, Tuple, Union
 
 from executorch.sdk import Inspector
 from executorch.sdk.inspector._inspector_utils import compare_results
@@ -34,6 +34,12 @@ def parse_coreml_delegate_metadata(delegate_metadatas: List[str]) -> Dict[str, A
         return {}
 
 
+def convert_coreml_delegate_time(
+    event_name: Union[str, int], input_time: Union[int, float]
+) -> Union[int, float]:
+    return input_time / (1000 * 1000)
+
+
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -60,6 +66,7 @@ def main() -> None:
         etrecord=args.etrecord_path,
         debug_buffer_path=args.debug_buffer_path,
         delegate_metadata_parser=parse_coreml_delegate_metadata,
+        delegate_time_scale_converter=convert_coreml_delegate_time,
     )
     inspector.print_data_tabular(include_delegate_debug_data=True)
     if args.compare_results:
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index 89c2b141b01..976ecebc979 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -42,7 +42,7 @@ add_compile_options("-Wall" "-Werror")
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options -Wno-deprecated-declarations -fPIC -DET_EVENT_TRACER_ENABLED)
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
@@ -51,7 +51,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 # portable_ops_lib, etdump, bundled_program.
 find_package(executorch CONFIG REQUIRED)
 target_include_directories(executorch INTERFACE ${_common_include_directories})
-target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+target_compile_options(executorch INTERFACE ${_common_compile_options})
 
 find_package(
   gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party
@@ -73,7 +73,7 @@ generate_bindings_for_kernels(
   FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
 )
 gen_operators_lib(
-  "portable_ops_lib"
+  "mps_portable_ops_lib"
   KERNEL_LIBS portable_kernels
   DEPS executorch)
 
@@ -107,9 +107,9 @@ list(TRANSFORM _mps_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_executable(mps_executor_runner ${_mps_executor_runner__srcs})
 
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
-    set(FLATCC_LIB flatcc_d)
+  set(FLATCC_LIB flatccrt_d)
 else()
-    set(FLATCC_LIB flatcc)
+  set(FLATCC_LIB flatccrt)
 endif()
 
 target_link_libraries(mps_executor_runner bundled_program
@@ -117,7 +117,7 @@ target_link_libraries(mps_executor_runner bundled_program
                                           etdump
                                           ${FLATCC_LIB}
                                           mpsdelegate
-                                          portable_ops_lib
+                                          mps_portable_ops_lib
                                           ${mps_executor_runner_libs})
 target_compile_options(mps_executor_runner PUBLIC ${_common_compile_options})
 endif()
diff --git a/examples/apple/mps/scripts/bench_utils.py b/examples/apple/mps/scripts/bench_utils.py
new file mode 100644
index 00000000000..c00738987ab
--- /dev/null
+++ b/examples/apple/mps/scripts/bench_utils.py
@@ -0,0 +1,117 @@
+#
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+#
+
+import logging
+import time
+
+import torch
+from torch._export.exported_program import ExportedProgram
+
+
+def assert_outputs_equal(model_output, ref_output):
+    """
+    Helper testing function that asserts that the model output and the reference output
+    are equal with some tolerance. Due to numerical differences between eager mode and
+    the MPS's backend, we relax the detal such that absolute tolerance is 1e-3. and
+    relative tolerance is 1e-3.
+    """
+
+    # Compare the result from executor and eager mode direclty
+    if isinstance(ref_output, tuple) or isinstance(ref_output, list):
+        # Multiple outputs executor always returns tuple, even if there is one output
+        assert len(ref_output) == len(
+            model_output
+        ), "Length of outputs is not matching!"
+        for i in range(len(ref_output)):
+            assert torch.allclose(
+                model_output[i], ref_output[i], atol=1e-03, rtol=1e-03
+            )
+    else:
+        # If one output, eager returns tensor while executor tuple of size 1
+        assert torch.allclose(
+            model_output[0], ref_output, atol=1e-03, rtol=1e-03
+        ), "Outputs are not matching!"
+
+
+def bench_forward(func, *args):
+    # warmup
+    for _ in range(10):
+        func(*args)
+
+    start = time.time()
+    for _ in range(100):
+        func(*args)
+    end = time.time()
+    return end - start
+
+
+def executorch_forward_pass(model, inputs):
+    for _ in range(10):
+        model.forward(inputs)
+
+
+def synchronize():
+    torch.mps.synchronize()
+
+
+def pytorch_forward_pass(model, inputs):
+    for _ in range(10):
+        model(*inputs)
+    synchronize()
+
+
+def get_mps_inputs(inputs):
+    inputs_mps = []
+    for tensor in inputs:
+        inputs_mps.append(tensor.to("mps"))
+    inputs_mps = tuple(inputs_mps)
+    return inputs_mps
+
+
+def get_executorch_model(executorch_program: ExportedProgram):
+    try:
+        from executorch.extension.pybindings.portable_lib import (  # @manual
+            _load_for_executorch_from_buffer,
+        )
+
+        return _load_for_executorch_from_buffer(executorch_program.buffer)
+    except ImportError:
+        logging.info(
+            "ExecuTorch MPS delegate was built without pybind support (not possible to run forward pass within python)"
+        )
+        return None
+
+
+def bench_torch(executorch_program: ExportedProgram, model, inputs, model_name):
+    model = model.to("mps")
+    inputs_mps = get_mps_inputs(inputs)
+
+    executorch_model = get_executorch_model(executorch_program)
+    if executorch_model is not None:
+        t_pytorch = bench_forward(pytorch_forward_pass, model, inputs_mps)
+        t_executorch = bench_forward(executorch_forward_pass, executorch_model, inputs)
+
+        logging.info(f"Model name: {model_name}")
+        logging.info(f"Pytorch MPS forward pass: {t_pytorch} seconds")
+        logging.info(f"ExecuTorch MPS forward pass: {t_executorch} seconds")
+        logging.info(
+            f"ExecuTorch speedup: {((t_pytorch - t_executorch) / t_pytorch) * 100}%"
+        )
+
+
+def compare_outputs(executorch_program: ExportedProgram, model, inputs, model_name):
+    inputs_copy = []
+    for t in inputs:
+        inputs_copy.append(t.detach().clone())
+    inputs_copy = tuple(inputs_copy)
+
+    pytorch_results = model(*inputs)
+    executorch_model = get_executorch_model(executorch_program)
+    if executorch_model is not None:
+        executorch_results = executorch_model.forward(inputs_copy)
+        assert_outputs_equal(executorch_results, pytorch_results)
+        logging.info(
+            f"Results between ExecuTorch forward pass with MPS backend and PyTorch forward pass for {model_name} are matching!"
+        )
diff --git a/examples/apple/mps/scripts/build_mps_executor_runner.sh b/examples/apple/mps/scripts/build_mps_executor_runner.sh
new file mode 100755
index 00000000000..16754588b67
--- /dev/null
+++ b/examples/apple/mps/scripts/build_mps_executor_runner.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+#  Copyright (c) 2024 Apple Inc. All rights reserved.
+#  Provided subject to the LICENSE file in the top level directory.
+
+set -e
+
+MODE="Release"
+OUTPUT="cmake-out"
+
+usage() {
+  echo "Usage: $0 [OPTIONS]"
+  echo "Build frameworks for Apple platforms."
+  echo "SOURCE_ROOT_DIR defaults to the current directory if not provided."
+  echo
+  echo "Options:"
+  echo "  --output=DIR         Output directory. Default: 'cmake-out'"
+  echo "  --Debug              Use Debug build mode. Default: 'Release'"
+  echo "Example:"
+  echo "  $0 --output=cmake-out --Debug"
+  exit 0
+}
+
+for arg in "$@"; do
+  case $arg in
+      -h|--help) usage ;;
+      --output=*) OUTPUT="${arg#*=}" ;;
+      --Debug) MODE="Debug" ;;
+      *)
+      if [[ -z "$SOURCE_ROOT_DIR" ]]; then
+          SOURCE_ROOT_DIR="$arg"
+      else
+          echo "Invalid argument: $arg"
+          exit 1
+      fi
+      ;;
+  esac
+done
+
+rm -rf "$OUTPUT"
+
+cmake -DBUCK2="$BUCK" \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE="$MODE" \
+          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+          -DEXECUTORCH_BUILD_MPS=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+cmake --build cmake-out -j9 --target install --config "$MODE"
+CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+# build mps_executor_runner
+rm -rf cmake-out/examples/apple/mps
+cmake \
+    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
+    -DCMAKE_BUILD_TYPE="$MODE" \
+    -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+    -Bcmake-out/examples/apple/mps \
+    examples/apple/mps
+
+cmake --build cmake-out/examples/apple/mps -j9 --config "$MODE"
+
+echo "Build succeeded!"
+
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mps_logical_not.pte --bundled_program
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index a86a54c4d5c..0bfef7bf4ce 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -10,6 +10,7 @@
 import logging
 
 import torch
+from examples.apple.mps.scripts.bench_utils import bench_torch, compare_outputs
 from executorch import exir
 from executorch.backends.apple.mps.mps_preprocess import MPSBackend
 from executorch.backends.apple.mps.partition.mps_partitioner import MPSPartitioner
@@ -36,7 +37,28 @@
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
-if __name__ == "__main__":
+
+def get_bundled_program(executorch_program, example_inputs, expected_output):
+    method_test_suites = [
+        MethodTestSuite(
+            method_name="forward",
+            test_cases=[
+                MethodTestCase(
+                    inputs=example_inputs, expected_outputs=[expected_output]
+                )
+            ],
+        )
+    ]
+    logging.info(f"Expected output: {expected_output}")
+
+    bundled_program = BundledProgram(executorch_program, method_test_suites)
+    bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
+        bundled_program
+    )
+    return bundled_program_buffer
+
+
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-m",
@@ -54,11 +76,18 @@
 
     parser.add_argument(
         "--use_partitioner",
-        default=False,
+        default=True,
         action=argparse.BooleanOptionalAction,
         help="Use MPS partitioner to run the model instead of using whole graph lowering.",
     )
 
+    parser.add_argument(
+        "--bench_pytorch",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Bench ExecuTorch MPS foward pass with PyTorch MPS forward pass.",
+    )
+
     parser.add_argument(
         "-b",
         "--bundled",
@@ -68,6 +97,15 @@
         help="Flag for bundling inputs and outputs in the final flatbuffer program",
     )
 
+    parser.add_argument(
+        "-c",
+        "--check_correctness",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Whether to compare the ExecuTorch MPS results with the PyTorch forward pass",
+    )
+
     parser.add_argument(
         "--generate_etrecord",
         action="store_true",
@@ -76,25 +114,64 @@
         help="Generate ETRecord metadata to link with runtime results (used for profiling)",
     )
 
+    parser.add_argument(
+        "--checkpoint",
+        required=False,
+        default=None,
+        help="checkpoing for llama model",
+    )
+
+    parser.add_argument(
+        "--params",
+        required=False,
+        default=None,
+        help="params for llama model",
+    )
+
     args = parser.parse_args()
+    return args
+
+
+def get_model_config(args):
+    model_config = {}
+    model_config["module_name"] = MODEL_NAME_TO_MODEL[args.model_name][0]
+    model_config["model_class_name"] = MODEL_NAME_TO_MODEL[args.model_name][1]
+
+    if args.model_name == "llama2":
+        if args.checkpoint:
+            model_config["checkpoint"] = args.checkpoint
+        if args.params:
+            model_config["params"] = args.params
+        model_config["use_kv_cache"] = True
+    return model_config
+
+
+if __name__ == "__main__":
+    args = parse_args()
 
     if args.model_name not in MODEL_NAME_TO_MODEL:
         raise RuntimeError(f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}.")
 
-    model, example_inputs, _ = EagerModelFactory.create_model(
-        *MODEL_NAME_TO_MODEL[args.model_name]
-    )
+    model_config = get_model_config(args)
+    model, example_inputs, _ = EagerModelFactory.create_model(**model_config)
 
     model = model.eval()
+    if args.check_correctness or args.bench_pytorch:
+        model_copy = copy.deepcopy(model)
+        inputs_copy = []
+        for t in example_inputs:
+            inputs_copy.append(t.detach().clone())
+        inputs_copy = tuple(inputs_copy)
 
     # pre-autograd export. eventually this will become torch.export
-    model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+    with torch.no_grad():
+        model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+        edge: EdgeProgramManager = export_to_edge(
+            model,
+            example_inputs,
+            edge_compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
 
-    edge: EdgeProgramManager = export_to_edge(
-        model,
-        example_inputs,
-        edge_compile_config=EdgeCompileConfig(_check_ir_validity=False),
-    )
     edge_program_manager_copy = copy.deepcopy(edge)
 
     compile_specs = [CompileSpec("use_fp16", bytes([args.use_fp16]))]
@@ -120,31 +197,30 @@
     model_name = f"{args.model_name}_mps"
 
     if args.bundled:
-        method_test_suites = [
-            MethodTestSuite(
-                method_name="forward",
-                test_cases=[
-                    MethodTestCase(
-                        inputs=example_inputs, expected_outputs=[model(*example_inputs)]
-                    )
-                ],
-            )
-        ]
-        logging.info(f"Expected output: {model(*example_inputs)}")
-
-        bundled_program = BundledProgram(executorch_program, method_test_suites)
-        bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
-            bundled_program
+        expected_output = model(*example_inputs)
+        bundled_program_buffer = get_bundled_program(
+            executorch_program, example_inputs, expected_output
         )
         model_name = f"{model_name}_bundled"
         extension = "fp16"
         if not args.use_fp16:
             extension = "fp32"
-        model_name = f"{model_name}_{extension}"
+        model_name = f"{model_name}_{extension}.pte"
 
     if args.generate_etrecord:
         etrecord_path = "etrecord.bin"
         logging.info("generating etrecord.bin")
         generate_etrecord(etrecord_path, edge_program_manager_copy, executorch_program)
 
-    save_pte_program(executorch_program, model_name)
+    if args.bundled:
+        with open(model_name, "wb") as file:
+            file.write(bundled_program_buffer)
+        logging.info(f"Saved bundled program to {model_name}")
+    else:
+        save_pte_program(executorch_program, model_name)
+
+    if args.bench_pytorch:
+        bench_torch(executorch_program, model_copy, example_inputs, model_name)
+
+    if args.check_correctness:
+        compare_outputs(executorch_program, model_copy, inputs_copy, model_name)
diff --git a/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake b/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
index 506e78d0bcc..10ebdf0945f 100644
--- a/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+++ b/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
@@ -58,7 +58,13 @@ add_compile_definitions(
 add_link_options(
     -mcpu=${GCC_CPU}
     -mthumb
-    --specs=nosys.specs)
+)
+
+if(SEMIHOSTING)
+    add_link_options(--specs=rdimon.specs)
+else()
+    add_link_options(--specs=nosys.specs)
+endif()
 
 # Set floating point unit
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp")
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index c738a9502bf..98d136facec 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -12,6 +12,8 @@ if(NOT DEFINED ET_PTE_FILE_PATH)
     "model is built into the binary.")
 endif()
 
+option(SEMIHOSTING "Enable semihosting" OFF)
+
 # Example ExecuTorch demo for bare metal Cortex-M based systems
 set(ET_DIR_PATH "../../.." CACHE PATH
   "Path to ExecuTorch dir")
@@ -43,6 +45,11 @@ add_library(executorch STATIC IMPORTED)
 set_property(TARGET executorch PROPERTY IMPORTED_LOCATION
   "${ET_BUILD_DIR_PATH}/libexecutorch.a")
 
+add_library(executorch_no_prim_ops STATIC IMPORTED)
+set_property(TARGET executorch_no_prim_ops PROPERTY IMPORTED_LOCATION
+  "${ET_BUILD_DIR_PATH}/libexecutorch_no_prim_ops.a")
+target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
+
 add_library(executorch_delegate_ethos_u STATIC IMPORTED)
 set_property(TARGET executorch_delegate_ethos_u PROPERTY IMPORTED_LOCATION
   "${ET_BUILD_DIR_PATH}/backends/arm/libexecutorch_delegate_ethos_u.a")
@@ -100,3 +107,17 @@ target_include_directories(arm_executor_runner PRIVATE
   ${CMAKE_CURRENT_BINARY_DIR})
 
 add_dependencies(arm_executor_runner gen_model_header)
+
+
+if(SEMIHOSTING)
+target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
+endif()
+
+# Fixup compilation of retarget.c
+if(SEMIHOSTING)
+# Remove this when MLBEDSW-8910 is closed.
+set_source_files_properties(
+  ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c
+  PROPERTIES HEADER_FILE_ONLY TRUE
+)
+endif()
\ No newline at end of file
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index 3423abfdfb3..2e97359a569 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -31,8 +31,8 @@ using namespace std;
 using torch::executor::Error;
 using torch::executor::Result;
 
-__attribute__((section(".sram.data"), aligned(16)))
-uint8_t method_allocator_pool[4 * 1024U];
+__attribute__((section(".sram.data"), aligned(16))) uint8_t
+    method_allocator_pool[4 * 1024U];
 
 void et_pal_init(void) {}
 
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 72cc4c94a5b..f626b4caeb3 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -215,7 +215,7 @@ function setup_vela() {
     if [[ ! -e ethos-u-vela ]]; then
         git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela
         repo_dir="${root_dir}/ethos-u-vela"
-        base_rev=b90666d9b43f4b5223bb4dcecdbee87b2ad757c2
+        base_rev=92240e7979018a197b42aab2da16dc002d86f224
         patch_repo
     fi
     cd "${root_dir}/ethos-u-vela"
diff --git a/examples/xtensa/CMakeLists.txt b/examples/cadence/CMakeLists.txt
similarity index 90%
rename from examples/xtensa/CMakeLists.txt
rename to examples/cadence/CMakeLists.txt
index 8c9a251a168..ec4658b6701 100644
--- a/examples/xtensa/CMakeLists.txt
+++ b/examples/cadence/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 # Set the project name.
-project(xtensa_executorch_example)
+project(cadence_executorch_example)
 
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
@@ -100,21 +100,21 @@ add_custom_command(
 
 add_custom_target(gen_model_header DEPENDS ${CMAKE_BINARY_DIR}/model_pte.h)
 
-add_executable(xtensa_executorch_example executor_runner.cpp)
-add_dependencies(xtensa_executorch_example gen_model_header)
+add_executable(cadence_executorch_example executor_runner.cpp)
+add_dependencies(cadence_executorch_example gen_model_header)
 
 # lint_cmake: -linelength
-target_include_directories(xtensa_executorch_example PUBLIC ${ROOT_DIR}/..
+target_include_directories(cadence_executorch_example PUBLIC ${ROOT_DIR}/..
                                                             ${CMAKE_BINARY_DIR}
                                                             ${_common_include_directories})
 
-target_link_options(xtensa_executorch_example PRIVATE
+target_link_options(cadence_executorch_example PRIVATE
                     -mlsp=${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/xtensa/min-rt)
-target_link_libraries(xtensa_executorch_example dsp_mu_polling_libs
-                      xtensa_ops_lib extension_runner_util executorch)
+target_link_libraries(cadence_executorch_example dsp_mu_polling_libs
+                      cadence_ops_lib extension_runner_util executorch)
 
 add_custom_command(
-  TARGET xtensa_executorch_example
+  TARGET cadence_executorch_example
   POST_BUILD
   COMMAND
     ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/utils/post_compilation.py
diff --git a/examples/cadence/aot/compiler.py b/examples/cadence/aot/compiler.py
new file mode 100644
index 00000000000..36a5b308553
--- /dev/null
+++ b/examples/cadence/aot/compiler.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Any, Callable, Tuple
+
+import torch
+
+from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
+
+from torch.export import export
+from torch.export.exported_program import ExportedProgram
+
+
+def export_program(
+    model: Callable,
+    inputs: Any,
+    pt2_quant: bool = False,
+) -> ExportedProgram:
+    # we don't support training mode. Make it eval
+    if hasattr(model, "eval"):
+        if pt2_quant:
+            # pyre-fixme[6]: Incompatible parameter type.
+            torch.ao.quantization.move_exported_model_to_eval(model)
+        else:
+            # pyre-fixme[16]: Anonymous callable has no attribute `eval`.
+            model.eval()
+
+    # if it's already an ExportedProgram, just return it
+    if isinstance(model, ExportedProgram):
+        return model
+
+    assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
+
+    # Prevent mkldnn decompositions
+    torch._C._set_mkldnn_enabled(False)
+
+    # else: capture the model and return it.
+    return export(model, inputs)
+
+
+# Export the model and lower it it edge IR.
+def export_to_edge(
+    model: Callable,
+    inputs: Any,
+    pt2_quant: bool = False,
+    dump_graphs: bool = False,
+) -> Tuple[EdgeProgramManager, ExportedProgram]:
+    # Export the model into an ExportedProgram.
+    expo_program = export_program(model, inputs, pt2_quant)
+
+    if dump_graphs:
+        logging.info(f"Exported graph:\n{expo_program.graph_module.graph}")
+
+    # Call to_edge to convert the graph to edge IR.
+    edge_prog_manager = to_edge(
+        expo_program, compile_config=EdgeCompileConfig(_check_ir_validity=False)
+    )
+
+    if dump_graphs:
+        logging.info(
+            f"Edge graph:\n{edge_prog_manager.exported_program().graph_module.graph}"
+        )
+
+    return edge_prog_manager, expo_program
diff --git a/examples/cadence/aot/export_example.py b/examples/cadence/aot/export_example.py
new file mode 100644
index 00000000000..864df963f75
--- /dev/null
+++ b/examples/cadence/aot/export_example.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer
+
+import logging
+
+from .meta_registrations import *  # noqa
+
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+from ...portable.utils import save_pte_program
+
+from .compiler import export_to_edge
+from .quantizer import (
+    CadenceBaseQuantizer,
+    QuantFusion,
+    ReplacePT2DequantWithCadenceDequant,
+    ReplacePT2QuantWithCadenceQuant,
+)
+from .utils import print_ops_info
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+def export_model(model, example_inputs):
+    # Quantizer
+    quantizer = CadenceBaseQuantizer()
+
+    # Export
+    model_exp = capture_pre_autograd_graph(model, example_inputs)
+
+    # Prepare
+    prepared_model = prepare_pt2e(model_exp, quantizer)
+    prepared_model(*example_inputs)
+
+    # Convert
+    converted_model = convert_pt2e(prepared_model)
+
+    # pyre-fixme[16]: Pyre doesn't get that CadenceQuantizer has a patterns attribute
+    patterns = [q.pattern for q in quantizer.quantizers]
+    QuantFusion(patterns)(converted_model)
+
+    # Get edge program (note: the name will change to export_to_cadence in future PRs)
+    edge_prog_manager, expo_prog = export_to_edge(
+        converted_model, example_inputs, pt2_quant=True
+    )
+
+    # Run a couple required passes for quant/dequant ops
+    cadence_prog_manager = edge_prog_manager.transform(
+        [ReplacePT2QuantWithCadenceQuant(), ReplacePT2DequantWithCadenceDequant()],
+        check_ir_validity=False,
+    )
+
+    exec_prog = cadence_prog_manager.to_executorch()
+
+    logging.info(
+        f"Final exported graph module:\n{exec_prog.exported_program().graph_module}"
+    )
+
+    # Print some information to terminal
+    print_ops_info(
+        expo_prog.graph_module,
+        edge_prog_manager.exported_program().graph_module,
+        cadence_prog_manager.exported_program().graph_module,
+    )
+
+    # Save the program as CadenceDemoModel.pte
+    save_pte_program(exec_prog, "CadenceDemoModel")
diff --git a/examples/cadence/aot/meta_registrations.py b/examples/cadence/aot/meta_registrations.py
new file mode 100644
index 00000000000..f7dabe1ec11
--- /dev/null
+++ b/examples/cadence/aot/meta_registrations.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple
+
+import torch
+from executorch.exir.scalar_type import ScalarType
+from torch.library import impl, Library
+
+from .utils import get_conv1d_output_size
+
+lib = Library("cadence", "DEF")
+
+lib.define(
+    "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+lib.define(
+    "dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantized_layer_norm(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point) -> (Tensor Y)"
+)
+
+lib.define(
+    "quantized_layer_norm.out(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
+
+lib.define(
+    "quantized_linear(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) ->  Tensor(a!)"
+)
+
+lib.define("quantized_relu(Tensor X, Tensor X_zero_point) -> (Tensor Y)")
+
+lib.define(
+    "quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
+
+lib.define(
+    "quantized_conv(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+m = Library("cadence", "IMPL", "Meta")
+
+
+@impl(m, "quantize_per_tensor")
+def quantize_per_tensor_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: ScalarType,
+):
+    return input.new_empty(input.size(), dtype=dtype)
+
+
+@impl(m, "dequantize_per_tensor")
+def dequantize_per_tensor_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: ScalarType,
+):
+    return input.new_empty(input.size(), dtype=torch.float)
+
+
+@impl(m, "quantized_linear")
+def quantized_linear_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+    out_zero_point: int,
+    offset: Optional[torch.Tensor],
+):
+    # src comes in shape [leading_dims, in_dim]
+    # weight comes in shape [out_dim, in_dim]
+    # output comes in empty with shape [leading_dims, out_dim]
+    out_size = list(src.size())
+    weight_size = list(weight.size())
+    assert len(weight_size) == 2
+    out_size[-1] = weight_size[0]
+    return src.new_empty(out_size, dtype=torch.uint8)
+
+
+@impl(m, "quantized_conv")
+def quantized_conv_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    bias_scale: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+    channel_last: bool = False,
+):
+    out_channels, _in_channels, *kernel_size = weight.shape
+    in_size = input.shape
+    # Assert that the input tensor has at least 3 dimensions, and at most 6
+    assert len(in_size) > 2
+    assert len(in_size) < 6
+
+    # Compute the output tensor size
+    output_size = get_conv1d_output_size(
+        in_size, out_channels, stride[0], padding[0], dilation[0], kernel_size[0]
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@impl(m, "quantized_layer_norm")
+def quantized_layer_norm_meta(
+    input: torch.Tensor,
+    X_scale: torch.Tensor,
+    X_zero_point: torch.Tensor,
+    normalized_shape: int,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    output_scale: float,
+    output_zero_point: int,
+):
+    return input.new_empty(input.size(), dtype=torch.uint8)
+
+
+@impl(m, "quantized_relu")
+def quantized_relu_meta(
+    X: torch.Tensor,
+    X_zero_point: torch.Tensor,
+):
+    return X.new_empty(X.size(), dtype=torch.uint8)
diff --git a/examples/cadence/aot/quantizer.py b/examples/cadence/aot/quantizer.py
new file mode 100644
index 00000000000..df184f9d92c
--- /dev/null
+++ b/examples/cadence/aot/quantizer.py
@@ -0,0 +1,855 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from math import frexp, isclose, trunc
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+from torch import fx
+
+from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
+from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
+from torch.ao.quantization.quantizer import Quantizer
+from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    OperatorConfig,
+    QuantizationAnnotation,
+    QuantizationConfig,
+    QuantizationSpec,
+    SharedQuantizationSpec,
+)
+from torch.fx import GraphModule
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.utils.fuser_utils import legalize_graph
+
+
+def quantize_tensor_multiplier(
+    requantize_scale_tensor: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Given requantize_scale_tensor with values in the interval (0, 1),
+    produce a pair of tensors (out_multiplier, right_shift) where out_multiplier
+    is an int32 tensor representing fixed-point values in the interval [-1, 1),
+    and right_shift is an amount to shift right by, so that the floating-point
+    multiplication of some int32 input with each value of requantize_scale_tensor:
+        result = int32_value * requantize_scale_tensors[i]
+    is best approximated by the integer-arithmetic-only code:
+        result = RoundingRightShift(FixedPointMultiplication(int32_value,
+                                    out_multiplier[i]), right_shift[i])
+    """
+
+    # This is identical to C++11 std::round(). The general python round rounds
+    # down, and C++ rounds away from zero.
+    def round_away_zero(f) -> int:
+        r = -0.5 if (f < 0) else 0.5
+        return trunc(f + r)
+
+    def quantize_scalar_multiplier(requantize_scale: float) -> Tuple[int, int]:
+        significand, exponent = frexp(requantize_scale)
+        significand_q31 = int(round_away_zero(significand * (1 << 31)))
+        # Handle the special case when the real multiplier was so close to 1
+        # that its fixed-point approximation was indistinguishable from 1.
+        # We handle this by dividing it by two, incrementing exponent by 1.
+        # the right shift amount.
+        if significand_q31 == (1 << 31):
+            significand_q31 //= 2
+            exponent += 1
+
+        # Verify that the decomposition of requantize_scale into significand
+        # and exponent is correct.
+        reconstructed = significand_q31 / (1 << 31) * pow(2, exponent)
+        assert isclose(
+            requantize_scale, reconstructed, rel_tol=1e-4, abs_tol=1e-4
+        ), "computation of significand and exponent from requantize_scale is not accurate"
+
+        return (significand_q31, exponent)
+
+    # Flatten the input scale tensor so that we can operate on individual values
+    orig_shape = requantize_scale_tensor.shape
+    flattened_tensor = requantize_scale_tensor.flatten().to(torch.float32)
+    out_multiplier = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
+    right_shift = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
+
+    # Iterate over the flattened scale tensor and compute the decomposition of
+    # each value in scale tensor into significand(out_multiplier) and
+    # exponent(right_shift)
+    for idx, scale in enumerate(flattened_tensor):
+        (si, ex) = quantize_scalar_multiplier(scale)
+        out_multiplier[idx], right_shift[idx] = si, ex
+
+    # Reshape the tensors back to the original shape
+    out_multiplier = out_multiplier.reshape(orig_shape)
+    right_shift = right_shift.reshape(orig_shape)
+
+    return (out_multiplier, right_shift)
+
+
+def _is_annotated(nodes: List[fx.Node]) -> bool:
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def _no_outside_users(fused_partition) -> bool:
+    """
+    Checks if each partition other than the last does not have any outside users.
+    """
+    for source_partition in fused_partition[:-1]:
+        if len(source_partition.output_nodes) != 1:
+            return False
+        if len(source_partition.output_nodes[0].users) != 1:
+            return False
+    return True
+
+
+# Helper function to get the weight node for both quantized and unquantized weights
+# TODO(matthiascremon): get a better test!
+def get_weight_node(weights_inputs: fx.Node, dequants_weights: fx.Node) -> fx.Node:
+    """
+    Returns the weight node.
+    """
+    weight_node = (
+        weights_inputs
+        if weights_inputs.name.endswith("_frozen_param")
+        else dequants_weights
+    )
+    return weight_node
+
+
+# Helper function to get the args and kwargs for the linear replacement op
+def get_args_and_kwargs_linear(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    quant_node: fx.Node,
+) -> Tuple[Tuple[Any], Dict[str, Any]]:
+    """
+    Returns the args and kwargs for the linear replacement op.
+    """
+    weight_scale = get_weight_node(weights_inputs[0], dequants_weights[0]).args[1]
+    # pyre-fixme[58]: Unsupported operand types
+    bias_scale = dequants_inputs[0].args[1] * weight_scale
+    requantize_scale = bias_scale / quant_node.args[1]
+    requantize_scale_t = torch.tensor([requantize_scale])
+
+    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
+
+    # If bias is not available, create a bias tensor with the shape of weight[0]
+    if not bias_inputs:
+        weight_node = get_weight_node(weights_inputs[0], dequants_weights[0]).args[0]
+        # pyre-fixme[16]: Undefined attribute
+        attr_node = getattr(graph_module, weight_node.target)
+        weight_shape = list(attr_node.shape)
+        bias_shape = weight_shape[0]
+        bias = graph_module.graph.call_function(
+            torch.ops.aten.full.default, ([bias_shape], 0.0)
+        )
+    else:
+        bias = bias_inputs[0]
+
+    bias_int32_quant = graph_module.graph.call_function(
+        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        (
+            bias,
+            bias_scale,
+            0,
+            -(2**31),
+            (2**31) - 1,
+            torch.int32,
+        ),
+    )
+
+    # Create single element tensors for weight_zero_point, out_multiplier, out_shift.
+    # Note that the function expects int32_t, when it would default to int64_t, so
+    # we explicitly require that type.
+    weight_zero_point_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_weights[0].args[2]),
+        {"dtype": torch.int32},
+    )
+    out_multiplier_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_multiplier[0].item()),
+        {"dtype": torch.int32},
+    )
+    out_shift_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_shift[0].item()),
+        {"dtype": torch.int32},
+    )
+
+    args = tuple(inputs_inputs + weights_inputs + other_inputs + [bias_int32_quant])
+    kwargs = {
+        "src_zero_point": dequants_inputs[0].args[2],
+        "weight_zero_point": weight_zero_point_,
+        "out_multiplier": out_multiplier_,
+        "out_shift": out_shift_,
+        "out_zero_point": quant_node.args[2],
+        "offset": None,
+    }
+    return args, kwargs
+
+
+# Helper function to get the args and kwargs for the layer norm replacement op
+def get_args_and_kwargs_layer_norm(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    other_inputs: List[fx.Node],
+    weights_init_inputs: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    quant_node: fx.Node,
+) -> Tuple[Tuple[Any], Dict[str, Any]]:
+    """
+    Returns the args and kwargs for the layer norm replacement op.
+    """
+    # Check if the input is per-channel quantized
+    # TODO(matthiascremon): add proper support and testing for per-channel quantization
+    assert isinstance(dequants_inputs[0].args[1], float) and isinstance(
+        dequants_inputs[0].args[2], int
+    ), "per-channel quantization is not supported for layer norm, both scale and zero_point should be scalars"
+
+    # Make the scale and zero_point tensors
+    scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[1],
+        ),
+    )
+    zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[2],
+        ),
+    )
+
+    # Make the args and kwargs for the replacement op
+    args = tuple(inputs_inputs + [scale_tensor] + [zero_point_tensor])
+    kwargs = {
+        "normalized_shape": other_inputs[0],
+        "weight": weights_init_inputs[0],
+        "bias": bias_inputs[0],
+        "eps": 1e-05,
+        "output_scale": quant_node.args[1],
+        "output_zero_point": quant_node.args[2],
+    }
+    return args, kwargs
+
+
+def get_conv_args(arg, first_val: int) -> List[fx.Node]:
+    return arg if len(arg) == 2 else [first_val, arg[0]]
+
+
+def get_args_and_kwargs_conv1d(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    quant_node: fx.Node,
+    op_node: fx.Node,
+):
+    weight_scale = get_weight_node(weights_inputs[0], dequants_weights[0]).args[1]
+    weight_zero_point = get_weight_node(weights_inputs[0], dequants_weights[0]).args[2]
+    # pyre-fixme[58]: Unsupported operand types
+    bias_scale = dequants_inputs[0].args[1] * weight_scale
+    stride = [1, 1] if len(op_node.args) < 4 else get_conv_args(op_node.args[3], 1)
+    padding = [0, 0] if len(op_node.args) < 5 else get_conv_args(op_node.args[4], 0)
+    dilation = [1, 1] if len(op_node.args) < 6 else get_conv_args(op_node.args[5], 1)
+    groups = 1 if len(op_node.args) < 7 else op_node.args[6]
+    # If bias is not available, create a bias tensor with the shape of weight[0]
+    if not bias_inputs:
+        weight_node = get_weight_node(weights_inputs[0], dequants_weights[0]).args[0]
+        # pyre-fixme[16]: Undefined attribute
+        attr_node = getattr(graph_module, weight_node.target)
+        weight_shape = list(attr_node.shape)
+        bias_shape = weight_shape[0]
+        bias = graph_module.graph.call_function(
+            torch.ops.aten.full.default, ([bias_shape], 0.0)
+        )
+    else:
+        bias = bias_inputs[0]
+    # The bias is quantized to int32_t
+    bias_int32_quant = graph_module.graph.call_function(
+        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        (
+            bias,
+            bias_scale,
+            0,
+            -(2**31),
+            (2**31) - 1,
+            torch.int32,
+        ),
+    )
+
+    # Compute the out multiplier and out shift. They are used when the conv op is
+    # replaced by quantized linear, we compute them a priori for simplicity but
+    # may revisit the decision.
+    requantize_scale = bias_scale / quant_node.args[1]
+    requantize_scale_t = torch.tensor([requantize_scale])
+
+    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
+
+    out_multiplier_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_multiplier[0].item()),
+        {"dtype": torch.int32},
+    )
+    out_shift_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_shift[0].item()),
+        {"dtype": torch.int32},
+    )
+
+    # Create a single element tensor for the weight zero point
+    weight_zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], weight_zero_point),
+        {"dtype": torch.int32},
+    )
+
+    # Create a single element tensor for the bias scale
+    bias_scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], bias_scale),
+        {"dtype": torch.float32},
+    )
+
+    # Make the args and kwargs for the replacement op
+    args = tuple(inputs_inputs + weights_inputs + other_inputs + [bias_int32_quant])
+    kwargs = {
+        "stride": stride,
+        "padding": padding,
+        "dilation": dilation,
+        "groups": groups,
+        "input_zero_point": dequants_inputs[0].args[2],
+        "weight_zero_point": weight_zero_point_tensor,
+        "bias_scale": bias_scale_tensor,
+        "out_scale": quant_node.args[1],
+        "out_zero_point": quant_node.args[2],
+        "out_multiplier": out_multiplier_,
+        "out_shift": out_shift_,
+        "channel_last": False,
+    }
+    return args, kwargs
+
+
+def get_args_and_kwargs_relu(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+):
+    # Make the args and kwargs for the replacement op
+    args = tuple(inputs_inputs)
+
+    X_zero_point = graph_module.graph.call_function(
+        torch.ops.aten.full.default, ([1], dequants_inputs[0].args[2])
+    )
+
+    kwargs = {
+        "X_zero_point": X_zero_point,
+    }
+    return args, kwargs
+
+
+@dataclass
+class PartitionAnchors:
+    """
+    All fields except output are lists of (node, args_index) pair, where node is from
+    the given partition and node.args[args_index] is an input to the partition. Assumes
+    a single output.
+
+    Quantizer uses inputs, weights and biases for quantization annotation. The others
+    field contains tensor inputs that aren't quantized, and the literals fields contains
+    is used for other types of input values as well as handling default parameters.
+    """
+
+    inputs: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    weights: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    biases: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    others: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    literals: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    output: List[Union[Tuple[fx.Node], Tuple[fx.Node, QuantizationSpec]]] = field(
+        default_factory=list
+    )
+
+
+class QuantizationPattern(ABC):
+    @abstractmethod
+    def partition_types(self) -> List[Any]:
+        """
+        List of types to be passed to find_sequential_partitions.
+        """
+        pass
+
+    @abstractmethod
+    def get_anchors(self, gm, fused_partition) -> Optional[PartitionAnchors]:
+        pass
+
+    @abstractmethod
+    def replacement_op(self) -> Callable[..., Any]:
+        """
+        Operator (most likely a custom one) that this partition should be fused into in
+        the backend. Refer to the QuantFusion pass for examples.
+        """
+        pass
+
+
+class LinearPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.Linear]
+
+    def get_anchors(
+        self, gm: GraphModule, fused_partition: List[GraphModule]
+    ) -> PartitionAnchors:
+        linear_node = fused_partition[0].nodes[-1]
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(linear_node.args) > 2:
+            bias = [(linear_node, 2)]
+
+        return PartitionAnchors(
+            inputs=[(linear_node, 0)],
+            weights=[(linear_node, 1)],
+            biases=bias,
+            output=[(linear_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_linear.default
+
+
+class LinearFunctionalPattern(QuantizationPattern):
+    def partition_types(self):
+        return [torch.nn.functional.linear]
+
+    def get_anchors(
+        self, gm: GraphModule, fused_partition: List[GraphModule]
+    ) -> PartitionAnchors:
+        linear_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(linear_node, 0)],
+            weights=[(linear_node, 1)],
+            biases=[(linear_node, 2)],
+            output=[(linear_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_linear.default
+
+
+class LayerNormPattern(QuantizationPattern):
+    def partition_types(self):
+        return [torch.nn.LayerNorm]
+
+    def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
+        layer_norm_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(layer_norm_node, 0)],
+            weights=[(layer_norm_node, 2)],
+            biases=[(layer_norm_node, 3)],
+            others=[(layer_norm_node, 1)],
+            output=[(layer_norm_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_layer_norm.default
+
+
+class Conv1dPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.Conv1d]
+
+    def get_anchors(
+        self, gm: GraphModule, fused_partition: List[GraphModule]
+    ) -> PartitionAnchors:
+        conv1d_node = fused_partition[0].nodes[-1]
+
+        # If bias is None, replace it with an empty list.
+        bias = (
+            [(conv1d_node, 2)]
+            if len(conv1d_node.args) > 2 and conv1d_node.args[2]
+            else []
+        )
+
+        return PartitionAnchors(
+            inputs=[(conv1d_node, 0)],
+            weights=[(conv1d_node, 1)],
+            biases=bias,
+            output=[(conv1d_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_conv.default
+
+
+class Conv2dPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.Conv2d]
+
+    def get_anchors(
+        self, gm: GraphModule, fused_partition: List[GraphModule]
+    ) -> PartitionAnchors:
+        conv2d_node = fused_partition[0].nodes[-1]
+
+        # If bias is None, replace it with an empty list.
+        bias = (
+            [(conv2d_node, 2)]
+            if len(conv2d_node.args) > 2 and conv2d_node.args[2]
+            else []
+        )
+
+        return PartitionAnchors(
+            inputs=[(conv2d_node, 0)],
+            weights=[(conv2d_node, 1)],
+            biases=bias,
+            output=[(conv2d_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_conv.default
+
+
+class AddmmPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.addmm]
+
+    def get_anchors(
+        self, gm: GraphModule, fused_partition: List[GraphModule]
+    ) -> PartitionAnchors:
+        addmm_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(addmm_node, 1)],
+            weights=[(addmm_node, 2)],
+            biases=[(addmm_node, 0)],
+            output=[(addmm_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_linear.default
+
+
+class ReluPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.ReLU]
+
+    def get_anchors(
+        self, gm: GraphModule, fused_partition: List[GraphModule]
+    ) -> PartitionAnchors:
+        relu_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(relu_node, 0)],
+            weights=[],
+            biases=[],
+            # pyre-fixme[6]: Incompatible parameter type
+            output=[
+                (relu_node, SharedQuantizationSpec((relu_node.args[0], relu_node)))
+            ],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_relu.default
+
+
+class GenericQuantizer(Quantizer):
+    def __init__(self, pattern, quantization_config):
+        super().__init__()
+        self.pattern = pattern
+        self.quantization_config = quantization_config
+
+    def annotate(self, model):
+        fused_partitions = find_sequential_partitions(
+            model,
+            self.pattern.partition_types(),
+        )
+
+        input_act_qspec = self.quantization_config.input_activation
+        weight_qspec = self.quantization_config.weight
+        bias_qspec = self.quantization_config.bias
+        output_act_qspec = self.quantization_config.output_activation
+
+        for fused_partition in fused_partitions:
+            if not _no_outside_users(fused_partition):
+                continue
+
+            anchors = self.pattern.get_anchors(model, fused_partition)
+            if not anchors:
+                continue
+            if _is_annotated(
+                [
+                    x[0]
+                    for x in anchors.inputs
+                    + anchors.weights
+                    + anchors.biases
+                    + anchors.output
+                ]
+            ):
+                continue
+
+            for output, *custom_spec in anchors.output:
+                output.meta["quantization_annotation"] = QuantizationAnnotation(
+                    output_qspec=custom_spec[0] if custom_spec else output_act_qspec,
+                    _annotated=True,
+                )
+
+            def annotate_inputs(inputs, spec):
+                for node, idx in inputs:
+                    annotation = node.meta.get(
+                        "quantization_annotation",
+                        QuantizationAnnotation(_annotated=True),
+                    )
+                    annotation.input_qspec_map[node.args[idx]] = spec
+                    node.meta["quantization_annotation"] = annotation
+
+            annotate_inputs(anchors.inputs, input_act_qspec)
+            annotate_inputs(anchors.weights, weight_qspec)
+            annotate_inputs(anchors.biases, bias_qspec)
+
+    def validate(self, model: fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> List[OperatorConfig]:
+        return []
+
+
+act_qspec = QuantizationSpec(
+    dtype=torch.uint8,
+    quant_min=0,
+    quant_max=255,
+    qscheme=torch.per_tensor_affine,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
+)
+
+wgt_qspec = QuantizationSpec(
+    dtype=torch.uint8,
+    quant_min=0,
+    quant_max=255,
+    qscheme=torch.per_tensor_affine,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=MinMaxObserver,
+)
+
+
+class CadenceBaseQuantizer(ComposableQuantizer):
+    def __init__(self):
+        static_qconfig = QuantizationConfig(
+            act_qspec,
+            act_qspec,
+            wgt_qspec,
+            None,
+        )
+        static_qconfig_no_wgt = QuantizationConfig(
+            act_qspec,
+            act_qspec,
+            None,
+            None,
+        )
+        super().__init__(
+            [
+                GenericQuantizer(AddmmPattern(), static_qconfig),
+                GenericQuantizer(Conv1dPattern(), static_qconfig),
+                GenericQuantizer(Conv2dPattern(), static_qconfig),
+                GenericQuantizer(LayerNormPattern(), static_qconfig_no_wgt),
+                GenericQuantizer(LinearFunctionalPattern(), static_qconfig),
+                GenericQuantizer(LinearPattern(), static_qconfig),
+                GenericQuantizer(ReluPattern(), static_qconfig),
+            ]
+        )
+
+
+class QuantFusion(ExportPass):
+    def __init__(self, patterns):
+        super().__init__()
+        self.patterns = patterns
+
+    def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
+        for pattern in self.patterns:
+            fused_partitions = find_sequential_partitions(
+                graph_module,
+                pattern.partition_types(),
+            )
+            for fused_partition in fused_partitions:
+                anchors = pattern.get_anchors(graph_module, fused_partition)
+                if not anchors:
+                    continue
+                if any(self.is_fused(p.nodes) for p in fused_partition):
+                    continue
+
+                for p in fused_partition:
+                    self.mark_fused(p.nodes)
+
+                dequants_inputs = []
+                for node, idx in anchors.inputs:
+                    if (
+                        node.args[idx].target
+                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    ):
+                        dequants_inputs.append(node.args[idx])
+                dequants_weights = []
+                for node, idx in anchors.weights:
+                    if (
+                        node.args[idx].target
+                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    ):
+                        dequants_weights.append(node.args[idx])
+
+                inputs_inputs = [node.args[0] for node in dequants_inputs]
+                weights_inputs = [node.args[0] for node in dequants_weights]
+                weights_init_inputs = [node.args[idx] for node, idx in anchors.weights]
+                bias_inputs = [node.args[idx] for node, idx in anchors.biases]
+                other_inputs = [node.args[idx] for node, idx in anchors.others]
+
+                # The node is the first index of the list and first of the tuple
+                op_node = anchors.output[0][0]
+
+                assert len(op_node.users) == 1
+                quant_node = list(op_node.users.keys())[0]
+
+                with graph_module.graph.inserting_after(op_node):
+                    args = tuple(
+                        inputs_inputs + weights_inputs + other_inputs + bias_inputs
+                    )
+                    kwargs = {}
+                    if isinstance(pattern, Conv1dPattern) or isinstance(
+                        pattern, Conv2dPattern
+                    ):
+                        args, kwargs = get_args_and_kwargs_conv1d(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            quant_node,
+                            op_node,
+                        )
+                    elif isinstance(pattern, LinearPattern) or isinstance(
+                        pattern, LinearFunctionalPattern
+                    ):
+                        args, kwargs = get_args_and_kwargs_linear(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, LayerNormPattern):
+                        args, kwargs = get_args_and_kwargs_layer_norm(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            other_inputs,
+                            weights_init_inputs,
+                            bias_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, AddmmPattern):
+                        # Transpose the weight tensor
+                        transposed_weights = graph_module.graph.call_function(
+                            torch.ops.aten.transpose.int,
+                            (weights_inputs[0], 0, 1),
+                        )
+                        # Call linear with transposed weight
+                        args, kwargs = get_args_and_kwargs_linear(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            other_inputs,
+                            [transposed_weights],
+                            dequants_weights,
+                            bias_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, ReluPattern):
+                        args, kwargs = get_args_and_kwargs_relu(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                        )
+                    fused = graph_module.graph.call_function(
+                        pattern.replacement_op(),
+                        args,
+                        kwargs,
+                    )
+                    fused.meta = quant_node.meta
+                    quant_node.replace_all_uses_with(fused)
+
+            legalize_graph(graph_module)
+            graph_module.graph.eliminate_dead_code()
+            # pyre-fixme[7]: Incompatible return type
+            graph_module.recompile()
+
+    @classmethod
+    def is_fused(cls, nodes) -> bool:
+        return any(cls.__qualname__ in n.meta for n in nodes)
+
+    @classmethod
+    def mark_fused(cls, nodes) -> bool:
+        for n in nodes:
+            # pyre-fixme[7]: Incompatible return type
+            n.meta["QuantFusion"] = True
+
+
+class ReplacePT2QuantWithCadenceQuant(ExportPass):
+    """
+    Replace the pt2 quantization ops with custom cadence quantization ops.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in {exir_ops.edge.quantized_decomposed.quantize_per_tensor.default}:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args,
+            kwargs,
+            meta,
+        )
+
+
+class ReplacePT2DequantWithCadenceDequant(ExportPass):
+    """
+    Replace the pt2 dequantization ops with custom cadence dequantization ops.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in {exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default}:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+            args,
+            kwargs,
+            meta,
+        )
diff --git a/examples/cadence/aot/utils.py b/examples/cadence/aot/utils.py
new file mode 100644
index 00000000000..b511f95e80a
--- /dev/null
+++ b/examples/cadence/aot/utils.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import operator
+from typing import Dict
+
+import torch
+from executorch.exir import memory
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
+from tabulate import tabulate
+
+
+# Get the output size of a 1D convolution given the input size and parameters
+def get_conv1d_output_size(
+    in_size: torch.Size,
+    out_channels: int,
+    stride: int,
+    padding: int,
+    dilation: int,
+    kernel_size: int,
+) -> torch.Size:
+    assert len(in_size) == 3
+    N, C, L = in_size
+
+    # Reference: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+    lout = (L + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1
+
+    return torch.Size((in_size[0], out_channels, lout))
+
+
+# Return the overload packet for the edge op
+def get_edge_overload_packet(edge_op: EdgeOpOverload) -> EdgeOpOverloadPacket:
+    edge_op_namespace, edge_op_name = (
+        edge_op.namespace,
+        edge_op._schema.name.split("::")[1],
+    )
+    edge_op_overload_packet = getattr(
+        getattr(exir_ops.edge, edge_op_namespace), edge_op_name
+    )
+    return edge_op_overload_packet
+
+
+# Get the frequency list of ops in a graph module
+def get_ops_count(graph_module: torch.fx.GraphModule) -> Dict[str, int]:
+    freq = {}
+    # Loop over nodes to count the number of times each op occurs
+    for node in graph_module.graph.nodes:
+        if node.op == "call_function":
+            # Ignore getitem, alloc and view cases, we only want actual operations
+            if (
+                node.target == operator.getitem
+                or node.target.__name__ == "alloc"
+                or node.target == memory.view
+            ):
+                continue
+            # If the op is already present, increment the count
+            if get_edge_overload_packet(node.target).__name__ in freq:
+                freq[get_edge_overload_packet(node.target).__name__] += 1
+            # else, add a new entry
+            else:
+                freq[get_edge_overload_packet(node.target).__name__] = 1
+    return freq
+
+
+# Print the ops and how many times they occur multiple graph modules:
+# from export, from to_edge, and from Jarvis. Print the available
+# implementations for each op, and error out if the op is not supported.
+def print_ops_info(
+    export_gm: torch.fx.GraphModule,
+    to_edge_gm: torch.fx.GraphModule,
+    jarvis_gm: torch.fx.GraphModule,
+):
+    export_ops_count = get_ops_count(export_gm)
+    to_edge_ops_count = get_ops_count(to_edge_gm)
+    jarvis_ops_count = get_ops_count(jarvis_gm)
+
+    # De-duplicate the "<op>" and "<op>_copy" ops
+    keys_to_delete_and_add = []
+    for k1 in export_ops_count:
+        for k2 in {**to_edge_ops_count, **jarvis_ops_count}:
+            if k2.startswith(k1):
+                keys_to_delete_and_add.append((k1, k2))
+                break
+
+    for k in keys_to_delete_and_add:
+        export_ops_count[k[1]] = export_ops_count[k[0]]
+        del export_ops_count[k[0]]
+
+    removed_ops = []
+    # Get the counts of the ops that are removed from the final graph
+    for k in {**export_ops_count, **to_edge_ops_count}:
+        if k not in jarvis_ops_count:
+            removed_ops.append(k)
+
+    # Create a dict of ops and their counts to pass to tabulate
+    ops_count = [
+        [
+            op,
+            jarvis_ops_count[op],
+            to_edge_ops_count[op] if op in to_edge_ops_count else 0,
+            export_ops_count[op] if op in export_ops_count else 0,
+        ]
+        for op in jarvis_ops_count
+    ]
+    sorted_ops_count = sorted(ops_count, key=lambda x: x[1], reverse=True)
+
+    # Create a dict of deleted ops and their counts to pass to tabulate
+    removed_ops_count = [
+        [
+            op,
+            0,
+            to_edge_ops_count[op] if op in to_edge_ops_count else 0,
+            export_ops_count[op] if op in export_ops_count else 0,
+        ]
+        for op in removed_ops
+    ]
+
+    # Print the final ops and their counts in a tabular format
+    logging.info(
+        tabulate(
+            sorted_ops_count,
+            headers=[
+                "Final Operators                                    ",  # one character longer than the longest op name
+                "Jarvis (Final) Graph",
+                "To_edge Graph",
+                "Export Graph",
+            ],
+            tablefmt="outline",
+        )
+    )
+
+    # Print the removed ops and their counts in a tabular format (if any)
+    if removed_ops != []:
+        logging.info(
+            tabulate(
+                removed_ops_count,
+                headers=[
+                    "Deleted Operators                                  ",  # one character longer than the longest op name
+                    "Jarvis (Final) Graph",
+                    "To_edge Graph",
+                    "Export Graph",
+                ],
+                tablefmt="outline",
+            )
+        )
diff --git a/examples/xtensa/xtensa.cmake b/examples/cadence/cadence.cmake
similarity index 100%
rename from examples/xtensa/xtensa.cmake
rename to examples/cadence/cadence.cmake
diff --git a/examples/xtensa/executor_runner.cpp b/examples/cadence/executor_runner.cpp
similarity index 99%
rename from examples/xtensa/executor_runner.cpp
rename to examples/cadence/executor_runner.cpp
index 37ade9914fc..7144f84507b 100644
--- a/examples/xtensa/executor_runner.cpp
+++ b/examples/cadence/executor_runner.cpp
@@ -12,7 +12,7 @@
  * This is a simple executor_runner that boots up the DSP, configures the serial
  * port, sends a bunch of test messages to the M33 core and then loads the model
  * defined in model_pte.h. It runs this model using the ops available in
- * xtensa/ops directory.
+ * cadence/ops directory.
  */
 
 #include <fsl_debug_console.h>
diff --git a/examples/xtensa/kernels/CMakeLists.txt b/examples/cadence/kernels/CMakeLists.txt
similarity index 78%
rename from examples/xtensa/kernels/CMakeLists.txt
rename to examples/cadence/kernels/CMakeLists.txt
index edd2a606000..963181c0664 100644
--- a/examples/xtensa/kernels/CMakeLists.txt
+++ b/examples/cadence/kernels/CMakeLists.txt
@@ -5,10 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 
 # lint_cmake: -linelength
-add_library(xtensa_kernels kernels.cpp ${EXECUTORCH_ROOT}/examples/xtensa/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u.cpp)
+add_library(cadence_kernels kernels.cpp ${EXECUTORCH_ROOT}/examples/cadence/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u.cpp)
 
 target_include_directories(
-  xtensa_kernels
+  cadence_kernels
   PUBLIC .
          ${NN_LIB_BASE_DIR}/xa_nnlib/algo/common/include/
          ${NN_LIB_BASE_DIR}/xa_nnlib/include/nnlib
diff --git a/examples/xtensa/kernels/kernels.cpp b/examples/cadence/kernels/kernels.cpp
similarity index 98%
rename from examples/xtensa/kernels/kernels.cpp
rename to examples/cadence/kernels/kernels.cpp
index 5fcc545a540..2f29b25ac82 100644
--- a/examples/xtensa/kernels/kernels.cpp
+++ b/examples/cadence/kernels/kernels.cpp
@@ -16,6 +16,11 @@ namespace impl {
 namespace HiFi {
 namespace kernels {
 
+__attribute__((always_inline)) void
+memcpy(void* dst, const void* src, size_t num_bytes) {
+  MEMCPY_8b(dst, src, num_bytes);
+}
+
 // Quantize a fp32 value to an int8_t/uint8_t value
 template <typename T>
 __attribute__((always_inline)) T
diff --git a/examples/xtensa/kernels/kernels.h b/examples/cadence/kernels/kernels.h
similarity index 88%
rename from examples/xtensa/kernels/kernels.h
rename to examples/cadence/kernels/kernels.h
index 6a5a255c0ad..13e0470b382 100644
--- a/examples/xtensa/kernels/kernels.h
+++ b/examples/cadence/kernels/kernels.h
@@ -16,6 +16,8 @@ namespace impl {
 namespace HiFi {
 namespace kernels {
 
+void memcpy(void* dst, const void* src, size_t num_bytes);
+
 WORD32 matmul_asym8uxasym8u_asym8u(
     UWORD8* __restrict__ p_out, // output uint8 matrix
     const UWORD8* __restrict__ p_mat1, // weight uint8 matrix
@@ -35,6 +37,12 @@ WORD32 matmul_asym8uxasym8u_asym8u(
     WORD32 out_zero_bias,
     bool per_channel_quantized = false); // per-channel quantized weight
 
+template <typename T>
+T quantize(const float x, float scale, int32_t zero_point);
+
+template <typename T>
+float dequantize(const T x, float scale, int32_t zero_point);
+
 template <typename T>
 void quantize(
     T* __restrict__ y,
diff --git a/examples/xtensa/ops/CMakeLists.txt b/examples/cadence/ops/CMakeLists.txt
similarity index 66%
rename from examples/xtensa/ops/CMakeLists.txt
rename to examples/cadence/ops/CMakeLists.txt
index 215de49f20c..a5453979f0e 100644
--- a/examples/xtensa/ops/CMakeLists.txt
+++ b/examples/cadence/ops/CMakeLists.txt
@@ -26,29 +26,35 @@ endif()
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_embedding.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp")
-add_library(aten_ops_xtensa ${_aten_ops__srcs})
-target_link_libraries(aten_ops_xtensa PUBLIC executorch)
+add_library(aten_ops_cadence ${_aten_ops__srcs})
+target_link_libraries(aten_ops_cadence PUBLIC executorch)
+target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
-target_include_directories(aten_ops_xtensa PUBLIC ${ROOT_DIR}/..
-                                                  ${CMAKE_BINARY_DIR}
-                                                  ${_common_include_directories})
+target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/..
+                                                ${CMAKE_BINARY_DIR}
+                                                ${_common_include_directories})
 
 # Custom ops that are needed to run the test model.
 add_library(
-  custom_ops "quantized_linear_out.cpp" "quantize_per_tensor.cpp"
-  "dequantize_per_tensor.cpp")
+  custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp"
+  "quantized_relu_out.cpp" "quantized_layer_norm.cpp"
+  "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp")
 target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/..
                                              ${CMAKE_BINARY_DIR}
                                              ${_common_include_directories})
 
 target_link_libraries(custom_ops PUBLIC executorch)
-target_link_libraries(custom_ops PRIVATE xtensa_kernels)
+target_link_libraries(custom_ops PRIVATE cadence_kernels)
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in functions.yaml
@@ -58,6 +64,6 @@ generate_bindings_for_kernels(
 message("Generated files ${gen_command_sources}")
 
 gen_operators_lib(
-  "xtensa_ops_lib"
+  "cadence_ops_lib"
   KERNEL_LIBS custom_ops
-  DEPS aten_ops_xtensa)
+  DEPS aten_ops_cadence)
diff --git a/examples/xtensa/ops/dequantize_per_tensor.cpp b/examples/cadence/ops/dequantize_per_tensor.cpp
similarity index 100%
rename from examples/xtensa/ops/dequantize_per_tensor.cpp
rename to examples/cadence/ops/dequantize_per_tensor.cpp
diff --git a/examples/cadence/ops/functions.yaml b/examples/cadence/ops/functions.yaml
new file mode 100644
index 00000000000..08db1ca6f25
--- /dev/null
+++ b/examples/cadence/ops/functions.yaml
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This yaml file contains operators that are also defined by the ATen library.
+# For lean mode:
+#   - Codegen'd target `executorch_generated_lib` will be reading all the information
+#     from this file, including operator schema and kernel metadata.
+#   - Selective build target `codegen:executorch_defined_ops` now is selecting all the
+#     operators in this file, by dumping all the op names into `selected_operators.yaml`.
+#
+# See the README.md file in executorch/kernels/portable for a description of the syntax used
+# by this file.
+
+
+# aten ops
+- op: add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::add_out
+
+- op: embedding.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::embedding_out
+
+- op: full.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::full_out
+
+- op: permute_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::permute_copy_out
+
+- op: view_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::view_copy_out
+
+# custom ops
+- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantize_per_tensor_out
+
+- func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::dequantize_per_tensor_out
+
+- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv_out
+
+- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_layer_norm_out
+
+- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_linear_out
+
+- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_relu_out
diff --git a/examples/xtensa/ops/op_add.cpp b/examples/cadence/ops/op_add.cpp
similarity index 100%
rename from examples/xtensa/ops/op_add.cpp
rename to examples/cadence/ops/op_add.cpp
diff --git a/examples/cadence/ops/op_embedding.cpp b/examples/cadence/ops/op_embedding.cpp
new file mode 100644
index 00000000000..b4100feacc1
--- /dev/null
+++ b/examples/cadence/ops/op_embedding.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "kernels.h"
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+
+void embedding_out(
+    RuntimeContext& ctx,
+    const Tensor& weight,
+    const Tensor& indices,
+    int64_t padding_idx,
+    bool scale_grad_by_freq,
+    bool sparse,
+    Tensor& out) {
+  int64_t nbytes_per_entry = weight.size(1) * weight.element_size();
+  const char* w_data = weight.const_data_ptr<char>();
+  char* out_data = out.mutable_data_ptr<char>();
+  const int64_t* indices_ptr = indices.const_data_ptr<int64_t>();
+
+  for (int i = 0, e = indices.numel(); i < e; i++) {
+    // memcpy(dest, src, nbytes);
+    impl::HiFi::kernels::memcpy(
+        out_data, w_data + nbytes_per_entry * indices_ptr[i], nbytes_per_entry);
+    out_data += nbytes_per_entry;
+  }
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/examples/xtensa/ops/op_full.cpp b/examples/cadence/ops/op_full.cpp
similarity index 100%
rename from examples/xtensa/ops/op_full.cpp
rename to examples/cadence/ops/op_full.cpp
diff --git a/examples/cadence/ops/op_view_copy.cpp b/examples/cadence/ops/op_view_copy.cpp
new file mode 100644
index 00000000000..e856c1592cb
--- /dev/null
+++ b/examples/cadence/ops/op_view_copy.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "kernels.h"
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+
+Tensor& view_copy_out(
+    RuntimeContext& ctx,
+    const Tensor& input,
+    const IntArrayRef size,
+    Tensor& out) {
+  impl::HiFi::kernels::memcpy(
+      out.mutable_data_ptr(), input.const_data_ptr(), input.nbytes());
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/examples/xtensa/ops/quantize_per_tensor.cpp b/examples/cadence/ops/quantize_per_tensor.cpp
similarity index 100%
rename from examples/xtensa/ops/quantize_per_tensor.cpp
rename to examples/cadence/ops/quantize_per_tensor.cpp
diff --git a/examples/cadence/ops/quantized_conv_out.cpp b/examples/cadence/ops/quantized_conv_out.cpp
new file mode 100644
index 00000000000..23e189e6bcb
--- /dev/null
+++ b/examples/cadence/ops/quantized_conv_out.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "kernels.h"
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <algorithm>
+#include <cmath>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+
+// This implements a generic 2d conv kernel that operates on raw pointers.
+// The version handles both quantized and fp32 convolutions.
+// The input is of shape [n x c x h x w]
+// The weight is of shape [oc x wc x wh x ww], where wc == c
+// The output is of shape [n x oc x oh x ow]
+// The bias is of shape [oc]
+template <typename IT, typename WT, typename BT, typename OT, bool quantized>
+__attribute__((noinline)) void conv2d_nchw_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t c,
+    int32_t h,
+    int32_t w,
+    int32_t oc,
+    int32_t wc,
+    int32_t wh,
+    int32_t ww,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    const int32_t* __restrict__ weight_zero_point = nullptr,
+    const float* __restrict__ bias_scale = nullptr,
+    float out_scale = 1,
+    OT out_zero_point = 0,
+    bool per_tensor_quantized = true) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * c * h * w;
+    OT* out_batch = p_out + _n * oc * oh * ow;
+    // Compute separable convolution for each group
+    for (int _g = 0; _g < groups; ++_g) {
+      // Identify the input and output channels involved in the computation
+      // of this group
+      int sic = _g * icpg;
+      int soc = _g * ocpg;
+      // Populate all the output channels in the group
+      for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+        OT* out_plane = out_batch + _oc * oh * ow;
+        const WT* weight_batch = p_weight + _oc * wc * wh * ww;
+        // We compute one output channel at a time. The computation can be
+        // thought of as a stencil computation: we iterate over an input of size
+        // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an
+        // output channel of size 1 x oh x ow.
+        for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+          for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to the
+            // output channel being computed) with the corresponding weight
+            // channel.
+            // If the padding is 0, and dilation is 1, then we can remove the
+            // unnecessary checks, and simplify the code so that it can be
+            // vectorized by Tensilica compiler.
+            if (zero_pad_unit_dilation) {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    int ioff = (_h + _wh) * w + (_w + _ww);
+                    int woff = _wh * ww + _ww;
+                    float lhs = in_plane[ioff] - in_zero_point;
+                    float rhs = weight_plane[woff] -
+                        (quantized ? weight_zero_point[0] : 0);
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    if (((_h + d0 * _wh - p0) >= 0) &&
+                        ((_h + d0 * _wh - p0) < h) &&
+                        ((_w + d1 * _ww - p1) >= 0) &&
+                        ((_w + d1 * _ww - p1 < w))) {
+                      int ioff =
+                          (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
+                      int woff = _wh * ww + _ww;
+                      float lhs = in_plane[ioff] - in_zero_point;
+                      float rhs = weight_plane[woff] -
+                          (quantized ? weight_zero_point[0] : 0);
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val =
+                  (per_tensor_quantized ? bias_scale[0] : bias_scale[_oc]) *
+                  acc;
+              out_plane[_oh * ow + _ow] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+            } else {
+              out_plane[_oh * ow + _ow] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// The quantized convolution kernel. in_scale and weight_scale are implicit in
+// bias_scale, since it is a product of the two. The kernel will branch to
+// quantized::conv1d or quantized::conv2d based on the dimensionality of
+// activation tensor.
+void quantized_conv_out(
+    RuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    exec_aten::IntArrayRef stride,
+    exec_aten::IntArrayRef padding,
+    exec_aten::IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    bool channel_last,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  // input = [n, c, h, w]
+  const int n = input.size(0);
+  const int c = input.size(1);
+  const int h = conv1d ? 1 : input.size(2);
+  const int w = conv1d ? input.size(2) : input.size(3);
+  // weight = [oc, wc, wh, ww]
+  const int oc = weight.size(0);
+  const int wc = weight.size(1);
+  const int wh = conv1d ? 1 : weight.size(2);
+  const int ww = conv1d ? weight.size(2) : weight.size(3);
+  // output = [n, oc, oh, ow]
+  const int oh = conv1d ? 1 : out.size(2);
+  const int ow = conv1d ? out.size(2) : out.size(3);
+
+  // Bool flag to check if weight tensor is quantized per-tensor or
+  // per-channel
+  bool per_tensor_quantized = bias_scale.numel() == 1;
+
+  conv2d_nchw_core_generic<uint8_t, uint8_t, int32_t, uint8_t, true>(
+      input.const_data_ptr<uint8_t>(),
+      weight.const_data_ptr<uint8_t>(),
+      bias.const_data_ptr<int32_t>(),
+      out.mutable_data_ptr<uint8_t>(),
+      n,
+      c,
+      h,
+      w,
+      oc,
+      wc,
+      wh,
+      ww,
+      oh,
+      ow,
+      stride[0],
+      stride[1],
+      padding[0],
+      padding[1],
+      dilation[0],
+      dilation[1],
+      groups,
+      in_zero_point,
+      weight_zero_point.const_data_ptr<int32_t>(),
+      bias_scale.const_data_ptr<float>(),
+      output_scale,
+      (uint8_t)output_zero_point,
+      per_tensor_quantized);
+}
+
+}; // namespace native
+}; // namespace HiFi
+}; // namespace impl
diff --git a/examples/cadence/ops/quantized_layer_norm.cpp b/examples/cadence/ops/quantized_layer_norm.cpp
new file mode 100644
index 00000000000..27d86e56227
--- /dev/null
+++ b/examples/cadence/ops/quantized_layer_norm.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "kernels.h"
+
+#include <algorithm>
+#include <cmath>
+#include <tuple>
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// Compute quantized layer_norm. The current implementation assumes that the
+// input is per-tensor quantized.
+template <typename T>
+void quantized_layer_norm_(
+    const Tensor& input,
+    float input_scale,
+    int64_t input_zero_point,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  // Get the raw pointers to input, output, weight, and bias
+  const T* __restrict__ in_data = input.const_data_ptr<T>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+  const float* __restrict__ weight_data = weight.const_data_ptr<float>();
+  const float* __restrict__ bias_data = bias.const_data_ptr<float>();
+
+  float output_inv_scale = XT_RECIP_S(output_scale);
+
+  size_t last_dim = input.size(input.dim() - 1);
+  size_t leading_dims = getLeadingDims(input, input.dim() - 1);
+
+  // Visualize the input tensor as a set of 1d vectors, and compute the
+  // layer_norm for each vector.
+  for (size_t i = 0; i < leading_dims; ++i) {
+    const T* __restrict__ x = in_data + i * last_dim;
+    T* __restrict__ y = out_data + i * last_dim;
+
+    // compute sum and squared sum. The fp32 sum can be approximated as:
+    // (X_1 - in_zero_point) * in_scale + (X_2 - in_zero_point) * in_scale + ...
+    // (X_N - in_zero_point) * in_scale.
+    int32_t sum = 0;
+    int32_t sq_sum = last_dim * input_zero_point * input_zero_point;
+#pragma simd
+    for (size_t j = 0; j < last_dim; ++j) {
+      int32_t val = x[j];
+      sum += val;
+      sq_sum += val * val;
+    }
+    sq_sum -= (2 * sum * input_zero_point);
+    sum -= (last_dim * input_zero_point);
+
+    float mean = XT_DIV_S(XT_MUL_S(input_scale, sum), last_dim);
+    float variance =
+        XT_DIV_S(
+            XT_MUL_S(sq_sum, XT_MUL_S(input_scale, input_scale)), last_dim) -
+        XT_MUL_S(mean, mean);
+    float inv_std = XT_RECIP_S(XT_SQRT_S(XT_ADD_S(variance, (float)eps)));
+
+    // y = (x - mean) / std * kGamma + kBeta
+#pragma simd
+    for (size_t j = 0; j < last_dim; ++j) {
+      // Since X is quantized, we dequantize it, compute fp32 result, and
+      // quantize the result to an int8/uint8 value.
+      float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);
+      val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
+      y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);
+    }
+  }
+}
+
+// Compute quantized layer_norm. The current implementation assumes that the
+// input is per-tensor quantized.
+template <typename T>
+void quantized_layer_norm_(
+    const Tensor& input,
+    const Tensor& in_scale,
+    const Tensor& in_zero_point,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  // Extract the zero point and scale for input tensor.
+  float input_scale = in_scale.const_data_ptr<float>()[0];
+  int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
+
+  // Call other overload
+  quantized_layer_norm_<T>(
+      input,
+      input_scale,
+      input_zero_point,
+      weight,
+      bias,
+      eps,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_layer_norm_out(
+    RuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_scale,
+    const Tensor& in_zero_point,
+    const exec_aten::IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  if (input.scalar_type() == exec_aten::ScalarType::Byte) {
+    quantized_layer_norm_<uint8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == exec_aten::ScalarType::Char) {
+    quantized_layer_norm_<int8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());
+  }
+}
+
+}; // namespace native
+}; // namespace HiFi
+}; // namespace impl
diff --git a/examples/xtensa/ops/quantized_linear_out.cpp b/examples/cadence/ops/quantized_linear_out.cpp
similarity index 81%
rename from examples/xtensa/ops/quantized_linear_out.cpp
rename to examples/cadence/ops/quantized_linear_out.cpp
index 2acf36e33d8..2fdd900008d 100644
--- a/examples/xtensa/ops/quantized_linear_out.cpp
+++ b/examples/cadence/ops/quantized_linear_out.cpp
@@ -19,35 +19,23 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 using RuntimeContext = torch::executor::RuntimeContext;
 
-namespace linear_util {
-// This function compute the product of dim[0:dim] where dim is not inclusive
-size_t getLeadingDims(const Tensor& tensor, int64_t dim) {
-  size_t dims = 1;
-  for (size_t i = 0; i < dim; ++i) {
-    dims *= tensor.size(i);
-  }
-  return dims;
-}
-} // namespace linear_util
-
-void quantized_linear_pt2_out(
+void quantized_linear_out(
     RuntimeContext& ctx,
     const Tensor& src,
     const Tensor& weight,
     const Tensor& bias,
-    double src_scale,
     int64_t src_zero_point,
-    double weight_scale,
-    int64_t weight_zero_point,
+    const Tensor& weight_zero_point,
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     int64_t out_zero_point,
+    const exec_aten::optional<Tensor>& offset,
     Tensor& out) {
   // input comes in shape [leading_dims, in_dim]
   // weight comes in shape [out_dim, in_dim]
   // output comes in empty with shape [leading_dims, out_dim]
   // Perform matrix multiply (M x N) x (N x P)' => M x P
-  int64_t leading_dims = linear_util::getLeadingDims(src, src.dim() - 1);
+  int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
   int64_t out_dim = weight.size(0); // = out_dim
   int64_t in_dim = weight.size(1); // = in_dim
 
@@ -69,7 +57,7 @@ void quantized_linear_pt2_out(
       in_dim, // vec_offset of p_mat2.
       out_dim, // out_offset, i.e., offset of next output element written
       1, // out_stride, i.e., stride to go to next output row
-      -weight_zero_point, // mat1_zero_bias
+      -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
       -src_zero_point, // mat2_zero_bias
       out_multiplier.const_data_ptr<int32_t>(), // out_multiplier
       out_shift.const_data_ptr<int32_t>(), // out_shift
diff --git a/examples/cadence/ops/quantized_relu_out.cpp b/examples/cadence/ops/quantized_relu_out.cpp
new file mode 100644
index 00000000000..1643747baec
--- /dev/null
+++ b/examples/cadence/ops/quantized_relu_out.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "kernels.h"
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+
+// Note: this kernel assumes that the input and output share quantization
+// parameters. If that is not the case, it will produce incorrect results.
+template <typename T>
+void quantized_relu_(
+    const Tensor& input,
+    const Tensor& in_zero_point,
+    Tensor& output) {
+  T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
+  const T* __restrict__ in = input.const_data_ptr<T>();
+  T* __restrict__ out = output.mutable_data_ptr<T>();
+
+  for (size_t i = 0, e = input.numel(); i < e; ++i) {
+    out[i] = in[i] > q_zero_point ? in[i] : q_zero_point;
+  }
+}
+
+void quantized_relu_out(
+    RuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_zero_point,
+    Tensor& output) {
+  if (input.scalar_type() == exec_aten::ScalarType::Byte) {
+    quantized_relu_<uint8_t>(input, in_zero_point, output);
+  } else if (input.scalar_type() == exec_aten::ScalarType::Char) {
+    quantized_relu_<int8_t>(input, in_zero_point, output);
+  } else {
+    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());
+  }
+}
+
+}; // namespace native
+}; // namespace HiFi
+}; // namespace impl
diff --git a/examples/cadence/tests/quantized_conv1d_example.py b/examples/cadence/tests/quantized_conv1d_example.py
new file mode 100644
index 00000000000..47ec4e9b661
--- /dev/null
+++ b/examples/cadence/tests/quantized_conv1d_example.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer
+
+import logging
+
+from ..aot.meta_registrations import *  # noqa
+
+import torch
+
+from ..aot.export_example import export_model
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+if __name__ == "__main__":
+    (
+        shape,
+        in_channels,
+        out_channels,
+        kernel,
+        stride,
+        padding,
+        dilation,
+        depthwise,
+        bias,
+        channel_last,
+    ) = [(1, 8, 33), 8, 16, 3, 2, 4, 3, False, True, False]
+
+    class QuantizedConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1d = torch.nn.Conv1d(
+                in_channels,
+                out_channels,
+                kernel,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=in_channels if depthwise else 1,
+                bias=bias,
+            )
+
+        def forward(self, x: torch.Tensor):
+            return self.conv1d(x)
+
+    model = QuantizedConv()
+    model.eval()
+
+    example_inputs = (torch.randn(shape),)
+
+    export_model(model, example_inputs)
diff --git a/examples/cadence/tests/quantized_linear_example.py b/examples/cadence/tests/quantized_linear_example.py
new file mode 100644
index 00000000000..916b684173a
--- /dev/null
+++ b/examples/cadence/tests/quantized_linear_example.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer
+
+import logging
+
+from ..aot.meta_registrations import *  # noqa
+
+import torch
+
+from ..aot.export_example import export_model
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+if __name__ == "__main__":
+    in_features = 32
+    out_features = 16
+    bias = True
+    shape = [64, in_features]
+
+    class QuantizedLinear(torch.nn.Module):
+        def __init__(self, in_features: int, out_features: int, bias: bool):
+            super().__init__()
+            self.output_linear = torch.nn.Linear(in_features, out_features, bias=bias)
+
+        def forward(self, x: torch.Tensor):
+            output_linear_out = self.output_linear(x)
+            return output_linear_out
+
+    model = QuantizedLinear(in_features, out_features, bias)
+    model.eval()
+
+    example_inputs = (torch.ones(shape),)
+
+    export_model(model, example_inputs)
diff --git a/examples/cadence/tests/rnnt_predictor_quantized_example.py b/examples/cadence/tests/rnnt_predictor_quantized_example.py
new file mode 100644
index 00000000000..fd94f48f88b
--- /dev/null
+++ b/examples/cadence/tests/rnnt_predictor_quantized_example.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer
+
+import logging
+
+import torch
+
+from ..aot.meta_registrations import *  # noqa
+
+from typing import Tuple
+
+from ..aot.export_example import export_model
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+if __name__ == "__main__":
+
+    class Predictor(torch.nn.Module):
+        def __init__(
+            self,
+            num_symbols: int,
+            symbol_embedding_dim: int,
+        ) -> None:
+            super().__init__()
+            self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim)
+            self.relu = torch.nn.ReLU()
+            self.linear = torch.nn.Linear(symbol_embedding_dim, symbol_embedding_dim)
+            self.layer_norm = torch.nn.LayerNorm(symbol_embedding_dim)
+
+        def forward(
+            self,
+            input: torch.Tensor,
+            lengths: torch.Tensor,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            input_tb = input.permute(1, 0)
+            embedding_out = self.embedding(input_tb)
+            relu_out = self.relu(embedding_out)
+            linear_out = self.linear(relu_out)
+            layer_norm_out = self.layer_norm(linear_out)
+            return layer_norm_out.permute(1, 0, 2), lengths
+
+    # Predictor
+    model = Predictor(128, 256)
+    model.eval()
+
+    # Batch size
+    batch_size = 1
+
+    num_symbols = 128
+    max_target_length = 10
+
+    # Dummy inputs
+    predictor_input = torch.randint(0, num_symbols, (batch_size, max_target_length))
+    predictor_lengths = torch.randint(1, max_target_length + 1, (batch_size,))
+
+    example_inputs = (
+        predictor_input,
+        predictor_lengths,
+    )
+
+    export_model(model, example_inputs)
diff --git a/examples/xtensa/third-party/nnlib-hifi4/license.txt b/examples/cadence/third-party/nnlib-hifi4/license.txt
similarity index 100%
rename from examples/xtensa/third-party/nnlib-hifi4/license.txt
rename to examples/cadence/third-party/nnlib-hifi4/license.txt
diff --git a/examples/xtensa/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u.cpp b/examples/cadence/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u.cpp
similarity index 100%
rename from examples/xtensa/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u.cpp
rename to examples/cadence/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u.cpp
diff --git a/examples/xtensa/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u_macros.h b/examples/cadence/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u_macros.h
similarity index 100%
rename from examples/xtensa/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u_macros.h
rename to examples/cadence/third-party/nnlib-hifi4/matmul_asym8uxasym8u_asym8u_macros.h
diff --git a/examples/xtensa/third-party/nnlib-hifi4/xa_nnlib_matmul_unroll_macros.h b/examples/cadence/third-party/nnlib-hifi4/xa_nnlib_matmul_unroll_macros.h
similarity index 100%
rename from examples/xtensa/third-party/nnlib-hifi4/xa_nnlib_matmul_unroll_macros.h
rename to examples/cadence/third-party/nnlib-hifi4/xa_nnlib_matmul_unroll_macros.h
diff --git a/examples/xtensa/utils/gen_header.py b/examples/cadence/utils/gen_header.py
similarity index 100%
rename from examples/xtensa/utils/gen_header.py
rename to examples/cadence/utils/gen_header.py
diff --git a/examples/xtensa/utils/post_compilation.py b/examples/cadence/utils/post_compilation.py
similarity index 100%
rename from examples/xtensa/utils/post_compilation.py
rename to examples/cadence/utils/post_compilation.py
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 990dcfadc53..27e4d14876a 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -17,7 +17,7 @@ This guide explains how to setup ExecuTorch for Android using a demo app. The ap
 * Refer to [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
 * Download and install [Android Studio and SDK](https://developer.android.com/studio).
 * Supported Host OS: CentOS, macOS Ventura (M1/x86_64). See below for Qualcomm HTP specific requirements.
-* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](build-run-qualcomm-ai-engine-direct-backend.md) for hardware and software pre-requisites.
+* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](build-run-qualcomm-ai-engine-direct-backend.md) for hardware and software pre-requisites. The version we use for this tutorial is 2.19. The chip we use for this tutorial is SM8450.
 :::
 ::::
 
@@ -39,7 +39,7 @@ We generate the model file for the ExecuTorch runtime in Android Demo App.
 For delegating DeepLab v3 to XNNPACK backend, please do the following to export the model:
 
 ```bash
-export FLATC_EXECUTABLE=$(realpath third-party/flatbuffers/cmake-out/flatc)
+export FLATC_EXECUTABLE=$(realpath third-party/flatbuffers/cmake-android-out/flatc)
 python3 -m examples.xnnpack.aot_compiler --model_name="dl3" --delegate
 mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 cp dl3_xnnpack_fp32.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
@@ -54,7 +54,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-
 After generating the model, copy the model to `assets` directory.
 
 ```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8550 -s <adb_connected_device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8450 -s <adb_connected_device_serial>
 cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 ```
 
@@ -68,22 +68,20 @@ We build the required ExecuTorch runtime library to run the model.
 
 ```bash
 export ANDROID_NDK=<path-to-android-ndk>
-export BUCK2=/tmp/buck2 # Or your buck path
+export ANDROID_ABI=arm64-v8a
 
-rm -rf cmake-out && mkdir cmake-out && cd cmake-out
+rm -rf cmake-android-out && mkdir cmake-android-out
 
 # Build the core executorch library
-cmake .. -DCMAKE_INSTALL_PREFIX=cmake-out \
+cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DBUCK2="${BUCK2}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_FLATC=OFF \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DFLATC_EXECUTABLE="${FLATC}" \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -Bcmake-android-out
 
-cmake --build . -j16 --target install
+cmake --build cmake-android-out -j16 --target install
 ```
 
 When we set `EXECUTORCH_BUILD_XNNPACK=ON`, we will build the target [`xnnpack_backend`](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt) which in turn is linked into libexecutorch_jni via [CMake](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/jni/CMakeLists.txt).
@@ -93,45 +91,53 @@ When we set `EXECUTORCH_BUILD_XNNPACK=ON`, we will build the target [`xnnpack_ba
 ```bash
 
 # Build the android extension
-cmake ../extension/android -DBUCK2="${BUCK2}" \
-  -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+cmake extension/android \
+  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-out \
-  -Bextension/android
+  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+  -Bcmake-android-out/extension/android
 
-cmake --build ./extension/android -j16
+cmake --build cmake-android-out/extension/android -j16
 ```
 
 `libexecutorch_jni.so` wraps up the required XNNPACK Backend runtime library from `xnnpack_backend`, and adds an additional JNI layer using fbjni. This is later exposed to Java app.
 
 #### Qualcomm Hexagon NPU
 
-1. Configure the CMake target for the library with Qualcomm Hexagon NPU (HTP) backend (XNNPACK also included):
+1. Build the CMake target for the library with Qualcomm Hexagon NPU (HTP) backend (XNNPACK also included):
 
 ```bash
 export ANDROID_NDK=<path-to-android-ndk>
-export QNN_SDK=<path-to-qnn-sdk>
-
-rm -rf cmake-out && mkdir cmake-out && cd cmake-out
-cmake .. \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI=arm64-v8a \
-    -DBUCK2=/tmp/buck2 \
-    -DEXECUTORCH_BUILD_ANDROID_JNI=ON \
+export ANDROID_ABI=arm64-v8a
+export QNN_SDK_ROOT=<path-to-qnn-sdk>
+
+rm -rf cmake-android-out && mkdir cmake-android-out && cd cmake-android-out
+cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
+    -DANDROID_ABI="${ANDROID_ABI}" \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_FLATC=OFF \
     -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT=$QNN_SDK \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON
+    -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -Bcmake-android-out
+
+cmake --build cmake-android-out -j16 --target install
 ```
 Similar to the XNNPACK library, with this setup, we compile `libexecutorch_jni.so` but it adds an additional static library `qnn_executorch_backend` which wraps up Qualcomm HTP runtime library and registers the Qualcomm HTP backend. This is later exposed to Java app.
 
 `qnn_executorch_backend` is built when we turn on CMake option `EXECUTORCH_BUILD_QNN`. It will include the [CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/CMakeLists.txt) from backends/qualcomm where we `add_library(qnn_executorch_backend STATIC)`.
 
-2. Build the libraries:
+2. Build the Android extension:
 
 ```bash
-cmake --build . -j16
+cmake extension/android \
+  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI="${ANDROID_ABI}" \
+  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+  -Bcmake-android-out/extension/android
+
+cmake --build cmake-android-out/extension/android -j16
 ```
 
 ## Deploying on Device via Demo App
@@ -139,14 +145,9 @@ cmake --build . -j16
 ### Steps for Deploying Model via XNNPACK
 
 ```bash
-mkdir -p ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
-```
-
-Copy the core libraries:
-
-```bash
-cp ./examples/demo-apps/android/jni/libexecutorch_jni.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
+cp cmake-android-out/extension/android/libexecutorch_jni.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
 ```
 
 This allows the Android app to load ExecuTorch runtime with XNNPACK backend as a JNI library. Later, this shared library will be loaded by `NativePeer.java` in Java code.
@@ -160,15 +161,17 @@ mkdir -p ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64
 We need to push some additional Qualcomm HTP backend libraries to the app. Please refer to [Qualcomm docs](build-run-qualcomm-ai-engine-direct-backend.md) here.
 
 ```bash
-cp ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Skel.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpStub.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
+cp ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
 ```
 
 Copy the core libraries:
 
 ```bash
-cp ./examples/demo-apps/android/jni/libexecutorch_jni.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+cp cmake-android-out/extension/android/libexecutorch_jni.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+cp cmake-android-out/lib/libqnn_executorch_backend.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libqnn_executorch_backend.so
 ```
 
 ## Running the App
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
index 4407fbc3fe6..615fee860f8 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
@@ -68,3 +68,12 @@ dependencies {
   debugImplementation("androidx.compose.ui:ui-tooling")
   debugImplementation("androidx.compose.ui:ui-test-manifest")
 }
+
+tasks.register("setup") {
+  doFirst {
+    exec {
+      commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup.sh")
+      workingDir("../../../../../")
+    }
+  }
+}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
index 66be7da3157..8ff65bee59b 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh
+++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
@@ -1,40 +1,40 @@
 #!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
-#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 set -eu
 
-# Note: Set up ANDROID_NDK, ANDROID_ABI, BUCK2, and FLATC
-cmake . -DCMAKE_INSTALL_PREFIX=cmake-out \
+CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}"
+# Note: Set up ANDROID_NDK and ANDROID_ABI
+cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DBUCK2="${BUCK2}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_FLATC=OFF \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DFLATC_EXECUTABLE="${FLATC}" \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -Bcmake-out
+  -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+  -DCMAKE_BUILD_TYPE=Release \
+  -B"${CMAKE_OUT}"
 
 if [ "$(uname)" == "Darwin" ]; then
   CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
 else
   CMAKE_JOBS=$(( $(nproc) - 1 ))
 fi
-cmake --build cmake-out -j "${CMAKE_JOBS}" --target install
+cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
-cmake extension/android -DBUCK2="${BUCK2}" \
+cmake extension/android \
   -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-out \
-  -Bcmake-out/extension/android
+  -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -B"${CMAKE_OUT}"/extension/android
 
-cmake --build cmake-out/extension/android -j "${CMAKE_JOBS}"
+cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
 
 JNI_LIBS_PATH="examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs"
 mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}"
-cp cmake-out/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch.so"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index fccc4288f53..2b3e842bdf6 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -13,9 +13,7 @@ This app demonstrates the use of the LLaMA chat app demonstrating local inferenc
  * Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
 * Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
 
-```{note}
-This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
-```
+Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
 
 ## Getting models
 Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
@@ -27,23 +25,28 @@ adb push llama2.pte /data/local/tmp/llama
 adb push tokenizer.bin /data/local/tmp/llama
 ```
 
-```{note}
-The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
-```
+Note: The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
 
 ## Build JNI library
 1. Open a terminal window and navigate to the root directory of the `executorch`.
 2. Set the following environment variables:
-```{note}
-<path_to_android_ndk> is the root for the NDK, which is usually under
-~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md.
-We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.
-```
 ```bash
 export ANDROID_NDK=<path_to_android_ndk>
 export ANDROID_ABI=arm64-v8a
 ```
-3. Run the following command set up the required JNI library:
+Note: `<path_to_android_ndk>` is the root for the NDK, which is usually under
+`~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md.
+We use `<path_to_android_ndk>/build/cmake/android.toolchain.cmake` for CMake to cross-compile.
+
+3. (Optional) If you need to use tiktoken as the tokenizer (for LLaMA3), set
+`EXECUTORCH_USE_TIKTOKEN=ON` and later CMake will use it as the tokenizer.
+If you need to run other models like LLaMA2, skip this skip.
+
+```bash
+export EXECUTORCH_USE_TIKTOKEN=ON # Only for LLaMA3
+```
+
+4. Run the following command set up the required JNI library:
 ```bash
 pushd examples/demo-apps/android/LlamaDemo
 ./gradlew :app:setup
diff --git a/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml b/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml
new file mode 100644
index 00000000000..4df9f18cc5f
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml
@@ -0,0 +1,76 @@
+version: 0.1
+
+android_test_host: amazon_linux_2
+
+phases:
+  install:
+    commands:
+
+  pre_test:
+    commands:
+      # Prepare the model and the tokenizer
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /sdcard/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/llama/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/tokenizer.bin /data/local/tmp/llama/tokenizer.bin"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/xnnpack_llama2.pte /data/local/tmp/llama/xnnpack_llama2.pte"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/tokenizer.bin"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/xnnpack_llama2.pte"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/llama/"
+
+  test:
+    commands:
+      # By default, the following ADB command is used by Device Farm to run your Instrumentation test.
+      # Please refer to Android's documentation for more options on running instrumentation tests with adb:
+      # https://developer.android.com/studio/test/command-line#run-tests-with-adb
+      - echo "Starting the Instrumentation test"
+      - |
+        adb -s $DEVICEFARM_DEVICE_UDID shell "am instrument -r -w --no-window-animation \
+        $DEVICEFARM_TEST_PACKAGE_NAME/$DEVICEFARM_TEST_PACKAGE_RUNNER 2>&1 || echo \": -1\"" |
+        tee $DEVICEFARM_LOG_DIR/instrument.log
+
+      # Parse the results
+      - |-
+        INSTRUMENT_LOG="$DEVICEFARM_LOG_DIR/instrument.log"
+
+        DID_ANY_TESTS_START=$(grep "INSTRUMENTATION_STATUS_CODE: 1" $INSTRUMENT_LOG | wc -l);
+        TESTS_PASSED=$(grep "INSTRUMENTATION_STATUS_CODE: 0" $INSTRUMENT_LOG | wc -l);
+        TESTS_ERRORED=$(grep "INSTRUMENTATION_STATUS_CODE: -1" $INSTRUMENT_LOG | wc -l);
+        TESTS_FAILED=$(grep "INSTRUMENTATION_STATUS_CODE: -2" $INSTRUMENT_LOG | wc -l);
+        TESTS_IGNORED=$(grep "INSTRUMENTATION_STATUS_CODE: -3" $INSTRUMENT_LOG | wc -l);
+        TESTS_ASSUMPTION_FAILED=$(grep "INSTRUMENTATION_STATUS_CODE: -4" $INSTRUMENT_LOG | wc -l);
+        TESTS_PROCESSES_CRASHED=$(grep "INSTRUMENTATION_RESULT: shortMsg=Process crashed." $INSTRUMENT_LOG | wc -l);
+
+      # And print the results so that the CI job can show them later
+      - |
+        INSTRUMENT_LOG="$DEVICEFARM_LOG_DIR/instrument.log"
+
+        if [ $DID_ANY_TESTS_START -eq 0 ];
+        then
+          echo "[PyTorch] Marking the test suite as failed because no tests started!";
+          false;
+        elif [ $TESTS_FAILED -ne 0 ];
+        then
+          OBSERVED_TPS=$(grep "The observed TPS " $INSTRUMENT_LOG | tail -n 1)
+
+          if [ -n "${OBSERVED_TPS}" ];
+          then
+            echo "[PyTorch] ${OBSERVED_TPS}";
+          else
+            echo "[PyTorch] Marking the test suite as failed because it failed to load the model";
+          fi
+        elif [ $TESTS_ERRORED -ne 0 ];
+        then
+          echo "[PyTorch] Marking the test suite as failed because $TESTS_ERRORED tests errored!";
+          false;
+        elif [ $TESTS_PROCESSES_CRASHED -ne 0 ];
+        then
+          echo "[PyTorch] Marking the test suite as failed because the app crashed due to OOM!";
+          false;
+        fi;
+
+  post_test:
+    commands:
+
+artifacts:
+  # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory.
+  - $DEVICEFARM_LOG_DIR
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
new file mode 100644
index 00000000000..b8988d1f4ba
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import static junit.framework.TestCase.assertTrue;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import androidx.test.ext.junit.runners.AndroidJUnit4;
+import java.util.ArrayList;
+import java.util.List;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.pytorch.executorch.LlamaCallback;
+import org.pytorch.executorch.LlamaModule;
+
+@RunWith(AndroidJUnit4.class)
+public class PerfTest implements LlamaCallback {
+
+  private static final String RESOURCE_PATH = "/data/local/tmp/llama/";
+  private static final String MODEL_NAME = "xnnpack_llama2.pte";
+  private static final String TOKENIZER_BIN = "tokenizer.bin";
+
+  // From https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md
+  private static final Float EXPECTED_TPS = 10.0F;
+
+  private final List<String> results = new ArrayList<>();
+  private final List<Float> tokensPerSecond = new ArrayList<>();
+
+  @Test
+  public void testTokensPerSecond() {
+    String modelPath = RESOURCE_PATH + MODEL_NAME;
+    String tokenizerPath = RESOURCE_PATH + TOKENIZER_BIN;
+    LlamaModule mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f);
+
+    int loadResult = mModule.load();
+    // Check that the model can be load successfully
+    assertEquals(0, loadResult);
+
+    // Run a testing prompt
+    mModule.generate("How do you do! I'm testing llama2 on mobile device", PerfTest.this);
+    assertFalse(tokensPerSecond.isEmpty());
+
+    final Float tps = tokensPerSecond.get(tokensPerSecond.size() - 1);
+    assertTrue(
+        "The observed TPS " + tps + " is less than the expected TPS " + EXPECTED_TPS,
+        tps >= EXPECTED_TPS);
+  }
+
+  @Override
+  public void onResult(String result) {
+    results.add(result);
+  }
+
+  @Override
+  public void onStats(float tps) {
+    tokensPerSecond.add(tps);
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index f2529eb6b86..d93c798fcf9 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -79,9 +79,9 @@ private void setLocalModel(String modelPath, String tokenizerPath) {
     if (loadResult != 0) {
       AlertDialog.Builder builder = new AlertDialog.Builder(this);
       builder.setTitle("Load failed: " + loadResult);
-      AlertDialog alert = builder.create();
       runOnUiThread(
           () -> {
+            AlertDialog alert = builder.create();
             alert.show();
           });
     }
@@ -119,6 +119,10 @@ private String memoryInfo() {
   private void modelDialog() {
     String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte");
     String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin");
+    String[] modelFiles = listLocalFile("/data/local/tmp/llama/", ".model");
+    String[] tokenizerFiles = new String[binFiles.length + modelFiles.length];
+    System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length);
+    System.arraycopy(modelFiles, 0, tokenizerFiles, binFiles.length, modelFiles.length);
     AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
     modelPathBuilder.setTitle("Select model path");
     AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
@@ -134,10 +138,10 @@ private void modelDialog() {
         });
 
     tokenizerPathBuilder.setSingleChoiceItems(
-        binFiles,
+        tokenizerFiles,
         -1,
         (dialog, item) -> {
-          mTokenizerFilePath = binFiles[item];
+          mTokenizerFilePath = tokenizerFiles[item];
           Runnable runnable =
               new Runnable() {
                 @Override
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
index 8bdba698645..212e214d377 100644
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -8,6 +8,7 @@
 set -eu
 
 CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}"
+EXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN:-OFF}"
 # Note: Set up ANDROID_NDK and ANDROID_ABI
 cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
@@ -16,6 +17,8 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
   -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+  -DEXECUTORCH_BUILD_QUANTIZED=ON \
+  -DEXECUTORCH_BUILD_CUSTOM=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"
 
@@ -30,6 +33,10 @@ cmake examples/models/llama2 \
          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
          -DANDROID_ABI="$ANDROID_ABI" \
          -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+         -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
+         -DEXECUTORCH_BUILD_CUSTOM=ON \
+         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+         -DEXECUTORCH_BUILD_XNNPACK=ON \
          -DCMAKE_BUILD_TYPE=Release \
          -B"${CMAKE_OUT}"/examples/models/llama2
 
@@ -40,6 +47,7 @@ cmake extension/android \
   -DANDROID_ABI="${ANDROID_ABI}" \
   -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
+  -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"/extension/android
 
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
index ce20a78e8e4..cce00878672 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
@@ -13,7 +13,7 @@
 		032C01B72AC329B6002955E1 /* CustomViews.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C01B62AC329B6002955E1 /* CustomViews.swift */; };
 		032C01B92AC32ADF002955E1 /* CameraController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C01B82AC32ADF002955E1 /* CameraController.swift */; };
 		032C01E82AC34B60002955E1 /* MobileNetClassifier.mm in Sources */ = {isa = PBXBuildFile; fileRef = 032C01902AC22B16002955E1 /* MobileNetClassifier.mm */; };
-		032C01EC2AC34CAC002955E1 /* libMobileNetClassifier.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */; };
+		032C01EC2AC34CAC002955E1 /* libMobileNetClassifier.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */; platformFilter = ios; };
 		032C02032AC47CFB002955E1 /* mv3_xnnpack_fp32.pte in Resources */ = {isa = PBXBuildFile; fileRef = 032C01FC2AC47CFB002955E1 /* mv3_xnnpack_fp32.pte */; };
 		032C02082AC47CFB002955E1 /* imagenet_classes.txt in Resources */ = {isa = PBXBuildFile; fileRef = 032C02012AC47CFB002955E1 /* imagenet_classes.txt */; };
 		036834D52ACB710D00BA100F /* mv3.pte in Resources */ = {isa = PBXBuildFile; fileRef = 036834D42ACB710D00BA100F /* mv3.pte */; };
@@ -480,6 +480,7 @@
 /* Begin PBXTargetDependency section */
 		032C01EB2AC34CA8002955E1 /* PBXTargetDependency */ = {
 			isa = PBXTargetDependency;
+			platformFilter = ios;
 			target = 032C01CA2AC34632002955E1 /* MobileNetClassifier */;
 			targetProxy = 032C01EA2AC34CA8002955E1 /* PBXContainerItemProxy */;
 		};
@@ -635,9 +636,12 @@
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo;
 				PRODUCT_NAME = "$(PROJECT_NAME)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
+				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -660,9 +664,12 @@
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo;
 				PRODUCT_NAME = "$(PROJECT_NAME)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
+				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -784,7 +791,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = main;
+				branch = latest;
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/README.md b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
similarity index 69%
rename from examples/demo-apps/apple_ios/README.md
rename to examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
index 8c429af74a9..2f9102e7c00 100644
--- a/examples/demo-apps/apple_ios/README.md
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
@@ -40,36 +40,45 @@ pip --version
 
 ### 3. Getting Started Tutorial
 
-Before proceeding, follow the [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
-tutorial to configure the basic environment. Feel free to skip building anything
-just yet. Make sure you have all the required dependencies installed, including
-the following tools:
+Follow the [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
+tutorial to configure the basic environment:
 
-- Buck2 (as `/tmp/buck2`)
-- Cmake (`cmake` reachable at `$PATH`)
-- FlatBuffers Compiler (`flatc` reachable at `$PATH` or as `$FLATC_EXECUTABLE`
-  enironment variable)
+```bash
+git clone -b release/0.2 https://github.com/pytorch/executorch.git
+cd executorch
+git submodule update --init
+
+python3 -m venv .venv && source .venv/bin/activate
+
+./install_requirements.sh --pybind coreml mps xnnpack
+```
 
 ### 4. Backend Dependencies
 
-Also, follow the corresponding sections from [Core ML](build-run-coreml.md) and
-[MPS](build-run-mps.md) tutorials to install additional dependencies for those
-backends. Feel free to skip building anything just yet.
+Also, follow the corresponding sections from [Core ML](https://pytorch.org/executorch/stable/build-run-coreml) and
+[MPS](https://pytorch.org/executorch/stable/build-run-mps) tutorials to install additional dependencies for those
+backends:
+
+```bash
+./backends/apple/coreml/scripts/install_requirements.sh
+
+./backends/apple/mps/install_requirements.sh
+```
 
 ## Models and Labels
 
-Now let's move on to exporting and bundling the MobileNet v3 model.
+Now, let's move on to exporting and bundling the MobileNet v3 model.
 
 ### 1. Export Model
 
-Export the MobileNet v3 model with Core ML, MPS and XNNPACK delegates, and move
+Export the MobileNet v3 model with Core ML, MPS and XNNPACK backends, and move
 the exported model to a specific location where the Demo App will pick them up:
 
 ```bash
 python3 -m examples.portable.scripts.export --model_name="mv3"
-python3 -m examples.xnnpack.aot_compiler --delegate --model_name="mv3"
 python3 -m examples.apple.coreml.scripts.export --model_name="mv3"
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3"
+python3 -m examples.xnnpack.aot_compiler --delegate --model_name="mv3"
 
 mkdir -p examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/
 mv mv3*.pte examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/
@@ -84,27 +93,6 @@ curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
   -o examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/imagenet_classes.txt
 ```
 
-## Build Runtime and Backends
-
-Next, we will build the necessary
-[frameworks](https://developer.apple.com/documentation/xcode/creating-a-multi-platform-binary-framework-bundle)
-for ExecuTorch and move them over for app linking.
-
-### 1. Build Frameworks
-
-```bash
-./build/build_apple_frameworks.sh --Release --coreml --mps --xnnpack
-```
-
-### 2. Move Frameworks for App Linking
-
-Make sure to have all the `.xcframework` bundles generated at the previous step
-at a specific location where the Demo App will pick them up:
-
-```bash
-mv cmake-out examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Frameworks
-```
-
 ## Final Steps
 
 We're almost done! Now, we just need to open the project in Xcode, run the
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index 80ab3c34b0d..d7cc2c66051 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -41,8 +41,75 @@
 		03729F0C2BB203B300152F2E /* util.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F092BB203B300152F2E /* util.h */; };
 		03729F122BB2042B00152F2E /* sampler.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F102BB2042B00152F2E /* sampler.h */; };
 		03729F132BB2042B00152F2E /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F112BB2042B00152F2E /* sampler.cpp */; };
-		03729F162BB2043600152F2E /* tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F142BB2043600152F2E /* tokenizer.cpp */; };
+		03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F142BB2043600152F2E /* bpe_tokenizer.cpp */; };
 		03729F172BB2043600152F2E /* tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F152BB2043600152F2E /* tokenizer.h */; };
+		03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */; };
+		03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */; };
+		03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE222BD2EB6700DDFDC2 /* tiktoken.h */; };
+		03DDA09E2BD6263A00D234B3 /* mutex.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA09D2BD6263A00D234B3 /* mutex.cc */; };
+		03DDA0A02BD6266000D234B3 /* graphcycles.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA09F2BD6266000D234B3 /* graphcycles.cc */; };
+		03DDA0A22BD6272700D234B3 /* stacktrace.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0A12BD6272700D234B3 /* stacktrace.cc */; };
+		03DDA0A42BD6273D00D234B3 /* cycleclock.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0A32BD6273D00D234B3 /* cycleclock.cc */; };
+		03DDA0A62BD6275000D234B3 /* spinlock_wait.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0A52BD6275000D234B3 /* spinlock_wait.cc */; };
+		03DDA0A82BD6275F00D234B3 /* low_level_alloc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0A72BD6275F00D234B3 /* low_level_alloc.cc */; };
+		03DDA0AB2BD627C000D234B3 /* thread_identity.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0A92BD627C000D234B3 /* thread_identity.cc */; };
+		03DDA0AC2BD627C000D234B3 /* sysinfo.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0AA2BD627C000D234B3 /* sysinfo.cc */; };
+		03DDA0AE2BD627D800D234B3 /* spinlock.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0AD2BD627D800D234B3 /* spinlock.cc */; };
+		03DDA0B02BD6282500D234B3 /* hash.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0AF2BD6282500D234B3 /* hash.cc */; };
+		03DDA0B22BD628DC00D234B3 /* create_thread_identity.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0B12BD628DC00D234B3 /* create_thread_identity.cc */; };
+		03DDA0B42BD6299B00D234B3 /* per_thread_sem.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0B32BD6299B00D234B3 /* per_thread_sem.cc */; };
+		03DDA0B62BD629EC00D234B3 /* clock.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0B52BD629EC00D234B3 /* clock.cc */; };
+		03DDA0B82BD62A5600D234B3 /* time.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0B72BD62A5600D234B3 /* time.cc */; };
+		03DDA0BA2BD62A8D00D234B3 /* duration.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0B92BD62A8D00D234B3 /* duration.cc */; };
+		03DDA0BC2BD62A9F00D234B3 /* city.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0BB2BD62A9F00D234B3 /* city.cc */; };
+		03DDA0BE2BD62ABA00D234B3 /* low_level_hash.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0BD2BD62ABA00D234B3 /* low_level_hash.cc */; };
+		03DDA0C02BD62B0500D234B3 /* ascii.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0BF2BD62B0500D234B3 /* ascii.cc */; };
+		03DDA0C22BD62B2C00D234B3 /* raw_logging.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0C12BD62B2C00D234B3 /* raw_logging.cc */; };
+		03DDA0C42BD62B4D00D234B3 /* raw_hash_set.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0C32BD62B4D00D234B3 /* raw_hash_set.cc */; };
+		03DDA0C62BD62B8600D234B3 /* bind.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0C52BD62B8600D234B3 /* bind.cc */; };
+		03DDA0C82BD62BA900D234B3 /* output.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0C72BD62BA900D234B3 /* output.cc */; };
+		03DDA0CA2BD62BD400D234B3 /* extension.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0C92BD62BD400D234B3 /* extension.cc */; };
+		03DDA0CC2BD62C7F00D234B3 /* parser.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0CB2BD62C7F00D234B3 /* parser.cc */; };
+		03DDA0CE2BD62CA200D234B3 /* pthread_waiter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0CD2BD62CA200D234B3 /* pthread_waiter.cc */; };
+		03DDA0D02BD62CCF00D234B3 /* waiter_base.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0CF2BD62CCF00D234B3 /* waiter_base.cc */; };
+		03DDA0D22BD62D6300D234B3 /* time_zone_lookup.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0D12BD62D6300D234B3 /* time_zone_lookup.cc */; };
+		03DDA0D42BD62D9800D234B3 /* time_zone_impl.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0D32BD62D9800D234B3 /* time_zone_impl.cc */; };
+		03DDA0D82BD62DE100D234B3 /* time_zone_if.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0D72BD62DE100D234B3 /* time_zone_if.cc */; };
+		03DDA0D92BD62DF900D234B3 /* time_zone_info.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0D52BD62DBE00D234B3 /* time_zone_info.cc */; };
+		03DDA0DB2BD62E1E00D234B3 /* time_zone_libc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0DA2BD62E1E00D234B3 /* time_zone_libc.cc */; };
+		03DDA0DD2BD62E5200D234B3 /* time_zone_posix.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0DC2BD62E5200D234B3 /* time_zone_posix.cc */; };
+		03DDA0DF2BD62E8E00D234B3 /* kernel_timeout.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0DE2BD62E8E00D234B3 /* kernel_timeout.cc */; };
+		03DDA0E12BD62ED700D234B3 /* symbolize.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0E02BD62ED700D234B3 /* symbolize.cc */; };
+		03DDA0E32BD62F0000D234B3 /* zone_info_source.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0E22BD62F0000D234B3 /* zone_info_source.cc */; };
+		03DDA0E52BD62F3100D234B3 /* time_zone_fixed.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0E42BD62F3100D234B3 /* time_zone_fixed.cc */; };
+		03DDA0E72BD62F5600D234B3 /* demangle.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0E62BD62F5600D234B3 /* demangle.cc */; };
+		03DDA0E92BD62FA600D234B3 /* arg.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0E82BD62FA600D234B3 /* arg.cc */; };
+		03DDA0EB2BD62FDC00D234B3 /* numbers.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0EA2BD62FDC00D234B3 /* numbers.cc */; };
+		03DDA0ED2BD6300C00D234B3 /* charconv.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0EC2BD6300C00D234B3 /* charconv.cc */; };
+		03DDA0EF2BD6324200D234B3 /* match.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0EE2BD6324200D234B3 /* match.cc */; };
+		03DDA0F12BD6326800D234B3 /* charconv_parse.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0F02BD6326800D234B3 /* charconv_parse.cc */; };
+		03DDA0F32BD6328C00D234B3 /* memutil.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0F22BD6328C00D234B3 /* memutil.cc */; };
+		03DDA0F52BD632B800D234B3 /* float_conversion.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0F42BD632B800D234B3 /* float_conversion.cc */; };
+		03DDA0F72BD632D800D234B3 /* charconv_bigint.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0F62BD632D800D234B3 /* charconv_bigint.cc */; };
+		03DDA0F92BD632F600D234B3 /* int128.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03DDA0F82BD632F600D234B3 /* int128.cc */; };
+		03DDA0FB2BD6368100D234B3 /* base64.h in Headers */ = {isa = PBXBuildFile; fileRef = 03DDA0FA2BD6368100D234B3 /* base64.h */; };
+		03EC44DA2BD61805008D4E28 /* re2.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA772BD30C0B00DDFDC2 /* re2.cc */; };
+		03EC45882BD618AC008D4E28 /* rune.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEAAF2BD30C2400DDFDC2 /* rune.cc */; };
+		03EC45892BD618B9008D4E28 /* prog.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA642BD30C0A00DDFDC2 /* prog.cc */; };
+		03EC458A2BD618C5008D4E28 /* regexp.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA6D2BD30C0A00DDFDC2 /* regexp.cc */; };
+		03EC458B2BD618DF008D4E28 /* onepass.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA7B2BD30C0B00DDFDC2 /* onepass.cc */; };
+		03EC458C2BD61909008D4E28 /* parse.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA662BD30C0A00DDFDC2 /* parse.cc */; };
+		03EC458D2BD61920008D4E28 /* bitstate.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA6C2BD30C0A00DDFDC2 /* bitstate.cc */; };
+		03EC458E2BD61929008D4E28 /* perl_groups.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA762BD30C0B00DDFDC2 /* perl_groups.cc */; };
+		03EC458F2BD61933008D4E28 /* unicode_casefold.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA692BD30C0A00DDFDC2 /* unicode_casefold.cc */; };
+		03EC45902BD61935008D4E28 /* unicode_groups.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA652BD30C0A00DDFDC2 /* unicode_groups.cc */; };
+		03EC45912BD6194D008D4E28 /* strutil.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEAAE2BD30C2400DDFDC2 /* strutil.cc */; };
+		03EC45922BD6195D008D4E28 /* dfa.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA682BD30C0A00DDFDC2 /* dfa.cc */; };
+		03EC45932BD6196F008D4E28 /* nfa.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA782BD30C0B00DDFDC2 /* nfa.cc */; };
+		03EC45942BD61977008D4E28 /* compile.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA6F2BD30C0A00DDFDC2 /* compile.cc */; };
+		03EC45952BD61987008D4E28 /* simplify.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA702BD30C0A00DDFDC2 /* simplify.cc */; };
+		03EC45962BD6199C008D4E28 /* bitmap256.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA6B2BD30C0A00DDFDC2 /* bitmap256.cc */; };
+		03EC45972BD619B3008D4E28 /* tostring.cc in Sources */ = {isa = PBXBuildFile; fileRef = 03BAEA812BD30C0B00DDFDC2 /* tostring.cc */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -91,8 +158,81 @@
 		03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = "<group>"; };
 		03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = "<group>"; };
 		03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = "<group>"; };
-		03729F142BB2043600152F2E /* tokenizer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tokenizer.cpp; sourceTree = "<group>"; };
+		03729F142BB2043600152F2E /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bpe_tokenizer.cpp; sourceTree = "<group>"; };
 		03729F152BB2043600152F2E /* tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = "<group>"; };
+		03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bpe_tokenizer.h; sourceTree = "<group>"; };
+		03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tiktoken.cpp; sourceTree = "<group>"; };
+		03BADE222BD2EB6700DDFDC2 /* tiktoken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = "<group>"; };
+		03BAEA642BD30C0A00DDFDC2 /* prog.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = prog.cc; path = re2/prog.cc; sourceTree = "<group>"; };
+		03BAEA652BD30C0A00DDFDC2 /* unicode_groups.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = unicode_groups.cc; path = re2/unicode_groups.cc; sourceTree = "<group>"; };
+		03BAEA662BD30C0A00DDFDC2 /* parse.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = parse.cc; path = re2/parse.cc; sourceTree = "<group>"; };
+		03BAEA682BD30C0A00DDFDC2 /* dfa.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = dfa.cc; path = re2/dfa.cc; sourceTree = "<group>"; };
+		03BAEA692BD30C0A00DDFDC2 /* unicode_casefold.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = unicode_casefold.cc; path = re2/unicode_casefold.cc; sourceTree = "<group>"; };
+		03BAEA6B2BD30C0A00DDFDC2 /* bitmap256.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bitmap256.cc; path = re2/bitmap256.cc; sourceTree = "<group>"; };
+		03BAEA6C2BD30C0A00DDFDC2 /* bitstate.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bitstate.cc; path = re2/bitstate.cc; sourceTree = "<group>"; };
+		03BAEA6D2BD30C0A00DDFDC2 /* regexp.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = regexp.cc; path = re2/regexp.cc; sourceTree = "<group>"; };
+		03BAEA6F2BD30C0A00DDFDC2 /* compile.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = compile.cc; path = re2/compile.cc; sourceTree = "<group>"; };
+		03BAEA702BD30C0A00DDFDC2 /* simplify.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = simplify.cc; path = re2/simplify.cc; sourceTree = "<group>"; };
+		03BAEA722BD30C0A00DDFDC2 /* set.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = set.cc; path = re2/set.cc; sourceTree = "<group>"; };
+		03BAEA752BD30C0B00DDFDC2 /* prefilter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = prefilter.cc; path = re2/prefilter.cc; sourceTree = "<group>"; };
+		03BAEA762BD30C0B00DDFDC2 /* perl_groups.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = perl_groups.cc; path = re2/perl_groups.cc; sourceTree = "<group>"; };
+		03BAEA772BD30C0B00DDFDC2 /* re2.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = re2.cc; path = re2/re2.cc; sourceTree = "<group>"; };
+		03BAEA782BD30C0B00DDFDC2 /* nfa.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = nfa.cc; path = re2/nfa.cc; sourceTree = "<group>"; };
+		03BAEA7B2BD30C0B00DDFDC2 /* onepass.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = onepass.cc; path = re2/onepass.cc; sourceTree = "<group>"; };
+		03BAEA7D2BD30C0B00DDFDC2 /* mimics_pcre.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = mimics_pcre.cc; path = re2/mimics_pcre.cc; sourceTree = "<group>"; };
+		03BAEA812BD30C0B00DDFDC2 /* tostring.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = tostring.cc; path = re2/tostring.cc; sourceTree = "<group>"; };
+		03BAEA822BD30C0B00DDFDC2 /* filtered_re2.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = filtered_re2.cc; path = re2/filtered_re2.cc; sourceTree = "<group>"; };
+		03BAEA832BD30C0B00DDFDC2 /* prefilter_tree.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = prefilter_tree.cc; path = re2/prefilter_tree.cc; sourceTree = "<group>"; };
+		03BAEAAB2BD30C2400DDFDC2 /* pcre.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = pcre.cc; path = util/pcre.cc; sourceTree = "<group>"; };
+		03BAEAAE2BD30C2400DDFDC2 /* strutil.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = strutil.cc; path = util/strutil.cc; sourceTree = "<group>"; };
+		03BAEAAF2BD30C2400DDFDC2 /* rune.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = rune.cc; path = util/rune.cc; sourceTree = "<group>"; };
+		03DDA09D2BD6263A00D234B3 /* mutex.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = mutex.cc; path = synchronization/mutex.cc; sourceTree = "<group>"; };
+		03DDA09F2BD6266000D234B3 /* graphcycles.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = graphcycles.cc; path = synchronization/internal/graphcycles.cc; sourceTree = "<group>"; };
+		03DDA0A12BD6272700D234B3 /* stacktrace.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = stacktrace.cc; path = debugging/stacktrace.cc; sourceTree = "<group>"; };
+		03DDA0A32BD6273D00D234B3 /* cycleclock.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = cycleclock.cc; path = base/internal/cycleclock.cc; sourceTree = "<group>"; };
+		03DDA0A52BD6275000D234B3 /* spinlock_wait.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = spinlock_wait.cc; path = base/internal/spinlock_wait.cc; sourceTree = "<group>"; };
+		03DDA0A72BD6275F00D234B3 /* low_level_alloc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = low_level_alloc.cc; path = base/internal/low_level_alloc.cc; sourceTree = "<group>"; };
+		03DDA0A92BD627C000D234B3 /* thread_identity.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = thread_identity.cc; path = base/internal/thread_identity.cc; sourceTree = "<group>"; };
+		03DDA0AA2BD627C000D234B3 /* sysinfo.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = sysinfo.cc; path = base/internal/sysinfo.cc; sourceTree = "<group>"; };
+		03DDA0AD2BD627D800D234B3 /* spinlock.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = spinlock.cc; path = base/internal/spinlock.cc; sourceTree = "<group>"; };
+		03DDA0AF2BD6282500D234B3 /* hash.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = hash.cc; path = hash/internal/hash.cc; sourceTree = "<group>"; };
+		03DDA0B12BD628DC00D234B3 /* create_thread_identity.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = create_thread_identity.cc; path = synchronization/internal/create_thread_identity.cc; sourceTree = "<group>"; };
+		03DDA0B32BD6299B00D234B3 /* per_thread_sem.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = per_thread_sem.cc; path = synchronization/internal/per_thread_sem.cc; sourceTree = "<group>"; };
+		03DDA0B52BD629EC00D234B3 /* clock.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = clock.cc; path = time/clock.cc; sourceTree = "<group>"; };
+		03DDA0B72BD62A5600D234B3 /* time.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time.cc; path = time/time.cc; sourceTree = "<group>"; };
+		03DDA0B92BD62A8D00D234B3 /* duration.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = duration.cc; path = time/duration.cc; sourceTree = "<group>"; };
+		03DDA0BB2BD62A9F00D234B3 /* city.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = city.cc; path = hash/internal/city.cc; sourceTree = "<group>"; };
+		03DDA0BD2BD62ABA00D234B3 /* low_level_hash.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = low_level_hash.cc; path = hash/internal/low_level_hash.cc; sourceTree = "<group>"; };
+		03DDA0BF2BD62B0500D234B3 /* ascii.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ascii.cc; path = strings/ascii.cc; sourceTree = "<group>"; };
+		03DDA0C12BD62B2C00D234B3 /* raw_logging.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = raw_logging.cc; path = base/internal/raw_logging.cc; sourceTree = "<group>"; };
+		03DDA0C32BD62B4D00D234B3 /* raw_hash_set.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = raw_hash_set.cc; path = container/internal/raw_hash_set.cc; sourceTree = "<group>"; };
+		03DDA0C52BD62B8600D234B3 /* bind.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bind.cc; path = strings/internal/str_format/bind.cc; sourceTree = "<group>"; };
+		03DDA0C72BD62BA900D234B3 /* output.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = output.cc; path = strings/internal/str_format/output.cc; sourceTree = "<group>"; };
+		03DDA0C92BD62BD400D234B3 /* extension.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = extension.cc; path = strings/internal/str_format/extension.cc; sourceTree = "<group>"; };
+		03DDA0CB2BD62C7F00D234B3 /* parser.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = parser.cc; path = strings/internal/str_format/parser.cc; sourceTree = "<group>"; };
+		03DDA0CD2BD62CA200D234B3 /* pthread_waiter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = pthread_waiter.cc; path = synchronization/internal/pthread_waiter.cc; sourceTree = "<group>"; };
+		03DDA0CF2BD62CCF00D234B3 /* waiter_base.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = waiter_base.cc; path = synchronization/internal/waiter_base.cc; sourceTree = "<group>"; };
+		03DDA0D12BD62D6300D234B3 /* time_zone_lookup.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time_zone_lookup.cc; path = time/internal/cctz/src/time_zone_lookup.cc; sourceTree = "<group>"; };
+		03DDA0D32BD62D9800D234B3 /* time_zone_impl.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time_zone_impl.cc; path = time/internal/cctz/src/time_zone_impl.cc; sourceTree = "<group>"; };
+		03DDA0D52BD62DBE00D234B3 /* time_zone_info.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time_zone_info.cc; path = time/internal/cctz/src/time_zone_info.cc; sourceTree = "<group>"; };
+		03DDA0D72BD62DE100D234B3 /* time_zone_if.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time_zone_if.cc; path = time/internal/cctz/src/time_zone_if.cc; sourceTree = "<group>"; };
+		03DDA0DA2BD62E1E00D234B3 /* time_zone_libc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time_zone_libc.cc; path = time/internal/cctz/src/time_zone_libc.cc; sourceTree = "<group>"; };
+		03DDA0DC2BD62E5200D234B3 /* time_zone_posix.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time_zone_posix.cc; path = time/internal/cctz/src/time_zone_posix.cc; sourceTree = "<group>"; };
+		03DDA0DE2BD62E8E00D234B3 /* kernel_timeout.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = kernel_timeout.cc; path = synchronization/internal/kernel_timeout.cc; sourceTree = "<group>"; };
+		03DDA0E02BD62ED700D234B3 /* symbolize.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = symbolize.cc; path = debugging/symbolize.cc; sourceTree = "<group>"; };
+		03DDA0E22BD62F0000D234B3 /* zone_info_source.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = zone_info_source.cc; path = time/internal/cctz/src/zone_info_source.cc; sourceTree = "<group>"; };
+		03DDA0E42BD62F3100D234B3 /* time_zone_fixed.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = time_zone_fixed.cc; path = time/internal/cctz/src/time_zone_fixed.cc; sourceTree = "<group>"; };
+		03DDA0E62BD62F5600D234B3 /* demangle.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = demangle.cc; path = debugging/internal/demangle.cc; sourceTree = "<group>"; };
+		03DDA0E82BD62FA600D234B3 /* arg.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = arg.cc; path = strings/internal/str_format/arg.cc; sourceTree = "<group>"; };
+		03DDA0EA2BD62FDC00D234B3 /* numbers.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = numbers.cc; path = strings/numbers.cc; sourceTree = "<group>"; };
+		03DDA0EC2BD6300C00D234B3 /* charconv.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = charconv.cc; path = strings/charconv.cc; sourceTree = "<group>"; };
+		03DDA0EE2BD6324200D234B3 /* match.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = match.cc; path = strings/match.cc; sourceTree = "<group>"; };
+		03DDA0F02BD6326800D234B3 /* charconv_parse.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = charconv_parse.cc; path = strings/internal/charconv_parse.cc; sourceTree = "<group>"; };
+		03DDA0F22BD6328C00D234B3 /* memutil.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = memutil.cc; path = strings/internal/memutil.cc; sourceTree = "<group>"; };
+		03DDA0F42BD632B800D234B3 /* float_conversion.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = float_conversion.cc; path = strings/internal/str_format/float_conversion.cc; sourceTree = "<group>"; };
+		03DDA0F62BD632D800D234B3 /* charconv_bigint.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = charconv_bigint.cc; path = strings/internal/charconv_bigint.cc; sourceTree = "<group>"; };
+		03DDA0F82BD632F600D234B3 /* int128.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = int128.cc; path = numeric/int128.cc; sourceTree = "<group>"; };
+		03DDA0FA2BD6368100D234B3 /* base64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = base64.h; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -183,6 +323,8 @@
 			isa = PBXGroup;
 			children = (
 				0324D69B2BAACB7C00DEF36F /* Exported */,
+				03BAEB6A2BD316F000DDFDC2 /* absl */,
+				03BAEA602BD30BA600DDFDC2 /* re2 */,
 				03729F062BB2035900152F2E /* runner */,
 				03729F0F2BB203E100152F2E /* sampler */,
 				03729F0E2BB203D700152F2E /* tokenizer */,
@@ -232,7 +374,11 @@
 		03729F0E2BB203D700152F2E /* tokenizer */ = {
 			isa = PBXGroup;
 			children = (
-				03729F142BB2043600152F2E /* tokenizer.cpp */,
+				03DDA0FA2BD6368100D234B3 /* base64.h */,
+				03729F142BB2043600152F2E /* bpe_tokenizer.cpp */,
+				03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */,
+				03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */,
+				03BADE222BD2EB6700DDFDC2 /* tiktoken.h */,
 				03729F152BB2043600152F2E /* tokenizer.h */,
 			);
 			name = tokenizer;
@@ -249,6 +395,91 @@
 			path = ../../../../../models/llama2/sampler;
 			sourceTree = "<group>";
 		};
+		03BAEA602BD30BA600DDFDC2 /* re2 */ = {
+			isa = PBXGroup;
+			children = (
+				03BAEA6B2BD30C0A00DDFDC2 /* bitmap256.cc */,
+				03BAEA6C2BD30C0A00DDFDC2 /* bitstate.cc */,
+				03BAEA6F2BD30C0A00DDFDC2 /* compile.cc */,
+				03BAEA682BD30C0A00DDFDC2 /* dfa.cc */,
+				03BAEA822BD30C0B00DDFDC2 /* filtered_re2.cc */,
+				03BAEA7D2BD30C0B00DDFDC2 /* mimics_pcre.cc */,
+				03BAEA782BD30C0B00DDFDC2 /* nfa.cc */,
+				03BAEA7B2BD30C0B00DDFDC2 /* onepass.cc */,
+				03BAEA662BD30C0A00DDFDC2 /* parse.cc */,
+				03BAEAAB2BD30C2400DDFDC2 /* pcre.cc */,
+				03BAEA762BD30C0B00DDFDC2 /* perl_groups.cc */,
+				03BAEA832BD30C0B00DDFDC2 /* prefilter_tree.cc */,
+				03BAEA752BD30C0B00DDFDC2 /* prefilter.cc */,
+				03BAEA642BD30C0A00DDFDC2 /* prog.cc */,
+				03BAEA772BD30C0B00DDFDC2 /* re2.cc */,
+				03BAEA6D2BD30C0A00DDFDC2 /* regexp.cc */,
+				03BAEAAF2BD30C2400DDFDC2 /* rune.cc */,
+				03BAEA722BD30C0A00DDFDC2 /* set.cc */,
+				03BAEA702BD30C0A00DDFDC2 /* simplify.cc */,
+				03BAEAAE2BD30C2400DDFDC2 /* strutil.cc */,
+				03BAEA812BD30C0B00DDFDC2 /* tostring.cc */,
+				03BAEA692BD30C0A00DDFDC2 /* unicode_casefold.cc */,
+				03BAEA652BD30C0A00DDFDC2 /* unicode_groups.cc */,
+			);
+			name = re2;
+			path = "../../../../../models/llama2/third-party/re2";
+			sourceTree = "<group>";
+		};
+		03BAEB6A2BD316F000DDFDC2 /* absl */ = {
+			isa = PBXGroup;
+			children = (
+				03DDA0E82BD62FA600D234B3 /* arg.cc */,
+				03DDA0BF2BD62B0500D234B3 /* ascii.cc */,
+				03DDA0C52BD62B8600D234B3 /* bind.cc */,
+				03DDA0F62BD632D800D234B3 /* charconv_bigint.cc */,
+				03DDA0F02BD6326800D234B3 /* charconv_parse.cc */,
+				03DDA0EC2BD6300C00D234B3 /* charconv.cc */,
+				03DDA0BB2BD62A9F00D234B3 /* city.cc */,
+				03DDA0B52BD629EC00D234B3 /* clock.cc */,
+				03DDA0B12BD628DC00D234B3 /* create_thread_identity.cc */,
+				03DDA0A32BD6273D00D234B3 /* cycleclock.cc */,
+				03DDA0E62BD62F5600D234B3 /* demangle.cc */,
+				03DDA0B92BD62A8D00D234B3 /* duration.cc */,
+				03DDA0C92BD62BD400D234B3 /* extension.cc */,
+				03DDA0F42BD632B800D234B3 /* float_conversion.cc */,
+				03DDA09F2BD6266000D234B3 /* graphcycles.cc */,
+				03DDA0AF2BD6282500D234B3 /* hash.cc */,
+				03DDA0F82BD632F600D234B3 /* int128.cc */,
+				03DDA0DE2BD62E8E00D234B3 /* kernel_timeout.cc */,
+				03DDA0A72BD6275F00D234B3 /* low_level_alloc.cc */,
+				03DDA0BD2BD62ABA00D234B3 /* low_level_hash.cc */,
+				03DDA0EE2BD6324200D234B3 /* match.cc */,
+				03DDA0F22BD6328C00D234B3 /* memutil.cc */,
+				03DDA09D2BD6263A00D234B3 /* mutex.cc */,
+				03DDA0EA2BD62FDC00D234B3 /* numbers.cc */,
+				03DDA0C72BD62BA900D234B3 /* output.cc */,
+				03DDA0CB2BD62C7F00D234B3 /* parser.cc */,
+				03DDA0B32BD6299B00D234B3 /* per_thread_sem.cc */,
+				03DDA0CD2BD62CA200D234B3 /* pthread_waiter.cc */,
+				03DDA0C32BD62B4D00D234B3 /* raw_hash_set.cc */,
+				03DDA0C12BD62B2C00D234B3 /* raw_logging.cc */,
+				03DDA0A52BD6275000D234B3 /* spinlock_wait.cc */,
+				03DDA0AD2BD627D800D234B3 /* spinlock.cc */,
+				03DDA0A12BD6272700D234B3 /* stacktrace.cc */,
+				03DDA0E02BD62ED700D234B3 /* symbolize.cc */,
+				03DDA0AA2BD627C000D234B3 /* sysinfo.cc */,
+				03DDA0A92BD627C000D234B3 /* thread_identity.cc */,
+				03DDA0E42BD62F3100D234B3 /* time_zone_fixed.cc */,
+				03DDA0D72BD62DE100D234B3 /* time_zone_if.cc */,
+				03DDA0D32BD62D9800D234B3 /* time_zone_impl.cc */,
+				03DDA0D52BD62DBE00D234B3 /* time_zone_info.cc */,
+				03DDA0DA2BD62E1E00D234B3 /* time_zone_libc.cc */,
+				03DDA0D12BD62D6300D234B3 /* time_zone_lookup.cc */,
+				03DDA0DC2BD62E5200D234B3 /* time_zone_posix.cc */,
+				03DDA0B72BD62A5600D234B3 /* time.cc */,
+				03DDA0CF2BD62CCF00D234B3 /* waiter_base.cc */,
+				03DDA0E22BD62F0000D234B3 /* zone_info_source.cc */,
+			);
+			name = absl;
+			path = "../../../../../models/llama2/third-party/abseil-cpp/absl";
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 
 /* Begin PBXHeadersBuildPhase section */
@@ -256,8 +487,11 @@
 			isa = PBXHeadersBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */,
 				03729F172BB2043600152F2E /* tokenizer.h in Headers */,
 				03729EE22BB1F93E00152F2E /* LLaMARunner.h in Headers */,
+				03DDA0FB2BD6368100D234B3 /* base64.h in Headers */,
+				03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */,
 				03729F122BB2042B00152F2E /* sampler.h in Headers */,
 				03729F0C2BB203B300152F2E /* util.h in Headers */,
 				03729F0B2BB203B300152F2E /* runner.h in Headers */,
@@ -402,10 +636,74 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				03DDA0D42BD62D9800D234B3 /* time_zone_impl.cc in Sources */,
+				03DDA0C62BD62B8600D234B3 /* bind.cc in Sources */,
+				03DDA0F52BD632B800D234B3 /* float_conversion.cc in Sources */,
 				03729EE12BB1F93800152F2E /* LLaMARunner.mm in Sources */,
-				03729F162BB2043600152F2E /* tokenizer.cpp in Sources */,
+				03DDA0BE2BD62ABA00D234B3 /* low_level_hash.cc in Sources */,
+				03DDA0AC2BD627C000D234B3 /* sysinfo.cc in Sources */,
+				03EC458E2BD61929008D4E28 /* perl_groups.cc in Sources */,
+				03EC458F2BD61933008D4E28 /* unicode_casefold.cc in Sources */,
+				03DDA0C02BD62B0500D234B3 /* ascii.cc in Sources */,
+				03DDA0BA2BD62A8D00D234B3 /* duration.cc in Sources */,
+				03EC458A2BD618C5008D4E28 /* regexp.cc in Sources */,
+				03DDA0AE2BD627D800D234B3 /* spinlock.cc in Sources */,
+				03DDA0E72BD62F5600D234B3 /* demangle.cc in Sources */,
+				03DDA0B82BD62A5600D234B3 /* time.cc in Sources */,
+				03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */,
+				03EC45932BD6196F008D4E28 /* nfa.cc in Sources */,
+				03DDA0F32BD6328C00D234B3 /* memutil.cc in Sources */,
+				03DDA0BC2BD62A9F00D234B3 /* city.cc in Sources */,
+				03DDA0D82BD62DE100D234B3 /* time_zone_if.cc in Sources */,
+				03DDA0ED2BD6300C00D234B3 /* charconv.cc in Sources */,
+				03EC45912BD6194D008D4E28 /* strutil.cc in Sources */,
+				03DDA0F12BD6326800D234B3 /* charconv_parse.cc in Sources */,
+				03DDA0E52BD62F3100D234B3 /* time_zone_fixed.cc in Sources */,
+				03DDA0F72BD632D800D234B3 /* charconv_bigint.cc in Sources */,
+				03EC45902BD61935008D4E28 /* unicode_groups.cc in Sources */,
+				03EC458B2BD618DF008D4E28 /* onepass.cc in Sources */,
+				03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */,
+				03EC45892BD618B9008D4E28 /* prog.cc in Sources */,
+				03EC45972BD619B3008D4E28 /* tostring.cc in Sources */,
+				03DDA09E2BD6263A00D234B3 /* mutex.cc in Sources */,
+				03DDA0A42BD6273D00D234B3 /* cycleclock.cc in Sources */,
+				03DDA0C42BD62B4D00D234B3 /* raw_hash_set.cc in Sources */,
+				03DDA0A62BD6275000D234B3 /* spinlock_wait.cc in Sources */,
+				03DDA0D02BD62CCF00D234B3 /* waiter_base.cc in Sources */,
+				03DDA0A22BD6272700D234B3 /* stacktrace.cc in Sources */,
+				03DDA0DD2BD62E5200D234B3 /* time_zone_posix.cc in Sources */,
 				03729F0A2BB203B300152F2E /* runner.cpp in Sources */,
+				03DDA0EB2BD62FDC00D234B3 /* numbers.cc in Sources */,
+				03DDA0A82BD6275F00D234B3 /* low_level_alloc.cc in Sources */,
+				03EC45942BD61977008D4E28 /* compile.cc in Sources */,
+				03EC45962BD6199C008D4E28 /* bitmap256.cc in Sources */,
+				03EC458D2BD61920008D4E28 /* bitstate.cc in Sources */,
+				03EC458C2BD61909008D4E28 /* parse.cc in Sources */,
+				03DDA0D92BD62DF900D234B3 /* time_zone_info.cc in Sources */,
+				03DDA0D22BD62D6300D234B3 /* time_zone_lookup.cc in Sources */,
+				03DDA0B22BD628DC00D234B3 /* create_thread_identity.cc in Sources */,
+				03DDA0E92BD62FA600D234B3 /* arg.cc in Sources */,
+				03DDA0E32BD62F0000D234B3 /* zone_info_source.cc in Sources */,
+				03DDA0CC2BD62C7F00D234B3 /* parser.cc in Sources */,
 				03729F132BB2042B00152F2E /* sampler.cpp in Sources */,
+				03EC45952BD61987008D4E28 /* simplify.cc in Sources */,
+				03DDA0DF2BD62E8E00D234B3 /* kernel_timeout.cc in Sources */,
+				03DDA0C82BD62BA900D234B3 /* output.cc in Sources */,
+				03DDA0DB2BD62E1E00D234B3 /* time_zone_libc.cc in Sources */,
+				03DDA0EF2BD6324200D234B3 /* match.cc in Sources */,
+				03EC45882BD618AC008D4E28 /* rune.cc in Sources */,
+				03EC44DA2BD61805008D4E28 /* re2.cc in Sources */,
+				03DDA0CE2BD62CA200D234B3 /* pthread_waiter.cc in Sources */,
+				03DDA0B42BD6299B00D234B3 /* per_thread_sem.cc in Sources */,
+				03EC45922BD6195D008D4E28 /* dfa.cc in Sources */,
+				03DDA0CA2BD62BD400D234B3 /* extension.cc in Sources */,
+				03DDA0AB2BD627C000D234B3 /* thread_identity.cc in Sources */,
+				03DDA0B62BD629EC00D234B3 /* clock.cc in Sources */,
+				03DDA0A02BD6266000D234B3 /* graphcycles.cc in Sources */,
+				03DDA0B02BD6282500D234B3 /* hash.cc in Sources */,
+				03DDA0E12BD62ED700D234B3 /* symbolize.cc in Sources */,
+				03DDA0C22BD62B2C00D234B3 /* raw_logging.cc in Sources */,
+				03DDA0F92BD632F600D234B3 /* int128.cc in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -581,6 +879,8 @@
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcustom_backend-Debug-0.a",
 					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libquantized_backend-Debug-0.a",
+					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Debug-0.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Debug-0.a",
@@ -593,6 +893,8 @@
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcustom_backend-Debug-1.a",
 					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libquantized_backend-Debug-1.a",
+					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Debug-1.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Debug-1.a",
@@ -607,7 +909,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -639,6 +941,8 @@
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcustom_backend-Release-0.a",
 					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libquantized_backend-Release-0.a",
+					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Release-0.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Release-0.a",
@@ -651,6 +955,8 @@
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcustom_backend-Release-1.a",
 					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libquantized_backend-Release-1.a",
+					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Release-1.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Release-1.a",
@@ -665,7 +971,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -683,8 +989,16 @@
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
 				ENABLE_MODULE_VERIFIER = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"ET_USE_TIKTOKEN=1",
+				);
 				GENERATE_INFOPLIST_FILE = YES;
-				HEADER_SEARCH_PATHS = "\"$(SRCROOT)/../../../../..\"";
+				HEADER_SEARCH_PATHS = (
+					"\"$(SRCROOT)/../../../../..\"",
+					"\"$(SRCROOT)/../../../models/llama2/third-party/abseil-cpp\"",
+					"\"$(SRCROOT)/../../../models/llama2/third-party/re2\"",
+				);
 				INFOPLIST_KEY_NSHumanReadableCopyright = "";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
@@ -708,7 +1022,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 				VERSIONING_SYSTEM = "apple-generic";
 				VERSION_INFO_PREFIX = "";
 			};
@@ -728,8 +1042,13 @@
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
 				ENABLE_MODULE_VERIFIER = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_PREPROCESSOR_DEFINITIONS = "ET_USE_TIKTOKEN=1";
 				GENERATE_INFOPLIST_FILE = YES;
-				HEADER_SEARCH_PATHS = "\"$(SRCROOT)/../../../../..\"";
+				HEADER_SEARCH_PATHS = (
+					"\"$(SRCROOT)/../../../../..\"",
+					"\"$(SRCROOT)/../../../models/llama2/third-party/abseil-cpp\"",
+					"\"$(SRCROOT)/../../../models/llama2/third-party/re2\"",
+				);
 				INFOPLIST_KEY_NSHumanReadableCopyright = "";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
@@ -753,7 +1072,7 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 				VERSIONING_SYSTEM = "apple-generic";
 				VERSION_INFO_PREFIX = "";
 			};
@@ -796,7 +1115,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = main;
+				branch = latest;
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
index 5d7ddbc388f..9afb0cafb37 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -150,6 +150,7 @@ struct ContentView: View {
         }
       }
     }
+    .navigationViewStyle(StackNavigationViewStyle())
   }
 
   private func generate() {
@@ -215,7 +216,7 @@ struct ContentView: View {
           tokens.append(token)
           if tokens.count > 2 {
             let text = tokens.joined()
-            let count = text.count
+            let count = tokens.count
             tokens = []
             DispatchQueue.main.async {
               withAnimation {
diff --git a/examples/demo-apps/apple_ios/LLaMA/README.md b/examples/demo-apps/apple_ios/LLaMA/README.md
new file mode 100644
index 00000000000..ddd542a0066
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/README.md
@@ -0,0 +1,52 @@
+# Building ExecuTorch LLaMA iOS Demo App
+
+This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch.
+
+## Prerequisites
+* [Xcode 15](https://developer.apple.com/xcode)
+* [iOS 17 SDK](https://developer.apple.com/ios)
+* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+
+```bash
+git clone -b release/0.2 https://github.com/pytorch/executorch.git
+cd executorch
+git submodule update --init
+
+python3 -m venv .venv && source .venv/bin/activate
+
+./install_requirements.sh
+```
+
+## Exporting models
+Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
+
+## Run the App
+
+1. Open the [project](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj) in Xcode.
+2. Run the app (cmd+R).
+3. In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton
+
+```{note}
+ExecuTorch runtime is distributed as a Swift package providing some .xcframework as prebuilt binary targets.
+Xcode will dowload and cache the package on the first run, which will take some time.
+```
+
+## Copy the model to Simulator
+
+1. Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+2. Pick the files in the app dialog, type a prompt and click the arrow-up button.
+
+## Copy the model to Device
+
+1. Wire-connect the device and open the contents in Finder.
+2. Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
+3. Wait until the files are copied.
+
+Click the image below to see it in action!
+
+<a href="https://pytorch.org/executorch/main/_static/img/llama_ios_app.mp4">
+  <img src="https://pytorch.org/executorch/main/_static/img/llama_ios_app.png" width="600" alt="iOS app running a LlaMA model">
+</a>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt
new file mode 100644
index 00000000000..c605e947409
--- /dev/null
+++ b/examples/llm_manual/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+project(nanogpt_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
+
+# Include the executorch subdirectory.
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+    ${CMAKE_BINARY_DIR}/executorch)
+
+# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
+
+add_executable(nanogpt_runner main.cpp)
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
diff --git a/examples/llm_manual/README.md b/examples/llm_manual/README.md
new file mode 100644
index 00000000000..0ee6bb6a9f1
--- /dev/null
+++ b/examples/llm_manual/README.md
@@ -0,0 +1,3 @@
+# LLM Manual
+
+This repository is a storage place for the files that [LLM Maunal](https://pytorch.org/executorch/main/llm/getting-started.html) needs. Please refer to the documentation website for more information.
diff --git a/examples/llm_manual/basic_sampler.h b/examples/llm_manual/basic_sampler.h
new file mode 100644
index 00000000000..a95b823de8d
--- /dev/null
+++ b/examples/llm_manual/basic_sampler.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <vector>
+class BasicSampler {
+ public:
+  BasicSampler() {}
+  int64_t sample(std::vector<float> logits) {
+    // Find the token with the highest log probability.
+    int64_t max_index =
+        std::max_element(logits.begin(), logits.end()) - logits.begin();
+    return max_index;
+  }
+};
diff --git a/examples/llm_manual/basic_tokenizer.h b/examples/llm_manual/basic_tokenizer.h
new file mode 100644
index 00000000000..eb51d15fc50
--- /dev/null
+++ b/examples/llm_manual/basic_tokenizer.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+class BasicTokenizer {
+ public:
+  BasicTokenizer(const std::string& filePath) {
+    std::ifstream file(filePath);
+
+    if (!file) {
+      std::cerr << "Unable to open file";
+      exit(9); // return with error code
+    }
+    std::string str(
+        (std::istreambuf_iterator<char>(file)),
+        std::istreambuf_iterator<char>());
+
+    size_t i = 0u;
+    i = consume_whitespace(str, i);
+    i = expect(str, i, '{');
+
+    while (i < str.size() && str[i] != '}') {
+      i = consume_field(str, i);
+    }
+
+    // Build decode map as inverse of encode.
+    for (auto& i : encode_) {
+      decode_[i.second] = i.first;
+    }
+  }
+
+  std::vector<int64_t> encode(const std::string& prompt) {
+    std::vector<std::string> words = parse_prompt(prompt);
+    std::vector<int64_t> result;
+    for (auto word : words) {
+      result.push_back(encode_[word]);
+    }
+    return result;
+  }
+
+  std::string decode(const std::vector<int64_t>& indices) {
+    std::string result;
+    for (const auto& index : indices) {
+      result += decode_[index];
+    }
+    return result;
+  }
+
+ private:
+  std::unordered_map<std::string, int64_t> encode_;
+  std::unordered_map<int64_t, std::string> decode_;
+
+  // Advance the input string index until a non-whitespace character is found
+  // or it reaches the end of string.
+  size_t consume_whitespace(const std::string& data, size_t i) {
+    while (i < data.size() && std::isspace(data[i])) {
+      i++;
+    }
+
+    return i;
+  }
+
+  // Consumes an JSON field of the form
+  //  "str": id,
+  size_t consume_field(const std::string& data, size_t i) {
+    i = consume_whitespace(data, i);
+
+    // Parse the key literal.
+    i = expect(data, i, '"');
+
+    auto in_escape = false;
+    std::string key = "";
+    while (i < data.size()) {
+      if (in_escape) {
+        key += data[i];
+        i++;
+        in_escape = false;
+      } else { // !in_escape
+        if (data[i] == '"') { // End of string literal
+          i++;
+          break;
+        } else if (data[i] == '\\') { // Escaped code point
+          in_escape = true;
+        }
+        key += data[i];
+        i++;
+      }
+    }
+
+    key = post_process_key(key);
+
+    i = expect(data, i, ':');
+    i = consume_whitespace(data, i);
+
+    // Read unsigned integer value
+    auto value_start = i;
+    while (i < data.size() && std::isdigit(data[i])) {
+      i++;
+    }
+    auto value = static_cast<int64_t>(
+        std::stol(data.substr(value_start, i - value_start)));
+
+    encode_[key] = value;
+
+    i = consume_whitespace(data, i);
+    if (i < data.size() && data[i] == ',') {
+      i++;
+    }
+
+    return i;
+  }
+
+  // Assert that the next character in the input string is equal to c. Increment
+  // the input string index by one.
+  size_t expect(const std::string& data, size_t i, char c) {
+    if (i >= data.size() || data[i] != c) {
+      std::cerr << "Invalid tokenizer vocabulary file. Expected '" << c
+                << "' at index " << i << std::endl;
+      exit(1);
+    }
+
+    return i + 1;
+  }
+
+  std::string post_process_key(std::string key) {
+    // Replace the unicode characters with the corresponding byte encoding
+    // TODO: adopt byte encoder to handle unicode characters in json file.
+
+    std::unordered_map<std::string, std::string> replacements = {
+        {"\\u0120", " "},
+        {"\\u010a", "\n"},
+    };
+
+    for (const auto& replacement : replacements) {
+      size_t pos = 0;
+      // While loop through all instances of the substring in the string
+      while ((pos = key.find(replacement.first, pos)) != std::string::npos) {
+        key.replace(pos, replacement.first.length(), replacement.second);
+        pos += replacement.second.length();
+      }
+    }
+
+    // remove duplicate backslashes
+    for (size_t idx = 0; idx < key.length(); idx++) {
+      if (key[idx] == '\\') {
+        key.erase(idx, 1);
+        if (key[idx] == '\\') {
+          // If there are two backslashes, keep the second one
+          idx += 1;
+        }
+      }
+    }
+
+    return key;
+  }
+  std::vector<std::string> parse_prompt(const std::string& prompt) {
+    std::vector<std::string> result;
+    std::string word;
+    for (char c : prompt) {
+      if (c == ' ') {
+        if (!word.empty()) {
+          result.push_back(word);
+          word.clear();
+        }
+        word += c;
+      } else if (ispunct(c)) {
+        if (!word.empty()) {
+          result.push_back(word);
+          word.clear();
+        }
+        result.push_back(std::string(1, c));
+      } else {
+        word += c;
+      }
+    }
+    if (!word.empty()) {
+      result.push_back(word);
+    }
+    return result;
+  }
+};
diff --git a/examples/llm_manual/export_nanogpt.py b/examples/llm_manual/export_nanogpt.py
new file mode 100644
index 00000000000..cf29a69c080
--- /dev/null
+++ b/examples/llm_manual/export_nanogpt.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# export_nanogpt.py
+
+# Load partitioner for Xnnpack backend
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# Model to be delegated to specific backend should use specific edge compile config
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
+from executorch.exir import to_edge
+
+from model import GPT
+from torch._export import capture_pre_autograd_graph
+from torch.export import export
+from torch.nn.attention import sdpa_kernel, SDPBackend
+
+model = GPT.from_pretrained("gpt2")  # use gpt2 weight as pretrained weight
+example_inputs = (
+    torch.randint(0, 100, (1, model.config.block_size), dtype=torch.long),
+)
+dynamic_shape = ({1: torch.export.Dim("token_dim", max=model.config.block_size)},)
+
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
+
+# Convert the model into a runnable ExecuTorch program.
+# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
+edge_config = get_xnnpack_edge_compile_config()
+edge_manager = to_edge(traced_model, compile_config=edge_config)
+
+# Delegate exported model to Xnnpack backend by invoking `to_backend` function with Xnnpack partitioner.
+edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+et_program = edge_manager.to_executorch()
+
+# Save the Xnnpack-delegated ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp
new file mode 100644
index 00000000000..2b336059cff
--- /dev/null
+++ b/examples/llm_manual/main.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// main.cpp
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "basic_sampler.h"
+#include "basic_tokenizer.h"
+#include "managed_tensor.h"
+
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+using namespace torch::executor;
+
+using SizesType = exec_aten::SizesType;
+using DimOrderType = exec_aten::DimOrderType;
+using StridesType = exec_aten::StridesType;
+
+// main.cpp
+
+#define ENDOFTEXT 50256
+
+std::string generate(
+    Module& llm_model,
+    std::string& prompt,
+    BasicTokenizer& tokenizer,
+    BasicSampler& sampler,
+    size_t max_input_length,
+    size_t max_output_length) {
+  // Convert the input text into a list of integers (tokens) that represents
+  // it, using the string-to-token mapping that the model was trained on.
+  // Each token is an integer that represents a word or part of a word.
+  std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
+  std::vector<int64_t> output_tokens;
+
+  for (auto i = 0u; i < max_output_length; i++) {
+    // Convert the input_tokens from a vector of int64_t to EValue.
+    // EValue is a unified data type in the ExecuTorch runtime.
+    ManagedTensor tensor_tokens(
+        input_tokens.data(),
+        {1, static_cast<int>(input_tokens.size())},
+        ScalarType::Long);
+    std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
+
+    // Run the model. It will return a tensor of logits (log-probabilities).
+    Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+
+    // Convert the output logits from EValue to std::vector, which is what
+    // the sampler expects.
+    Tensor logits_tensor = logits_evalue.get()[0].toTensor();
+    std::vector<float> logits(
+        logits_tensor.data_ptr<float>(),
+        logits_tensor.data_ptr<float>() + logits_tensor.numel());
+
+    // Sample the next token from the logits.
+    int64_t next_token = sampler.sample(logits);
+
+    // Break if we reached the end of the text.
+    if (next_token == ENDOFTEXT) {
+      break;
+    }
+
+    // Add the next token to the output.
+    output_tokens.push_back(next_token);
+
+    std::cout << tokenizer.decode({next_token});
+    std::cout.flush();
+
+    // Update next input.
+    input_tokens.push_back(next_token);
+    if (input_tokens.size() > max_input_length) {
+      input_tokens.erase(input_tokens.begin());
+    }
+  }
+
+  std::cout << std::endl;
+
+  // Convert the output tokens into a human-readable string.
+  std::string output_string = tokenizer.decode(output_tokens);
+  return output_string;
+}
+
+// main.cpp
+
+int main() {
+  // Set up the prompt. This provides the seed text for the model to elaborate.
+  std::cout << "Prompt: ";
+  std::string prompt;
+  std::getline(std::cin, prompt);
+
+  // The tokenizer is used to convert between tokens (used by the model) and
+  // human-readable strings.
+  BasicTokenizer tokenizer("vocab.json");
+
+  // The sampler is used to sample the next token from the logits.
+  BasicSampler sampler = BasicSampler();
+
+  // Load the exported nanoGPT program, which was generated via the previous
+  // steps.
+  Module model(
+      "nanogpt.pte",
+      torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
+
+  const auto max_input_tokens = 1024;
+  const auto max_output_tokens = 30;
+  std::cout << prompt;
+  generate(
+      model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
+}
diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h
new file mode 100644
index 00000000000..d401ae4d18b
--- /dev/null
+++ b/examples/llm_manual/managed_tensor.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+#pragma once
+
+namespace torch {
+namespace executor {
+
+/**
+ * A tensor wrapper takes ownership of all the memory of the necessary metadata
+ * for torch::executor::Tensor. Note that it doesn't own the data memory.
+ */
+class ManagedTensor {
+ public:
+  /// The type used for elements of `sizes()`.
+  using SizesType = exec_aten::SizesType;
+  /// The type used for elements of `dim_order()`.
+  using DimOrderType = exec_aten::DimOrderType;
+  /// The type used for elements of `strides()`.
+  using StridesType = exec_aten::StridesType;
+  ManagedTensor() = delete;
+
+  explicit ManagedTensor(
+      void* data,
+      const std::vector<SizesType>& sizes,
+      ScalarType dtype)
+      : dtype_(dtype), sizes_(sizes), data_ptr_(data) {
+    ssize_t dim = sizes.size();
+    dim_order_.resize(dim);
+    strides_.resize(dim);
+    for (size_t i = 0; i < dim; ++i) {
+      dim_order_[i] = i;
+    }
+    dim_order_to_stride_nocheck(
+        sizes.data(), dim_order_.data(), dim, strides_.data());
+    tensor_impl_ = std::make_unique<TensorImpl>(
+        dtype_,
+        dim,
+        sizes_.data(),
+        data_ptr_,
+        dim_order_.data(),
+        strides_.data(),
+        TensorShapeDynamism::DYNAMIC_BOUND);
+  }
+
+  /**
+   * Get the Tensor object managed by this class.
+   */
+  Tensor get_tensor() {
+    return Tensor(tensor_impl_.get());
+  }
+
+ private:
+  void* data_ptr_ = nullptr;
+  std::unique_ptr<TensorImpl> tensor_impl_;
+  std::vector<SizesType> sizes_;
+  std::vector<StridesType> strides_;
+  std::vector<DimOrderType> dim_order_;
+  ScalarType dtype_;
+};
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index ea4096074ee..17841cd4eb4 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -18,8 +18,24 @@
 cmake_minimum_required(VERSION 3.19)
 project(llama_runner)
 
+# Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
 
+option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF)
+
+include(CMakeDependentOption)
+#
+# pthreadpool: build pthreadpool library. Disable on unsupported platforms
+#
+cmake_dependent_option(
+  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+#
+# cpuinfo: build cpuinfo library. Disable on unsupported platforms
+#
+cmake_dependent_option(EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
+                       "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+
 if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
 endif()
@@ -49,22 +65,16 @@ set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
-# For some reason android build is not able to find where gflags is
-# and hence cannot find corresponding .cmake file
+# For some reason android build is not able to find where gflags is and hence
+# cannot find corresponding .cmake file
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
 #
 # llama_main: test binary to run llama, with tokenizer and sampler integrated
 #
-add_executable(llama_main main.cpp
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/cpuinfo_utils.cpp)
-if(CMAKE_BUILD_TYPE EQUAL "RELEASE")
-  target_link_options(llama_main PRIVATE "LINKER:--gc-sections")
-endif()
 
-# find `executorch` libraries
-# Same as for gflags
+# find `executorch` libraries Same as for gflags
 set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
 find_package(executorch CONFIG REQUIRED)
 if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
@@ -72,32 +82,77 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
 endif()
 
 # custom ops library
-add_subdirectory(custom_ops)
+if(EXECUTORCH_BUILD_CUSTOM)
+  add_subdirectory(custom_ops)
+endif()
 
 # llama_runner library
 add_subdirectory(runner)
+if(EXECUTORCH_USE_TIKTOKEN)
+  # find RE2 for tokenizer
+  set(ABSL_ENABLE_INSTALL ON)
+  set(ABSL_PROPAGATE_CXX_STD ON)
+  set(_pic_flag
+    ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
+  set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+  target_link_libraries(llama_runner PUBLIC re2::re2)
+endif()
 
-target_include_directories(llama_main PUBLIC
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/cpuinfo/include)
-target_include_directories(llama_main PUBLIC
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/pthreadpool/include)
-
-set(link_libraries)
+set(link_libraries gflags)
+set(_srcs main.cpp)
 
 if(EXECUTORCH_BUILD_OPTIMIZED)
-  list(APPEND link_libraries optimized_native_cpu_ops_lib optimized_kernels
-  portable_kernels cpublas eigen_blas)
+  list(
+    APPEND
+    link_libraries
+    optimized_native_cpu_ops_lib
+    optimized_kernels
+    portable_kernels
+    cpublas
+    eigen_blas)
   target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 else()
   list(APPEND link_libraries portable_ops_lib portable_kernels)
   target_link_options_shared_lib(portable_ops_lib)
 endif()
 
-target_link_libraries(llama_main PUBLIC gflags llama_runner custom_ops_lib)
+# quantized_ops_lib: Register quantized op kernels into the runtime
+target_link_options_shared_lib(quantized_ops_lib)
+list(APPEND link_libraries quantized_kernels quantized_ops_lib)
+
+if(EXECUTORCH_BUILD_CUSTOM)
+  target_link_options_shared_lib(custom_ops)
+  list(APPEND link_libraries custom_ops)
+endif()
+
+set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
+# Extra compile option and include dir for pthreadpool
+if(EXECUTORCH_BUILD_PTHREADPOOL)
+  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
+  list(APPEND link_libraries pthreadpool)
+  # These 2 source files are included in xnnpack_backend
+  if(NOT TARGET xnnpack_backend)
+    list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/threadpool.cpp
+         ${XNNPACK_ROOT}/threadpool/threadpool_guard.cpp)
+  endif()
+  list(APPEND _common_include_directories
+       ${XNNPACK_ROOT}/third-party/pthreadpool/include)
+endif()
 
-# XNNPACK pthreadpool cpuinfo
+# Extra sources for cpuinfo
+if(EXECUTORCH_BUILD_CPUINFO)
+  list(APPEND link_libraries cpuinfo)
+  list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
+  list(APPEND _common_include_directories
+       ${XNNPACK_ROOT}/third-party/cpuinfo/include)
+endif()
+
+# XNNPACK
 if(TARGET xnnpack_backend)
-  set(xnnpack_backend_libs xnnpack_backend XNNPACK pthreadpool cpuinfo)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK)
   list(APPEND link_libraries ${xnnpack_backend_libs})
   target_link_options_shared_lib(xnnpack_backend)
 endif()
@@ -114,15 +169,19 @@ if(TARGET qnn_executorch_backend)
   target_link_options_shared_lib(qnn_executorch_backend)
 endif()
 
-# This one is needed for cpuinfo where it uses android
-# specific log lib
+# This one is needed for cpuinfo where it uses android specific log lib
 if(ANDROID)
   list(APPEND link_libraries log)
 endif()
 
-target_compile_options(llama_main PUBLIC ${_common_compile_options}
-  -DET_USE_THREADPOOL)
-target_link_libraries(llama_main PUBLIC ${link_libraries})
+add_executable(llama_main ${_srcs})
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  target_link_options(llama_main PRIVATE "LINKER:--gc-sections,-s")
+endif()
+
+target_include_directories(llama_main PUBLIC ${_common_include_directories})
+target_link_libraries(llama_main PUBLIC llama_runner ${link_libraries})
+target_compile_options(llama_main PUBLIC ${_common_compile_options})
 
 if(APPLE)
   target_link_options_shared_lib(executorch)
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index d392673d34a..f2ed9990f97 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -5,7 +5,7 @@ This example demonstrates how to run a [Llama 2](https://ai.meta.com/llama/) 7B
 For Llama2, please refer to [the llama's github page](https://github.com/facebookresearch/llama) for details.
 Pretrained parameters are not included in this repo. Users are suggested to download them through [the llama's download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
 
-# What is Llama 2?
+# What are Llama 2 and 3?
 Llama is a family of large language models that uses publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence.
 
 Llama models have shown to perform well on a variety of natural language processing tasks, including language translation, question answering, and text summarization and are also capable of generating human-like text, making Llama models a useful tool for creative writing and other applications where natural language generation is important.
@@ -14,32 +14,39 @@ Overall, Llama models are powerful and versatile language models that can be use
 
 Please note that the models are subject to the [acceptable use policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md) and the provided [responsible use guide](https://ai.meta.com/static-resource/responsible-use-guide/).
 
-
 # Results
 
 Since 7B Llama2 model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model.
 
+For Llama3, we can use the same process. Note that it's only supported in the ExecuTorch main branch.
+
 ## Quantization:
-We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html).
+We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch-labs/ao/).
 
-We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Below are the results for two different groupsizes.
+We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Below are the results for two different groupsizes, with max_seq_len 2048, and 1000 samples.
 
-|Llama 2 | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256)
+|Model | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256)
 |--------|-----------------| ---------------------- | ---------------
-|Wikitext Perplexity | 9.16 | 10.2 | 10.7
+|Llama 2 7B | 9.2 | 10.2 | 10.7
+|Llama 3 8B | 7.9 | 9.4 | 9.7
 
 Note that groupsize less than 128 was not enabled, since such model were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32.
 
+## Enablement
+
+We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12.
+
+For Llama 3 8B, we have verified so far on iPhone 15 Pro Max and OnePlus 12 (with 16GB RAM).
+
 ## Performance
 
-Performance was measured on Samsung Galaxy S22, S23, S24 and One Plus 12. Measurement performance is in terms of tokens/second.
+Llama2 7B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on).
 
 |Device  | Groupwise 4-bit (128) | Groupwise 4-bit (256)
 |--------| ---------------------- | ---------------
-|Galaxy S22 | 8.15 tokens/second | 8.3 tokens/second |
+|Galaxy S22  | 8.15 tokens/second | 8.3 tokens/second |
 |Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second |
-|One plus 12 | 11.55 tokens/second | 11.6 tokens/second |
-|iPhone 15 pro | x | x |
+|OnePlus 12 | 11.55 tokens/second | 11.6 tokens/second |
 
 
 # Instructions
@@ -50,7 +57,7 @@ Performance was measured on Samsung Galaxy S22, S23, S24 and One Plus 12. Measur
 - For Llama7b, your device may require at least 32GB RAM. If this is a constraint for you, please try the smaller stories model.
 
 ## Step 1: Setup
-1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack`
 2. Run `examples/models/llama2/install_requirements.sh` to install a few dependencies.
 
 ## Step 2: Prepare model
@@ -61,10 +68,17 @@ You can export and run the original Llama2 7B model.
 
 1. Llama2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b).
 
-2. Export model and generate `.pte` file:
+2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround.
+
+3. Export model and generate `.pte` file:
     ```
     python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
     ```
+4. Create tokenizer.bin.
+
+    ```
+    python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+    ```
 
 ### Option B: Download and export stories110M model
 
@@ -89,6 +103,18 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
     ```
 
+### Option C: Download and export Llama3 8B model
+
+You can export and run the original Llama3 8B model.
+
+1. Llama3 pretrained parameters can be downloaded from [Meta's official llama3 repository](https://github.com/meta-llama/llama3/).
+
+2. Export model and generate `.pte` file
+    ```
+    python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_id":128001}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+    ```
+
+    Due to the larger vocabulary size of Llama3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` to further reduce the model size.
 
 ## (Optional) Finetuning
 
@@ -134,7 +160,9 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}`
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_CUSTOM=ON \
         -Bcmake-out .
 
     cmake --build cmake-out -j16 --target install --config Release
@@ -145,18 +173,25 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}`
     cmake -DPYTHON_EXECUTABLE=python \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_CUSTOM=ON \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_QUANTIZED=ON \
         -Bcmake-out/examples/models/llama2 \
         examples/models/llama2
 
     cmake --build cmake-out/examples/models/llama2 -j16 --config Release
     ```
 
+For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner.
+
 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40).
     ```
     cmake-out/examples/models/llama2/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.bin> --prompt=<prompt>
     ```
 
+For Llama3, you can pass the original `tokenizer.model` (without converting to `.bin` file).
+
 ## Step 5: Run benchmark on Android phone
 
 **1. Build llama runner binary for Android**
@@ -208,22 +243,35 @@ cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release
 
 **2.2 Upload model, tokenizer and llama runner binary to phone**
 ```
-adb push <model.pte> /data/local/tmp/
-adb push <tokenizer.bin> /data/local/tmp/
-adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/
+adb shell mkdir -p /data/local/tmp/llama
+adb push <model.pte> /data/local/tmp/llama/
+adb push <tokenizer.bin> /data/local/tmp/llama/
+adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/llama/
 ```
 
 **2.3 Run model**
 ```
-adb shell "cd /data/local/tmp && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt "Once upon a time" --seq_len 120
+adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt \"Once upon a time\" --seq_len 120"
 ```
-## Step 6: Build iOS and/or Android apps
+## Step 6: Build Mobile apps
 
-TODO
+### iOS
 
-### Android app
+Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App.
+
+### Android
 Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
 
+## Optional: Smaller models delegated to other backends
+Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
+for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
+
+- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --coreml -c stories110M.pt -p params.json`
+- MPS: `python -m examples.models.llama2.export_llama -kv --mps -c stories110M.pt -p params.json`
+- QNN: `python -m examples.models.llama2.export_llama -kv --qnn -c stories110M.pt -p params.json`
+
+The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
+
 # What is coming next?
 ## Quantization
 - Enabling FP16 model to leverage smaller groupsize for 4-bit quantization.
@@ -238,7 +286,6 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 - Enabling LLama2 7b and other architectures via Vulkan
 - Enabling performant execution of widely used quantization schemes.
 
-TODO
 
 # Notes
 This example tries to reuse the Python code, with minimal modifications to make it compatible with current ExecuTorch:
@@ -247,12 +294,17 @@ This example tries to reuse the Python code, with minimal modifications to make
 3. No dependencies on fairscale. The ColumnParallelLinear, ParallelEmbedding and training are not needed and supported in ExecuTorch.
 
 
-# Clean
-To clean your build:
+# Common Issues and Mitigations:
+- To clean your build:
 ```
 git clean -xfd
 pip uninstall executorch
-./install_requirements.sh <options>
+./install_requirements.sh --pybind xnnpack
 
 rm -rf cmake-out
 ```
+- If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt`
+
+# Disclaimer
+
+The ExecuTorch Repository Content is provided without any guarantees about performance or compatibility. In particular, ExecuTorch makes available model architectures written in Python for PyTorch that may not perform in the same manner or meet the same standards as the original versions of those models. When using the Executorch Repository Content, including any model architectures, you are solely responsible for determining the appropriateness of using or redistributing the ExecuTorch Repository Content and assume any risks associated with your use of the ExecuTorch Repository Content or any models, outputs, or results, both alone and in combination with any other technologies. Additionally, you may have other legal obligations that govern your use of other content, such as the terms of service for third-party models, weights, data, or other technologies, and you are solely responsible for complying with all such obligations.
\ No newline at end of file
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index c93ea6149ff..9610510f244 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -18,7 +18,6 @@ runtime.python_library(
     ],
     deps = [
         "//caffe2:torch",
-        "//executorch/examples/models/llama2/custom_ops:llama_custom_ops_aot_lib",
     ],
 )
 
@@ -28,7 +27,6 @@ runtime.python_library(
         "__init__.py",
         "fairseq2.py",
         "model.py",
-        "quantize.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama2",
@@ -37,13 +35,13 @@ runtime.python_library(
     },
     visibility = [
         "//bento/...",
+        "//bento_kernels/...",
         "//executorch/...",
     ],
     deps = [
         "//caffe2:torch",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models/llama2:llama_transformer",
-        "//executorch/examples/models/llama2/ops:quantized_aot_lib",
     ],
 )
 
@@ -52,6 +50,7 @@ runtime.python_binary(
     main_module = "executorch.examples.models.llama2.export_llama",
     # visibility = ["//executorch/examples/..."],
     preload_deps = [
+        "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_lib",
         "//executorch/kernels/quantized:aot_lib",
     ],
     deps = [
@@ -67,14 +66,18 @@ runtime.python_library(
         "builder.py",
         "export_llama.py",
         "export_llama_lib.py",
+        "lib/partitioner_lib.py",
+        "lib/quant_lib.py",
         "model.py",
-        "quant_lib.py",
-        "quantize.py",
+        "source_transformation/quantize.py",
+        "source_transformation/rope.py",
+        "source_transformation/sdpa.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama2",
     visibility = [
         "//bento/...",
+        "//bento_kernels/...",
         "//executorch/examples/...",
     ],
     deps = [
@@ -85,6 +88,7 @@ runtime.python_library(
         "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
+        "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_py",
         "//executorch/examples/portable:utils",
         "//executorch/exir:lib",
         "//executorch/sdk/etrecord:etrecord",
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
index 3473391b641..b05dc19bfc0 100644
--- a/examples/models/llama2/builder.py
+++ b/examples/models/llama2/builder.py
@@ -62,7 +62,9 @@ def to_torch_dtype(self) -> torch.dtype:
 
 def load_llama_model(
     *,
-    checkpoint: str,
+    modelname: str = "llama2",
+    checkpoint: Optional[str] = None,
+    checkpoint_dir: Optional[str] = None,
     params_path: str,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
@@ -76,7 +78,9 @@ def load_llama_model(
     Returns:
         An instance of LlamaEdgeManager which contains the eager mode model.
     """
-    assert checkpoint and params_path, "Both checkpoint and params can't be empty"
+    assert (
+        checkpoint or checkpoint_dir
+    ) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty"
     logging.info(
         f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}"
     )
@@ -84,6 +88,7 @@ def load_llama_model(
         "llama2",
         "Llama2Model",
         checkpoint=checkpoint,
+        checkpoint_dir=checkpoint_dir,
         params=params_path,
         use_kv_cache=use_kv_cache,
         use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
@@ -110,6 +115,7 @@ def load_llama_model(
 
     return LlamaEdgeManager(
         model=model,
+        modelname=modelname,
         weight_type=weight_type,
         dtype=dtype,
         use_kv_cache=use_kv_cache,
@@ -127,6 +133,7 @@ class LlamaEdgeManager:
     def __init__(
         self,
         model,
+        modelname,
         weight_type,
         dtype,
         use_kv_cache,
@@ -135,6 +142,7 @@ def __init__(
         verbose: bool = False,
     ):
         self.model = model
+        self.modelname = modelname
         self.weight_type = weight_type
         self.dtype = dtype
         self.example_inputs = example_inputs
@@ -202,11 +210,7 @@ def source_transform(
     def _get_dynamic_shape(self) -> Any:
         dim = torch.export.Dim("token_dim", max=self.model.params.max_seq_len - 1)
         if self.use_kv_cache:
-            if self.use_sdpa_with_kv_cache:
-                return None
-            else:
-                # return {1: dim}, {0: dim}} TODO update xnnpack to be able to handle dynamic shape kv cache
-                return None
+            return None
         else:
             return ({1: dim},)
 
diff --git a/examples/models/llama2/custom_ops/CMakeLists.txt b/examples/models/llama2/custom_ops/CMakeLists.txt
index d06f3d5de81..5075807b8db 100644
--- a/examples/models/llama2/custom_ops/CMakeLists.txt
+++ b/examples/models/llama2/custom_ops/CMakeLists.txt
@@ -25,7 +25,7 @@ if(NOT TORCH_ROOT)
   set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
 endif()
 
-set(_common_compile_options -Wno-deprecated-declarations)
+set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
@@ -44,21 +44,12 @@ include(${EXECUTORCH_SRCS_FILE})
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 # Custom op libraries
-set(custom_ops_libs extension_module)
+set(custom_ops_libs executorch_no_prim_ops)
 list(APPEND custom_ops_libs pthreadpool)
 list(APPEND custom_ops_libs cpuinfo)
 list(APPEND custom_ops_libs cpublas)
 list(APPEND custom_ops_libs eigen_blas)
 
-# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
-# Executorch (for runtime). Here select all ops in optimized.yaml
-set(_yaml "${CMAKE_CURRENT_LIST_DIR}/custom_ops.yaml")
-gen_selected_ops("${_yaml}" "" "")
-
-generate_bindings_for_kernels(FUNCTIONS_YAML
-                              ${CMAKE_CURRENT_SOURCE_DIR}/custom_ops.yaml)
-message("Generated files ${gen_command_sources}")
-
 list(TRANSFORM _custom_ops__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 # TODO: Consider moving xnnpack/threadpool in a separate lib since it's now used
@@ -70,6 +61,8 @@ if(NOT EXECUTORCH_BUILD_XNNPACK)
     "${CMAKE_CURRENT_SOURCE_DIR}/../../../../backends/xnnpack/threadpool/threadpool.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../../../backends/xnnpack/threadpool/threadpool_guard.cpp"
   )
+else()
+  list(APPEND custom_ops_libs xnnpack_backend)
 endif()
 
 add_library(custom_ops ${_custom_ops__srcs})
@@ -82,7 +75,20 @@ target_link_libraries(custom_ops PUBLIC ${custom_ops_libs})
 target_compile_options(custom_ops PUBLIC ${_common_compile_options}
                                          -DET_USE_THREADPOOL)
 
-# Build a library for _custom_ops_srcs
-#
-# custom_ops_lib: Register optimized ops kernels into Executorch runtime
-gen_operators_lib("custom_ops_lib" KERNEL_LIBS custom_ops DEPS executorch)
+install(TARGETS custom_ops DESTINATION lib)
+
+if(EXECUTORCH_BUILD_CUSTOM_OPS_AOT)
+  # Add a AOT library
+  find_package(Torch CONFIG REQUIRED)
+  add_library(custom_ops_aot_lib SHARED
+              ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp)
+  target_include_directories(custom_ops_aot_lib
+                            PUBLIC "${_common_include_directories}")
+  target_include_directories(
+    custom_ops_aot_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../../include")
+  target_link_libraries(custom_ops_aot_lib PUBLIC custom_ops torch)
+  target_compile_options(custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations
+                                                  -fPIC -frtti -fexceptions)
+
+  install(TARGETS custom_ops_aot_lib DESTINATION lib)
+endif()
diff --git a/examples/models/llama2/custom_ops/TARGETS b/examples/models/llama2/custom_ops/TARGETS
index 2341af9282f..195df3bb931 100644
--- a/examples/models/llama2/custom_ops/TARGETS
+++ b/examples/models/llama2/custom_ops/TARGETS
@@ -1,8 +1,23 @@
 # Any targets that should be shared between fbcode and xplat must be defined in
 # targets.bzl. This file can contain fbcode-only targets.
 
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
 
 define_common_targets()
+
+runtime.python_test(
+    name = "test_sdpa_with_kv_cache",
+    srcs = [
+        "test_sdpa_with_kv_cache.py",
+    ],
+    preload_deps = [
+        ":custom_ops_aot_lib",
+        ":custom_ops_aot_py",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
diff --git a/.swift/coreml_backend/dummy.swift b/examples/models/llama2/custom_ops/__init__.py
similarity index 100%
rename from .swift/coreml_backend/dummy.swift
rename to examples/models/llama2/custom_ops/__init__.py
diff --git a/examples/models/llama2/custom_ops/custom_ops.yaml b/examples/models/llama2/custom_ops/custom_ops.yaml
deleted file mode 100644
index 8de14c6aaaf..00000000000
--- a/examples/models/llama2/custom_ops/custom_ops.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This yaml file contains operators that have optimized kernels available.
-
-- func: llama::sdpa.out(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::flash_attention_kernel_out
-
-- func: llama::sdpa_with_kv_cache.out(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, Tensor(b!) value_cache, int start_pos, int seq_len, Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sdpa_with_kv_cache_out
diff --git a/examples/models/llama2/custom_ops/op_sdpa.cpp b/examples/models/llama2/custom_ops/op_sdpa.cpp
index 18e24eb867c..dd0fa67ec08 100644
--- a/examples/models/llama2/custom_ops/op_sdpa.cpp
+++ b/examples/models/llama2/custom_ops/op_sdpa.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h>
 
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
 #include <executorch/kernels/optimized/vec/functional.h>
@@ -22,6 +22,7 @@
 #include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #include <executorch/extension/parallel/thread_parallel.h>
 #endif
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
 namespace torch {
 namespace executor {
@@ -218,13 +219,29 @@ void cpu_flash_attention(
   int64_t qSize = query.size(2);
   int64_t headSize = query.size(3);
   int64_t kvSize = value.size(2);
+  int64_t num_heads_kv = key.size(1);
 
   if (is_with_kv_cache) {
     num_head = query.size(2);
+    num_heads_kv = key.size(2);
     qSize = query.size(1);
     kvSize = value.size(1);
   }
 
+  ET_CHECK_MSG(
+      num_heads_kv <= num_head,
+      "FlashAttention does not support num kv heads > num query heads.Got num query heads=%" PRId64
+      " num key heads:%" PRId64,
+      num_head,
+      num_heads_kv);
+  ET_CHECK_MSG(
+      num_head % num_heads_kv == 0,
+      "FlashAttention: num qyery heads must be divisible by num kv heads but got num query heads=%" PRId64
+      " and num kv heads=%" PRId64,
+      num_head,
+      num_heads_kv);
+  int64_t num_reps = num_head / num_heads_kv;
+
   bool has_attn_mask = attn_mask.has_value() && attn_mask.value().numel();
   if (has_attn_mask) {
     /*
@@ -364,6 +381,7 @@ void cpu_flash_attention(
       fill_stub(
           qk_max_data, -std::numeric_limits<accum_t>::infinity(), qBlockSize);
       int64_t num_keys = is_causal ? std::min(m + qBlockSize, kvSize) : kvSize;
+      auto j_kv = j / num_reps;
       for (int64_t n = 0; n < num_keys; n += kvSplitSize) {
         int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
         // Calculate scale * q @ k.T
@@ -375,7 +393,7 @@ void cpu_flash_attention(
             qBlockSize,
             headSize,
             static_cast<accum_t>(1),
-            k_data + i * kStrideB + j * kStrideH + n * kStrideN,
+            k_data + i * kStrideB + j_kv * kStrideH + n * kStrideN,
             kStrideN,
             q_data + i * qStrideB + j * qStrideH + m * qStrideM,
             qStrideM,
@@ -459,7 +477,7 @@ void cpu_flash_attention(
             qBlockSize,
             kvBlockSize,
             static_cast<accum_t>(1),
-            v_data + i * vStrideB + j * vStrideH + n * vStrideN,
+            v_data + i * vStrideB + j_kv * vStrideH + n * vStrideN,
             vStrideN,
             conditional_data_ptr(qk_data, qk_reduced_data),
             kvBlockSize,
@@ -843,3 +861,8 @@ Tensor& sdpa_with_kv_cache_out(
 } // namespace native
 } // namespace executor
 } // namespace torch
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "sdpa_with_kv_cache.out",
+    torch::executor::native::sdpa_with_kv_cache_out);
diff --git a/examples/models/llama2/custom_ops/op_sdpa.h b/examples/models/llama2/custom_ops/op_sdpa.h
new file mode 100644
index 00000000000..fd130964ebb
--- /dev/null
+++ b/examples/models/llama2/custom_ops/op_sdpa.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+Tensor& sdpa_with_kv_cache_out(
+    RuntimeContext& ctx,
+    const Tensor& q_projected,
+    const Tensor& k_projected,
+    const Tensor& v_projected,
+    Tensor& key_cache,
+    Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output);
+
+Tensor& flash_attention_kernel_out(
+    RuntimeContext& ctx,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output);
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/custom_ops/op_sdpa_aot.cpp b/examples/models/llama2/custom_ops/op_sdpa_aot.cpp
new file mode 100644
index 00000000000..ed735406ad5
--- /dev/null
+++ b/examples/models/llama2/custom_ops/op_sdpa_aot.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h>
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+
+#include <torch/library.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+Tensor& sdpa_with_kv_cache_out_no_context(
+    const Tensor& q_projected,
+    const Tensor& k_projected,
+    const Tensor& v_projected,
+    Tensor& key_cache,
+    Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output) {
+  exec_aten::RuntimeContext context{};
+  return torch::executor::native::sdpa_with_kv_cache_out(
+      context,
+      q_projected,
+      k_projected,
+      v_projected,
+      key_cache,
+      value_cache,
+      start_pos,
+      seq_len,
+      attn_mask,
+      dropout_p,
+      is_causal,
+      scale,
+      output);
+}
+
+at::Tensor sdpa_with_kv_cache_aten(
+    const at::Tensor& q_projected,
+    const at::Tensor& k_projected,
+    const at::Tensor& v_projected,
+    at::Tensor& key_cache,
+    at::Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const c10::optional<at::Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const c10::optional<double> scale) {
+  auto output = at::empty_like(q_projected);
+  WRAP_TO_ATEN(sdpa_with_kv_cache_out_no_context, 11)
+  (q_projected,
+   k_projected,
+   v_projected,
+   key_cache,
+   value_cache,
+   start_pos,
+   seq_len,
+   attn_mask,
+   dropout_p,
+   is_causal,
+   scale,
+   output);
+  return output;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+TORCH_LIBRARY(llama, m) {
+  m.def(
+      "sdpa_with_kv_cache(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
+      "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
+      "float drpout_p=0.0, bool is_causal=False, float? scale=None) -> Tensor");
+  m.def(
+      "sdpa_with_kv_cache.out(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
+      "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
+      "float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)");
+}
+
+TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
+  m.impl(
+      "sdpa_with_kv_cache", torch::executor::native::sdpa_with_kv_cache_aten);
+  m.impl(
+      "sdpa_with_kv_cache.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::sdpa_with_kv_cache_out_no_context, 11));
+}
diff --git a/examples/models/llama2/custom_ops/op_sdpa_test.cpp b/examples/models/llama2/custom_ops/op_sdpa_test.cpp
index 293359d19c9..971e8cf45cb 100644
--- a/examples/models/llama2/custom_ops/op_sdpa_test.cpp
+++ b/examples/models/llama2/custom_ops/op_sdpa_test.cpp
@@ -8,7 +8,8 @@
 
 #include <limits>
 
-#include <executorch/examples/models/llama2/custom_ops/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h>
+
 #include <executorch/kernels/test/TestUtil.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -28,7 +29,7 @@ exec_aten::Tensor op_scaled_dot_product_attention(
     exec_aten::optional<double> scale,
     exec_aten::Tensor& out) {
   exec_aten::RuntimeContext context{};
-  return torch::executor::llama::sdpa_outf(
+  return torch::executor::native::flash_attention_kernel_out(
       context, query, key, value, attn_mask, dropout_p, is_causal, scale, out);
 }
 
diff --git a/examples/models/llama2/custom_ops/op_sdpa_with_kv_cache_test.cpp b/examples/models/llama2/custom_ops/op_sdpa_with_kv_cache_test.cpp
index 6ec6f429264..fa2d164fe3d 100644
--- a/examples/models/llama2/custom_ops/op_sdpa_with_kv_cache_test.cpp
+++ b/examples/models/llama2/custom_ops/op_sdpa_with_kv_cache_test.cpp
@@ -8,7 +8,7 @@
 
 #include <limits>
 
-#include <executorch/examples/models/llama2/custom_ops/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -32,7 +32,7 @@ exec_aten::Tensor op_sdpa_with_kv_cache(
     exec_aten::optional<double> scale,
     exec_aten::Tensor& out) {
   exec_aten::RuntimeContext context{};
-  return torch::executor::llama::sdpa_with_kv_cache_outf(
+  return torch::executor::native::sdpa_with_kv_cache_out(
       context,
       query,
       key,
diff --git a/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py b/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py
index 5f11defb11d..bada40220bc 100644
--- a/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py
+++ b/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py
@@ -4,21 +4,29 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Import custom op defined in op_sdpa_aot.cpp. Those ops are using PyTorch
+# C++ APIs for registration so here we need to import the shared library.
+# This is only needed for OSS.
+
+import logging
+from pathlib import Path
+
 import torch
-from torch.library import impl, impl_abstract
 
-custom_ops_lib = torch.library.Library("llama", "DEF")
-custom_ops_lib.define(
-    "sdpa_with_kv_cache(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
-    "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
-    "float drpout_p=0.0, bool is_causal=False, float? scale=None) -> Tensor"
-)
+from torch.library import impl
 
-custom_ops_lib.define(
-    "sdpa_with_kv_cache.out(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
-    "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
-    "float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)"
-)
+try:
+    op = torch.ops.llama.sdpa_with_kv_cache.default
+    assert op is not None
+except:
+    libs = list(Path(__file__).parent.resolve().glob("libcustom_ops_aot_lib.*"))
+    assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
+    logging.info(f"Loading custom ops library: {libs[0]}")
+    torch.ops.load_library(libs[0])
+    op = torch.ops.llama.sdpa_with_kv_cache.default
+    assert op is not None
+
+custom_ops_lib = torch.library.Library("llama", "IMPL")
 
 
 def _validate_params(
@@ -118,82 +126,3 @@ def sdpa_with_kv_cache_meta(
     )
 
     return torch.empty_like(query)
-
-
-@impl(custom_ops_lib, "sdpa_with_kv_cache", "CompositeExplicitAutograd")
-def sdpa_with_kv_cache(
-    query,
-    key,
-    value,
-    key_cache,
-    value_cache,
-    start_pos,
-    seq_len,
-    attn_mask=None,
-    drpout_p=0.0,
-    is_causal=False,
-    scale=None,
-):
-    _validate_params(
-        query,
-        key,
-        value,
-        key_cache,
-        value_cache,
-        start_pos,
-        seq_len,
-        attn_mask,
-        drpout_p,
-        is_causal,
-        scale,
-    )
-
-    if attn_mask is not None:
-        attn_mask = attn_mask[start_pos].view((1, -1))
-        attn_mask = attn_mask[:, : start_pos + seq_len]
-    q = query.transpose(1, 2)
-    key_cache[:, start_pos] = key
-    value_cache[:, start_pos] = value
-
-    sliced_k_cache = key_cache
-    sliced_v_cache = value_cache
-    sliced_k_cache = sliced_k_cache[:, : start_pos + seq_len, :, :]
-    sliced_v_cache = sliced_v_cache[:, : start_pos + seq_len, :, :]
-    sliced_k_cache = sliced_k_cache.transpose(1, 2)
-    sliced_v_cache = sliced_v_cache.transpose(1, 2)
-    out = torch.nn.functional.scaled_dot_product_attention(
-        q, sliced_k_cache, sliced_v_cache, attn_mask=attn_mask
-    )
-    out = out.transpose(1, 2)
-    return out
-
-
-@impl_abstract("llama::sdpa_with_kv_cache.out")
-def sdpa_with_kv_cache_out(
-    query,
-    key,
-    value,
-    key_cache,
-    value_cache,
-    start_pos,
-    seq_len,
-    attn_mask,
-    drpout_p,
-    is_causal,
-    scale,
-    out,
-):
-    out = sdpa_with_kv_cache_meta(
-        query,
-        key,
-        value,
-        key_cache,
-        value_cache,
-        start_pos,
-        seq_len,
-        attn_mask,
-        drpout_p,
-        is_causal,
-        scale,
-    )
-    return out
diff --git a/examples/models/llama2/custom_ops/targets.bzl b/examples/models/llama2/custom_ops/targets.bzl
index ab611125fd0..cac83abe07d 100644
--- a/examples/models/llama2/custom_ops/targets.bzl
+++ b/examples/models/llama2/custom_ops/targets.bzl
@@ -1,41 +1,4 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib")
-load("@fbsource//xplat/executorch/kernels/test:util.bzl", "codegen_function_header_wrapper")
-
-def define_tests():
-    codegen_function_header_wrapper("executorch/examples/models/llama2/custom_ops", "custom_ops")
-
-    # In the long run we should really have aten variant available as well
-    deps = [":function_header_wrapper_custom_ops"]
-    generated_lib_and_op_deps = [
-        ":custom_ops",
-        ":sdpa",
-        ":custom_ops_headers",
-    ]
-    runtime.cxx_test(
-        name = "op_sdpa_test",
-        srcs = [
-            "op_sdpa_test.cpp",
-        ],
-        visibility = ["//executorch/..."],
-        deps = [
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-            "//executorch/kernels/test:test_util",
-        ] + generated_lib_and_op_deps + deps,
-    )
-    runtime.cxx_test(
-        name = "op_sdpa_with_kv_cache_test",
-        srcs = [
-            "op_sdpa_with_kv_cache_test.cpp",
-        ],
-        visibility = ["//executorch/..."],
-        deps = [
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-            "//executorch/kernels/test:test_util",
-        ] + generated_lib_and_op_deps + deps,
-    )
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -43,86 +6,83 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-
-    runtime.python_library(
-        name = "llama_custom_ops_aot_lib",
-        srcs = [
-            "sdpa_with_kv_cache.py",
+    runtime.cxx_library(
+        name = "custom_ops",
+        srcs = ["op_sdpa.cpp"],
+        exported_headers = ["op_sdpa.h"],
+        exported_deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/optimized:libvec",
+            "//executorch/extension/kernel_util:kernel_util",
+            "//executorch/extension/parallel:thread_parallel",
+            "//executorch/backends/xnnpack/threadpool:threadpool",
         ],
+        compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
         visibility = [
             "//executorch/...",
+            "//executorch/examples/models/llama2/custom_ops/...",
             "@EXECUTORCH_CLIENTS",
         ],
-        deps = [
-            "//caffe2:torch",
-        ],
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+        force_static = True,
     )
 
-    runtime.export_file(
-        name = "custom_ops.yaml",
+    runtime.cxx_library(
+        name = "custom_ops_aot_lib",
+        srcs = [
+            "op_sdpa_aot.cpp",
+        ],
         visibility = [
             "//executorch/...",
             "@EXECUTORCH_CLIENTS",
         ],
-    )
-
-    # ~~~ START of custom ops 1 `my_ops::mul3` library definitions ~~~
-    et_operator_library(
-        name = "sdpa_op",
-        ops = [
-            "llama::sdpa.out",
+        external_deps = [
+            "libtorch",
         ],
-        define_static_targets = True,
-        visibility = [
-            "//executorch/codegen/...",
-            "@EXECUTORCH_CLIENTS",
+        deps = [
+            ":custom_ops",
+            "//executorch/extension/aten_util:aten_bridge",
         ],
     )
 
-    et_operator_library(
-        name = "sdpa_with_kv_cache",
-        ops = [
-            "llama::sdpa_with_kv_cache.out",
+    runtime.python_library(
+        name = "custom_ops_aot_py",
+        srcs = [
+            "sdpa_with_kv_cache.py",
         ],
-        define_static_targets = True,
-        visibility = [
-            "//executorch/codegen/...",
-            "@EXECUTORCH_CLIENTS",
+        visibility = ["//executorch/..."],
+        deps = [
+            "//caffe2:torch",
         ],
     )
 
-    runtime.cxx_library(
-        name = "sdpa",
-        srcs = ["op_sdpa.cpp"],
-        deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/optimized:libblas",
-            "//executorch/kernels/optimized:libvec",
-            "//executorch/extension/parallel:thread_parallel",
-            "//executorch/backends/xnnpack/threadpool:threadpool",
+    runtime.cxx_test(
+        name = "op_sdpa_test",
+        srcs = [
+            "op_sdpa_test.cpp",
         ],
-        compiler_flags = ["-Wno-missing-prototypes"],
-        visibility = [
-            "//executorch/...",
-            "//executorch/examples/models/llama2/custom_ops/...",
-            "@EXECUTORCH_CLIENTS",
+        visibility = ["//executorch/..."],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/kernels/test:test_util",
+            ":custom_ops",
         ],
-        force_static = True,
     )
 
-    executorch_generated_lib(
-        name = "custom_ops",
-        deps = [
-            ":sdpa_op",
-            ":sdpa_with_kv_cache",
-            ":sdpa",
+    runtime.cxx_test(
+        name = "op_sdpa_with_kv_cache_test",
+        srcs = [
+            "op_sdpa_with_kv_cache_test.cpp",
         ],
-        custom_ops_yaml_target = ":custom_ops.yaml",
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
+        visibility = ["//executorch/..."],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/kernels/test:test_util",
+            ":custom_ops",
         ],
-        define_static_targets = True,
     )
-    define_tests()
diff --git a/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py b/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py
new file mode 100644
index 00000000000..abf3abc0284
--- /dev/null
+++ b/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py
@@ -0,0 +1,205 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import torch.nn.functional as F
+
+from .sdpa_with_kv_cache import custom_ops_lib  # noqa
+
+
+class SDPATest(unittest.TestCase):
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.k_cache = torch.zeros((1, 5, 8, 4))
+        self.v_cache = torch.zeros((1, 5, 8, 4))
+        self.mask = torch.full(
+            (5, 5),
+            float("-inf"),
+        )
+        self.mask = torch.triu(self.mask, diagonal=1)
+
+    def _sdpa_with_kv_cache_ref(self, q, k, v, k_cache, v_cache, mask, start_pos):
+        print(f"at start_pos:{start_pos}")
+        print(q)
+        print(k)
+        print(v)
+        attn_mask = mask[start_pos].view((1, -1))
+        attn_mask = attn_mask[:, : start_pos + 1]
+        q = q.transpose(1, 2)
+        k_cache[:, start_pos] = k
+        v_cache[:, start_pos] = v
+        sliced_k_cache = k_cache[:, : start_pos + 1, :, :]
+        sliced_v_cache = v_cache[:, : start_pos + 1, :, :]
+        sliced_k_cache = sliced_k_cache.transpose(1, 2)
+        sliced_v_cache = sliced_v_cache.transpose(1, 2)
+        # print(sliced_k_cache.size())
+        # print(torch.matmul(q, sliced_k_cache.transpose(2, 3)))
+        # print("q @ k")
+        # qk = torch.matmul(q, sliced_k_cache.transpose(2, 3))
+        # qk_softmax = torch.softmax(qk, dim=-1)
+        # qkv = torch.matmul(qk_softmax, sliced_v_cache)
+        # print(qk)
+        # print(qk_softmax)
+        # print(qkv)
+        out = F.scaled_dot_product_attention(
+            q, sliced_k_cache, sliced_v_cache, attn_mask=attn_mask
+        )
+        out = out.transpose(1, 2)
+        print(out)
+        print(f"-------- start pos {start_pos} done -----")
+        return out
+
+    def test_sdpa_with_cache_no_mqa_1(self):
+        q = torch.rand((1, 1, 8, 4))
+        k = torch.rand((1, 1, 8, 4))
+        v = torch.rand((1, 1, 8, 4))
+        ref_output = self._sdpa_with_kv_cache_ref(
+            q, k, v, self.k_cache, self.v_cache, self.mask, 0
+        )
+        op_output = torch.ops.llama.sdpa_with_kv_cache(
+            q, k, v, self.k_cache, self.v_cache, 0, 1, None, 0, False
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output))
+
+    def test_sdpa_with_cache_no_mqa_2(self):
+        q = torch.rand((1, 1, 8, 4))
+        k = torch.rand((1, 1, 8, 4))
+        v = torch.rand((1, 1, 8, 4))
+
+        ref_output = self._sdpa_with_kv_cache_ref(
+            q, k, v, self.k_cache, self.v_cache, self.mask, 1
+        )
+        op_output = torch.ops.llama.sdpa_with_kv_cache(
+            q, k, v, self.k_cache, self.v_cache, 1, 1, None, 0, False
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output))
+
+    def test_sdpa_with_cache_no_mqa_3(self):
+        q = torch.rand((1, 1, 8, 4))
+        k = torch.rand((1, 1, 8, 4))
+        v = torch.rand((1, 1, 8, 4))
+
+        ref_output = self._sdpa_with_kv_cache_ref(
+            q, k, v, self.k_cache, self.v_cache, self.mask, 2
+        )
+        op_output = torch.ops.llama.sdpa_with_kv_cache(
+            q, k, v, self.k_cache, self.v_cache, 2, 1, None, 0, False
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output))
+
+    def test_sdpa_with_cache_no_mqa_4(self):
+        q = torch.rand((1, 1, 8, 4))
+        k = torch.rand((1, 1, 8, 4))
+        v = torch.rand((1, 1, 8, 4))
+
+        ref_output = self._sdpa_with_kv_cache_ref(
+            q, k, v, self.k_cache, self.v_cache, self.mask, 3
+        )
+        op_output = torch.ops.llama.sdpa_with_kv_cache(
+            q, k, v, self.k_cache, self.v_cache, 3, 1, None, 0, False
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output))
+
+
+class SDPATestWithMQA(unittest.TestCase):
+
+    def setup_caches(self):
+        self.k_cache = torch.zeros((1, 5, self.n_heads_kv, 4))
+        self.v_cache = torch.zeros((1, 5, self.n_heads_kv, 4))
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.n_heads_kv = 4
+        self.n_heads_q = 8
+        self.setup_caches()
+        self.mask = torch.full(
+            (5, 5),
+            float("-inf"),
+        )
+        self.mask = torch.triu(self.mask, diagonal=1)
+
+    def _sdpa_with_kv_cache_ref(self, q, k, v, k_cache, v_cache, mask, start_pos):
+        print(f"at start_pos:{start_pos}")
+        print(q)
+        print(k)
+        print(v)
+        attn_mask = mask[start_pos].view((1, -1))
+        attn_mask = attn_mask[:, : start_pos + 1]
+        q = q.transpose(1, 2)
+        k_cache[:, start_pos] = k
+        v_cache[:, start_pos] = v
+        sliced_k_cache = k_cache[:, : start_pos + 1, :, :]
+        sliced_v_cache = v_cache[:, : start_pos + 1, :, :]
+        sliced_k_cache = sliced_k_cache.transpose(1, 2)
+        sliced_v_cache = sliced_v_cache.transpose(1, 2)
+        # print(sliced_k_cache.size())
+        # print(torch.matmul(q, sliced_k_cache.transpose(2, 3)))
+        # print("q @ k")
+        # qk = torch.matmul(q, sliced_k_cache.transpose(2, 3))
+        # qk_softmax = torch.softmax(qk, dim=-1)
+        # qkv = torch.matmul(qk_softmax, sliced_v_cache)
+        # print(qk)
+        # print(qk_softmax)
+        # print(qkv)
+        num_heads_q = q.size(1)
+        num_heads_kv = sliced_k_cache.size(1)
+        if num_heads_q != num_heads_kv:
+            assert (
+                num_heads_q % num_heads_kv == 0
+            ), f"{num_heads_q} not divisible by {num_heads_kv}"
+        n_reps = num_heads_q // num_heads_kv
+        if n_reps > 1:
+            sliced_k_cache = sliced_k_cache.repeat_interleave(n_reps, dim=1)
+            sliced_v_cache = sliced_v_cache.repeat_interleave(n_reps, dim=1)
+        out = F.scaled_dot_product_attention(
+            q, sliced_k_cache, sliced_v_cache, attn_mask=attn_mask
+        )
+        out = out.transpose(1, 2)
+        print(out)
+        print(f"-------- start pos {start_pos} done -----")
+        return out
+
+    def test_sdpa_with_cache_mqa_1(self):
+        q = torch.rand((1, 1, self.n_heads_q, 4))
+        k = torch.rand((1, 1, self.n_heads_kv, 4))
+        v = torch.rand((1, 1, self.n_heads_kv, 4))
+        ref_output = self._sdpa_with_kv_cache_ref(
+            q, k, v, self.k_cache, self.v_cache, self.mask, 0
+        )
+        op_output = torch.ops.llama.sdpa_with_kv_cache(
+            q, k, v, self.k_cache, self.v_cache, 0, 1, None, 0, False
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output))
+
+    def test_sdpa_with_cache_mqa_2(self):
+        q = torch.rand((1, 1, self.n_heads_q, 4))
+        k = torch.rand((1, 1, self.n_heads_kv, 4))
+        v = torch.rand((1, 1, self.n_heads_kv, 4))
+        ref_output = self._sdpa_with_kv_cache_ref(
+            q, k, v, self.k_cache, self.v_cache, self.mask, 1
+        )
+        op_output = torch.ops.llama.sdpa_with_kv_cache(
+            q, k, v, self.k_cache, self.v_cache, 1, 1, None, 0, False
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output))
+
+    def test_sdpa_with_cache_mqa_3(self):
+        self.n_heads_q = 14
+        self.n_heads_kv = 7
+        self.setup_caches()
+        q = torch.rand((1, 1, self.n_heads_q, 4))
+        k = torch.rand((1, 1, self.n_heads_kv, 4))
+        v = torch.rand((1, 1, self.n_heads_kv, 4))
+        ref_output = self._sdpa_with_kv_cache_ref(
+            q, k, v, self.k_cache, self.v_cache, self.mask, 1
+        )
+        op_output = torch.ops.llama.sdpa_with_kv_cache(
+            q, k, v, self.k_cache, self.v_cache, 1, 1, None, 0, False
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output))
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
index c9650531705..c9faeb556c8 100644
--- a/examples/models/llama2/eval_llama_lib.py
+++ b/examples/models/llama2/eval_llama_lib.py
@@ -6,16 +6,22 @@
 
 
 import argparse
-from typing import Optional
+
+from typing import Optional, Union
 
 import lm_eval
 import torch
 
+from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
+from executorch.examples.models.llama2.tokenizer.tokenizer import (
+    Tokenizer as SentencePieceTokenizer,
+)
+
 from lm_eval.api.model import LM
 from lm_eval.evaluator import evaluate
 from lm_eval.models.huggingface import HFLM as eval_wrapper
 from lm_eval.tasks import get_task_dict
-from sentencepiece import SentencePieceProcessor
+
 from torch import nn
 
 from .builder import LlamaEdgeManager
@@ -33,20 +39,21 @@ class GPTFastEvalWrapper(eval_wrapper):
     def __init__(
         self,
         model: nn.Module,
-        tokenizer: SentencePieceProcessor,
+        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
         max_seq_length: Optional[int] = None,
+        use_kv_cache: bool = False,
     ):
-        super().__init__()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        super().__init__(device=device)
         self._model = model
         self._tokenizer = tokenizer
-        self._device = (
-            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-        )
+        self._device = torch.device(device)
         self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
+        self._use_kv_cache = use_kv_cache
 
     @property
     def eot_token_id(self):
-        return self._tokenizer.eos_id()
+        return self._tokenizer.eos_id
 
     @property
     def max_length(self):
@@ -65,7 +72,7 @@ def device(self):
         return self._device
 
     def tok_encode(self, string: str, **kwargs):
-        tokens = [self._tokenizer.bos_id()] + self._tokenizer.encode(string)
+        tokens = self._tokenizer.encode(string, bos=True, eos=False)
         encoded = torch.tensor(tokens, dtype=torch.int, device=self.device)
         # encoded is a pytorch tensor, but some internal logic in the
         # eval harness expects it to be a list instead
@@ -78,7 +85,15 @@ def tok_decode(self, tokens):
         return decoded
 
     def _model_call(self, inps):
-        return self._model(inps)
+        if self._use_kv_cache:
+            result_logits = []
+            for pos in range(self._max_seq_length):
+                pos_tensor = torch.tensor([pos], dtype=torch.int64)
+                logits = self._model(inps[:, pos : pos + 1], pos_tensor)
+                result_logits.append(logits)
+            return torch.cat(result_logits, dim=1)
+        else:
+            return self._model(inps)
 
     def _model_generate(self, context, max_length, eos_token_id):
         raise Exception("unimplemented")
@@ -93,7 +108,7 @@ class ETEagerEvalWrapper(GPTFastEvalWrapper):
     def __init__(
         self,
         model: str,
-        tokenizer: SentencePieceProcessor,
+        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
         max_seq_length: Optional[int] = None,
     ):
         super().__init__(None, tokenizer, max_seq_length)
@@ -102,13 +117,22 @@ def __init__(
         from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
         self._et_model = _load_for_executorch(self._model)
+        self._use_kv_cache = self._et_model.run_method("use_kv_cache")[0]
 
     def _model_call(self, inps):
         # Given inps (tokens), return the logits from a single forward call
         # inps: Tensor of shape (1, max_seq_len - 1)
-        # logits: Tensor of shape (1, max_seq_len - 1, 32000)
-        result = self._et_model.forward((inps,))
-        return result[0]
+        # logits: Tensor of shape (1, max_seq_len - 1, vocab_size)
+        if self._use_kv_cache:
+            result_logits = []
+            for pos in range(self._max_seq_length):
+                pos_tensor = torch.tensor([pos], dtype=torch.int64)
+                logits = self._et_model.forward((inps[:, pos : pos + 1], pos_tensor))
+                result_logits.append(logits[0])
+            return torch.cat(result_logits, dim=1)
+        else:
+            result = self._et_model.forward((inps,))
+            return result[0]
 
 
 class ETRunnerEvalWrapper(GPTFastEvalWrapper):
@@ -120,7 +144,7 @@ class ETRunnerEvalWrapper(GPTFastEvalWrapper):
     def __init__(
         self,
         model: str,
-        tokenizer: SentencePieceProcessor,
+        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
         tokenizer_bin: str,
         max_seq_length: Optional[int] = None,
     ):
@@ -134,7 +158,7 @@ def _model_call(self, inps):
 
         # Example:
         # inps: Tensor of shape (1, N)
-        # logits: Tensor of shape (1, N, 32000)
+        # logits: Tensor of shape (1, N, vocab_size)
         pass
 
 
@@ -183,7 +207,11 @@ def gen_eval_wrapper(
     Returns:
         eval_wrapper (LM): A wrapper interface for the lm-evaluation-harness library.
     """
-    tokenizer = SentencePieceProcessor(model_file=str(args.tokenizer_path))
+    try:
+        tokenizer = SentencePieceTokenizer(model_path=str(args.tokenizer_path))
+    except Exception:
+        print("Using Tiktokenizer")
+        tokenizer = Tiktoken(model_path=str(args.tokenizer_path))
 
     # ExecuTorch Binary Evaluation
     if (model := args.pte) is not None:
@@ -216,6 +244,7 @@ def gen_eval_wrapper(
         model=model,
         tokenizer=tokenizer,
         max_seq_length=args.max_seq_length,
+        use_kv_cache=args.use_kv_cache,
     )
 
 
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index de1e711a2c9..a8c8880b281 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -9,32 +9,39 @@
 import argparse
 import copy
 import logging
-import os
 import shlex
-
-from functools import partial
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Union
 
 import pkg_resources
-import torch
-from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
-    XnnpackDynamicallyQuantizedPartitioner,
-)
-
-from executorch.examples.models.llama2.llama_transformer import Transformer
-from executorch.exir.backend.backend_details import CompileSpec
 
 from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
-from sentencepiece import SentencePieceProcessor
 
 from .builder import DType, LlamaEdgeManager, load_llama_model, WeightType
-from .quant_lib import _get_pt2e_quantization_params, get_pt2e_quantizers
-
-from .quantize import EmbeddingOnlyInt8QuantHandler, WeightOnlyInt8QuantHandler
+from .lib.partitioner_lib import (
+    get_coreml_partitioner,
+    get_mps_partitioner,
+    get_qnn_partitioner,
+    get_vulkan_partitioner,
+    get_xnnpack_partitioner,
+)
 
+from .lib.quant_lib import (
+    _get_pt2e_quantization_params,
+    get_pt2e_quantizers,
+    get_qnn_quantizer,
+)
+from .source_transformation.quantize import (
+    get_quant_embedding_transform,
+    get_quant_weight_transform,
+)
+from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
+from .source_transformation.sdpa import (
+    replace_causal_mask,
+    replace_sdpa_with_custom_op,
+    replace_sdpa_with_simple_sdpa,
+)
 
 IS_FBCODE = True  #  os.environ.get("FBCODE_PLATFORM", False)
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -62,133 +69,6 @@ def verbose_export():
     return verbosity_setting
 
 
-def materialze_broadcast_of_rope_freq_cis(
-    module: torch.nn.Module,
-):
-    assert isinstance(module, Transformer)
-    assert module.freqs_cos.dim() == 2
-    dim0 = module.freqs_cos.size(0)
-    dim1 = module.freqs_cos.size(1)
-    assert (
-        module.layers[0].attention.n_local_kv_heads
-        == module.layers[0].attention.n_local_heads
-    ), f"For rope freqs to be materialzed for broadcast q, k, v num heads must match. For q got {module.attention.n_kv_heads} for k got {module.attention.n_local_heads} and v got {module.attention.n_local_kv_heads}"
-    num_heads = module.layers[0].attention.n_local_heads
-    module.freqs_cos = module.freqs_cos.view(dim0, 1, dim1)
-    module.freqs_cos = module.freqs_cos.expand(dim0, num_heads, dim1).contiguous()
-    assert module.freqs_sin.dim() == 2
-    assert dim0 == module.freqs_sin.size(
-        0
-    ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.freqs_sin.size(0)}"
-    assert dim1 == module.freqs_sin.size(
-        1
-    ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.freqs_sin.size(1)}"
-    module.freqs_sin = module.freqs_sin.view(dim0, 1, dim1)
-    module.freqs_sin = module.freqs_sin.expand(dim0, num_heads, dim1).contiguous()
-    return module
-
-
-def quantize(
-    model: torch.nn.Module,
-    qmode: str,
-    activation_dtype: Optional[DType],
-    checkpoint_path: Optional[Path] = None,
-    # following arguments only available when setting int4 or gptq quantization.
-    group_size: Optional[int] = 128,
-    # following arguments are only used for GPTQ
-    calibration_tasks: Optional[list] = None,
-    calibration_limit: Optional[int] = None,
-    calibration_seq_length: Optional[int] = None,
-    pad_calibration_inputs: bool = False,
-    percdamp: float = 0.01,
-    blocksize: int = 128,
-    tokenizer_path: Optional[Path] = None,
-) -> torch.nn.Module:
-    """
-    Quantizes a model by converting all weights to int8.
-    Args:
-        model: A model to quantize.
-        qmode: quantization mode, e.g. int8, 8da4w, 8da4w-gptq
-    Returns:
-        A quantized model.
-    """
-    if activation_dtype is not None:
-        torch_dtype = activation_dtype.to_torch_dtype()
-    else:
-        torch_dtype = torch.float16
-
-    assert checkpoint_path, "Need to specify a checkpoint"
-    assert os.path.isfile(
-        canonical_path(checkpoint_path)
-    ), f"{checkpoint_path} does not exist"
-    # if checkpoint_path is None:
-    #     checkpoint_path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
-
-    if qmode == "int8":
-        # Add quantization mode options here: group size, bit width, etc.
-        return WeightOnlyInt8QuantHandler(model).quantized_model()
-    elif qmode == "8da4w":
-        # Check for required args
-        if group_size is None:
-            raise Exception("For 8da4w quantization, group size must be specified.")
-        from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
-
-        model = Int8DynActInt4WeightQuantizer(
-            precision=torch_dtype, groupsize=group_size
-        ).quantize(model)
-        if verbose_export():
-            print("quantized model:", model)
-        return model
-    elif qmode == "8da4w-gptq":
-        # Check for required args
-        required_args: Optional[Any] = [
-            group_size,
-            calibration_limit,
-            calibration_seq_length,
-        ]
-        if any(arg is None for arg in required_args):
-            raise Exception(
-                "For 8da4w-gptq quantization, group size, calibration limit and calibration sequence length must be specified."
-            )
-        if calibration_tasks is None:
-            calibration_tasks = ["wikitext"]
-
-        from torchao.quantization.GPTQ import InputRecorder
-        from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
-
-        if tokenizer_path is None:
-            tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-        assert tokenizer_path.is_file(), tokenizer_path
-        tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
-            model_file=str(tokenizer_path)
-        )
-
-        inputs = (
-            InputRecorder(
-                tokenizer,
-                calibration_seq_length,
-                None,  # input_prep_func
-                pad_calibration_inputs,
-                model.vocab_size,
-            )
-            .record_inputs(
-                calibration_tasks,
-                calibration_limit,
-            )
-            .get_inputs()
-        )
-
-        gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer(
-            blocksize,
-            percdamp,
-            group_size,
-        )
-        model = gptq_quantizer.quantize(model, inputs)
-        return model
-    else:
-        raise Exception(f"Unrecognized quantize mode: {qmode}")
-
-
 def build_model(
     modelname: str = "model",
     extra_opts: str = "",
@@ -225,6 +105,13 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--pt2e_quantize",
         default=None,
+        choices=[
+            "xnnpack_dynamic",
+            "xnnpack_dynamic_qc4",
+            "qnn_8a8w",
+            "qnn_16a16w",
+            "qnn_16a4w",
+        ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
     parser.add_argument(
@@ -242,6 +129,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=f"{ckpt_dir}/params/demo_rand_params.pth",
         help="checkpoint path",
     )
+
+    parser.add_argument(
+        "--checkpoint_dir",
+        default=None,
+        help="checkpoint directory. Use with a sharded checkpoint, not for the standard llama2 model. Note, checkpoint_dir takes precedence over checkpoint if both are set.",
+    )
+
     parser.add_argument(
         "--calibration_tasks",
         nargs="+",
@@ -417,10 +311,12 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     """
 
     # load model from checkpoint and params.json
-    checkpoint_path = canonical_path(args.checkpoint)
+    checkpoint_path = canonical_path(args.checkpoint) if args.checkpoint else None
+    checkpoint_dir = (
+        canonical_path(args.checkpoint_dir) if args.checkpoint_dir else None
+    )
     params_path = canonical_path(args.params)
     output_dir_path = canonical_path(args.output_dir, dir=True)
-    modelname = "llama2"
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
     # dtype override
@@ -435,56 +331,28 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     transforms = []
     if args.quantization_mode:
         modelname = f"{modelname}_q"
-
-        # If these optional args are None, don't provide them to quantize()
-        quant_args_str = [
-            "group_size",
-            "calibration_tasks",
-            "calibration_limit",
-            "calibration_seq_length",
-        ]
-        arg_dict = vars(args)
-        quant_args = {
-            param: val
-            for param in quant_args_str
-            if (val := arg_dict.get(param)) is not None
-        }
-
         transforms.append(
-            partial(
-                quantize,
-                **quant_args,
-                qmode=args.quantization_mode,
-                activation_dtype=dtype_override,
-                checkpoint_path=(
-                    Path(path) if (path := args.checkpoint) is not None else None
-                ),
-                tokenizer_path=(
-                    Path(path) if (path := args.tokenizer_path) is not None else None
-                ),
-            )
+            get_quant_weight_transform(args, dtype_override, verbose_export())
         )
 
     if args.embedding_quantize:
         modelname = f"{modelname}_e"
-        bitwidth, group_size = args.embedding_quantize.split(",")
-        if group_size == "none" or group_size == "None" or group_size == "0":
-            group_size = None
-        else:
-            group_size = int(group_size)
-        bitwidth = int(bitwidth)
-        transforms.append(
-            lambda model: EmbeddingOnlyInt8QuantHandler(
-                model, bitwidth=bitwidth, group_size=group_size
-            ).quantized_model()
-        )
+        transforms.append(get_quant_embedding_transform(args))
 
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
+    if args.use_sdpa_with_kv_cache:
+        transforms.append(replace_sdpa_with_custom_op)
+
+    if args.qnn and args.use_kv_cache:
+        transforms.append(replace_sdpa_with_simple_sdpa)
+        transforms.append(replace_causal_mask)
     return (
         load_llama_model(
+            modelname=modelname,
             checkpoint=checkpoint_path,
+            checkpoint_dir=checkpoint_dir,
             params_path=params_path,
             use_kv_cache=args.use_kv_cache,
             use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
@@ -503,161 +371,47 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
     # export_to_edge
     pt2e_quant_params = _get_pt2e_quantization_params(args)
     quantizers = get_pt2e_quantizers(pt2e_quant_params, args)
-    if args.qnn:
-        assert (
-            args.quantization_mode is None
-        ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
-        try:
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
-            from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
-
-            # reset quantizers and pt2e_quant_params from xnnpack backend
-            pt2e_quant_params = None
-            quantizers = []
-        except ImportError:
-            raise ImportError(
-                "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
-            )
-
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
-        qnn_quantizer = QnnQuantizer()
-        # more custom quantization are supported including 16a4w etc. default to 8bit quantized
-        custom_annotations = ()
-        qnn_quantizer.add_custom_quant_annotations(custom_annotations)
+    quant_dtype = None
+    if args.qnn and args.pt2e_quantize:
+        assert quantizers is None, "Should not enable both xnnpack and qnn"
+        qnn_quantizer, quant_dtype = get_qnn_quantizer(args)
         quantizers.append(qnn_quantizer)
 
     builder_exported_to_edge = _prepare_for_llama_export(
         modelname, args
     ).export_to_edge(quantizers)
 
+    modelname = builder_exported_to_edge.modelname
+
     # to_backend
     partitioners = []
     if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None:
-        partitioners.append(XnnpackDynamicallyQuantizedPartitioner())
+        partitioners.append(get_xnnpack_partitioner())
         modelname = f"xnnpack_dq_{modelname}"
 
     if args.xnnpack:
-        # Following changes due to.
-        # 1. We need dynamically quantized partitioner for both pt2e_quantize options
-        #    as well as "qmode 8da4w" which is also dynamic quantizes linear layers.
-        # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
-        partitioners.append(XnnpackDynamicallyQuantizedPartitioner())
-        # partitioners.append(XnnpackPartitioner())
+        partitioners.append(get_xnnpack_partitioner())
         modelname = f"xnnpack_{modelname}"
 
     if args.vulkan:
-        assert (
-            args.dtype_override == "fp32" or args.dtype_override is None
-        ), "Vulkan backend does not support non fp32 dtypes at the moment"
-        assert (
-            args.quantization_mode is None
-        ), "Vulkan backend does not support quantization at the moment"
-
-        partitioners.append(VulkanPartitioner())
+        partitioners.append(get_vulkan_partitioner(args))
         modelname = f"vulkan_{modelname}"
 
     if args.mps:
-        assert (
-            args.use_kv_cache is True
-        ), "MPS backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
-        try:
-            # pyre-ignore Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.mps.partition.mps_partitioner`.
-            from executorch.backends.apple.mps.partition.mps_partitioner import (
-                MPSPartitioner,
-            )
-        except ImportError:
-            raise ImportError(
-                "Please install the MPS backend follwing https://pytorch.org/executorch/main/build-run-mps.html"
-            )
-
-        compile_specs = [CompileSpec("use_fp16", bytes([True]))]
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`.
-        partitioners.append(MPSPartitioner(compile_specs))
+        partitioners.append(get_mps_partitioner(args))
         modelname = f"mps_{modelname}"
 
     if args.coreml:
-        assert (
-            args.use_kv_cache is True
-        ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
-        try:
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.partition.coreml_partitioner`.
-            import coremltools as ct
-
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.compiler`
-            from executorch.backends.apple.coreml.compiler import CoreMLBackend
-
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.partition.coreml_partitioner`
-            from executorch.backends.apple.coreml.partition.coreml_partitioner import (
-                CoreMLPartitioner,
-            )
-        except ImportError:
-            raise ImportError(
-                "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
-            )
-
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`.
-        compile_specs = CoreMLBackend.generate_compile_specs(
-            compute_precision=ct.precision(ct.precision.FLOAT16.value),
-            compute_unit=ct.ComputeUnit[ct.ComputeUnit.ALL.name.upper()],
-            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`
-            model_type=CoreMLBackend.MODEL_TYPE.MODEL,
-        )
-        partitioners.append(
-            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`
-            CoreMLPartitioner(
-                skip_ops_for_coreml_delegation=[
-                    "aten.index_put.default",
-                ],
-                compile_specs=compile_specs,
-            )
-        )
+        partitioners.append(get_coreml_partitioner(args))
         modelname = f"coreml_{modelname}"
 
     if args.qnn:
-        assert (
-            args.use_kv_cache is True
-        ), "Qualcomm backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
-        try:
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.partition.qnn_partitioner`
-            from executorch.backends.qualcomm.partition.qnn_partitioner import (
-                QnnPartitioner,
-            )
+        partitioners.append(get_qnn_partitioner(args, quant_dtype))
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
+        from executorch.backends.qualcomm.utils.utils import _transform
 
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.serialization.qnn_compile_spec_schema`
-            from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-                QcomChipset,
-            )
-
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
-            from executorch.backends.qualcomm.utils.utils import (
-                _transform,
-                generate_htp_compiler_spec,
-                generate_qnn_executorch_compiler_spec,
-            )
-        except ImportError:
-            raise ImportError(
-                "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
-            )
-
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        backend_options = generate_htp_compiler_spec(use_fp16=False)
-        partitioners.append(
-            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-            QnnPartitioner(
-                # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-                generate_qnn_executorch_compiler_spec(
-                    # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
-                    soc_model=QcomChipset.SM8650,  # default to SM8650
-                    backend_options=backend_options,
-                    debug=False,
-                    saver=False,
-                ),
-                skip_node_id_set={},
-                skip_node_op_set={},
-            )
-        )
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        _transform(builder_exported_to_edge.export_program())
+        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`, Optional type has no attribute `exported_program`
+        _transform(builder_exported_to_edge.edge_manager.exported_program())
 
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh
index f2d9c9ee2d0..d316790d572 100755
--- a/examples/models/llama2/install_requirements.sh
+++ b/examples/models/llama2/install_requirements.sh
@@ -11,7 +11,8 @@ pip install snakeviz sentencepiece
 pip install torchao==0.1
 
 # Install lm-eval for Model Evaluation with lm-evalution-harness
-pip install lm-eval
+# Install tiktoken for tokenizer
+pip install lm-eval tiktoken blobfile
 
 # Call the install helper for further setup
 python examples/models/llama2/install_requirement_helper.py
diff --git a/.swift/coreml_backend_debug/dummy.swift b/examples/models/llama2/lib/__init__.py
similarity index 100%
rename from .swift/coreml_backend_debug/dummy.swift
rename to examples/models/llama2/lib/__init__.py
diff --git a/examples/models/llama2/lib/partitioner_lib.py b/examples/models/llama2/lib/partitioner_lib.py
new file mode 100644
index 00000000000..1638a357576
--- /dev/null
+++ b/examples/models/llama2/lib/partitioner_lib.py
@@ -0,0 +1,138 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def get_xnnpack_partitioner():
+    from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
+        XnnpackDynamicallyQuantizedPartitioner,
+    )
+
+    # Following changes due to.
+    # 1. We need dynamically quantized partitioner for both pt2e_quantize options
+    #    as well as "qmode 8da4w" which is also dynamic quantizes linear layers.
+    # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
+    return XnnpackDynamicallyQuantizedPartitioner()
+
+
+def get_vulkan_partitioner(args):
+    assert (
+        args.dtype_override == "fp32" or args.dtype_override is None
+    ), "Vulkan backend does not support non fp32 dtypes at the moment"
+    assert (
+        args.quantization_mode is None
+    ), "Vulkan backend does not support quantization at the moment"
+    # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.vulkan.partition.vulkan_partitioner`
+    from executorch.backends.vulkan.partition.vulkan_partitioner import (
+        VulkanPartitioner,
+    )
+
+    return VulkanPartitioner()
+
+
+def get_mps_partitioner(args):
+    from executorch.exir.backend.backend_details import CompileSpec
+
+    assert (
+        args.use_kv_cache is True
+    ), "MPS backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
+    try:
+        # pyre-ignore Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.mps.partition.mps_partitioner`.
+        from executorch.backends.apple.mps.partition.mps_partitioner import (
+            MPSPartitioner,
+        )
+    except ImportError:
+        raise ImportError(
+            "Please install the MPS backend follwing https://pytorch.org/executorch/main/build-run-mps.html"
+        )
+
+    compile_specs = [CompileSpec("use_fp16", bytes([True]))]
+    return MPSPartitioner(compile_specs)
+
+
+def get_coreml_partitioner(args):
+    assert (
+        args.use_kv_cache is True
+    ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
+    try:
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.partition.coreml_partitioner`.
+        import coremltools as ct
+
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.compiler`
+        from executorch.backends.apple.coreml.compiler import CoreMLBackend
+
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.partition.coreml_partitioner`
+        from executorch.backends.apple.coreml.partition.coreml_partitioner import (
+            CoreMLPartitioner,
+        )
+    except ImportError:
+        raise ImportError(
+            "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
+        )
+
+    compile_specs = CoreMLBackend.generate_compile_specs(
+        compute_precision=ct.precision(ct.precision.FLOAT16.value),
+        # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
+        compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
+        model_type=CoreMLBackend.MODEL_TYPE.MODEL,
+    )
+    return CoreMLPartitioner(
+        compile_specs=compile_specs,
+    )
+
+
+def get_qnn_partitioner(args, quant_dtype):
+    assert (
+        args.use_kv_cache is True
+    ), "Qualcomm backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
+    try:
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.partition.qnn_partitioner`
+        from executorch.backends.qualcomm.partition.qnn_partitioner import (
+            QnnPartitioner,
+        )
+
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
+        from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.serialization.qnn_compile_spec_schema`
+        from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+            QcomChipset,
+        )
+
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
+        from executorch.backends.qualcomm.utils.utils import (
+            generate_htp_compiler_spec,
+            generate_qnn_executorch_compiler_spec,
+        )
+    except ImportError:
+        raise ImportError(
+            "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html"
+        )
+
+    use_fp16 = True
+    skip_node_op_set = {}
+    if args.pt2e_quantize:
+        use_fp16 = False
+        # TODO: fix the lowering error without skipping nodes
+
+        if quant_dtype == QuantDtype.use_8a8w:
+            raise NotImplementedError("8a8w for llama is still under development")
+
+        elif quant_dtype == QuantDtype.use_16a16w:
+            raise NotImplementedError("16a16w for llama is still under development")
+
+        elif quant_dtype == QuantDtype.use_16a4w:
+            raise NotImplementedError("16a4w for llama is still under development")
+
+    return QnnPartitioner(
+        generate_qnn_executorch_compiler_spec(
+            soc_model=QcomChipset.SM8650,  # default to SM8650
+            backend_options=generate_htp_compiler_spec(use_fp16=use_fp16),
+            debug=False,
+            saver=False,
+        ),
+        skip_node_id_set={},
+        skip_node_op_set=skip_node_op_set,
+    )
diff --git a/examples/models/llama2/quant_lib.py b/examples/models/llama2/lib/quant_lib.py
similarity index 71%
rename from examples/models/llama2/quant_lib.py
rename to examples/models/llama2/lib/quant_lib.py
index 226f10421b9..e1f0827a644 100644
--- a/examples/models/llama2/quant_lib.py
+++ b/examples/models/llama2/lib/quant_lib.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# This is for PT2E quantization. For source-transformation quantize, please modify source_transformation/quantize.py
+
 import logging
 from dataclasses import dataclass
 from typing import List, Optional
@@ -105,7 +107,7 @@ def check_embedding_byte_registered():
                     'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
                     "Set that as TORCH_PACKAGE_DIR.\n"
                     "Then from root executorch dir do the following:\n"
-                    "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n"
+                    "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON ..) && cmake --build . -j16\n"
                     'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
                     "Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
                 )
@@ -134,3 +136,49 @@ def check_embedding_byte_registered():
         dynamic_quantizer.set_global(operator_config_dynamic)
         quantizers.append(dynamic_quantizer)
     return quantizers
+
+
+def get_qnn_quantizer(args):
+    try:
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
+        from executorch.backends.qualcomm.quantizer.quantizer import (
+            get_16a4w_qnn_ptq_config,
+            get_default_16bit_qnn_ptq_config,
+            QnnQuantizer,
+            QuantDtype,
+        )
+
+    except ImportError:
+        raise ImportError(
+            "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
+        )
+
+    backend, quant_config = args.pt2e_quantize.split("_")
+    assert (
+        backend == "qnn"
+    ), f"The quantization config is for backend {backend} instead of qnn."
+    qnn_quantizer = QnnQuantizer()
+    # more custom quantization are supported including 16a4w etc. default to 8bit quantized
+    custom_annotations = ()
+    if quant_config == "8a8w":
+        quant_dtype = QuantDtype.use_8a8w
+        pass
+    elif quant_config == "16a16w":
+        quant_dtype = QuantDtype.use_16a16w
+        qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+        qnn_quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config())
+    elif quant_config == "16a4w":
+        quant_dtype = QuantDtype.use_16a4w
+        qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+        qnn_quantizer.set_bit16_op_quant_config(get_16a4w_qnn_ptq_config())
+        qnn_quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
+    else:
+        raise AssertionError(
+            f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
+        )
+
+    assert (
+        args.quantization_mode is None
+    ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
+    qnn_quantizer.add_custom_quant_annotations(custom_annotations)
+    return qnn_quantizer, quant_dtype
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 2a259af59cb..298e0463c07 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -62,6 +62,12 @@ def forward(self, x):
         return output * self.weight
 
 
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
 @dataclass
 class ModelArgs:
     dim: int = 4096
@@ -82,7 +88,10 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
-    rope_freq_base: float = 10000.0  # The base frequency for RoPE
+    rope_theta: Optional[float] = (
+        None  # The official name to override self.rope_freq_base.
+    )
+    rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
     # Additional Model Metadata needed at runtime
     bos_idx: int = 1
     eos_idx: int = 3
@@ -93,9 +102,23 @@ def __post_init__(self):
         if self.n_kv_heads is None:
             self.n_kv_heads = self.n_heads
 
+        # rope_theta overrides rope_freq_base since it's the official name.
+        if self.rope_theta is not None:
+            self.rope_freq_base = self.rope_theta
+
         if self.use_sdpa_with_kv_cache_op:
             assert self.use_kv_cache, "use_sdpa_with_kv_cache_op requires use_kv_cache"
 
+        if self.hidden_dim is None:
+            # If hidden_dim is not explicitly set in the ModelArgs,
+            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
+            multiple_of = self.multiple_of
+            hidden_dim = 4 * self.dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            if self.ffn_dim_multiplier is not None:
+                hidden_dim = int(self.ffn_dim_multiplier * hidden_dim)
+            self.hidden_dim = find_multiple(hidden_dim, multiple_of)
+
 
 def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
@@ -193,6 +216,44 @@ def update(
         return k_out, v_out
 
 
+class SDPA(nn.Module):
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
 class Attention(nn.Module):
     def __init__(self, args: ModelArgs, layer_id: int):
         super().__init__()
@@ -213,7 +274,6 @@ def __init__(self, args: ModelArgs, layer_id: int):
         self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
         self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
 
-        self.use_sdpa_with_kv_cache_op = args.use_sdpa_with_kv_cache_op
         self.layer_id = layer_id
 
         causal_mask = torch.tril(
@@ -234,6 +294,12 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 self.head_dim,
                 not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op dont transpose the cache. Expect untransposed q k v
             )
+            self.SDPA = SDPA(
+                kv_cache=self.kv_cache,
+                dim=self.dim,
+                head_dim=self.head_dim,
+                n_rep=self.n_rep,
+            )
 
     def forward(
         self,
@@ -256,41 +322,8 @@ def forward(
 
         if self.use_kv_cache:
             assert input_pos is not None
-
-            if not self.use_sdpa_with_kv_cache_op:
-
-                q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-                k = k.transpose(1, 2)
-                v = v.transpose(1, 2)
-
-                k, v = self.kv_cache.update(input_pos, k, v)
-                mask = self.mask[None, None, input_pos]
-
-                k = k.repeat_interleave(self.n_rep, dim=1)
-                v = v.repeat_interleave(self.n_rep, dim=1)
-                y = F.scaled_dot_product_attention(
-                    q, k, v, attn_mask=mask, dropout_p=0.0
-                )
-
-                y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
-
-                y = self.wo(y)
-                return y
-            else:
-                from .custom_ops.sdpa_with_kv_cache import sdpa_with_kv_cache  # noqa
-
-                output = torch.ops.llama.sdpa_with_kv_cache(
-                    q,
-                    k,
-                    v,
-                    self.kv_cache.k_cache,
-                    self.kv_cache.v_cache,
-                    input_pos[-1].item(),
-                    seqlen,
-                )
-                output = output.view(bsz, seqlen, -1)
-                output = self.wo(output)
-                return output
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
+            return self.wo(output)
 
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         k = k.transpose(1, 2)
@@ -316,19 +349,11 @@ def forward(
 class FeedForward(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
-        dim = args.dim
-        hidden_dim = args.hidden_dim
-        if hidden_dim is None:
-            # If hidden_dim is not explicitly set in the ModelArgs,
-            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
-            multiple_of = args.multiple_of
-            hidden_dim = 4 * dim
-            hidden_dim = int(2 * hidden_dim / 3)
-            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        assert args.hidden_dim is not None
+        hidden_dim: int = args.hidden_dim
+        self.w1 = nn.Linear(args.dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, args.dim, bias=False)
+        self.w3 = nn.Linear(args.dim, hidden_dim, bias=False)
 
     def forward(self, x):
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
@@ -425,7 +450,11 @@ def __init__(self, params: ModelArgs):
 
         freqs_cos, freqs_sin = precompute_freqs_cis(
             params.dim // params.n_heads,
-            params.max_seq_len,
+            (
+                params.max_seq_len  # Normal llama2.
+                if params.ffn_dim_multiplier is None
+                else params.max_seq_len * 2  # Sharded checkpoint.
+            ),
             params.rope_freq_base,
         )
         self.register_buffer("freqs_cos", freqs_cos, persistent=False)
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index 68882433679..aa997aa56ea 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -4,7 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
 import json
+import os
 from pathlib import Path
 
 import torch
@@ -48,6 +50,12 @@ def __init__(self, **kwargs):
             # The 1st way
             ckpt_dir = Path(__file__).absolute().parent / "params"
 
+        # Check if checkpoint_dir was provided for a sharded checkpoint.
+        checkpoint_dir = (
+            kwargs["checkpoint_dir"] if "checkpoint_dir" in kwargs else None
+        )
+
+        # Use single checkpoint file.
         checkpoint_path = (
             kwargs["checkpoint"]
             if "checkpoint" in kwargs
@@ -72,7 +80,35 @@ def __init__(self, **kwargs):
         # Follow the instruction in https://github.com/facebookresearch/llama to download the model
         device = "cpu"
         # flake8: noqa: TOR102
-        checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
+        cps = []
+        if checkpoint_dir is not None:
+            # Load multiple checkpoint; ignore the single path.
+            checkpoint_path = None
+            for i in range(4):
+                cp_name = f"consolidated.{i}.pth"
+                print(f"Loading {cp_name}")
+                cps.append(
+                    torch.load(
+                        os.path.join(checkpoint_dir, cp_name),
+                        map_location=device,
+                        mmap=True,
+                    )
+                )
+            checkpoint = {}
+            for key in cps[0].keys():
+                if not torch.allclose(cps[0][key], cps[1][key]):
+                    values = (cps[0][key], cps[1][key], cps[2][key], cps[3][key])
+                    if "wo" in key or "w2" in key:
+                        # Concat on dim=1 for "wo" and "w2".
+                        checkpoint[key] = torch.cat(values, dim=1)
+                    else:
+                        # Concat on dim=0 for everything else.
+                        checkpoint[key] = torch.cat(values, dim=0)
+                else:
+                    # Do not duplicate layers shared between each checkpoint.
+                    checkpoint[key] = cps[0][key]
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
         fairseq2_checkpoint = kwargs.get("fairseq2", False)
         if fairseq2_checkpoint:
             print("Using fairseq2 checkpoint")
@@ -140,7 +176,8 @@ def __init__(self, **kwargs):
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
-            from .quantize import WeightOnlyInt8QuantHandler
+            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.examples.models.source_transformation.quantize`
+            from ..source_transformation.quantize import WeightOnlyInt8QuantHandler
 
             simple_quantizer = WeightOnlyInt8QuantHandler(self.model_)
             self.model_ = simple_quantizer.convert_for_runtime()
@@ -173,11 +210,7 @@ def get_eager_model(self):
 
     def get_example_inputs(self):
         if self.use_kv_cache:
-            if self.use_sdpa_with_kv_cache_op:
-                return self.get_example_inputs_kvcache_sdpa()
-            else:
-                # return self.get_example_inputs_kvcache() TODO xnnpack does not handle forwarding symints, update partitioner to not partition symints
-                return self.get_example_inputs_kvcache_sdpa()
+            return self.get_example_inputs_kvcache_sdpa()
         else:
             return (
                 torch.tensor(
@@ -195,13 +228,3 @@ def get_example_inputs_kvcache_sdpa(self):
                 [0], dtype=torch.long
             ),  # start_pos, what token of output are we on.)
         )
-
-    def get_example_inputs_kvcache(self):
-        return (
-            torch.tensor(
-                [[1, 2, 3]], dtype=torch.long
-            ),  # tokens, with kv cache our input token length is always just 1 token.
-            torch.tensor(
-                [0, 1, 2], dtype=torch.long
-            ),  # start_pos, what token of output are we on.
-        )
diff --git a/examples/models/llama2/ops/TARGETS b/examples/models/llama2/ops/TARGETS
deleted file mode 100644
index 0fbbff56977..00000000000
--- a/examples/models/llama2/ops/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("ai_infra_mobile_platform")
-
-define_common_targets()
diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml
deleted file mode 100644
index 8e435169e17..00000000000
--- a/examples/models/llama2/ops/quantized.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::quantized_embedding_byte_out
-
-- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::quantized_embedding_byte_dtype_out
diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py
deleted file mode 100644
index 5d13856442d..00000000000
--- a/examples/models/llama2/ops/quantized_ops.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Optional
-
-import torch
-from torch.library import impl, impl_abstract
-
-# NOTE: this is a hacky way to get around the fact that we can't use quantized_decomposed::embedding_byte in exir directly in eager model. That op can be found under exir/passes/_quant_patterns_and_replacements.py. Ideally we should consolidate these 2 versions.
-# This op share the same signature and C++ kernel implementation with quantized_decomposed::embedding_byte.
-quantized_lib = torch.library.Library(
-    "llama_quantized", "DEF"
-)  # to not be confused with torch.ops.quantized.* ops.
-quantized_lib.define(
-    "embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
-)
-
-quantized_lib.define(
-    "embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
-)
-
-quantized_lib.define(
-    "embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
-)
-
-quantized_lib.define(
-    "embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
-)
-
-
-def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points):
-    assert weight.dtype in [
-        torch.int8,
-        torch.uint8,
-    ], f"Expecting weights to be of dtype in [torch.int8, torch.uint8], but got {weight.dtype}"
-    assert (
-        weight.dim() == 2
-    ), f"Expecting weight tensor to have dim()==2, but found {weight.dim()}"
-
-    assert weight_scales.dtype in [
-        torch.float16,
-        torch.float32,
-    ], f"Expecting weight_scales to be of dtype in [torch.float16, torch.float32], but got {weight_scales.dtype}"
-    assert (
-        weight_scales.dim() == 1 or weight_scales.dim() == 2
-    ), f"Expecting weight_scales tensor to have rank 1 or 2, but found {weight_scales.dim()}"
-    assert weight_scales.size(0) == weight.size(
-        0
-    ), f"Expecting weight and scale tensor to have same number of rows, but found {weight.size()} and {weight_scales.size()}"
-
-    assert (
-        weight_zero_points is None or weight_zero_points.dtype == weight_scales.dtype
-    ), "Expecting weight_zero_points to be None or have same dtype as weight_scales"
-    assert (
-        weight_zero_points is None or weight_zero_points.dim() == 1
-    ), f"Expecting weight_zero_points tensor to be None or have dim()==1, but found {weight_zero_points.dim()}"
-    assert weight_zero_points is None or weight_zero_points.size(0) == weight.size(
-        0
-    ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"
-
-
-@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd")
-def embedding_byte(
-    weight: torch.Tensor,
-    weight_scales: torch.Tensor,
-    weight_zero_points: Optional[torch.Tensor],
-    weight_quant_min: int,
-    weight_quant_max: int,
-    indices: torch.Tensor,
-) -> torch.Tensor:
-    embedding_byte_weight_checks(weight, weight_scales, weight_zero_points)
-    group_size = weight.size(1) // (
-        weight_scales.size(1) if weight_scales.dim() == 2 else 1
-    )
-    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
-        weight,
-        weight_scales,
-        weight_zero_points,
-        weight_quant_min,
-        weight_quant_max,
-        weight.dtype,
-        group_size,
-        weight_scales.dtype,
-    )
-    return torch.ops.aten.embedding.default(weight, indices)
-
-
-@impl_abstract("llama_quantized::embedding_byte.out")
-def embedding_byte_out_meta(
-    weight: torch.Tensor,
-    weight_scales: torch.Tensor,
-    weight_zero_points: Optional[torch.Tensor],
-    weight_quant_min: int,
-    weight_quant_max: int,
-    indices: torch.Tensor,
-    out: torch.Tensor,
-) -> torch.Tensor:
-    return embedding_byte(
-        weight,
-        weight_scales,
-        weight_zero_points,
-        weight_quant_min,
-        weight_quant_max,
-        indices,
-    )
-
-
-@impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd")
-def embedding_byte_dtype(
-    weight: torch.Tensor,
-    weight_scales: torch.Tensor,
-    weight_zero_points: Optional[torch.Tensor],
-    weight_quant_min: int,
-    weight_quant_max: int,
-    indices: torch.Tensor,
-    *,
-    dtype: Optional[torch.dtype] = None,
-) -> torch.Tensor:
-    embedding_byte_weight_checks(weight, weight_scales, weight_zero_points)
-    group_size = weight.size(1) // (
-        weight_scales.size(1) if weight_scales.dim() == 2 else 1
-    )
-    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
-        weight,
-        weight_scales,
-        weight_zero_points,
-        weight_quant_min,
-        weight_quant_max,
-        weight.dtype,
-        group_size,
-        dtype,
-    )
-    return torch.ops.aten.embedding.default(weight, indices)
-
-
-@impl_abstract("llama_quantized::embedding_byte.dtype_out")
-def embedding_byte_dtype_out_meta(
-    weight: torch.Tensor,
-    weight_scales: torch.Tensor,
-    weight_zero_points: Optional[torch.Tensor],
-    weight_quant_min: int,
-    weight_quant_max: int,
-    indices: torch.Tensor,
-    *,
-    dtype: Optional[torch.dtype] = None,
-    out: torch.Tensor,
-) -> torch.Tensor:
-    return embedding_byte_dtype(
-        weight,
-        weight_scales,
-        weight_zero_points,
-        weight_quant_min,
-        weight_quant_max,
-        indices,
-        dtype=dtype,
-    )
diff --git a/examples/models/llama2/ops/targets.bzl b/examples/models/llama2/ops/targets.bzl
deleted file mode 100644
index b441773dc3b..00000000000
--- a/examples/models/llama2/ops/targets.bzl
+++ /dev/null
@@ -1,50 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib")
-
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    runtime.python_library(
-        name = "quantized_aot_lib",
-        srcs = [
-            "quantized_ops.py",
-        ],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        deps = [
-            "//caffe2:torch",
-        ],
-    )
-
-    runtime.export_file(
-        name = "quantized.yaml",
-        visibility = [
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    et_operator_library(
-        name = "all_quantized_ops",
-        define_static_targets = True,
-        ops_schema_yaml_target = ":quantized.yaml",
-    )
-
-    executorch_generated_lib(
-        name = "generated_lib",
-        custom_ops_yaml_target = ":quantized.yaml",
-        define_static_targets = True,
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        deps = [
-            ":all_quantized_ops",
-            "//executorch/kernels/quantized:quantized_operators",
-        ],
-    )
diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
index 8e9190eb4c1..9e9ad0d4879 100644
--- a/examples/models/llama2/runner/CMakeLists.txt
+++ b/examples/models/llama2/runner/CMakeLists.txt
@@ -39,20 +39,26 @@ list(TRANSFORM _llama_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 target_include_directories(extension_module
                            INTERFACE ${_common_include_directories})
 
-if(CMAKE_TOOLCHAIN_IOS OR ANDROID OR APPLE)
-  # Building a share library on iOS requires code signing
-  # On Android we see duplicated registration when using shared lib
+if(EXECUTORCH_USE_TIKTOKEN)
+  list(APPEND _llama_runner__srcs
+       ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/tiktoken.cpp)
+  set(_preprocessor_flag -DET_USE_TIKTOKEN)
+endif()
+
+if(CMAKE_TOOLCHAIN_IOS
+   OR ANDROID
+   OR APPLE)
+  # Building a share library on iOS requires code signing On Android we see
+  # duplicated registration when using shared lib
   add_library(llama_runner STATIC ${_llama_runner__srcs})
 else()
   add_library(llama_runner SHARED ${_llama_runner__srcs})
 endif()
 
-set(llama_runner_deps executorch extension_module extension_data_loader
-  custom_ops)
+set(llama_runner_deps executorch extension_module extension_data_loader)
 
-target_link_libraries(
-  llama_runner PUBLIC ${llama_runner_deps})
+target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 
-target_include_directories(llama_runner
-                           INTERFACE ${_common_include_directories}
-                           ${EXECUTORCH_ROOT})
+target_include_directories(llama_runner INTERFACE ${_common_include_directories}
+                                                  ${EXECUTORCH_ROOT})
+target_compile_options(llama_runner PUBLIC ${_preprocessor_flag})
diff --git a/examples/models/llama2/runner/generation.py b/examples/models/llama2/runner/generation.py
new file mode 100644
index 00000000000..56a15005ef1
--- /dev/null
+++ b/examples/models/llama2/runner/generation.py
@@ -0,0 +1,370 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import argparse
+
+import json
+from typing import List, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn.functional as F
+from executorch.examples.models.llama2.llama_transformer import ModelArgs
+
+from executorch.examples.models.llama2.tokenizer.tiktoken import (
+    Dialog,
+    Message,
+    Tokenizer,
+)
+from executorch.extension.pybindings.portable_lib import _load_for_executorch
+
+
+class CompletionPrediction(TypedDict, total=False):
+    generation: str
+    tokens: List[str]  # not required
+    logprobs: List[float]  # not required
+
+
+class ChatPrediction(TypedDict, total=False):
+    generation: Message
+    tokens: List[str]  # not required
+    logprobs: List[float]  # not required
+
+
+def sample_top_p(probs, p):
+    """
+    Perform top-p (nucleus) sampling on a probability distribution.
+
+    Args:
+        probs (torch.Tensor): Probability distribution tensor.
+        p (float): Probability threshold for top-p sampling.
+
+    Returns:
+        torch.Tensor: Sampled token indices.
+
+    Note:
+        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
+        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+
+
+class LlamaRunner:
+    def __init__(self, model_path: str, tokenizer_path: str, model_args: ModelArgs):
+        # model is a pte file.
+        self.model = _load_for_executorch(model_path)
+        self.params = model_args
+        self.tokenizer = Tokenizer(tokenizer_path)
+        assert model_args.vocab_size == self.tokenizer.n_words
+
+    def generate(  # noqa: C901
+        self,
+        prompt_tokens: List[List[int]],
+        max_gen_len: int,
+        temperature: float = 0.8,
+        top_p: float = 0.9,
+        logprobs: bool = False,
+        echo: bool = False,
+    ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
+        bsz = len(prompt_tokens)
+        params = self.params
+        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
+
+        min_prompt_len = min(len(t) for t in prompt_tokens)
+        max_prompt_len = max(len(t) for t in prompt_tokens)
+
+        assert max_prompt_len <= params.max_seq_len
+        total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
+        pad_id = self.tokenizer.pad_id
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cpu")
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cpu")
+        if logprobs:
+            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
+
+        prev_pos = 0
+        if self.params.use_kv_cache:
+            min_prompt_len = 1
+
+        eos_reached = torch.tensor([False] * bsz, device="cpu")
+        input_text_mask = tokens != pad_id
+        pos = torch.tensor([prev_pos], dtype=torch.int64)
+        if min_prompt_len == total_len:
+            if self.params.use_kv_cache:
+                inputs = (tokens, pos)
+            else:
+                inputs = (tokens,)
+            logits = self.model.forward(inputs)  # updated forward call.
+            logits = logits[0]
+            token_logprobs = -F.cross_entropy(
+                input=logits.transpose(1, 2),
+                target=tokens,
+                reduction="none",
+                ignore_index=pad_id,
+            )
+
+        stop_tokens = torch.tensor(list(self.tokenizer.stop_tokens))
+
+        for cur_pos in range(min_prompt_len, total_len):
+            pos = torch.tensor([prev_pos], dtype=torch.int64)
+            if self.params.use_kv_cache:
+                inputs = (tokens[:, prev_pos:cur_pos], pos)
+            else:
+                inputs = (tokens[:, :cur_pos],)
+            logits = self.model.forward(inputs)  # updated forward call.
+            logits = logits[0]
+            if temperature > 0:
+                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits[:, -1], dim=-1)
+
+            next_token = next_token.reshape(-1)
+
+            # only replace token if prompt has already been generated
+            if not self.params.use_kv_cache or cur_pos < len(prompt_tokens[0]):
+                next_token = torch.where(
+                    input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+                )
+
+            tokens[:, cur_pos] = next_token
+            if logprobs:
+                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
+                    input=logits.transpose(1, 2),
+                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
+                    reduction="none",
+                    ignore_index=pad_id,
+                )
+            eos_reached |= (~input_text_mask[:, cur_pos]) & (
+                torch.isin(next_token, stop_tokens)
+            )
+            prev_pos = cur_pos
+            if all(eos_reached):
+                break
+
+        if logprobs:
+            token_logprobs = token_logprobs.tolist()
+        out_tokens, out_logprobs = [], []
+        for i, toks in enumerate(tokens.tolist()):
+            # cut to max gen len
+            start = 0 if echo else len(prompt_tokens[i])
+            toks = toks[start : len(prompt_tokens[i]) + max_gen_len]
+            probs = None
+            if logprobs:
+                probs = token_logprobs[i][start : len(prompt_tokens[i]) + max_gen_len]
+            # cut to after eos tok if any
+            for stop_token in self.tokenizer.stop_tokens:
+                try:
+                    eos_idx = toks.index(stop_token)
+                    toks = toks[:eos_idx]
+                    probs = probs[:eos_idx] if logprobs else None
+                except ValueError:
+                    pass
+            out_tokens.append(toks)
+            out_logprobs.append(probs)
+        return (out_tokens, out_logprobs if logprobs else None)
+
+    def text_completion(
+        self,
+        prompts: List[str],
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        logprobs: bool = False,
+        echo: bool = False,
+    ) -> List[CompletionPrediction]:
+        """
+        Perform text completion for a list of prompts using the language generation model.
+
+        Args:
+            prompts (List[str]): List of text prompts for completion.
+            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            max_gen_len (Optional[int], optional): Maximum length of the generated completion sequence.
+                If not provided, it's set to the model's maximum sequence length minus 1.
+            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
+            echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
+
+        Returns:
+            List[CompletionPrediction]: List of completion predictions, each containing the generated text completion.
+
+        Note:
+            This method generates text completions for the provided prompts, employing nucleus sampling to introduce controlled randomness.
+            If logprobs is True, token log probabilities are computed for each generated token.
+        """
+        if max_gen_len is None:
+            max_gen_len = self.model.params.max_seq_len - 1
+        prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
+        generation_tokens, generation_logprobs = self.generate(
+            prompt_tokens=prompt_tokens,
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            echo=echo,
+        )
+
+        if logprobs:
+            return [
+                {
+                    "generation": self.tokenizer.decode(t),
+                    "tokens": [self.tokenizer.decode([x]) for x in t],
+                    "logprobs": logprobs_i,
+                }
+                for t, logprobs_i in zip(generation_tokens, generation_logprobs)
+            ]
+        return [{"generation": self.tokenizer.decode(t)} for t in generation_tokens]
+
+    def chat_completion(
+        self,
+        dialogs: List[Dialog],
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        logprobs: bool = False,
+    ) -> List[ChatPrediction]:
+        """
+        Generate assistant responses for a list of conversational dialogs using the language generation model.
+
+        Args:
+            dialogs (List[Dialog]): List of conversational dialogs, where each dialog is a list of messages.
+            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            max_gen_len (Optional[int], optional): Maximum length of the generated response sequence.
+                If not provided, it's set to the model's maximum sequence length minus 1.
+            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
+
+        Returns:
+            List[ChatPrediction]: List of chat predictions, each containing the assistant's generated response.
+
+        Raises:
+            AssertionError: If the last message in a dialog is not from the user.
+            AssertionError: If the dialog roles are not in the required 'user', 'assistant', and optional 'system' order.
+
+        Note:
+            This method generates assistant responses for the provided conversational dialogs.
+            It employs nucleus sampling to introduce controlled randomness in text generation.
+            If logprobs is True, token log probabilities are computed for each generated token.
+        """
+        if max_gen_len is None:
+            max_gen_len = self.model.params.max_seq_len - 1
+
+        prompt_tokens = [
+            self.formatter.encode_dialog_prompt(dialog) for dialog in dialogs
+        ]
+        generation_tokens, generation_logprobs = self.generate(
+            prompt_tokens=prompt_tokens,
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+        )
+        if logprobs:
+            return [
+                {
+                    "generation": {
+                        "role": "assistant",
+                        "content": self.tokenizer.decode(t),
+                    },
+                    "tokens": [self.tokenizer.decode([x]) for x in t],
+                    "logprobs": logprobs_i,
+                }
+                for t, logprobs_i in zip(generation_tokens, generation_logprobs)
+            ]
+        return [
+            {
+                "generation": {
+                    "role": "assistant",
+                    "content": self.tokenizer.decode(t),
+                },
+            }
+            for t in generation_tokens
+        ]
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-f",
+        "--pte",
+        type=str,
+        default=None,
+        help="path to exported executorch .pte file",
+    )
+
+    parser.add_argument(
+        "-p", "--params", type=str, default=None, help="model params file"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--tokenizer",
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.6,
+    )
+
+    parser.add_argument(
+        "-kv",
+        "--kv_cache",
+        default=False,
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--max_gen_len",
+        type=int,
+        default=10,
+        help="Maximum length of the generated response sequence.",
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+
+    with open(args.params, "r") as f:
+        params = json.loads(f.read())
+    model_args: ModelArgs = ModelArgs(
+        max_seq_len=128,
+        max_batch_size=1,
+        use_kv_cache=args.kv_cache,
+        **params,
+    )
+    runner = LlamaRunner(
+        model_path=args.pte, tokenizer_path=args.tokenizer, model_args=model_args
+    )
+    result = runner.text_completion(
+        prompts=[args.prompt],
+        max_gen_len=args.max_gen_len,
+        temperature=args.temperature,
+    )
+    print(f"Result: {result}")
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index af7c25ec67d..46a20a6c050 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -10,6 +10,11 @@
 // The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama2/runner/runner.h>
+#if defined(ET_USE_TIKTOKEN)
+#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+#else /* BPE */
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#endif /* ET_USE_TIKTOKEN*/
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
@@ -76,19 +81,25 @@ Error Runner::load() {
   append_eos_ = getMetadataHelper("append_eos_to_prompt", false);
 
   // Load tokenizer
-  tokenizer_ = std::make_unique<Tokenizer>(vocab_size_, bos_id_, eos_id_);
+#if defined(ET_USE_TIKTOKEN)
+  tokenizer_ = std::make_unique<Tiktoken>(vocab_size_, bos_id_, eos_id_);
+#else
+  tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
+#endif
   tokenizer_->load(tokenizer_path_);
   if (tokenizer_->bos_tok() != bos_id_) {
     ET_LOG(
         Error,
-        "Tokenizer's BOS id %d does not match model's BOS id %d, will override tokenizer's BOS.",
+        "Tokenizer's BOS id %" PRIu64
+        " does not match model's BOS id %d, will override tokenizer's BOS.",
         tokenizer_->bos_tok(),
         bos_id_);
   }
   if (tokenizer_->eos_tok() != eos_id_) {
     ET_LOG(
         Error,
-        "Tokenizer's EOS id %d does not match model's EOS id %d, will override tokenizer's EOS.",
+        "Tokenizer's EOS id %" PRIu64
+        " does not match model's EOS id %d, will override tokenizer's EOS.",
         tokenizer_->eos_tok(),
         eos_id_);
   }
@@ -103,7 +114,7 @@ Error Runner::load() {
 }
 
 template <typename T>
-T Runner::getMetadataHelper(std::string method_name, T default_val) {
+T Runner::getMetadataHelper(const std::string& method_name, T default_val) {
   T res = default_val;
   if (model_methods_.count(method_name)) {
     Result<std::vector<EValue>> outputs = module_->execute(method_name);
@@ -227,20 +238,18 @@ Error Runner::generate(
   stats_.inference_start_ms = util::time_in_ms();
   shouldStop_ = false;
 
-  // encode the (string) prompt into tokens sequence
-  int num_prompt_tokens = 0;
-  // max # of prompt tokens: len(prompt) + '\0', ?BOS, ?EOS
-  int* prompt_tokens = new int[prompt.size() + 1 + n_bos_ + n_eos_];
-
   // Set the sequence length to the max seq length if not provided
   seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
 
-  tokenizer_->encode(
-      prompt.c_str(),
-      n_bos_,
-      append_eos_ ? n_eos_ : 0,
-      prompt_tokens,
-      &num_prompt_tokens);
+  Result<std::vector<uint64_t>> encode_res =
+      tokenizer_->encode(prompt, n_bos_, append_eos_ ? n_eos_ : 0);
+
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
+
+  // encode the (string) prompt into tokens sequence
+  std::vector<uint64_t> prompt_tokens = encode_res.get();
+  int num_prompt_tokens = prompt_tokens.size();
 
   ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
   ET_CHECK_MSG(
@@ -303,13 +312,13 @@ Error Runner::generate(
 
     // Print the prompt for consistent output between single token prefill and
     // batch prefill.
-    int prev = prompt_tokens[0];
-    int cur;
+    uint64_t prev = prompt_tokens[0];
+    uint64_t cur;
     for (int i = 1; i < num_prompt_tokens; i++) {
       cur = prompt_tokens[i];
       auto piece_res = tokenizer_->decode(prev, cur);
       ET_CHECK_OK_OR_RETURN_ERROR(piece_res.error());
-      util::safe_printf(piece_res.get());
+      util::safe_printf(piece_res.get().c_str());
       fflush(stdout);
       prev = cur;
     }
@@ -361,7 +370,7 @@ Error Runner::generate(
     // print the token as string, decode it with the Tokenizer object
     auto piece_res = tokenizer_->decode(prev_token, cur_token);
     ET_CHECK(piece_res.ok());
-    const char* piece = piece_res.get();
+    const char* piece = piece_res.get().c_str();
 
     // same as printf("%s", piece), but skips "unsafe" bytes
     util::safe_printf(piece);
@@ -396,7 +405,6 @@ Error Runner::generate(
     stats_callback(stats_);
   }
 
-  delete[] prompt_tokens;
   return Error::Ok;
 }
 
@@ -472,8 +480,7 @@ std::string statsToJsonString(const Runner::Stats& stats) {
      << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
      << "\"first_token_ms\":" << stats.first_token_ms << ","
      << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-     << ","
-     << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
      << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
   return ss.str();
 }
@@ -485,9 +492,9 @@ void Runner::stop() {
 
 // explicit instantiation of template methods
 template int64_t Runner::getMetadataHelper<int64_t>(
-    std::string method_name,
+    const std::string& method_name,
     int64_t default_val);
 template bool Runner::getMetadataHelper<bool>(
-    std::string method_name,
+    const std::string& method_name,
     bool default_val);
 } // namespace torch::executor
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index 08f5e33c47e..4e200d5e6ca 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -69,7 +69,7 @@ class Runner {
  private:
   // metadata
   template <typename T>
-  T getMetadataHelper(std::string method_name, T default_val);
+  T getMetadataHelper(const std::string& method_name, T default_val);
   template <typename T>
   int32_t
   logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index 7f91b4a67fc..d9e51561ca1 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -2,11 +2,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def _get_operator_lib(aten = False):
     if aten:
-        return ["//executorch/kernels/aten:generated_lib_aten"]
+        return ["//executorch/kernels/aten:generated_lib"]
     elif runtime.is_oss:
         return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
     else:
-        return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]
+        return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
+
+def use_tiktoken():
+    return native.read_config("llama", "use_tiktoken", "0") == "1"
 
 def define_common_targets():
     for aten in (True, False):
@@ -24,20 +27,24 @@ def define_common_targets():
             preprocessor_flags = [
                 "-DUSE_ATEN_LIB",
             ] if aten else [],
+            exported_preprocessor_flags = ["-DET_USE_TIKTOKEN"] if use_tiktoken() else [],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
                 "//executorch/examples/models/llama2/sampler:sampler" + aten_suffix,
-                "//executorch/examples/models/llama2/tokenizer:tokenizer",
                 "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ] + (_get_operator_lib(aten)) + ([
+            ] + ([
+                "//executorch/examples/models/llama2/tokenizer:tiktoken",
+            ] if use_tiktoken() else [
+                "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
+            ]) + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
                 "//executorch/backends/vulkan:vulkan_backend_lib",
diff --git a/.swift/custom_backend/dummy.swift b/examples/models/llama2/source_transformation/__init__.py
similarity index 100%
rename from .swift/custom_backend/dummy.swift
rename to examples/models/llama2/source_transformation/__init__.py
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/source_transformation/quantize.py
similarity index 56%
rename from examples/models/llama2/quantize.py
rename to examples/models/llama2/source_transformation/quantize.py
index bedbb86be67..857ccdb281a 100644
--- a/examples/models/llama2/quantize.py
+++ b/examples/models/llama2/source_transformation/quantize.py
@@ -4,13 +4,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Dict, Optional
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .ops.quantized_ops import *  # noqa
 
+from sentencepiece import SentencePieceProcessor
+
+from ..builder import DType
 
 try:
     # pyre-ignore[21]: Undefined import.
@@ -28,6 +32,105 @@
     fsLinear = nn.Linear
 
 
+def quantize(
+    model: torch.nn.Module,
+    qmode: str,
+    activation_dtype: Optional[DType],
+    checkpoint_path: Optional[Path] = None,
+    # following arguments only available when setting int4 or gptq quantization.
+    group_size: Optional[int] = 128,
+    # following arguments are only used for GPTQ
+    calibration_tasks: Optional[list] = None,
+    calibration_limit: Optional[int] = None,
+    calibration_seq_length: Optional[int] = None,
+    pad_calibration_inputs: bool = False,
+    percdamp: float = 0.01,
+    blocksize: int = 128,
+    tokenizer_path: Optional[Path] = None,
+    verbose: bool = False,
+) -> torch.nn.Module:
+    """
+    Quantizes a model by converting all weights to int8.
+    Args:
+        model: A model to quantize.
+        qmode: quantization mode, e.g. int8, 8da4w, 8da4w-gptq
+    Returns:
+        A quantized model.
+    """
+    if activation_dtype is not None:
+        torch_dtype = activation_dtype.to_torch_dtype()
+    else:
+        torch_dtype = torch.float16
+
+    assert checkpoint_path, "Need to specify a checkpoint"
+    # if checkpoint_path is None:
+    #     checkpoint_path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+
+    if qmode == "int8":
+        # Add quantization mode options here: group size, bit width, etc.
+        return WeightOnlyInt8QuantHandler(model).quantized_model()
+    elif qmode == "8da4w":
+        # Check for required args
+        if group_size is None:
+            raise Exception("For 8da4w quantization, group size must be specified.")
+        from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
+
+        model = Int8DynActInt4WeightQuantizer(
+            precision=torch_dtype, groupsize=group_size
+        ).quantize(model)
+        if verbose:
+            print("quantized model:", model)
+        return model
+    elif qmode == "8da4w-gptq":
+        # Check for required args
+        required_args: Optional[Any] = [
+            group_size,
+            calibration_limit,
+            calibration_seq_length,
+        ]
+        if any(arg is None for arg in required_args):
+            raise Exception(
+                "For 8da4w-gptq quantization, group size, calibration limit and calibration sequence length must be specified."
+            )
+        if calibration_tasks is None:
+            calibration_tasks = ["wikitext"]
+
+        from torchao.quantization.GPTQ import InputRecorder
+        from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
+
+        if tokenizer_path is None:
+            tokenizer_path = checkpoint_path.parent / "tokenizer.model"
+        assert tokenizer_path.is_file(), tokenizer_path
+        tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
+            model_file=str(tokenizer_path)
+        )
+
+        inputs = (
+            InputRecorder(
+                tokenizer,
+                calibration_seq_length,
+                None,  # input_prep_func
+                pad_calibration_inputs,
+                model.vocab_size,
+            )
+            .record_inputs(
+                calibration_tasks,
+                calibration_limit,
+            )
+            .get_inputs()
+        )
+
+        gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer(
+            blocksize,
+            percdamp,
+            group_size,
+        )
+        model = gptq_quantizer.quantize(model, inputs)
+        return model
+    else:
+        raise Exception(f"Unrecognized quantize mode: {qmode}")
+
+
 def dynamically_quantize_per_channel(
     x,
     quant_min,
@@ -122,6 +225,10 @@ def dynamically_quantize_per_channel(
     return quant, scales, zero_points
 
 
+#########################################################################
+###                QuantHandler API definition                        ###
+
+
 class QuantHandler:
     def __init__(self, mod):
         self.mod = mod
@@ -132,8 +239,15 @@ def create_quantized_state_dict(self) -> Dict:  # "StateDict"
     def convert_for_runtime(self) -> nn.Module:
         pass
 
+    def quantized_model(self) -> nn.Module:
+        model_updated_state_dict = self.create_quantized_state_dict()
+        self.convert_for_runtime()
+        self.mod.load_state_dict(model_updated_state_dict)
+        return self.mod
+
 
-##### Weight-only int8 per-channel quantized code ######
+#########################################################################
+###             Weight-only int8 per-channel quantized code           ###
 
 
 def replace_linear_weight_only_int8_per_channel(module, node_type):
@@ -151,16 +265,17 @@ def replace_linear_weight_only_int8_per_channel(module, node_type):
                 setattr(
                     module,
                     name,
-                    WeightOnlyInt8Linear(child.in_features, child.out_features),
+                    WeightOnlyInt8Linear("cpu", child.in_features, child.out_features),
                 )
         else:
             replace_linear_weight_only_int8_per_channel(child, node_type)
 
 
-class WeightOnlyInt8QuantHandler:
+class WeightOnlyInt8QuantHandler(QuantHandler):
     def __init__(
         self,
         mod,
+        device="cpu",
         *,
         node_type: str = "*",
         bitwidth: Optional[int] = None,
@@ -200,7 +315,7 @@ def create_quantized_state_dict(self) -> Dict:
                     )
                 ):
                     print(
-                        f"quantize {self.node_type} {fqn, mod} with groupsize {self.group_size}, bitwidth {self.bitwidth}"
+                        f"quantize {self.node_type} {fqn, mod} with group_size {self.group_size}, bitwidth {self.bitwidth}"
                     )
 
                     # print(f"initial weight shape {mod.weight.shape}")
@@ -217,7 +332,7 @@ def create_quantized_state_dict(self) -> Dict:
                     )
 
                     cur_state_dict[f"{fqn}.weight"] = weight
-                    # squeeze makes groupsize=rowsize unidimensional
+                    # squeeze makes group_size=rowsize unidimensional
                     cur_state_dict[f"{fqn}.scales"] = scales.squeeze(dim=-1)
 
         return cur_state_dict
@@ -241,10 +356,10 @@ class WeightOnlyInt8Linear(torch.nn.Module):
 
     def __init__(
         self,
+        device,
         in_features: int,
         out_features: int,
         bias: bool = True,
-        device=None,
         dtype=None,
     ) -> None:
         super().__init__()
@@ -260,11 +375,12 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         # return F.linear(input, self.weight.to(dtype=input.dtype)) * se...
 
 
-##### embedding table quantization ######
+#########################################################################
+#####                   embedding table quantization               ######
 
 
 def replace_embedding_weight_only_grouped_int8_per_channel(
-    module, bitwidth: int = 8, group_size: Optional[int] = None
+    module, device, bitwidth: int = 8, group_size: Optional[int] = None, packed=False
 ):
     for name, child in module.named_children():
         # print(f"name: {name}")
@@ -275,25 +391,41 @@ def replace_embedding_weight_only_grouped_int8_per_channel(
                 module,
                 name,
                 QuantizedGroupEmbedding(
+                    device=device,
                     vocab_size=child.weight.shape[0],
                     embedding_dim=child.weight.shape[1],
                     group_size=group_size,
+                    packed=packed,
                 ),
             )
         else:
             replace_embedding_weight_only_grouped_int8_per_channel(
-                child, bitwidth, group_size
+                child, device, bitwidth, group_size, packed
             )
 
 
-class EmbeddingOnlyInt8QuantHandler:
-    def __init__(self, mod, *, bitwidth: int = 8, group_size: Optional[int] = None):
+class EmbeddingQuantHandler(QuantHandler):
+    def __init__(
+        self,
+        mod,
+        device="cpu",
+        *,
+        bitwidth: int = 8,
+        group_size: Optional[int] = None,
+        packed=False,
+    ):
+        if isinstance(packed, str):
+            packed = packed == "True"
         self.mod = mod
+        self.device = device
         self.group_size = group_size
         self.bitwidth = bitwidth
+        self.packed = packed
+        if (bitwidth != 4) and packed:
+            raise RuntimeError("pack only works with bitsize 4")
 
     @torch.no_grad()
-    def create_quantized_state_dict(self) -> Dict:
+    def create_quantized_state_dict(self, packed=False) -> Dict:
         cur_state_dict = self.mod.state_dict()
 
         if self.bitwidth == 4:
@@ -306,18 +438,14 @@ def create_quantized_state_dict(self) -> Dict:
             raise ValueError(f"Unsupported bitwidth {self.bitwidth}")
 
         for fqn, mod in self.mod.named_modules():
-            if (
-                isinstance(mod, nn.Embedding)
-                or isinstance(mod, fsEmbedding)
-                or isinstance(mod, fsStandardEmbedding)
-            ):
+            if isinstance(mod, nn.Embedding):
                 # print("****")
                 # print(f"Embedding identified: {fqn, mod}")
                 # print(f"weights size: {mod.weight.size()}")
                 # print(f"quantize {fqn}...")
 
                 print(
-                    f"quantize {fqn, mod} with groupsize {self.group_size}, bitwidth {self.bitwidth}"
+                    f"quantize {fqn, mod} with group_size {self.group_size}, bitwidth {self.bitwidth}"
                 )
                 weight, scales, _ = dynamically_quantize_per_channel(
                     mod.weight.float(),
@@ -328,21 +456,36 @@ def create_quantized_state_dict(self) -> Dict:
                     scales_dtype=mod.weight.dtype,
                 )
 
+                if packed:
+                    if weight.shape[-1] % 2 != 0:
+                        raise RuntimeError("automatic padding not implemented yet")
+
+                    weight_range_shifted = weight.add(8).view(torch.uint8)
+                    weight_view = weight_range_shifted.view(
+                        weight.shape[0], weight.shape[1] // 2, 2
+                    )
+                    weight_even = weight_view[:, :, 0] * 16  # left shift 4
+                    weight_odd = weight_view[:, :, 1]
+                    weight_packed = weight_even + weight_odd
+                    weight = weight_packed
+
+                weight = weight.to(device=self.device)
+                scales = scales.to(device=self.device)
                 # Update state dict
                 cur_state_dict[f"{fqn}.weight"] = weight
-                # squeeze makes groupsize=rowsize unidimensional
+                # squeeze makes group_size=rowsize unidimensional
                 cur_state_dict[f"{fqn}.scales"] = scales.squeeze(dim=-1)
 
         return cur_state_dict
 
     def convert_for_runtime(self) -> nn.Module:
         replace_embedding_weight_only_grouped_int8_per_channel(
-            self.mod, self.bitwidth, self.group_size
+            self.mod, self.device, self.bitwidth, self.group_size, self.packed
         )
         return self.mod
 
     def quantized_model(self) -> nn.Module:
-        model_updated_state_dict = self.create_quantized_state_dict()
+        model_updated_state_dict = self.create_quantized_state_dict(self.packed)
         self.convert_for_runtime()
         self.mod.load_state_dict(model_updated_state_dict)
         return self.mod
@@ -351,39 +494,101 @@ def quantized_model(self) -> nn.Module:
 class QuantizedGroupEmbedding(torch.nn.Module):
     def __init__(
         self,
+        device,
         vocab_size: int,
         embedding_dim: int,
         group_size: Optional[int] = None,
-        device=None,
         dtype=torch.half,
+        packed=False,
     ) -> None:
         super().__init__()
-        if group_size is None:
+        if group_size is None or group_size == 0:
             group_size = embedding_dim
         self.group_size = group_size
         self.dtype = dtype
-        self.register_buffer(
-            "weight", torch.empty((vocab_size, embedding_dim), dtype=torch.int8)
-        )
+        self.packed = packed
+        if not packed:
+            self.register_buffer(
+                "weight",
+                torch.empty(
+                    (vocab_size, embedding_dim), dtype=torch.int8, device=device
+                ),
+            )
+        else:  # packed
+            self.register_buffer(
+                "weight",
+                torch.empty(
+                    (vocab_size, embedding_dim // 2), dtype=torch.uint8, device=device
+                ),
+            )
         groups_per_row = (embedding_dim + group_size - 1) // group_size
         if groups_per_row > 1:
             self.register_buffer(
-                "scales", torch.ones((vocab_size, groups_per_row), dtype=torch.float16)
+                "scales",
+                torch.ones(
+                    (vocab_size, groups_per_row), dtype=torch.float16, device=device
+                ),
             )
         else:
             self.register_buffer(
-                "scales", torch.ones((vocab_size,), dtype=torch.float16)
+                "scales", torch.ones((vocab_size,), dtype=torch.float16, device=device)
             )
 
     @torch.no_grad()
     def forward(self, indices: torch.Tensor) -> torch.Tensor:
-        return torch.ops.llama_quantized.embedding_byte.dtype(
-            self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
-        )
+        if not self.packed:  # 8bit
+            return torch.ops.quantized_decomposed.embedding_byte.dtype(
+                self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+            )
+        else:  # 4bit packed
+            return torch.ops.quantized_decomposed.embedding_4bit.dtype(
+                self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+            )
 
 
-#        result_weights = self.weight.index_select(0, indices.view(-1))
-#        result_scales = self.scales.index_select(0, indices.view(-1))
-#
-#        r = result_weights.to(dtype=result_scales.dtype) * result_scales
-#        return r.view(indices.size() + (-1,))
+############################ Source Transform Start #######################
+
+
+def get_quant_embedding_transform(args):
+    bitwidth, group_size = args.embedding_quantize.split(",")
+    if group_size == "none" or group_size == "None" or group_size == "0":
+        group_size = None
+    else:
+        group_size = int(group_size)
+    bitwidth = int(bitwidth)
+    return lambda model: EmbeddingQuantHandler(
+        model,
+        bitwidth=bitwidth,
+        group_size=group_size,
+        packed=(bitwidth == 4),
+    ).quantized_model()
+
+
+def get_quant_weight_transform(args, dtype_override, verbose):
+    # If these optional args are None, don't provide them to quantize()
+    quant_args_str = [
+        "group_size",
+        "calibration_tasks",
+        "calibration_limit",
+        "calibration_seq_length",
+    ]
+    arg_dict = vars(args)
+    quant_args = {
+        param: val
+        for param in quant_args_str
+        if (val := arg_dict.get(param)) is not None
+    }
+
+    return partial(
+        quantize,
+        **quant_args,
+        qmode=args.quantization_mode,
+        activation_dtype=dtype_override,
+        checkpoint_path=(Path(path) if (path := args.checkpoint) is not None else None),
+        tokenizer_path=(
+            Path(path) if (path := args.tokenizer_path) is not None else None
+        ),
+    )
+
+
+############################ Source Transform End #######################
diff --git a/examples/models/llama2/source_transformation/rope.py b/examples/models/llama2/source_transformation/rope.py
new file mode 100644
index 00000000000..7061636f0c6
--- /dev/null
+++ b/examples/models/llama2/source_transformation/rope.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ..llama_transformer import Transformer
+
+
+def materialze_broadcast_of_rope_freq_cis(
+    module: torch.nn.Module,
+):
+    assert isinstance(module, Transformer)
+    assert module.freqs_cos.dim() == 2
+    dim0 = module.freqs_cos.size(0)
+    dim1 = module.freqs_cos.size(1)
+    assert (
+        module.layers[0].attention.n_local_kv_heads
+        == module.layers[0].attention.n_local_heads
+    ), f"For rope freqs to be materialzed for broadcast q, k, v num heads must match. For q got {module.attention.n_kv_heads} for k got {module.attention.n_local_heads} and v got {module.attention.n_local_kv_heads}"
+    num_heads = module.layers[0].attention.n_local_heads
+    module.freqs_cos = module.freqs_cos.view(dim0, 1, dim1)
+    module.freqs_cos = module.freqs_cos.expand(dim0, num_heads, dim1).contiguous()
+    assert module.freqs_sin.dim() == 2
+    assert dim0 == module.freqs_sin.size(
+        0
+    ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.freqs_sin.size(0)}"
+    assert dim1 == module.freqs_sin.size(
+        1
+    ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.freqs_sin.size(1)}"
+    module.freqs_sin = module.freqs_sin.view(dim0, 1, dim1)
+    module.freqs_sin = module.freqs_sin.expand(dim0, num_heads, dim1).contiguous()
+    return module
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
new file mode 100644
index 00000000000..8a8a0cac7c2
--- /dev/null
+++ b/examples/models/llama2/source_transformation/sdpa.py
@@ -0,0 +1,138 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting Llama2 to flatbuffer
+
+import math
+
+import torch
+
+from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA
+
+
+class SDPACustom(torch.nn.Module):
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        output = torch.ops.llama.sdpa_with_kv_cache(
+            q,
+            k,
+            v,
+            self.kv_cache.k_cache,
+            self.kv_cache.v_cache,
+            input_pos[-1].item(),
+            seqlen,
+        )
+        return output.view(bsz, seqlen, self.dim)
+
+
+def _replace_sdpa_with_custom_op(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPACustom(child.kv_cache, child.dim),
+            )
+        else:
+            _replace_sdpa_with_custom_op(child)
+
+
+def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
+    from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache  # noqa
+
+    _replace_sdpa_with_custom_op(module)
+    return module
+
+
+class SDPASimple(torch.nn.Module):
+
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+        attn_weight += attn_mask
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        y = attn_weight @ v
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
+def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPASimple(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+            )
+        else:
+            replace_sdpa_with_simple_sdpa(child)
+    return module
+
+
+def replace_causal_mask(module: torch.nn.Module):
+    for buffer_fqn_name, buffer in module.named_buffers():
+        buffer_name = buffer_fqn_name.split(".")[-1]
+        if buffer_name == "mask":
+            max_seq_len = buffer.shape[-1]
+            mask = torch.full(
+                (max_seq_len, max_seq_len),
+                float("-inf"),
+                device="cpu",
+            )
+
+            mask = torch.triu(mask, diagonal=1)
+            module.register_buffer(buffer_name, mask)
+    for _, child in module.named_children():
+        replace_causal_mask(child)
+    return module
diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama2/tests/TARGETS
new file mode 100644
index 00000000000..3d2aef6209f
--- /dev/null
+++ b/examples/models/llama2/tests/TARGETS
@@ -0,0 +1,15 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "test_simple_sdpa",
+    srcs = [
+        "test_simple_sdpa.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:export_library",
+        "//executorch/examples/models/llama2:llama_transformer",
+    ],
+)
diff --git a/examples/models/llama2/tests/test_simple_sdpa.py b/examples/models/llama2/tests/test_simple_sdpa.py
new file mode 100644
index 00000000000..9113059fd5d
--- /dev/null
+++ b/examples/models/llama2/tests/test_simple_sdpa.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA
+from executorch.examples.models.llama2.source_transformation.sdpa import SDPASimple
+
+
+class SDPATest(unittest.TestCase):
+    def test_simple_sdpa(self):
+        # Verify the correctness between the simple SDPA and the original SDPA module defined in llama_transformer.py
+        max_batch_size = 1
+        max_seq_length = 128
+        n_heads = 8
+        head_dim = 8
+        dim = 64
+        n_rep = 1
+        bsz = 1
+        seqlen = 1
+        n_local_heads = n_heads
+        kv_cache = KVCache(
+            max_batch_size=max_batch_size,
+            max_seq_length=max_seq_length,
+            n_heads=n_heads,
+            head_dim=head_dim,
+            transpose_cache=True,
+        )
+        sdpa = SDPA(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        input_pos = torch.tensor([0])
+        query = torch.randn(1, 1, n_local_heads, head_dim)
+        key = torch.randn(1, 1, n_local_heads, head_dim)
+        value = torch.randn(1, 1, n_local_heads, head_dim)
+        mask = torch.randn(max_seq_length, max_seq_length)
+        sdpa_output = sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        simple_sdpa = SDPASimple(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        simple_sdpa_output = simple_sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        # Compare the output from output from two sdpa implementation
+        self.assertTrue(torch.allclose(sdpa_output, simple_sdpa_output))
diff --git a/examples/models/llama2/third-party/abseil-cpp b/examples/models/llama2/third-party/abseil-cpp
new file mode 160000
index 00000000000..85419307149
--- /dev/null
+++ b/examples/models/llama2/third-party/abseil-cpp
@@ -0,0 +1 @@
+Subproject commit 854193071498f330b71083d7e06a7cd18e02a4cc
diff --git a/examples/models/llama2/third-party/re2 b/examples/models/llama2/third-party/re2
new file mode 160000
index 00000000000..ac82d4f628a
--- /dev/null
+++ b/examples/models/llama2/third-party/re2
@@ -0,0 +1 @@
+Subproject commit ac82d4f628a2045d89964ae11c48403d3b091af1
diff --git a/examples/models/llama2/tokenizer/base64.h b/examples/models/llama2/tokenizer/base64.h
new file mode 100644
index 00000000000..9fb1b5129b3
--- /dev/null
+++ b/examples/models/llama2/tokenizer/base64.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// @lint-ignore-every LICENSELINT
+/**************************************************************************
+   Copyright (c) 2023 sewenew
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ *************************************************************************/
+
+#pragma once
+
+#include <executorch/runtime/platform/assert.h>
+#include <cassert>
+#include <string>
+#include <string_view>
+
+namespace torch {
+namespace executor {
+namespace base64 {
+
+std::string decode(const std::string_view& input);
+
+namespace detail {
+
+constexpr uint32_t DECODE_TABLE[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
+    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
+inline void validate(uint32_t v) {
+  ET_CHECK_MSG(v != 255, "invalid char");
+}
+
+inline void decode(const std::string_view& input, std::string& output) {
+  ET_CHECK_MSG(
+      input.size() == 4, "input length must be 4, got %zu", input.size());
+
+  uint32_t val = 0;
+
+  uint8_t c = input[0];
+  auto v = DECODE_TABLE[c];
+  validate(v);
+  val = v;
+
+  c = input[1];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  c = input[2];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  c = input[3];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  output.push_back(static_cast<char>((val >> 16) & 0xFF));
+  output.push_back(static_cast<char>((val >> 8) & 0xFF));
+  output.push_back(static_cast<char>(val & 0xFF));
+}
+
+inline void decode_1_padding(
+    const std::string_view& input,
+    std::string& output) {
+  ET_CHECK_MSG(
+      input.size() == 3, "input length must be 3, got %zu", input.size());
+
+  uint32_t val = 0;
+
+  uint8_t c = input[0];
+  auto v = DECODE_TABLE[c];
+  validate(v);
+  val = v;
+
+  c = input[1];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  c = input[2];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  output.push_back(static_cast<char>((val >> 10) & 0xFF));
+  output.push_back(static_cast<char>((val >> 2) & 0xFF));
+}
+
+inline void decode_2_padding(
+    const std::string_view& input,
+    std::string& output) {
+  assert(input.size() == 2);
+
+  uint32_t val = 0;
+
+  uint8_t c = input[0];
+  auto v = DECODE_TABLE[c];
+  validate(v);
+  val = v;
+
+  c = input[1];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  output.push_back(static_cast<char>((val >> 4) & 0xFF));
+}
+
+} // namespace detail
+
+inline std::string decode(const std::string_view& input) {
+  ET_CHECK_MSG(!input.empty(), "empty input");
+
+  // Faster than `input.size() % 4`.
+  ET_CHECK_MSG(
+      (input.size() & 3) == 0 && input.size() >= 4,
+      "input length must be larger than 4 and is multiple of 4, got %zu",
+      input.size());
+
+  std::string output;
+  output.reserve(input.size() / 4 * 3);
+  auto idx = 0U;
+  for (; idx < input.size() - 4; idx += 4) {
+    detail::decode(input.substr(idx, 4), output);
+  }
+
+  // Last 4 bytes. Might contain paddings.
+  if (input[idx + 3] == '=') {
+    if (input[idx + 2] == '=') {
+      // Tow paddings.
+      detail::decode_2_padding(input.substr(idx, 2), output);
+    } else {
+      // One padding.
+      detail::decode_1_padding(input.substr(idx, 3), output);
+    }
+  } else {
+    // No padding.
+    detail::decode(input.substr(idx, 4), output);
+  }
+
+  return output;
+}
+
+} // namespace base64
+
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/tokenizer/tokenizer.cpp b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp
similarity index 89%
rename from examples/models/llama2/tokenizer/tokenizer.cpp
rename to examples/models/llama2/tokenizer/bpe_tokenizer.cpp
index b380cc675b4..ed7d34aca4d 100644
--- a/examples/models/llama2/tokenizer/tokenizer.cpp
+++ b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 
 #include <string>
 
@@ -23,11 +23,11 @@ static int compare_tokens(const void* a, const void* b) {
   return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
 }
 
-Tokenizer::Tokenizer(int32_t vocab_size, int32_t bos_tok, int32_t eos_tok)
-    : initialized_(false),
-      vocab_size_(vocab_size),
-      bos_tok_(bos_tok),
-      eos_tok_(eos_tok),
+BPETokenizer::BPETokenizer(
+    int32_t vocab_size,
+    uint64_t bos_tok,
+    uint64_t eos_tok)
+    : Tokenizer(vocab_size, bos_tok, eos_tok),
       vocab_(std::make_unique<char*[]>(vocab_size)),
       vocab_scores_(std::make_unique<float[]>(vocab_size)),
       sorted_vocab_(std::make_unique<TokenIndex[]>(vocab_size)) {
@@ -47,7 +47,7 @@ Tokenizer::Tokenizer(int32_t vocab_size, int32_t bos_tok, int32_t eos_tok)
  * @param tokenizer_path The path to the tokenizer file.
  * @return Error
  */
-Error Tokenizer::load(const std::string& tokenizer_path) {
+Error BPETokenizer::load(const std::string& tokenizer_path) {
   if (initialized_) {
     ET_LOG(Info, "Tokenizer already initialized");
     return Error::Ok;
@@ -131,7 +131,7 @@ Error Tokenizer::load(const std::string& tokenizer_path) {
   return Error::Ok;
 }
 
-Tokenizer::~Tokenizer() {
+BPETokenizer::~BPETokenizer() {
   for (int i = 0; i < vocab_size_; i++) {
     delete[] vocab_[i];
   }
@@ -142,10 +142,10 @@ Tokenizer::~Tokenizer() {
  *
  * @param prev_token The previous token.
  * @param token The current token.
- * @return Result<const char*> A pointer to the string representation of the
+ * @return Result<std::string> A pointer to the string representation of the
  * token.
  */
-Result<const char*> Tokenizer::decode(int32_t prev_token, int32_t token) {
+Result<std::string> BPETokenizer::decode(uint64_t prev_token, uint64_t token) {
   if (!initialized_) {
     ET_LOG(Error, "Tokenizer not initialized");
     return Error::NotSupported;
@@ -162,7 +162,8 @@ Result<const char*> Tokenizer::decode(int32_t prev_token, int32_t token) {
   if (sscanf(piece, "<0x%02hhX>", &byte_val) == 1) {
     piece = (char*)byte_pieces_ + byte_val * 2;
   }
-  return piece;
+  std::string res(piece);
+  return res;
 }
 
 static int32_t
@@ -183,14 +184,10 @@ str_lookup(const char* str, TokenIndex* sorted_vocab, int32_t vocab_size) {
  * @param eos The number of EOS to append to the token list.
  * @param tokens The output tokens.
  * @param n_tokens The number of tokens.
- * @return Error
+ * @return Result<std::vector<uint64_t>>
  */
-Error Tokenizer::encode(
-    const char* text,
-    int8_t bos,
-    int8_t eos,
-    int32_t* tokens,
-    int32_t* n_tokens) {
+Result<std::vector<uint64_t>>
+BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
   if (!initialized_) {
     ET_LOG(Error, "Tokenizer not initialized");
     return Error::NotSupported;
@@ -198,8 +195,8 @@ Error Tokenizer::encode(
   // encode the string text (input) into an upper-bound preallocated tokens[]
   // array bos != 0 means prepend the BOS token (=1), eos != 0 means append the
   // EOS token (=2)
-  if (text == nullptr) {
-    ET_LOG(Error, "cannot encode null text");
+  if (text.empty()) {
+    ET_LOG(Error, "cannot encode empty text");
     return Error::InvalidArgument;
   }
 
@@ -210,12 +207,12 @@ Error Tokenizer::encode(
   size_t str_len = 0;
 
   // start at 0 tokens
-  *n_tokens = 0;
+  std::vector<uint64_t> tokens;
 
   // add optional BOS token, if desired
   if (bos > 0) {
     while (bos--) {
-      tokens[(*n_tokens)++] = bos_tok_;
+      tokens.push_back(bos_tok_);
     }
   } else {
     ET_LOG(Error, "bos %d should be >= 0", bos);
@@ -230,7 +227,7 @@ Error Tokenizer::encode(
   const char* space = " ";
   if (text[0] != '\0') {
     int dummy_prefix = str_lookup(space, sorted_vocab_.get(), vocab_size_);
-    tokens[(*n_tokens)++] = dummy_prefix;
+    tokens.push_back(dummy_prefix);
   }
 
   // Okay UTF-8 time. This will get messy. Here is the reference from Wikipedia:
@@ -242,7 +239,7 @@ Error Tokenizer::encode(
   // U+10000	U+10FFFF    11110xxx	10xxxxxx	10xxxxxx	10xxxxxx
 
   // process the raw (UTF-8) byte sequence of the input string
-  for (const char* c = text; *c != '\0'; c++) {
+  for (const char* c = text.c_str(); *c != '\0'; c++) {
     // reset buffer if the current byte is ASCII or a leading byte
     // 0xC0 is 11000000, so (*c & 0xC0) keeps the first 2 bits and zeros the
     // rest 0x80 is 10000000 in UTF-8, all continuation bytes start with "10" in
@@ -271,13 +268,13 @@ Error Tokenizer::encode(
     int id = str_lookup(str_buffer, sorted_vocab_.get(), vocab_size_);
     if (id != -1) {
       // we found this codepoint in vocab, add it as a token
-      tokens[(*n_tokens)++] = id;
+      tokens.push_back(id);
     } else {
       // byte_fallback encoding: just encode each byte as a token
       // +3 is here because the first 3 vocab elements are <unk>, <s>, </s>
       // so the individual bytes only start at index 3
       for (int i = 0; i < str_len; i++) {
-        tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;
+        tokens.push_back((unsigned char)str_buffer[i] + 3);
       }
     }
     str_len = 0; // protect against a sequence of stray UTF8 continuation bytes
@@ -290,7 +287,7 @@ Error Tokenizer::encode(
     int best_id = -1;
     int best_idx = -1;
 
-    for (int i = 0; i < (*n_tokens - 1); i++) {
+    for (int i = 0; i < tokens.size() - 1; i++) {
       // check if we can merge the pair (tokens[i], tokens[i+1])
       snprintf(
           str_buffer,
@@ -314,16 +311,16 @@ Error Tokenizer::encode(
     // merge the consecutive pair (best_idx, best_idx+1) into new token best_id
     tokens[best_idx] = best_id;
     // delete token at position best_idx+1, shift the entire sequence back 1
-    for (int i = best_idx + 1; i < (*n_tokens - 1); i++) {
+    for (int i = best_idx + 1; i < tokens.size() - 1; i++) {
       tokens[i] = tokens[i + 1];
     }
-    (*n_tokens)--; // token length decreased
+    tokens.pop_back(); // token length decreased
   }
 
   // add optional EOS (=2) token, if desired
   if (eos >= 0) {
     while (eos--) {
-      tokens[(*n_tokens)++] = eos_tok_;
+      tokens.push_back(eos_tok_);
     }
   } else {
     ET_LOG(Error, "eos %d should be >= 0", eos);
@@ -331,7 +328,7 @@ Error Tokenizer::encode(
   }
 
   delete[] str_buffer;
-  return Error::Ok;
+  return Result(tokens);
 }
 
 } // namespace executor
diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.h b/examples/models/llama2/tokenizer/bpe_tokenizer.h
new file mode 100644
index 00000000000..82e3f396344
--- /dev/null
+++ b/examples/models/llama2/tokenizer/bpe_tokenizer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <cstdint>
+
+namespace torch {
+namespace executor {
+
+struct TokenIndex {
+  const char* str;
+  int32_t id;
+};
+
+class BPETokenizer : public Tokenizer {
+ public:
+  explicit BPETokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
+  ~BPETokenizer() override;
+
+  Error load(const std::string& tokenizer_path) override;
+
+  Result<std::vector<uint64_t>>
+  encode(const std::string& input, int8_t bos, int8_t eos) override;
+
+  Result<std::string> decode(uint64_t prev_token, uint64_t token) override;
+
+ private:
+  std::unique_ptr<char*[]> vocab_;
+  std::unique_ptr<float[]> vocab_scores_;
+  std::unique_ptr<TokenIndex[]> sorted_vocab_;
+  unsigned int max_token_length_;
+  unsigned char byte_pieces_[512]; // stores all single-byte strings
+};
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama2/tokenizer/targets.bzl
index b63f780faa1..51fdad8fa06 100644
--- a/examples/models/llama2/tokenizer/targets.bzl
+++ b/examples/models/llama2/tokenizer/targets.bzl
@@ -2,12 +2,13 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
     runtime.cxx_library(
-        name = "tokenizer",
+        name = "bpe_tokenizer",
         srcs = [
-            "tokenizer.cpp",
+            "bpe_tokenizer.cpp",
         ],
         exported_headers = [
             "tokenizer.h",
+            "bpe_tokenizer.h",
         ],
         exported_deps = [
             "//executorch/runtime/core/exec_aten:lib",
@@ -18,6 +19,28 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "tiktoken",
+        srcs = [
+            "tiktoken.cpp",
+        ],
+        exported_headers = [
+            "tokenizer.h",
+            "tiktoken.h",
+            "base64.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_external_deps = [
+            "re2",
+        ],
+    )
+
     runtime.python_library(
         name = "tokenizer_py_lib",
         srcs = [
@@ -28,6 +51,7 @@ def define_common_targets():
         visibility = [
             "//executorch/examples/...",
             "//bento/...",
+            "//bento_kernels/...",
         ],
         _is_external_target = True,
         deps = [] if runtime.is_oss else ["fbsource//third-party/pypi/sentencepiece:sentencepiece"],
diff --git a/examples/models/llama2/tokenizer/test/targets.bzl b/examples/models/llama2/tokenizer/test/targets.bzl
index 7ed15b81b9e..b8225cd06df 100644
--- a/examples/models/llama2/tokenizer/test/targets.bzl
+++ b/examples/models/llama2/tokenizer/test/targets.bzl
@@ -8,18 +8,34 @@ def define_common_targets():
     """
 
     runtime.cxx_test(
-        name = "test",
+        name = "test_bpe_tokenizer",
         srcs = [
-            "test_tokenizer.cpp",
+            "test_bpe_tokenizer.cpp",
         ],
         deps = [
-            "//executorch/examples/models/llama2/tokenizer:tokenizer",
+            "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
         ],
         env = {
             "RESOURCES_PATH": "$(location :resources)/resources",
         },
     )
 
+    runtime.cxx_test(
+        name = "test_tiktoken",
+        srcs = [
+            "test_tiktoken.cpp",
+        ],
+        deps = [
+            "//executorch/examples/models/llama2/tokenizer:tiktoken",
+        ],
+        env = {
+            "RESOURCES_PATH": "$(location :resources_fb_only)/resources",
+        },
+        external_deps = [
+            "re2",
+        ],
+    )
+
     runtime.filegroup(
         name = "resources",
         srcs = native.glob([
@@ -27,6 +43,13 @@ def define_common_targets():
         ]),
     )
 
+    runtime.filegroup(
+        name = "resources_fb_only",
+        srcs = native.glob([
+            "resources/fb/**",
+        ]),
+    )
+
     runtime.python_test(
         name = "test_tokenizer_py",
         srcs = [
diff --git a/examples/models/llama2/tokenizer/test/test_tokenizer.cpp b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
similarity index 82%
rename from examples/models/llama2/tokenizer/test/test_tokenizer.cpp
rename to examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
index 95fb2be7829..1d1f83065cf 100644
--- a/examples/models/llama2/tokenizer/test/test_tokenizer.cpp
+++ b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
@@ -6,9 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <gtest/gtest.h>
+#include <vector>
 
 using namespace ::testing;
 
@@ -19,7 +21,7 @@ class TokenizerExtensionTest : public Test {
  public:
   void SetUp() override {
     torch::executor::runtime_init();
-    tokenizer_ = std::make_unique<Tokenizer>(32000, 1, 2);
+    tokenizer_ = std::make_unique<BPETokenizer>(32000, 1, 2);
     modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
   }
 
@@ -28,8 +30,8 @@ class TokenizerExtensionTest : public Test {
 };
 
 TEST_F(TokenizerExtensionTest, EncodeWithoutLoadFails) {
-  Error error = tokenizer_->encode("hello world", 0, 0, nullptr, nullptr);
-  EXPECT_EQ(error, Error::NotSupported);
+  Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
+  EXPECT_EQ(res.error(), Error::NotSupported);
 }
 
 TEST_F(TokenizerExtensionTest, DecodeWithoutLoadFails) {
diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
new file mode 100644
index 00000000000..2f08e2a1aa7
--- /dev/null
+++ b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace ::testing;
+
+namespace torch {
+namespace executor {
+
+class TiktokenExtensionTest : public Test {
+ public:
+  void SetUp() override {
+    torch::executor::runtime_init();
+    tokenizer_ = std::make_unique<Tiktoken>(128256, 128000, 128001);
+    modelPath_ =
+        std::getenv("RESOURCES_PATH") + std::string("/fb/tokenizer.model");
+  }
+
+  std::unique_ptr<Tokenizer> tokenizer_;
+  std::string modelPath_;
+};
+
+TEST_F(TiktokenExtensionTest, EncodeWithoutLoadFails) {
+  Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
+  EXPECT_EQ(res.error(), Error::NotSupported);
+}
+
+TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
+  auto result = tokenizer_->decode(0, 0);
+  EXPECT_EQ(result.error(), Error::NotSupported);
+}
+
+TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
+  Error res = tokenizer_->load(modelPath_.c_str());
+  EXPECT_EQ(res, Error::Ok);
+  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
+  // passed in and add placeholder tokens.
+  EXPECT_EQ(tokenizer_->vocab_size(), 128256);
+  EXPECT_EQ(tokenizer_->bos_tok(), 128000);
+  EXPECT_EQ(tokenizer_->eos_tok(), 128001);
+}
+
+TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
+  Error res = tokenizer_->load(modelPath_.c_str());
+  EXPECT_EQ(res, Error::Ok);
+  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
+  // passed in and add placeholder tokens.
+  Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
+  EXPECT_EQ(out.error(), Error::Ok);
+  EXPECT_EQ(out.get().size(), 3);
+  EXPECT_EQ(out.get()[0], 128000);
+  EXPECT_EQ(out.get()[1], 15339);
+  EXPECT_EQ(out.get()[2], 1917);
+}
+
+TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
+  Error res = tokenizer_->load(modelPath_.c_str());
+  EXPECT_EQ(res, Error::Ok);
+  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
+  // passed in and add placeholder tokens.
+  std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
+  std::vector<uint64_t> tokens = {128000, 15339, 1917};
+  for (size_t i = 0; i < tokens.size(); i++) {
+    Result<std::string> out = tokenizer_->decode(0, tokens[i]);
+    EXPECT_EQ(out.error(), Error::Ok);
+    EXPECT_EQ(out.get(), expected[i]);
+  }
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/tokenizer/tiktoken.cpp b/examples/models/llama2/tokenizer/tiktoken.cpp
new file mode 100644
index 00000000000..849a2ff1e8d
--- /dev/null
+++ b/examples/models/llama2/tokenizer/tiktoken.cpp
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Adopted from https://github.com/sewenew/tokenizer
+
+// @lint-ignore-every LICENSELINT
+/**************************************************************************
+   Copyright (c) 2023 sewenew
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ *************************************************************************/
+
+#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+#include <cctype>
+#include <cstdint>
+#include <fstream>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <regex>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace torch {
+namespace executor {
+
+// ------------------------------Util start------------------------------------
+
+static uint64_t _max_size() {
+  return std::numeric_limits<uint64_t>::max();
+}
+
+static Re2UPtr _create_regex(const std::string& pattern) {
+  assert(!pattern.empty());
+
+  return std::make_unique<re2::RE2>("(" + pattern + ")");
+}
+
+static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) {
+  std::string special_pattern;
+  for (const auto& ele : special_encoder) {
+    if (!special_pattern.empty()) {
+      special_pattern += "|";
+    }
+    special_pattern += re2::RE2::QuoteMeta(ele.first);
+  }
+
+  if (special_pattern.empty()) {
+    return nullptr;
+  }
+
+  return _create_regex(special_pattern);
+}
+
+static std::pair<std::string, uint64_t> _parse(const std::string& line) {
+  auto pos = line.find(" ");
+  ET_CHECK_MSG(
+      pos != std::string::npos, "invalid encoder line: %s", line.c_str());
+
+  auto token = base64::decode({line.data(), pos});
+  uint64_t rank = 0;
+  try {
+    rank = std::stoul(line.substr(pos + 1));
+  } catch (const std::exception&) {
+    ET_CHECK_MSG(false, "invalid encoder rank: %s", line.c_str());
+  }
+
+  return {std::move(token), rank};
+}
+
+static Encoder _load_encoder(const std::string& path) {
+  std::ifstream file(path);
+  ET_CHECK_MSG(file, "failed to open encoder file: %s", path.c_str());
+
+  Encoder encoder;
+  std::string line;
+  while (std::getline(file, line)) {
+    auto [token, rank] = _parse(line);
+
+    ET_CHECK_MSG(
+        encoder.emplace(std::move(token), rank).second,
+        "duplicate item: %s",
+        line.c_str());
+  }
+
+  return encoder;
+}
+
+static Decoder _build_decoder(const Encoder& encoder) {
+  Decoder decoder;
+  for (const auto& [k, v] : encoder) {
+    decoder.emplace(v, k);
+  }
+
+  ET_CHECK_MSG(encoder.size() == decoder.size(), "duplicate items in encoder");
+
+  return decoder;
+}
+
+static std::vector<uint64_t> _byte_pair_merge(
+    const std::string& piece,
+    const std::unordered_map<std::string, uint64_t>& ranks,
+    std::function<uint64_t(uint64_t, uint64_t)> func) {
+  // This is a vector of (start, rank).
+  // The rank is of the byte pair starting at position start.
+  // The rank of the last item in the vector is not a valid value.
+  std::vector<std::pair<uint64_t, uint64_t>> parts;
+  parts.reserve(piece.size() + 1);
+  for (auto idx = 0U; idx < piece.size() + 1; ++idx) {
+    parts.emplace_back(idx, _max_size());
+  }
+
+  auto get_rank = [&piece, &ranks](
+                      const std::vector<std::pair<uint64_t, uint64_t>>& parts,
+                      uint64_t start_idx,
+                      uint64_t skip) -> std::optional<uint64_t> {
+    if (start_idx + skip + 2 < parts.size()) {
+      auto s = parts[start_idx].first;
+      auto e = parts[start_idx + skip + 2].first;
+      auto key = piece.substr(s, e - s);
+      auto iter = ranks.find(key);
+      if (iter != ranks.end()) {
+        return iter->second;
+      }
+    }
+    return std::nullopt;
+  };
+
+  // We look up the ranks once in the beginning and iteratively update
+  // them during each merge, which reduces the number of rank lookups.
+  for (auto i = 0U; i < parts.size() - 2; ++i) {
+    auto rank = get_rank(parts, i, 0);
+    if (rank) {
+      // usize::MAX is a sentinel value and cannot be a valid rank
+      ET_CHECK_MSG(*rank != _max_size(), "rank is too large");
+      parts[i].second = *rank;
+    }
+  }
+
+  // If you have n parts and m merges, this does O(mn) work.
+  // We could do something with a heap and do O(m log n) work.
+  // It is important to consider that n is often small (<100), and as such
+  // the cache-locality benefits outweigh the algorithmic complexity downsides
+  // of the `parts` vector data structure above.
+
+  // Note that we hash bytes, not token pairs. As long as we train BPE the way
+  // we currently do, this is equivalent. An easy way to break this would be
+  // to decouple merge priority from token index or to prevent specific token
+  // merges.
+  while (true) {
+    if (parts.size() == 1) {
+      break;
+    }
+
+    // usize::MAX is a sentinel rank value allowing us to
+    // take the min more quickly
+    auto min_rank = std::make_pair<uint64_t, uint64_t>(_max_size(), 0);
+    for (auto i = 0U; i < parts.size() - 1; ++i) {
+      auto rank = parts[i].second;
+      if (rank < min_rank.first) {
+        min_rank.first = rank;
+        min_rank.second = i;
+      }
+    }
+
+    if (min_rank.first != _max_size()) {
+      auto i = min_rank.second;
+
+      // NOTE: We are about to remove parts[i + 1]. We do not do it
+      // yet because there are cache-locality benefits to updating
+      // parts[i] and parts[i-1] before removing, which could thrash
+      // the cache. Thus, we update the rank calculation by skipping over
+      // parts[i + 1], by invoking `get_rank!` with `skip = 1`.
+      auto rank = get_rank(parts, i, 1);
+      if (rank) {
+        parts[i].second = *rank;
+      } else {
+        parts[i].second = _max_size();
+      }
+      if (i > 0) {
+        rank = get_rank(parts, i - 1, 1);
+        if (rank) {
+          parts[i - 1].second = *rank;
+        } else {
+          parts[i - 1].second = _max_size();
+        }
+      }
+
+      parts.erase(parts.begin() + (i + 1));
+    } else {
+      break;
+    }
+  }
+  std::vector<uint64_t> out;
+  out.reserve(parts.size() - 1);
+  for (auto i = 0U; i < parts.size() - 1; ++i) {
+    auto s = parts[i].first;
+    auto e = parts[i + 1].first;
+    out.push_back(func(s, e));
+  }
+  return out;
+}
+
+static std::vector<uint64_t> _byte_pair_encode(
+    const std::string& piece,
+    const Encoder& encoder) {
+  if (piece.size() == 1) {
+    auto iter = encoder.find(piece);
+    if (iter != encoder.end()) {
+      return std::vector<uint64_t>({iter->second});
+    } else {
+      // TODO: is it possible?
+      return {};
+    }
+  }
+
+  return _byte_pair_merge(
+      piece, encoder, [&piece, &encoder](uint64_t start, uint64_t stop) {
+        std::string key = piece.substr(start, stop - start);
+        auto iter = encoder.find(key);
+        if (iter != encoder.end()) {
+          return iter->second;
+        } else {
+          // TODO: what if key does not exist? Should we return `unknown`?
+          // assert(false); // ??
+          return uint64_t(0);
+        }
+      });
+}
+// ------------------------------Util end------------------------------------
+// -------------------------private method start-------------------------------
+
+template <typename T>
+std::pair<std::optional<std::string>, re2::StringPiece>
+Tiktoken::_split_with_allowed_special_token(
+    re2::StringPiece& input,
+    const T& allowed_special) {
+  if (!_special_token_regex) {
+    return std::make_pair(std::nullopt, input);
+  }
+
+  auto start = input.begin();
+  std::string special;
+  while (true) {
+    if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) {
+      // No special token.
+      break;
+    }
+
+    if (allowed_special.count(special) == 1) {
+      // Found an allowed special token, split the text with it.
+      return std::make_pair(
+          special,
+          re2::StringPiece(start, input.begin() - start - special.size()));
+    } // else try to find the next special token
+  }
+
+  return std::make_pair(std::nullopt, input);
+}
+
+void Tiktoken::_encode(
+    re2::StringPiece& input,
+    std::vector<uint64_t>& ret,
+    uint64_t& last_piece_token_len) {
+  std::string piece;
+  assert(_regex);
+  while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) {
+    auto iter = _encoder.find(piece);
+    if (iter != _encoder.end()) {
+      last_piece_token_len = 1;
+      ret.push_back(iter->second);
+      continue;
+    }
+    auto tokens = _byte_pair_encode(piece, _encoder);
+    last_piece_token_len = tokens.size();
+    ret.insert(ret.end(), tokens.begin(), tokens.end());
+  }
+}
+
+template <typename T>
+std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
+    const std::string& text,
+    const T& allowed_special) {
+  std::vector<uint64_t> tokens;
+  uint64_t last_piece_token_len = 0;
+  re2::StringPiece input(text);
+  while (true) {
+    auto [special, sub_input] =
+        _split_with_allowed_special_token(input, allowed_special);
+
+    _encode(sub_input, tokens, last_piece_token_len);
+
+    if (special) {
+      uint64_t token = 0;
+      try {
+        token = _special_token_encoder.at(*special);
+      } catch (const std::out_of_range&) {
+        // Should never go here, since special pattern includes all special
+        // chars.
+        ET_CHECK_MSG(false, "unknown special token: %s", special->c_str());
+      }
+
+      tokens.push_back(token);
+      last_piece_token_len = 0;
+    } else {
+      break;
+    }
+  }
+
+  // last_piece_token_len is how many tokens came from the last regex split.
+  // This is used for determining unstable tokens, since you can't merge
+  // across (stable) regex splits
+  return std::make_pair(tokens, last_piece_token_len);
+}
+
+// -------------------------private method end-------------------------------
+// -------------------------public method start-------------------------------
+
+Error Tiktoken::load(const std::string& path) {
+  _encoder = _load_encoder(path);
+  _special_token_encoder = _get_special_tokens(_encoder.size());
+
+  _decoder = _build_decoder(_encoder);
+  _special_token_decoder = _build_decoder(_special_token_encoder);
+
+  _regex = _create_regex(_pattern);
+
+  _special_token_regex = _build_special_token_regex(_special_token_encoder);
+
+  initialized_ = true;
+  return Error::Ok;
+}
+
+Result<std::vector<uint64_t>>
+Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
+  if (!initialized_) {
+    return Error::NotSupported;
+  }
+  auto res = _encode_with_special_token(text, _special_token_encoder).first;
+  for (auto i = 0; i < bos; ++i) {
+    res.insert(res.begin(), bos_tok_);
+  }
+  for (auto i = 0; i < eos; ++i) {
+    res.push_back(eos_tok_);
+  }
+  return Result(res);
+}
+
+Result<std::string> Tiktoken::decode(uint64_t prev, uint64_t cur) {
+  (void)prev;
+  if (!initialized_) {
+    return Error::NotSupported;
+  }
+  std::string ret;
+
+  std::string token_bytes;
+  auto iter = _decoder.find(cur);
+  if (iter != _decoder.end()) {
+    token_bytes = iter->second;
+  } else {
+    iter = _special_token_decoder.find(cur);
+    if (iter != _special_token_decoder.end()) {
+      token_bytes = iter->second;
+    } else {
+      ET_CHECK_MSG(false, "unknown token: %" PRIu64, cur);
+    }
+  }
+  ret += token_bytes;
+
+  return ret;
+}
+// -------------------------public method end-------------------------------
+
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/tokenizer/tiktoken.h b/examples/models/llama2/tokenizer/tiktoken.h
new file mode 100644
index 00000000000..e00efdf99e3
--- /dev/null
+++ b/examples/models/llama2/tokenizer/tiktoken.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/models/llama2/tokenizer/base64.h>
+#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <re2/re2.h>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <regex>
+#include <unordered_map>
+
+namespace torch {
+namespace executor {
+
+using Encoder = std::unordered_map<std::string, uint64_t>;
+using Decoder = std::unordered_map<uint64_t, std::string>;
+using Re2UPtr = std::unique_ptr<re2::RE2>;
+
+class Tiktoken : public Tokenizer {
+ public:
+  explicit Tiktoken(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
+      : Tokenizer(vocab_size, bos_tok, eos_tok){};
+  ~Tiktoken(){};
+
+  Error load(const std::string& tokenizer_path);
+
+  Result<std::vector<uint64_t>>
+  encode(const std::string& input, int8_t bos, int8_t eos);
+
+  Result<std::string> decode(uint64_t prev_token, uint64_t token);
+
+ private:
+  static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
+    Encoder special_tokens;
+    special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
+    special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
+    special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
+    special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
+    special_tokens.emplace("<|eot_id|>", num_base_tokens++);
+    for (auto i = 5; i < 251; ++i) {
+      special_tokens.emplace(
+          "<|reserved_special_token_" + std::to_string(i) + "|>",
+          num_base_tokens++);
+    }
+    return special_tokens;
+  }
+
+  template <typename T>
+  std::pair<std::optional<std::string>, re2::StringPiece>
+  _split_with_allowed_special_token(
+      re2::StringPiece& input,
+      const T& allowed_special);
+
+  void _encode(
+      re2::StringPiece& input,
+      std::vector<uint64_t>& ret,
+      uint64_t& last_piece_token_len);
+
+  template <typename T>
+  std::pair<std::vector<uint64_t>, uint64_t> _encode_with_special_token(
+      const std::string& text,
+      const T& allowed_special);
+
+  // Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
+  const std::string _pattern =
+      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)";
+  Encoder _encoder;
+  Encoder _special_token_encoder;
+  Decoder _decoder;
+  Decoder _special_token_decoder;
+
+  Re2UPtr _regex;
+  Re2UPtr _special_token_regex;
+};
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/tokenizer/tiktoken.py b/examples/models/llama2/tokenizer/tiktoken.py
new file mode 100644
index 00000000000..a1f0fde11af
--- /dev/null
+++ b/examples/models/llama2/tokenizer/tiktoken.py
@@ -0,0 +1,233 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from logging import getLogger
+from pathlib import Path
+from typing import (
+    AbstractSet,
+    cast,
+    Collection,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Sequence,
+    TypedDict,
+    Union,
+)
+
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+
+
+logger = getLogger(__name__)
+
+
+Role = Literal["system", "user", "assistant"]
+
+
+class Message(TypedDict):
+    role: Role
+    content: str
+
+
+Dialog = Sequence[Message]
+
+
+class Tokenizer:
+    """
+    tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    """
+
+    special_tokens: Dict[str, int]
+
+    num_reserved_special_tokens = 256
+
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
+
+    def __init__(self, model_path: str):
+        """
+        Initializes the Tokenizer with a Tiktoken model.
+
+        Args:
+            model_path (str): The path to the Tiktoken model file.
+        """
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+
+        mergeable_ranks = load_tiktoken_bpe(model_path)
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|reserved_special_token_2|>",
+            "<|reserved_special_token_3|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|reserved_special_token_4|>",
+            "<|eot_id|>",  # end of turn
+        ] + [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(5, self.num_reserved_special_tokens - 5)
+        ]
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        logger.info(f"Reloaded SentencePiece model from {model_path}")
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.model.n_vocab
+        self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self.eos_id: int = self.special_tokens["<|end_of_text|>"]
+        self.pad_id: int = -1
+        self.stop_tokens = {
+            self.special_tokens["<|end_of_text|>"],
+            self.special_tokens["<|eot_id|>"],
+        }
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool,
+        eos: bool,
+        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),  # noqa B006
+        disallowed_special: Union[Literal["all"], Collection[str]] = (),
+    ) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+            allowed_tokens ("all"|set[str]): allowed special tokens in string
+            disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
+
+        Returns:
+            list[int]: A list of token IDs.
+
+        By default, setting disallowed_special=() encodes a string by ignoring
+        special tokens. Specifically:
+        - Setting `disallowed_special` to () will cause all text corresponding
+          to special tokens to be encoded as natural text (insteading of raising
+          an error).
+        - Setting `allowed_special` to "all" will treat all text corresponding
+          to special tokens to be encoded as special tokens.
+        """
+        assert type(s) is str
+
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException (may go beyond 400k)
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+
+        substrs = (
+            substr
+            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
+            for substr in self._split_whitespaces_or_nonwhitespaces(
+                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+            )
+        )
+        t: List[int] = []
+        for substr in substrs:
+            t.extend(
+                self.model.encode(
+                    substr,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            )
+        if bos:
+            t.insert(0, self.bos_id)
+        if eos:
+            t.append(self.eos_id)
+        return t
+
+    def decode(self, t: Sequence[int]) -> str:
+        """
+        Decodes a list of token IDs into a string.
+
+        Args:
+            t (List[int]): The list of token IDs to be decoded.
+
+        Returns:
+            str: The decoded string.
+        """
+        # typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
+        return self.model.decode(cast(List[int], t))
+
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+
+
+class ChatFormat:
+    def __init__(self, tokenizer: Tokenizer):
+        self.tokenizer = tokenizer
+
+    def encode_header(self, message: Message) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
+        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
+        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
+        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
+        return tokens
+
+    def encode_message(self, message: Message) -> List[int]:
+        tokens = self.encode_header(message)
+        tokens.extend(
+            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
+        )
+        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
+        return tokens
+
+    def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
+        for message in dialog:
+            tokens.extend(self.encode_message(message))
+        # Add the start of an assistant message for the model to complete
+        tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
+        return tokens
diff --git a/examples/models/llama2/tokenizer/tokenizer.h b/examples/models/llama2/tokenizer/tokenizer.h
index 0edc4671b17..5e9f0925823 100644
--- a/examples/models/llama2/tokenizer/tokenizer.h
+++ b/examples/models/llama2/tokenizer/tokenizer.h
@@ -16,6 +16,7 @@
 #include <cstring>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
@@ -25,49 +26,39 @@
 namespace torch {
 namespace executor {
 
-struct TokenIndex {
-  const char* str;
-  int32_t id;
-};
-
 class Tokenizer {
  public:
-  explicit Tokenizer(int32_t vocab_size, int32_t bos_tok, int32_t eos_tok);
-  ~Tokenizer();
+  explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
+      : initialized_(false),
+        vocab_size_(vocab_size),
+        bos_tok_(bos_tok),
+        eos_tok_(eos_tok) {}
+  virtual ~Tokenizer() {}
 
-  Error load(const std::string& tokenizer_path);
+  virtual Error load(const std::string& tokenizer_path) = 0;
 
-  Error encode(
-      const char* text,
-      int8_t bos,
-      int8_t eos,
-      int32_t* tokens,
-      int32_t* n_tokens);
+  virtual Result<std::vector<uint64_t>>
+  encode(const std::string& input, int8_t bos, int8_t eos) = 0;
 
-  Result<const char*> decode(int prev_token, int token);
+  virtual Result<std::string> decode(uint64_t prev_token, uint64_t token) = 0;
 
   // getters
   int32_t vocab_size() const {
     return vocab_size_;
   }
 
-  int32_t bos_tok() const {
+  uint64_t bos_tok() const {
     return bos_tok_;
   }
 
-  int32_t eos_tok() const {
+  uint64_t eos_tok() const {
     return eos_tok_;
   }
 
- private:
+ protected:
   bool initialized_;
   const int32_t vocab_size_;
-  int32_t bos_tok_, eos_tok_;
-  std::unique_ptr<char*[]> vocab_;
-  std::unique_ptr<float[]> vocab_scores_;
-  std::unique_ptr<TokenIndex[]> sorted_vocab_;
-  unsigned int max_token_length_;
-  unsigned char byte_pieces_[512]; // stores all single-byte strings
+  uint64_t bos_tok_, eos_tok_;
 };
 
 } // namespace executor
diff --git a/examples/models/llama3/README.md b/examples/models/llama3/README.md
new file mode 100644
index 00000000000..5ea3e6b9e1e
--- /dev/null
+++ b/examples/models/llama3/README.md
@@ -0,0 +1,2 @@
+# Summary
+For Llama3, use the same example code, minus tokenizer, as Llama2. Please see the [Llama2 README page](../llama2/README.md) for details.
diff --git a/examples/portable/scripts/test_demo_backend_delegation.sh b/examples/portable/scripts/test_demo_backend_delegation.sh
index 7044026d601..d1ecf9150f9 100644
--- a/examples/portable/scripts/test_demo_backend_delegation.sh
+++ b/examples/portable/scripts/test_demo_backend_delegation.sh
@@ -22,8 +22,7 @@ build_cmake_executor_runner() {
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DBUCK2=buck2 \
-      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+    && retry cmake -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4
 }
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index b2691da2ec7..8998ee634e0 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -202,8 +202,10 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  // TODO: So far we have issues with etdump_gen during load_method. Enable it
+  // after the issues are fixed.
   Result<Method> method =
-      program->load_method(method_name, &memory_manager, &etdump_gen);
+      program->load_method(method_name, &memory_manager, nullptr);
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
new file mode 100644
index 00000000000..936db49d0a1
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
@@ -0,0 +1,281 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import sys
+from multiprocessing.connection import Client
+from pprint import PrettyPrinter
+
+import numpy as np
+import torch
+
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.qualcomm.scripts.utils import (
+    build_executorch_binary,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+
+
+def create_data_lists(voc07_path, data_size):
+    """
+    Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file.
+
+    :param voc07_path: path to the 'VOC2007' folder
+    :param output_folder: folder where the JSONs must be saved
+    """
+    from utils import parse_annotation
+
+    voc07_path = os.path.abspath(voc07_path)
+
+    # Test data
+    test_images = []
+    test_objects = []
+    n_objects = 0
+
+    # Find IDs of images in the test data
+    with open(os.path.join(voc07_path, "ImageSets/Main/test.txt")) as f:
+        ids = f.read().splitlines()
+
+    for index, id in enumerate(ids):
+        if index >= data_size:
+            break
+        # Parse annotation's XML file
+        objects = parse_annotation(os.path.join(voc07_path, "Annotations", id + ".xml"))
+        if len(objects) == 0:
+            continue
+        test_objects.append(objects)
+        n_objects += len(objects)
+        test_images.append(os.path.join(voc07_path, "JPEGImages", id + ".jpg"))
+
+    assert len(test_objects) == len(test_images)
+
+    # TEST_images.json stores the file name of the images, and TEST_objects.json stores info such as boxes, labels, and difficulties
+    with open(os.path.join(voc07_path, "TEST_images.json"), "w") as j:
+        json.dump(test_images, j)
+    with open(os.path.join(voc07_path, "TEST_objects.json"), "w") as j:
+        json.dump(test_objects, j)
+
+    print(
+        "\nThere are %d test images containing a total of %d objects. Files have been saved to %s."
+        % (len(test_images), n_objects, os.path.abspath(voc07_path))
+    )
+
+
+def get_dataset(data_size, dataset_dir, download):
+    from datasets import PascalVOCDataset
+    from torchvision import datasets
+
+    if download:
+        datasets.VOCSegmentation(
+            root=os.path.join(dataset_dir, "voc_image"),
+            year="2007",
+            image_set="test",
+            download=True,
+        )
+    voc07_path = os.path.join(dataset_dir, "voc_image", "VOCdevkit", "VOC2007")
+    create_data_lists(voc07_path, data_size)
+
+    # voc07_path is where the data and ground truth json file will be stored
+    test_dataset = PascalVOCDataset(voc07_path, split="test", keep_difficult=True)
+
+    test_loader = torch.utils.data.DataLoader(
+        test_dataset, shuffle=True, collate_fn=test_dataset.collate_fn
+    )
+
+    inputs, input_list = [], ""
+    true_boxes = []
+    true_labels = []
+    true_difficulties = []
+    for index, (images, boxes, labels, difficulties) in enumerate(test_loader):
+        if index >= data_size:
+            break
+        inputs.append((images,))
+        input_list += f"input_{index}_0.raw\n"
+        true_boxes.extend(boxes)
+        true_labels.extend(labels)
+        true_difficulties.extend(difficulties)
+
+    return inputs, input_list, true_boxes, true_labels, true_difficulties
+
+
+def SSD300VGG16(pretrained_weight_model):
+    from model import SSD300
+
+    model = SSD300(n_classes=21)
+    # TODO: If possible, it's better to set weights_only to True
+    # https://pytorch.org/docs/stable/generated/torch.load.html
+    checkpoint = torch.load(
+        pretrained_weight_model, map_location="cpu", weights_only=False
+    )
+    model.load_state_dict(checkpoint["model"].state_dict())
+
+    return model.eval()
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./ssd300_vgg16",
+        default="./ssd300_vgg16",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--download",
+        help="If specified, download VOCSegmentation dataset by torchvision API",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--oss_repo",
+        help=(
+            "Repository that contains model backbone and score calculation."
+            "e.g., --M ./a-PyTorch-Tutorial-to-Object-Detection"
+            "Please clone the repository from https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--pretrained_weight",
+        help=(
+            "Location of model pretrained weight."
+            "e.g., -p ./checkpoint_ssd300.pth.tar"
+            "Pretrained model can be found in the link https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection, under the Training Section"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+
+    sys.path.insert(0, args.oss_repo)
+
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    if not args.compile_only and args.device is None:
+        raise RuntimeError(
+            "device serial is required if not compile only. "
+            "Please specify a device serial by -s/--device argument."
+        )
+
+    data_num = 100
+    inputs, input_list, true_boxes, true_labels, true_difficulties = get_dataset(
+        data_size=data_num, dataset_dir=args.artifact, download=args.download
+    )
+
+    pte_filename = "ssd300_vgg16_qnn"
+    model = SSD300VGG16(args.pretrained_weight)
+
+    sample_input = (torch.randn((1, 3, 300, 300)),)
+    build_executorch_binary(
+        model,
+        sample_input,
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        quant_dtype=QuantDtype.use_8a8w,
+    )
+
+    if args.compile_only:
+        sys.exit(0)
+
+    # setup required paths accordingly
+    # qnn_sdk       : QNN SDK path setup in environment variable
+    # artifact_path : path where artifacts were built
+    # pte_path      : path where executorch binary was stored
+    # device_id     : serial number of android device
+    # workspace     : folder for storing artifacts on android device
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        artifact_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    det_boxes = []
+    det_labels = []
+    det_scores = []
+
+    def post_process():
+        from utils import calculate_mAP
+
+        np.set_printoptions(threshold=np.inf)
+
+        # output_xxx_0.raw is output of boxes, and output_xxx_1.raw is output of classes
+        for file_index in range(data_num):
+            boxes_filename = os.path.join(
+                output_data_folder, f"output_{file_index}_0.raw"
+            )
+            category_filename = os.path.join(
+                output_data_folder, f"output_{file_index}_1.raw"
+            )
+
+            predicted_locs = np.fromfile(boxes_filename, dtype=np.float32).reshape(
+                [1, 8732, 4]
+            )
+            predicted_locs = torch.tensor(predicted_locs)
+
+            predicted_scores = np.fromfile(category_filename, dtype=np.float32).reshape(
+                [1, 8732, 21]
+            )
+            predicted_scores = torch.tensor(predicted_scores)
+
+            det_boxes_batch, det_labels_batch, det_scores_batch = model.detect_objects(
+                predicted_locs,
+                predicted_scores,
+                min_score=0.01,
+                max_overlap=0.45,
+                top_k=200,
+            )
+
+            det_boxes.extend(det_boxes_batch)
+            det_labels.extend(det_labels_batch)
+            det_scores.extend(det_scores_batch)
+
+        pp = PrettyPrinter()
+        # Calculate mAP
+        APs, mAP = calculate_mAP(
+            det_boxes,
+            det_labels,
+            det_scores,
+            true_boxes,
+            true_labels,
+            true_difficulties,
+        )
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"mAP": float(mAP)}))
+        else:
+            print("\nMean Average Precision (mAP): %.3f" % mAP)
+            pp.pprint(APs)
+
+    adb.pull(output_path=args.artifact, callback=post_process)
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index cdb84f6e8c6..a6d2e6d1a3e 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -40,6 +40,14 @@
         help="Generate ETRecord metadata to link with runtime results (used for profiling)",
     )
 
+    parser.add_argument(
+        "-f",
+        "--output_folder",
+        type=str,
+        default="",
+        help="The folder to store the exported program",
+    )
+
     args = parser.parse_args()
 
     if args.model_name not in MODEL_NAME_TO_MODEL:
@@ -92,7 +100,7 @@
     )
 
     if args.generate_etrecord:
-        etrecord_path = "etrecord.bin"
+        etrecord_path = args.output_folder + "etrecord.bin"
         generate_etrecord(etrecord_path, edge_copy, executorch_program)
 
-    save_pte_program(executorch_program, args.model_name)
+    save_pte_program(executorch_program, args.model_name, args.output_folder)
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 84d130d4244..cb067690f94 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -204,6 +204,8 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
             )
 
     model.load_state_dict(
+        # TODO: If possible, it's better to set weights_only to True
+        # https://pytorch.org/docs/stable/generated/torch.load.html
         torch.load(
             (
                 f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model"
@@ -211,6 +213,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
                 else pretrained_weight
             ),
             map_location=torch.device("cpu"),
+            weights_only=False,
         ),
     )
 
diff --git a/examples/sdk/CMakeLists.txt b/examples/sdk/CMakeLists.txt
index d7ca7679e31..ec65bef8f55 100644
--- a/examples/sdk/CMakeLists.txt
+++ b/examples/sdk/CMakeLists.txt
@@ -38,6 +38,9 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 # Find prebuilt libraries. executorch package should contain
 # portable_ops_lib, etdump, bundled_program.
 find_package(executorch CONFIG REQUIRED)
+target_link_options_shared_lib(executorch)
+target_link_options_shared_lib(portable_ops_lib)
+
 target_include_directories(executorch INTERFACE ${_common_include_directories})
 
 find_package(
@@ -48,18 +51,6 @@ add_executable(sdk_example_runner
                sdk_example_runner/sdk_example_runner.cpp)
 target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
 
-# portable_ops_lib
-gen_selected_ops("" "" "ON")
-# Expect gen_selected_ops output file to be selected_operators.yaml
-generate_bindings_for_kernels(
-  FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
-)
-gen_operators_lib(
-  "portable_ops_lib"
-  KERNEL_LIBS portable_kernels
-  DEPS executorch)
-
-target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
 target_include_directories(
   etdump
   INTERFACE
@@ -72,6 +63,8 @@ target_link_libraries(
   gflags
   etdump
   extension_data_loader
-  flatcc
   bundled_program
-  portable_ops_lib)
+  flatccrt
+  portable_ops_lib
+  portable_kernels
+)
diff --git a/examples/sdk/README.md b/examples/sdk/README.md
index 8d42b6f6037..735776be877 100644
--- a/examples/sdk/README.md
+++ b/examples/sdk/README.md
@@ -56,7 +56,7 @@ Running the program will generate an `ETDump` file (`.etdp`) at the location spe
 
 ```bash
    cd executorch
-   rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DBUCK2=buck2 -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
+   rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
    cd ..
    cmake --build cmake-out -j8 -t sdk_example_runner
    ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path mv2_bundled.bpte --etdump_path mv2_etdump.etdp
diff --git a/examples/sdk/test_sdk_example_runner.sh b/examples/sdk/test_sdk_example_runner.sh
index 2f1044f42cc..5185def6552 100644
--- a/examples/sdk/test_sdk_example_runner.sh
+++ b/examples/sdk/test_sdk_example_runner.sh
@@ -18,8 +18,7 @@ cmake_install_executorch_sdk_lib() {
   echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
   rm -rf cmake-out
 
-  retry cmake -DBUCK2="$BUCK" \
-          -DCMAKE_INSTALL_PREFIX=cmake-out \
+  retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
           -DEXECUTORCH_BUILD_SDK=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
diff --git a/examples/selective_build/CMakeLists.txt b/examples/selective_build/CMakeLists.txt
index 29791187185..247a269d94b 100644
--- a/examples/selective_build/CMakeLists.txt
+++ b/examples/selective_build/CMakeLists.txt
@@ -115,10 +115,14 @@ list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 # link to
 #
 add_executable(selective_build_test ${_executor_runner__srcs})
-if(CMAKE_BUILD_TYPE EQUAL "RELEASE")
+if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(selective_build_test PRIVATE "LINKER:--gc-sections")
 endif()
-target_link_libraries(selective_build_test executorch gflags select_build_lib)
+target_link_libraries(
+  selective_build_test PRIVATE executorch gflags select_build_lib
+)
+target_link_options_shared_lib(select_build_lib)
+target_link_options_shared_lib(executorch)
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
 
 # Print all summary
diff --git a/examples/xtensa/aot/export_example.py b/examples/xtensa/aot/export_example.py
deleted file mode 100644
index b51f5c9b498..00000000000
--- a/examples/xtensa/aot/export_example.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Example script for exporting simple models to flatbuffer
-
-import logging
-
-from .meta_registrations import *  # noqa
-
-import torch
-from executorch.exir import EdgeCompileConfig
-from torch._export import capture_pre_autograd_graph
-from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
-
-from ...portable.utils import export_to_edge, save_pte_program
-
-from .quantizer import (
-    QuantFusion,
-    ReplacePT2DequantWithXtensaDequant,
-    ReplacePT2QuantWithXtensaQuant,
-    XtensaQuantizer,
-)
-
-
-FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-
-
-if __name__ == "__main__":
-    in_features = 32
-    out_features = 16
-    bias = True
-    shape = [64, in_features]
-
-    class QuantizedLinear(torch.nn.Module):
-        def __init__(self, in_features: int, out_features: int, bias: bool):
-            super().__init__()
-            self.output_linear = torch.nn.Linear(in_features, out_features, bias=bias)
-
-        def forward(self, x: torch.Tensor):
-            output_linear_out = self.output_linear(x)
-            return output_linear_out
-
-    model = QuantizedLinear(in_features, out_features, bias)
-    model.eval()
-
-    example_inputs = (torch.ones(shape),)
-
-    # Quantizer
-    quantizer = XtensaQuantizer()
-
-    # Export
-    model_exp = capture_pre_autograd_graph(model, example_inputs)
-
-    # Prepare
-    prepared_model = prepare_pt2e(model_exp, quantizer)
-    prepared_model(*example_inputs)
-
-    # Convert
-    converted_model = convert_pt2e(prepared_model)
-
-    # pyre-fixme[16]: Pyre doesn't get that XtensaQuantizer has a patterns attribute
-    patterns = [q.pattern for q in quantizer.quantizers]
-    QuantFusion(patterns)(converted_model)
-
-    # pre-autograd export. eventually this will become torch.export
-    converted_model_exp = capture_pre_autograd_graph(converted_model, example_inputs)
-
-    converted_model_exp = torch.ao.quantization.move_exported_model_to_eval(
-        converted_model_exp
-    )
-
-    exec_prog = (
-        export_to_edge(
-            converted_model_exp,
-            example_inputs,
-            edge_compile_config=EdgeCompileConfig(
-                _check_ir_validity=False,
-            ),
-        )
-        .transform(
-            [ReplacePT2QuantWithXtensaQuant(), ReplacePT2DequantWithXtensaDequant()],
-            check_ir_validity=False,
-        )
-        .to_executorch()
-    )
-
-    logging.info(f"Final exported graph:\n{exec_prog.exported_program().graph}")
-
-    # Save the program as XtensaDemoModel.pte
-    save_pte_program(exec_prog, "XtensaDemoModel")
diff --git a/examples/xtensa/aot/meta_registrations.py b/examples/xtensa/aot/meta_registrations.py
deleted file mode 100644
index aa6014dc9cf..00000000000
--- a/examples/xtensa/aot/meta_registrations.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from executorch.exir.scalar_type import ScalarType
-from torch.library import impl, Library
-
-lib = Library("xtensa", "DEF")
-
-lib.define(
-    "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
-)
-lib.define(
-    "quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
-)
-
-lib.define(
-    "dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
-)
-lib.define(
-    "dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
-)
-
-lib.define(
-    "quantized_linear_pt2(Tensor src, Tensor weight, Tensor bias, float src_scale, int src_zero_point, float weight_scale, int weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point) -> (Tensor Z)"
-)
-lib.define(
-    "quantized_linear_pt2.out(Tensor src, Tensor weight, Tensor bias, float src_scale, int src_zero_point, float weight_scale, int weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, *, Tensor(a!) out) ->  Tensor(a!)"
-)
-
-m = Library("xtensa", "IMPL", "Meta")
-
-
-@impl(m, "quantize_per_tensor")
-def quantize_per_tensor_meta(
-    input: torch.Tensor,
-    scale: float,
-    zero_point: int,
-    quant_min: int,
-    quant_max: int,
-    dtype: ScalarType,
-):
-    return input.new_empty(input.size(), dtype=dtype)
-
-
-@impl(m, "dequantize_per_tensor")
-def dequantize_per_tensor_meta(
-    input: torch.Tensor,
-    scale: float,
-    zero_point: int,
-    quant_min: int,
-    quant_max: int,
-    dtype: ScalarType,
-):
-    return input.new_empty(input.size(), dtype=torch.float)
-
-
-@impl(m, "quantized_linear_pt2")
-def quantized_linear_pt2_meta(
-    src: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    in_scale: float,
-    in_zero_point: int,
-    weight_scale: float,
-    weight_zero_point: int,
-    out_multiplier: int,
-    out_shift: int,
-    out_zero_point: int,
-):
-    # src comes in shape [leading_dims, in_dim]
-    # weight comes in shape [out_dim, in_dim]
-    # output comes in empty with shape [leading_dims, out_dim]
-    out_size = list(src.size())
-    weight_size = list(weight.size())
-    assert len(weight_size) == 2
-    out_size[-1] = weight_size[0]
-    return src.new_empty(out_size, dtype=torch.uint8)
diff --git a/examples/xtensa/aot/quantizer.py b/examples/xtensa/aot/quantizer.py
deleted file mode 100644
index 618d374853f..00000000000
--- a/examples/xtensa/aot/quantizer.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from math import frexp, isclose, trunc
-from typing import Any, Callable, List, Optional, Tuple, Type
-
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
-
-from torch import fx
-
-from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
-from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
-from torch.ao.quantization.quantizer import Quantizer
-from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
-from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
-    OperatorConfig,
-    QuantizationAnnotation,
-    QuantizationConfig,
-    QuantizationSpec,
-)
-from torch.fx import GraphModule
-from torch.fx.passes.infra.pass_base import PassResult
-from torch.fx.passes.utils.fuser_utils import legalize_graph
-
-# torch.ops.load_library("//executorch/kernels/quantized:custom_ops_generated_lib")
-
-
-def quantize_tensor_multiplier(
-    requantize_scale_tensor: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Given requantize_scale_tensor with values in the interval (0, 1),
-    produce a pair of tensors (out_multiplier, right_shift) where out_multiplier
-    is an int32 tensor representing fixed-point values in the interval [-1, 1),
-    and right_shift is an amount to shift right by, so that the floating-point
-    multiplication of some int32 input with each value of requantize_scale_tensor:
-        result = int32_value * requantize_scale_tensors[i]
-    is best approximated by the integer-arithmetic-only code:
-        result = RoundingRightShift(FixedPointMultiplication(int32_value,
-                                    out_multiplier[i]), right_shift[i])
-    """
-
-    # This is identical to C++11 std::round(). The general python round rounds
-    # down, and C++ rounds away from zero.
-    def round_away_zero(f) -> int:
-        r = -0.5 if (f < 0) else 0.5
-        return trunc(f + r)
-
-    def quantize_scalar_multiplier(requantize_scale: float) -> Tuple[int, int]:
-        significand, exponent = frexp(requantize_scale)
-        significand_q31 = int(round_away_zero(significand * (1 << 31)))
-        # Handle the special case when the real multiplier was so close to 1
-        # that its fixed-point approximation was indistinguishable from 1.
-        # We handle this by dividing it by two, incrementing exponent by 1.
-        # the right shift amount.
-        if significand_q31 == (1 << 31):
-            significand_q31 //= 2
-            exponent += 1
-
-        # Verify that the decomposition of requantize_scale into significand
-        # and exponent is correct.
-        reconstructed = significand_q31 / (1 << 31) * pow(2, exponent)
-        assert isclose(
-            requantize_scale, reconstructed, rel_tol=1e-4, abs_tol=1e-4
-        ), "computation of significand and exponent from requantize_scale is not accurate"
-
-        return (significand_q31, exponent)
-
-    # Flatten the input scale tensor so that we can operate on individual values
-    orig_shape = requantize_scale_tensor.shape
-    flattened_tensor = requantize_scale_tensor.flatten().to(torch.float32)
-    out_multiplier = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
-    right_shift = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
-
-    # Iterate over the flattened scale tensor and compute the decomposition of
-    # each value in scale tensor into significand(out_multiplier) and
-    # exponent(right_shift)
-    for idx, scale in enumerate(flattened_tensor):
-        (si, ex) = quantize_scalar_multiplier(scale)
-        out_multiplier[idx], right_shift[idx] = si, ex
-
-    # Reshape the tensors back to the original shape
-    out_multiplier = out_multiplier.reshape(orig_shape)
-    right_shift = right_shift.reshape(orig_shape)
-
-    return (out_multiplier, right_shift)
-
-
-def _is_annotated(nodes: List[fx.Node]) -> bool:
-    annotated = False
-    for node in nodes:
-        annotated = annotated or (
-            "quantization_annotation" in node.meta
-            and node.meta["quantization_annotation"]._annotated
-        )
-    return annotated
-
-
-def _no_outside_users(fused_partition) -> bool:
-    """
-    Checks if each partition other than the last does not have any outside users.
-    """
-    for source_partition in fused_partition[:-1]:
-        if len(source_partition.output_nodes) != 1:
-            return False
-        if len(source_partition.output_nodes[0].users) != 1:
-            return False
-    return True
-
-
-@dataclass
-class PartitionAnchors:
-    """
-    All fields except output are lists of (node, args_index) pair, where node is from
-    the given partition and node.args[args_index] is an input to the partition. Assumes
-    a single output.
-
-    Quantizer uses inputs, weights and biases for quantization annotation. The others
-    field contains tensor inputs that aren't quantized, and the literals fields contains
-    is used for other types of input values as well as handling default parameters.
-    """
-
-    inputs: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    weights: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    biases: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    others: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    literals: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    output: Optional[fx.Node] = None
-
-
-class QuantizationPattern(ABC):
-    @abstractmethod
-    def partition_types(self) -> List[Any]:
-        """
-        List of types to be passed to find_sequential_partitions.
-        """
-        pass
-
-    @abstractmethod
-    def get_anchors(self, gm, fused_partition) -> Optional[PartitionAnchors]:
-        pass
-
-    @abstractmethod
-    def replacement_op(self) -> Callable[..., Any]:
-        """
-        Operator (most likely a custom one) that this partition should be fused into in
-        the backend. Refer to the QuantFusion pass for examples.
-        """
-        pass
-
-
-class LinearPattern(QuantizationPattern):
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
-        return [torch.nn.Linear]
-
-    def get_anchors(
-        self, gm: GraphModule, fused_partition: List[GraphModule]
-    ) -> PartitionAnchors:
-        linear_node = fused_partition[0].nodes[-1]
-
-        # Keep bias empty if not supplied
-        bias = []
-        if len(linear_node.args) > 2:
-            bias = [(linear_node, 2)]
-
-        return PartitionAnchors(
-            inputs=[(linear_node, 0)],
-            weights=[(linear_node, 1)],
-            biases=bias,
-            output=linear_node,
-        )
-
-    def replacement_op(self):
-        return torch.ops.xtensa.quantized_linear_pt2.default
-
-
-class GenericQuantizer(Quantizer):
-    def __init__(self, pattern, quantization_config):
-        super().__init__()
-        self.pattern = pattern
-        self.quantization_config = quantization_config
-
-    def annotate(self, model):
-        fused_partitions = find_sequential_partitions(
-            model,
-            self.pattern.partition_types(),
-        )
-
-        input_act_qspec = self.quantization_config.input_activation
-        weight_qspec = self.quantization_config.weight
-        bias_qspec = self.quantization_config.bias
-        output_act_qspec = self.quantization_config.output_activation
-
-        for fused_partition in fused_partitions:
-            if not _no_outside_users(fused_partition):
-                continue
-
-            anchors = self.pattern.get_anchors(model, fused_partition)
-            if not anchors:
-                continue
-            if _is_annotated(
-                [x[0] for x in anchors.inputs + anchors.weights + anchors.biases]
-                + [anchors.output]
-            ):
-                continue
-
-            anchors.output.meta["quantization_annotation"] = QuantizationAnnotation(
-                output_qspec=output_act_qspec,
-                _annotated=True,
-            )
-
-            def annotate_inputs(inputs, spec):
-                for node, idx in inputs:
-                    annotation = node.meta.get(
-                        "quantization_annotation",
-                        QuantizationAnnotation(_annotated=True),
-                    )
-                    annotation.input_qspec_map[node.args[idx]] = spec
-                    node.meta["quantization_annotation"] = annotation
-
-            annotate_inputs(anchors.inputs, input_act_qspec)
-            annotate_inputs(anchors.weights, weight_qspec)
-            annotate_inputs(anchors.biases, bias_qspec)
-
-    def validate(self, model: fx.GraphModule) -> None:
-        pass
-
-    @classmethod
-    def get_supported_operators(cls) -> List[OperatorConfig]:
-        return []
-
-
-act_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
-)
-
-wgt_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=MinMaxObserver,
-)
-
-
-class XtensaQuantizer(ComposableQuantizer):
-    def __init__(self):
-        static_qconfig = QuantizationConfig(
-            act_qspec,
-            act_qspec,
-            wgt_qspec,
-            None,
-        )
-        super().__init__(
-            [
-                GenericQuantizer(LinearPattern(), static_qconfig),
-            ]
-        )
-
-
-class QuantFusion(ExportPass):
-    def __init__(self, patterns):
-        super().__init__()
-        self.patterns = patterns
-
-    def call(self, graph_module: fx.GraphModule) -> PassResult:
-        for pattern in self.patterns:
-            fused_partitions = find_sequential_partitions(
-                graph_module,
-                pattern.partition_types(),
-            )
-            for fused_partition in fused_partitions:
-                anchors = pattern.get_anchors(graph_module, fused_partition)
-                if not anchors:
-                    continue
-                if any(self.is_fused(p.nodes) for p in fused_partition):
-                    continue
-
-                for p in fused_partition:
-                    self.mark_fused(p.nodes)
-
-                dequants_inputs = []
-                for node, idx in anchors.inputs:
-                    if (
-                        node.args[idx].target
-                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
-                    ):
-                        dequants_inputs.append(node.args[idx])
-                dequants_weights = []
-                for node, idx in anchors.weights:
-                    if (
-                        node.args[idx].target
-                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
-                    ):
-                        dequants_weights.append(node.args[idx])
-
-                inputs_inputs = [node.args[0] for node in dequants_inputs]
-                weights_inputs = [node.args[0] for node in dequants_weights]
-                bias_inputs = [node.args[idx] for node, idx in anchors.biases]
-                other_inputs = [node.args[idx] for node, idx in anchors.others]
-
-                assert len(anchors.output.users) == 1
-                quant_node = list(anchors.output.users.keys())[0]
-
-                with graph_module.graph.inserting_after(anchors.output):
-                    args = tuple(
-                        inputs_inputs + weights_inputs + other_inputs + bias_inputs
-                    )
-                    kwargs = {}
-                    if (
-                        pattern.replacement_op()
-                        == torch.ops.xtensa.quantized_linear_pt2.default
-                    ):
-                        weight_scale = (
-                            weights_inputs[0].args[1]
-                            if weights_inputs[0].name[:13] != "_frozen_param"
-                            else dequants_weights[0].args[1]
-                        )
-                        bias_scale = inputs_inputs[0].args[1] * weight_scale
-                        requantize_scale = bias_scale / quant_node.args[1]
-                        requantize_scale_t = torch.tensor([requantize_scale])
-
-                        (out_multiplier, out_shift) = quantize_tensor_multiplier(
-                            requantize_scale_t
-                        )
-                        bias_shape = weights_inputs
-                        node = (
-                            weights_inputs[0].args[0]
-                            if weights_inputs[0].name[:13] != "_frozen_param"
-                            else dequants_weights[0].args[0]
-                        )
-                        attr_node = getattr(graph_module, node.target)
-                        weight_shape = list(attr_node.shape)
-                        bias_shape = weight_shape[0]
-                        bias = (
-                            bias_inputs[0]
-                            if bias_inputs
-                            else graph_module.graph.call_function(
-                                torch.ops.aten.full.default, ([bias_shape], 0.0)
-                            )
-                        )
-                        bias_int32_quant = graph_module.graph.call_function(
-                            torch.ops.quantized_decomposed.quantize_per_tensor.default,
-                            (
-                                bias,
-                                bias_scale,
-                                0,
-                                -(2**31),
-                                (2**31) - 1,
-                                torch.int32,
-                            ),
-                        )
-
-                        out_multiplier_ = graph_module.graph.call_function(
-                            torch.ops.aten.full.default, ([1], out_multiplier[0].item())
-                        )
-                        out_shift_ = graph_module.graph.call_function(
-                            torch.ops.aten.full.default, ([1], out_shift[0].item())
-                        )
-                        args = tuple(
-                            inputs_inputs
-                            + weights_inputs
-                            + other_inputs
-                            + [bias_int32_quant]
-                        )
-                        kwargs = {
-                            "src_scale": dequants_inputs[0].args[1],
-                            "src_zero_point": dequants_inputs[0].args[2],
-                            "weight_scale": dequants_weights[0].args[1],
-                            "weight_zero_point": dequants_weights[0].args[2],
-                            "out_multiplier": out_multiplier_,
-                            "out_shift": out_shift_,
-                            "out_zero_point": quant_node.args[2],
-                        }
-                    fused = graph_module.graph.call_function(
-                        pattern.replacement_op(),
-                        args,
-                        kwargs,
-                    )
-                    fused.meta = quant_node.meta
-                    quant_node.replace_all_uses_with(fused)
-
-            legalize_graph(graph_module)
-            graph_module.graph.eliminate_dead_code()
-            # pyre-fixme[7]: Incompatible return type
-            graph_module.recompile()
-
-    @classmethod
-    def is_fused(cls, nodes) -> bool:
-        return any(cls.__qualname__ in n.meta for n in nodes)
-
-    @classmethod
-    def mark_fused(cls, nodes) -> bool:
-        for n in nodes:
-            # pyre-fixme[7]: Incompatible return type
-            n.meta["QuantFusion"] = True
-
-
-class ReplacePT2QuantWithXtensaQuant(ExportPass):
-    """
-    Replace the pt2 quantization ops with custom xtensa quantization ops.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op not in {exir_ops.edge.quantized_decomposed.quantize_per_tensor.default}:
-            return super().call_operator(op, args, kwargs, meta)
-
-        return super().call_operator(
-            exir_ops.edge.xtensa.quantize_per_tensor.default,
-            args,
-            kwargs,
-            meta,
-        )
-
-
-class ReplacePT2DequantWithXtensaDequant(ExportPass):
-    """
-    Replace the pt2 dequantization ops with custom xtensa dequantization ops.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op not in {exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default}:
-            return super().call_operator(op, args, kwargs, meta)
-
-        return super().call_operator(
-            exir_ops.edge.xtensa.dequantize_per_tensor.default,
-            args,
-            kwargs,
-            meta,
-        )
diff --git a/examples/xtensa/ops/functions.yaml b/examples/xtensa/ops/functions.yaml
deleted file mode 100644
index 07093d3ed24..00000000000
--- a/examples/xtensa/ops/functions.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This yaml file contains operators that are also defined by the ATen library.
-# For lean mode:
-#   - Codegen'd target `executorch_generated_lib` will be reading all the information
-#     from this file, including operator schema and kernel metadata.
-#   - Selective build target `codegen:executorch_defined_ops` now is selecting all the
-#     operators in this file, by dumping all the op names into `selected_operators.yaml`.
-#
-# See the README.md file in executorch/kernels/portable for a description of the syntax used
-# by this file.
-
-
-- op: add.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::add_out
-
-- op: full.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::full_out
-
-- func: xtensa::quantized_linear_pt2.out(Tensor src, Tensor weight, Tensor bias, float src_scale, int src_zero_point, float weight_scale, int weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::HiFi::quantized_linear_pt2_out
-
-- func: xtensa::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::HiFi::dequantize_per_tensor_out
-
-- func: xtensa::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::HiFi::quantize_per_tensor_out
diff --git a/exir/TARGETS b/exir/TARGETS
index cc20f766bad..170a22f1328 100644
--- a/exir/TARGETS
+++ b/exir/TARGETS
@@ -144,6 +144,7 @@ python_library(
         "//executorch/exir/capture:lib",
         "//executorch/exir/emit:lib",
         "//executorch/exir/program:lib",
+        "//executorch/exir/serde:serialize",
     ],
 )
 
diff --git a/exir/__init__.py b/exir/__init__.py
index d71a2be064b..c6a1939d357 100644
--- a/exir/__init__.py
+++ b/exir/__init__.py
@@ -24,6 +24,7 @@
     ExirExportedProgram,
     to_edge,
 )
+from executorch.exir.serde.serialize import load, save
 from executorch.exir.tracer import ExirDynamoConfig
 from torch.export import ExportedProgram, ExportGraphSignature
 
@@ -49,4 +50,6 @@
     "ExecutorchBackendConfig",
     "Value",
     "ExirDynamoConfig",
+    "load",
+    "save",
 ]
diff --git a/exir/backend/test/demos/rpc/test_rpc.py b/exir/backend/test/demos/rpc/test_rpc.py
index 0c0e72862fd..63feb954fee 100644
--- a/exir/backend/test/demos/rpc/test_rpc.py
+++ b/exir/backend/test/demos/rpc/test_rpc.py
@@ -8,6 +8,7 @@
 
 import torch
 from executorch import exir
+from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.test.demos.rpc.executor_backend_partitioner import (
     ExecutorBackendPartitioner,
@@ -20,6 +21,7 @@
 from executorch.extension.pybindings.portable_lib import (  # @manual
     _load_for_executorch_from_buffer,
 )
+from torch.export import export
 from torch.utils._pytree import tree_flatten
 
 """
@@ -101,16 +103,15 @@ def test_delegate_whole_program(self):
 
         simple_net = self.get_a_simple_net()
         simple_net_input = simple_net.get_example_inputs()
-        exported_program = exir.capture(
-            simple_net, simple_net_input, exir.CaptureConfig()
-        ).to_edge(
-            exir.EdgeCompileConfig(
+        exported_program = to_edge(
+            export(simple_net, simple_net_input),
+            compile_config=exir.EdgeCompileConfig(
                 _check_ir_validity=False,
-            )
+            ),
         )
         # delegate the whole graph to the client executor
         lowered_module = to_backend(
-            ExecutorBackend.__name__, exported_program.exported_program, []
+            ExecutorBackend.__name__, exported_program.exported_program(), []
         )
 
         class CompositeModule(torch.nn.Module):
@@ -123,11 +124,7 @@ def forward(self, *args):
 
         composite_model = CompositeModule()
 
-        exec_prog = (
-            exir.capture(composite_model, simple_net_input, exir.CaptureConfig())
-            .to_edge()
-            .to_executorch()
-        )
+        exec_prog = to_edge(export(composite_model, simple_net_input)).to_executorch()
 
         executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer)
 
@@ -162,18 +159,14 @@ def forward(self, a, x, b):
         model = Model()
         inputs = (torch.ones(2, 2), torch.ones(2, 2), torch.ones(2, 2))
 
-        exported_program = exir.capture(model, inputs, exir.CaptureConfig()).to_edge()
+        exported_program = to_edge(export(model, inputs))
 
         # First lower to demo backend
-        demo_backend_lowered = exported_program
-        demo_backend_lowered.exported_program = to_backend(
-            exported_program.exported_program, AddMulPartitionerDemo()
-        )
+        demo_backend_lowered = exported_program.to_backend(AddMulPartitionerDemo())
 
         # Then lower to executor backend
-        executor_backend_lowered = demo_backend_lowered
-        executor_backend_lowered.exported_program = to_backend(
-            demo_backend_lowered.exported_program, ExecutorBackendPartitioner()
+        executor_backend_lowered = demo_backend_lowered.to_backend(
+            ExecutorBackendPartitioner()
         )
 
         prog_buffer = executor_backend_lowered.to_executorch()
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 74974d16231..d492c291f34 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -26,7 +26,7 @@
 from executorch.exir.backend.test.demos.rpc.executor_backend_preprocess import (
     ExecutorBackend,
 )
-from executorch.exir.backend.utils import get_delegates
+from executorch.exir.backend.utils import get_delegates, tag_constant_data
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -523,3 +523,85 @@ def partition(
             "constant data node (b_const) is tagged with (tag0) but has user (aten_sub_tensor) which has tag (None)",
             str(error.exception),
         )
+
+    def test_not_delegate_mutable_buffers(self) -> None:
+        """
+        A test case to check the mutated buffer is not delegated. We'll need to add a test case
+        to consider when the delegate can consume the mutable buffer.
+        """
+
+        class MutableStateModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("my_state", torch.zeros(1))
+
+            def forward(self, x):
+                y = x + self.my_state
+                self.my_state.add_(1)
+                return y
+
+        edge = exir.to_edge(
+            torch.export.export(
+                MutableStateModule(),
+                (torch.zeros(1),),
+            )
+        )
+        self.assertGreater(
+            len(edge.exported_program().graph_signature.buffers_to_mutate),
+            0,
+            "The test case should at leaset one mutable buffer",
+        )
+
+        class PartitionerTagData(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    ExecutorBackend.__name__,
+                    [CompileSpec(key, value) for key, value in self.spec.items()],
+                )
+
+            def partition(
+                self, edge_exported_program: ExportedProgram
+            ) -> PartitionResult:
+                partition_tags = {}
+                for node in edge_exported_program.graph.nodes:
+                    if node.op == "call_function" and node.target in [
+                        exir_ops.edge.aten.add.Tensor
+                    ]:
+                        delegation_tag = "tag0"
+                        node.meta["delegation_tag"] = delegation_tag
+                        partition_tags[delegation_tag] = self.delegation_spec
+                tag_constant_data(edge_exported_program)
+                return PartitionResult(
+                    tagged_exported_program=edge_exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        # Check the edge program inital buffers_to_mutate
+        mutate_op = "aten_add_tensor_1"
+        self.assertEqual(
+            edge.exported_program().graph_signature.buffers_to_mutate[mutate_op],
+            "my_state",
+        )
+        edge = edge.to_backend(PartitionerTagData())
+        # After to_backend, add is delegated and is no longer in buffers_to_mutate.
+        self.assertNotIn(
+            mutate_op,
+            edge.exported_program().graph_signature.buffers_to_mutate,
+        )
+
+        mutate_op = "getitem_1"
+        # Ensure the mutated buffer is not delegated, and the new mutate node is getitem (from call_delegate)
+        self.assertEqual(
+            edge.exported_program().graph_signature.buffers_to_mutate[mutate_op],
+            "my_state",
+        )
+        # Check the copy_ node is inserted
+        edge = edge.to_executorch()
+        copy_node = [
+            node
+            for node in edge.exported_program().graph.nodes
+            if node.op == "call_function"
+            and node.target == torch.ops.aten.copy_.default
+        ]
+        self.assertEqual(len(copy_node), 1)
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index f4c1c28f8bd..b299ba4be8a 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -508,6 +508,20 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
     subgraph. Throw error when const/param/buffers is used across different partitions. That is the
     underlying data will be owned by multiple delegates.
     """
+    mutated_buffer = set()
+    for node in edge_program.graph.nodes:
+        if node.op == "placeholder" and (
+            is_param(edge_program, node)
+            or is_buffer(edge_program, node)
+            or is_lifted_tensor_constant(edge_program, node)
+        ):
+            for node_user in node.users:
+                if node_user.name in edge_program.graph_signature.buffers_to_mutate:
+                    logging.info(
+                        "The buffer node is a mutated buffer node, which is not constant."
+                    )
+                    mutated_buffer.add(node)
+
     for node in edge_program.graph.nodes:
         # go through const/param/buffer nodes, if all users of const/param/buffer nodes are partitioned then partition
         if node.op == "placeholder" and (
@@ -515,20 +529,21 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
             or is_buffer(edge_program, node)
             or is_lifted_tensor_constant(edge_program, node)
         ):
-            user_tags = set()
-            for user in node.users:
-                user_tag = user.meta.get("delegation_tag", None)
-                if user_tag is not None:
-                    user_tags.add(user_tag)
-            if len(user_tags) > 1:
-                logging.info(
-                    f"The data node is used across multiple partitions, including {user_tags}. "
-                    "If the data is too large and it's not preferred to copy, please tag the "
-                    "constant node like node.['no_copy'] = True and they won't be copied."
-                )
-            # tag the data node with the same tag as the last user
-            if len(user_tags) > 0:
-                node.meta["delegation_tag"] = user_tags.pop()
+            if node not in mutated_buffer:
+                user_tags = set()
+                for user in node.users:
+                    user_tag = user.meta.get("delegation_tag", None)
+                    if user_tag is not None:
+                        user_tags.add(user_tag)
+                if len(user_tags) > 1:
+                    logging.info(
+                        f"The data node is used across multiple partitions, including {user_tags}. "
+                        "If the data is too large and it's not preferred to copy, please tag the "
+                        "constant node like node.['no_copy'] = True and they won't be copied."
+                    )
+                # tag the data node with the same tag as the last user
+                if len(user_tags) > 0:
+                    node.meta["delegation_tag"] = user_tags.pop()
 
 
 # TODO - style: use templated types
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index d743e4b0329..a2d3b53bcb6 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -75,3 +75,7 @@ class ExecutorchBackendConfig:
     # be a power of 2. If not provided, uses the value in the schema file.
     delegate_alignment: Optional[int] = None
     sym_shape_eval_pass: PassType = HintBasedSymShapeEvalPass()
+
+    # If set to true, view_copy operations will be converted to lightweight
+    # view operations in the ET runtime
+    remove_view_copy: bool = True
diff --git a/exir/delegate.py b/exir/delegate.py
index 959bd4bb17c..076e08daf37 100644
--- a/exir/delegate.py
+++ b/exir/delegate.py
@@ -102,7 +102,7 @@ def fake_requires_grad(var):
                         var.requires_grad = True
                 return var
 
-            return pytree.tree_map(fake_requires_grad, res)
+            return pytree.tree_map_only(torch.Tensor, fake_requires_grad, res)
 
         return res
 
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index fc3e446af9c..6b545e0a7d3 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -8,7 +8,6 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
 
-import executorch.extension.pytree as ex_pytree
 import torch
 import torch.fx
 from executorch.exir.emit._emitter import (
@@ -18,89 +17,12 @@
     _TopLevelEmitter,
 )
 from executorch.exir.error import ExportError, ExportErrorType
-from executorch.exir.schema import (
-    Bool,
-    Chain,
-    ContainerMetadata,
-    Double,
-    EValue,
-    ExecutionPlan,
-    Int,
-    Program,
-    String,
-    SubsegmentOffsets,
-)
-from executorch.exir.tensor import layout_enum, scalar_type_enum
+from executorch.exir.schema import Program, SubsegmentOffsets
 from executorch.exir.version import EXECUTORCH_SCHEMA_VERSION
 from torch.export.exported_program import ExportedProgram, OutputKind
 from torch.utils import _pytree as pytree
 
 
-def _emit_prim_getters(prim_getters: Dict[str, Any]) -> List[ExecutionPlan]:
-    """
-    Given a mapping of function names to return values, emit simple execution
-    plans that just return these constant values.
-
-    Precondition: All the values are primitives (bool, float, int, str, enum)
-    or structures (list, dict) of them.
-    """
-    plans = []
-    # flatten any structures
-    for method, vals in prim_getters.items():
-        # pyre-fixme[16]: Module `pytree` has no attribute `tree_flatten`.
-        flattened_output, spec = ex_pytree.tree_flatten(vals)
-        spec = spec.to_str()
-        chain = Chain(
-            inputs=[],
-            outputs=[],
-            instructions=[],
-            stacktrace=None,
-        )
-
-        # switch on type of prim
-        values = []
-        for val in flattened_output:
-            if isinstance(val, float):
-                values.append(EValue(Double(val)))
-
-            elif isinstance(val, bool):
-                values.append(EValue(Bool(val)))
-
-            elif isinstance(val, int):
-                values.append(EValue(Int(val)))
-
-            elif isinstance(val, str):
-                values.append(EValue(String(val)))
-
-            elif isinstance(val, torch.dtype):
-                values.append(EValue(Int(scalar_type_enum(val))))
-
-            elif isinstance(val, torch.layout):
-                values.append(EValue(Int(layout_enum(val))))
-
-            else:
-                raise ExportError(
-                    ExportErrorType.NOT_SUPPORTED,
-                    f"Error emitting {method} which returns a value of type {type(val)}. which is not a supported primitive",
-                )
-
-        # add to plans
-        plans.append(
-            ExecutionPlan(
-                name=method,
-                values=values,
-                inputs=[],
-                outputs=list(range(0, len(values))),
-                chains=[chain],
-                operators=[],
-                delegates=[],
-                non_const_buffer_sizes=[0, 0],
-                container_meta_type=ContainerMetadata("", spec),
-            )
-        )
-    return plans
-
-
 @dataclass
 class EmitterOutput:
     """
@@ -220,7 +142,7 @@ def emit_program(
 
     # emit any primitive getters
     if prim_getters is not None:
-        plans.extend(_emit_prim_getters(prim_getters))
+        plans.extend(emitter._emit_prim_getters(prim_getters))
 
     return EmitterOutput(
         debug_handle_map=debug_handle_map,
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index af5614bf208..e581789cb31 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -87,6 +87,7 @@
     TensorSpec,
 )
 from executorch.exir.types import LeafValueSpec, ValueSpec
+from torch._subclasses.fake_tensor import FakeTensor
 
 from torch.export.exported_program import ExportedProgram
 from torch.utils import _pytree as pytree
@@ -844,6 +845,32 @@ def _emit_control_flow(
                 )
             )
 
+    def _emit_view(self, args: Tuple[_Argument, ...]) -> _EmitterValue:
+        assert len(args) == 2
+
+        self_arg = self._emit_argument(args[0], torch.TensorType)  # pyre-ignore[6]
+        size_arg = self._emit_argument(args[1], torch.ListType.ofInts())
+        out_arg = self._emit_argument(
+            self._emit_spec(self.node.meta["spec"]), torch.TensorType  # pyre-ignore[6]
+        )
+
+        op_idx, op = self._get_operator(
+            name="executorch_prim::et_view",
+            overload="default",
+        )
+        kernel = Instruction(
+            KernelCall(
+                op_idx,
+                args=[
+                    self_arg.id,
+                    size_arg.id,
+                    out_arg.id,
+                ],
+            )
+        )
+        self.chain.instructions.append(kernel)
+        return out_arg
+
     def _add_debug_handle(self, emitter_id: int, target: _Target) -> None:
         """Updates the debug handle information for the current node.
 
@@ -907,6 +934,35 @@ def _emit_argument(
             return arg
         return self._emit_evalue(self._constant_to_evalue(arg, arg_type))
 
+    def _get_sym_ret(
+        self,
+        val: Tuple[Union[torch.SymInt, torch.BoolType, torch.FloatType, FakeTensor]],
+    ) -> Optional[_AbstractValue]:
+        """
+        Returns the emit ret for sym value.
+        """
+        ret = None
+        if isinstance(val, torch.SymInt):
+            ret = self._emit_evalue(EValue(Int(0)))
+        elif isinstance(val, torch.BoolType):
+            ret = self._emit_evalue(EValue(Bool(False)))
+        elif isinstance(val, torch.FloatType):
+            ret = self._emit_evalue(EValue(Double(0)))
+        return ret
+
+    def _get_sym_and_fake_tensor_ret(
+        self,
+        val: Tuple[Union[torch.SymInt, torch.BoolType, torch.FloatType, FakeTensor]],
+        spec: TensorSpec,
+    ) -> Union[List[_AbstractValue], _AbstractValue, Tuple[_AbstractValue, ...]]:
+        # Try to get the ret if it's a sym value.
+        ret = self._get_sym_ret(val)
+        # If the ret is None, it means that the val is not a sym value, but a regular tensor
+        if ret is None:
+            ret = self._emit_spec(spec)
+        assert ret is not None, "Can't have a None ret"
+        return ret
+
     def _emit_delegate(
         self,
         lowered_module: "LoweredBackendModule",  # noqa
@@ -918,7 +974,40 @@ def _emit_delegate(
         processed_bytes = lowered_module.processed_bytes
 
         delegate_index = self.emitter_state.delegate_cache.get(processed_bytes)
-        delegate_ret = self._emit_spec(self.node.meta["spec"])
+        delegate_ret = None
+
+        if isinstance(self.node.meta["spec"], list):
+            delegate_ret = []
+            for index, _ in enumerate(self.node.meta["val"]):
+                ret = self._get_sym_and_fake_tensor_ret(
+                    self.node.meta["val"][index], self.node.meta["spec"][index]
+                )
+                delegate_ret.append(ret)
+        elif isinstance(self.node.meta["spec"], tuple):
+            if isinstance(self.node.meta["val"], FakeTensor):
+                # There is a case when node.meta["spec"] is (TensorSpec, ) while node.meta["val"] is FakeTensor
+                ret = self._get_sym_and_fake_tensor_ret(
+                    self.node.meta["val"], self.node.meta["spec"][0]
+                )
+                delegate_ret = (ret,)
+            else:
+                delegate_ret = []
+                for index, _ in enumerate(self.node.meta["val"]):
+                    ret = self._get_sym_and_fake_tensor_ret(
+                        self.node.meta["val"][index], self.node.meta["spec"][index]
+                    )
+                    delegate_ret.append(ret)
+                delegate_ret = tuple(delegate_ret)
+        elif isinstance(self.node.meta["spec"], TensorSpec):
+            ret = self._get_sym_and_fake_tensor_ret(
+                self.node.meta["val"], self.node.meta["spec"]
+            )
+            delegate_ret = ret
+        else:
+            raise NotImplementedError(
+                f"self.node.meta['spec'] {type(self.node.meta['spec'])} is not supported"
+            )
+        assert delegate_ret is not None, "Can't have a None delegate_ret"
         if delegate_index is None:
             # Allocate an entry for the data. TODO(T150113674): Reuse any duplicate entries if
             # present.
@@ -1036,13 +1125,8 @@ def _get_empty_tensor_evalue() -> EValue:
                 torch.BoolType,
                 torch.NumberType,
             ), f"Only symbolic ops that return a Int Bool Float are supported currently got {type(target._schema.returns[0].type)}."
-            if type(target._schema.returns[0].type) == torch.IntType:
-                ret = self._emit_evalue(EValue(Int(0)))
-            elif type(target._schema.returns[0].type) == torch.BoolType:
-                ret = self._emit_evalue(EValue(Bool(False)))
-            elif type(target._schema.returns[0].type) == torch.FloatType:
-                ret = self._emit_evalue(EValue(Double(0)))
-            else:  # type(target._schema.returns[0].type) == torch.NumberType:
+            ret = self._get_sym_ret(target._schema.returns[0])
+            if ret is None:  # type(target._schema.returns[0].type) == torch.NumberType:
                 # Cant definitively say what type this is, the runtime operator just overrides the EValue completely
                 # though so we can just serialize whatever as a placeholder.
                 ret = self._emit_evalue(EValue(Int(0)))
@@ -1089,6 +1173,77 @@ def _emit_free(self, spec: TensorSpec) -> _AbstractValue:
         # The value is not used but the caller expects an AbstractValue returned.
         return _AbstractValue(None, None)  # pyre-ignore
 
+    def _emit_prim_getters(self, prim_getters: Dict[str, Any]) -> List[ExecutionPlan]:
+        """
+        Given a mapping of function names to return values, emit simple execution
+        plans that just return these constant values.
+
+        Precondition: All the values are primitives (bool, float, int, str, enum)
+        or structures (list, dict) of them.
+        """
+        plans = []
+        # flatten any structures
+        for method, vals in prim_getters.items():
+            # pyre-fixme[16]: Module `pytree` has no attribute `tree_flatten`.
+            flattened_output, spec = ex_pytree.tree_flatten(vals)
+            spec = spec.to_str()
+            chain = Chain(
+                inputs=[],
+                outputs=[],
+                instructions=[],
+                stacktrace=None,
+            )
+
+            # switch on type of prim
+            values = []
+            for val in flattened_output:
+                if isinstance(val, float):
+                    values.append(EValue(Double(val)))
+
+                elif isinstance(val, bool):
+                    values.append(EValue(Bool(val)))
+
+                elif isinstance(val, int):
+                    values.append(EValue(Int(val)))
+
+                elif isinstance(val, str):
+                    values.append(EValue(String(val)))
+
+                elif isinstance(val, torch.dtype):
+                    values.append(EValue(Int(scalar_type_enum(val))))
+
+                elif isinstance(val, torch.layout):
+                    values.append(EValue(Int(layout_enum(val))))
+
+                elif isinstance(val, torch.Tensor):
+                    values.append(
+                        self._tensor_spec_to_evalue(
+                            TensorSpec.from_tensor(val, const=True)
+                        )
+                    )
+
+                else:
+                    raise ExportError(
+                        ExportErrorType.NOT_SUPPORTED,
+                        f"Error emitting {method} which returns a value of type {type(val)}. which is not a supported primitive",
+                    )
+
+            # add to plans
+            plans.append(
+                ExecutionPlan(
+                    name=method,
+                    values=values,
+                    inputs=[],
+                    outputs=list(range(0, len(values))),
+                    chains=[chain],
+                    operators=[],
+                    delegates=[],
+                    non_const_buffer_sizes=[0],
+                    container_meta_type=ContainerMetadata("", spec),
+                )
+            )
+        return plans
+
     def fetch_attr(self, target: _Target) -> _AbstractValue:
         """Fetch weights and other module parameters. If the attribute is a tensor, emit it."""
         attr = super().fetch_attr(target)
@@ -1198,6 +1353,9 @@ def call_function(
             assert len(args) == 1
             return self._emit_spec(self.node.meta["spec"])
 
+        elif target == memory.view:
+            return self._emit_view(args)
+
         elif target == memory.free:
             assert len(args) == 1
             # pyre-ignore
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 3eebe52faef..23481a07aaf 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -265,16 +265,24 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         edge = to_edge(export(f, inputs))
 
         removed_ops = ["aten::relu_", "aten::view"]
-        expected_ops = ["aten::sin", "aten::relu", "aten::max", "aten::view_copy"]
+        expected_ops = [
+            "aten::sin",
+            "aten::relu",
+            "aten::max",
+            "executorch_prim::et_view",  # aten::view_copy if ExecutorchBackendConfig.remove_view_copy = False
+        ]
 
         for opname in removed_ops:
             self.assertEqual(
                 self.count_node(edge.exported_program().graph_module, opname), 0
             )
         for opname in expected_ops:
-            self.assertTrue(
-                self.count_node(edge.exported_program().graph_module, opname) >= 1
-            )
+            if (
+                opname != "executorch_prim::et_view"
+            ):  # et_view appears as call_function with target = memory.view in graph
+                self.assertTrue(
+                    self.count_node(edge.exported_program().graph_module, opname) >= 1
+                )
 
         program = edge.to_executorch().executorch_program
         for opname in removed_ops:
@@ -1057,6 +1065,9 @@ def forward(self, k: torch.Tensor) -> torch.Tensor:
         self.check_tensor_buffer_loc(1, execution_plan.values, 0, 1, 48)
 
     def test_emit_prims(self) -> None:
+        tensor_output = torch.rand(1, 4)
+        tensor_list_output = [torch.rand(1, 4), torch.rand(1, 4)]
+
         class Simple(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -1070,6 +1081,12 @@ def get_ints(self) -> Tuple[int]:
             def get_str(self) -> str:
                 return "foo"
 
+            def get_tensor(self) -> torch.Tensor:
+                return tensor_output
+
+            def get_tensor_list(self) -> List[torch.Tensor]:
+                return tensor_list_output
+
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return torch.nn.functional.sigmoid(self.linear(x))
 
@@ -1082,9 +1099,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         getters = {}
         getters["get_ints"] = model.get_ints()
         getters["get_str"] = model.get_str()
-        print(getters["get_str"])
+        getters["get_tensor"] = model.get_tensor()
+        getters["get_tensor_list"] = model.get_tensor_list()
+
         merged_program = emit_program(exir_input, False, getters).program
-        self.assertEqual(len(merged_program.execution_plan), 3)
+
+        self.assertEqual(len(merged_program.execution_plan), 5)
 
         self.assertEqual(
             merged_program.execution_plan[0].name,
@@ -1098,6 +1118,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             merged_program.execution_plan[2].name,
             "get_str",
         )
+        self.assertEqual(
+            merged_program.execution_plan[3].name,
+            "get_tensor",
+        )
+        self.assertEqual(
+            merged_program.execution_plan[4].name,
+            "get_tensor_list",
+        )
+
         # no instructions in a getter
         self.assertEqual(
             len(merged_program.execution_plan[1].chains[0].instructions),
@@ -1133,6 +1162,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             merged_program.execution_plan[2].values[0].val.string_val,
             "foo",
         )
+        self.assertEqual(len(merged_program.execution_plan[3].outputs), 1)
+        self.assertEqual(len(merged_program.execution_plan[4].outputs), 2)
+
+        merged_program = to_edge(
+            export(model, inputs), constant_methods=getters
+        ).to_executorch()
+        executorch_module = _load_for_executorch_from_buffer(merged_program.buffer)
+        torch.allclose(executorch_module.run_method("get_tensor", [])[0], tensor_output)
+        model_output = executorch_module.run_method("get_tensor_list", [])
+        for i in range(len(tensor_list_output)):
+            torch.allclose(model_output[i], tensor_list_output[i])
 
     def test_emit_debug_handle_map(self) -> None:
         mul_model = Mul()
diff --git a/exir/experimental/TARGETS b/exir/experimental/TARGETS
deleted file mode 100644
index 418ee54df6e..00000000000
--- a/exir/experimental/TARGETS
+++ /dev/null
@@ -1,25 +0,0 @@
-load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
-
-oncall("executorch")
-
-python_library(
-    name = "export_pt2",
-    srcs = ["export_pt2.py"],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:error",
-        "//executorch/exir:lib",
-        "//executorch/exir:tracer",
-    ],
-)
-
-python_library(
-    name = "lib",
-    srcs = [
-        "__init__.py",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:tensor",
-    ],
-)
diff --git a/exir/experimental/__init__.py b/exir/experimental/__init__.py
deleted file mode 100644
index c3e1dc8317f..00000000000
--- a/exir/experimental/__init__.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-import copy
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils._pytree as pytree
-from executorch.exir.tensor import TensorSpec
-from torch._export.serde.schema import TensorMeta
-from torch._export.serde.serialize import (
-    _SERIALIZE_TO_TORCH_DTYPE,
-    serialize_tensor_meta,
-)
-from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
-from torch.fx.experimental.symbolic_shapes import ShapeEnv
-
-
-def add_assertions(graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    modified_graph_module = copy.deepcopy(graph_module)
-
-    graph = modified_graph_module.graph
-    for node in graph.nodes:
-        if node.op != "call_function" and node.op != "placeholder":
-            continue
-
-        # Ignore constants
-        if node.meta.get("val", None) is None:
-            continue
-
-        # Ignore non-torch ops
-        if node.op == "call_function" and (
-            not isinstance(node.target, torch._ops.OpOverload)
-        ):
-            continue
-
-        shape = node.meta["val"].shape
-        dtype = node.meta["val"].dtype
-        node_name = node.name
-        with graph.inserting_after(node):
-
-            def check_spec(
-                x: TensorSpec, shape: List[int], dtype: torch.dtype, node_name: str
-            ) -> None:
-                assert list(x.shape) == list(
-                    shape
-                ), f"Expected {node_name} shape to be {shape}, got {x.shape}"
-                assert (
-                    x.dtype == dtype
-                ), f"Expected {node_name} dtype to be {dtype}, got {x.dtype}"
-
-            graph.call_function(check_spec, (node, shape, dtype, node_name))
-
-    modified_graph_module.recompile()
-    return modified_graph_module
-
-
-def convert_fake_tensor_to_tensor_meta(
-    ep: torch.fx.GraphModule,
-) -> Tuple[torch.fx.GraphModule, Optional[ShapeEnv]]:
-    """
-    Replace the faketensor metadata with the tensor metadata dataclass since we
-    cannot serialize faketensors
-    """
-    shape_env = None
-    for node in ep.graph.nodes:
-
-        def get_shape_env(
-            val: Union[List[FakeTensor], FakeTensor]
-        ) -> Optional[ShapeEnv]:
-            val_flat, _ = pytree.tree_flatten(val)
-            curr_shape_env = None
-            for v in val_flat:
-                if not isinstance(v, FakeTensor):
-                    continue
-                if curr_shape_env is None:
-                    curr_shape_env = v.fake_mode.shape_env
-                else:
-                    assert (
-                        curr_shape_env is v.fake_mode.shape_env
-                    ), "Multiple shape envs detected."
-            return curr_shape_env
-
-        if (val := node.meta.get("val", None)) is not None:
-            if shape_env is None:
-                shape_env = get_shape_env(val)
-            elif (new_shape_env := get_shape_env(val)) is not None:
-                assert shape_env is new_shape_env, "Multiple shape envs detected."
-
-            node.meta["tensor_meta"] = pytree.tree_map_only(
-                torch.Tensor, serialize_tensor_meta, val
-            )
-            del node.meta["val"]
-
-    return ep, shape_env
-
-
-def convert_tensor_meta_to_fake_tensor(
-    ep: torch.fx.GraphModule, shape_env: Optional[ShapeEnv] = None
-) -> torch.fx.GraphModule:
-    """
-    Replace (inplace) the tensor metadata with faketensor
-    """
-    fake_tensor_mode: FakeTensorMode = FakeTensorMode(
-        allow_non_fake_inputs=True, shape_env=shape_env
-    )
-    for node in ep.graph.nodes:
-        if (val := node.meta.get("tensor_meta", None)) is not None:
-
-            def _extract_faketensor(tensor_meta: TensorMeta) -> FakeTensor:
-                return FakeTensor(
-                    fake_tensor_mode,
-                    torch.empty(
-                        # TODO Support dynamic shape.
-                        tuple(s.as_int for s in tensor_meta.sizes),
-                        dtype=_SERIALIZE_TO_TORCH_DTYPE[tensor_meta.dtype],
-                        device="meta",
-                        requires_grad=tensor_meta.requires_grad,
-                    ),
-                    torch.device("cpu"),
-                )
-
-            node.meta["val"] = pytree.tree_map_only(
-                TensorMeta, _extract_faketensor, val
-            )
-    return ep
diff --git a/exir/experimental/export_pt2.py b/exir/experimental/export_pt2.py
deleted file mode 100644
index df040147f4b..00000000000
--- a/exir/experimental/export_pt2.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-# This class is for prototyping PyTorch 2.0 Export
-from dataclasses import dataclass
-from enum import Enum
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-
-from executorch import exir
-from executorch.exir import CaptureConfig
-from executorch.exir.error import ExportError, ExportErrorType, InternalError
-
-from executorch.exir.tracer import Value
-
-from torch._dynamo.guards import Guard as DynamoGuard
-
-
-class GuardType(Enum):
-    TENSOR_MATCH = 1
-
-
-class GuardResolution(Enum):
-    IGNORE = 1
-    CHECK_AT_RUNTIME = 2
-    ERROR_AT_EXPORT = 3
-
-
-@dataclass
-class Guard:
-    """
-    This is our own custom Guard class to store
-    information needed for EXIR. This will only
-    store things we actually need.
-    """
-
-    guard_type: GuardType
-    obj: Any  # pyre-ignore
-    check_code: str
-
-
-@dataclass
-class Trace:
-    """
-    Immutable object that abstracts the result of exir.trace
-    which is essentially a torch.fx.GraphModule plus all the assumptions
-    that are made about this tracing that are represented as Guard.
-    """
-
-    graph_module: torch.fx.GraphModule
-    guards: List[Guard]
-    inputs: Tuple[Value]
-
-
-class ExportSession:
-    def __init__(self, trace: Trace) -> None:
-        """
-        Mutable object where user can interactively resolve guards to access the final graph_module.
-        """
-        self.trace = trace
-        self.guard_rules: List[Callable[[Guard], Optional[GuardResolution]]] = []
-
-        # TODO (make more specific rule)
-        def default_rule(guard: Guard) -> Optional[GuardResolution]:
-            if guard.guard_type != GuardType.TENSOR_MATCH:
-                return GuardResolution.IGNORE
-            return None
-
-        self.guard_rules.append(default_rule)
-
-    def summary(self) -> str:
-        """
-        Prints the current status of guard resolutions in a module
-        hierarchical way.
-        """
-        # TODO implement this
-        return ""
-
-    def export(self) -> Optional[torch.fx.GraphModule]:
-        """
-        Exports a final GraphModule that is ready to be executed.
-        This will require that all guards imposed on GraphModule are
-        resolved.
-        """
-
-        def _guard_remaining_filter(guard: Guard) -> bool:
-            guard_resolutions: List[Optional[GuardResolution]] = [
-                guard_rule(guard) for guard_rule in self.guard_rules
-            ]
-            # if there was no guard resolutions, we should keep the guard
-            if len(guard_resolutions) == 0:
-                return True
-
-            # later rules take priority
-            for idx in range(len(guard_resolutions) - 1, -1, -1):
-                if guard_resolutions[idx] is None:
-                    continue
-                assert guard_resolutions is not None
-                if guard_resolutions[idx] in [
-                    GuardResolution.CHECK_AT_RUNTIME,
-                    GuardResolution.IGNORE,
-                ]:
-                    return False
-                if guard_resolutions[idx] == GuardResolution.ERROR_AT_EXPORT:
-                    return True
-            # nothing has been resolved
-            return True
-
-        remaining_guards = list(filter(_guard_remaining_filter, self.trace.guards))
-        if len(remaining_guards) > 0:
-            raise ExportError(
-                ExportErrorType.VIOLATION_OF_SPEC,
-                "There are outstanding guards to be resolved to export this graph",
-            )
-        return self.trace.graph_module
-
-    def add_guard_rule(
-        self, guard_rule: Callable[[Guard], Optional[GuardResolution]]
-    ) -> None:
-        """
-        Adds user provided guard rule. This rule will be applied when you call export() method.
-        """
-        self.guard_rules.append(guard_rule)
-
-
-def trace(root: Callable[..., Value], concrete_args: Tuple[Value, ...]) -> Trace:
-    """
-    Runs torchdynamo with no-python mode and dispatch trace
-    to create a Trace object which is graph module plus guards that
-    need to be resolved.
-    """
-    # TODO (yidi) cannot enable functionalization under exir.capture() pt2 mode
-    graph_module = exir.capture(
-        root,
-        concrete_args,
-        CaptureConfig(enable_functionalization=False),
-    ).graph_module
-
-    # TODO convert torchdynamo guards to our own guards
-    def _convert_dynamo_guard_to_exir_guard(
-        dynamo_guard: DynamoGuard,
-    ) -> Optional[Guard]:
-        if dynamo_guard.guard_types is not None and len(dynamo_guard.guard_types) > 0:
-            # TODO (make sure this list is always element of 1)
-            guard_type = dynamo_guard.guard_types[0]
-            # TODO (add more guard types)
-            if guard_type == "TENSOR_MATCH":
-                # pyre-fixme[29]: `Optional[object]` is not a function.
-                return Guard(GuardType.TENSOR_MATCH, dynamo_guard.obj_weakref(), "")
-
-        raise InternalError(f"Unregistered guard type: {dynamo_guard.guard_types}")
-
-    guards: List[Guard] = []
-    for g in graph_module.guards:
-        try:
-            guard = _convert_dynamo_guard_to_exir_guard(g)
-            assert isinstance(guard, Guard)
-            guards.append(guard)
-        except InternalError as e:
-            print(str(e))
-
-    return Trace(graph_module, guards, concrete_args)
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index b025cef233d..5a8dbd5a9b2 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -454,17 +454,17 @@ def _get_new_signature(  # noqa: C901
     new_state_dict = {}
     new_constants = {}
 
-    input_tensor_node_to_sig = {
-        input_spec.arg.name: input_spec
-        for input_spec in old_signature.input_specs
-        if isinstance(input_spec.arg, TensorArgument)
-    }
+    placeholder_nodes = [
+        node.name for node in original_program.graph.nodes if node.op == "placeholder"
+    ]
+    assert len(placeholder_nodes) == len(old_signature.input_specs)
+    input_node_to_sig = dict(zip(placeholder_nodes, old_signature.input_specs))
 
     for node in gm.graph.nodes:
         is_tagged = tag is None or node.meta.get("delegation_tag", None) == tag
         if node.op == "placeholder":
 
-            if node.name not in input_tensor_node_to_sig:
+            if node.name not in input_node_to_sig:
                 assert tag is not None
                 input_specs.append(
                     InputSpec(
@@ -475,7 +475,7 @@ def _get_new_signature(  # noqa: C901
                 )
                 continue
 
-            orig_input_spec = input_tensor_node_to_sig[node.name]
+            orig_input_spec = input_node_to_sig[node.name]
 
             if not isinstance(orig_input_spec.arg, TensorArgument):
                 input_specs.append(orig_input_spec)
@@ -528,7 +528,7 @@ def _get_new_signature(  # noqa: C901
                         output_specs.append(
                             OutputSpec(
                                 kind=OutputKind.USER_OUTPUT,
-                                arg=ConstantArgument(output_node),
+                                arg=ConstantArgument(name="", value=output_node),
                                 target=None,
                             )
                         )
@@ -555,7 +555,7 @@ def _get_new_signature(  # noqa: C901
                         output_specs.append(
                             OutputSpec(
                                 kind=OutputKind.USER_OUTPUT,
-                                arg=ConstantArgument(output_node),
+                                arg=ConstantArgument(name="", value=output_node),
                                 target=None,
                             )
                         )
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index b8c47b440c5..675f196fcd8 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -397,6 +397,7 @@ def collect_specs_from_nodes(  # noqa: C901
                 or node.target
                 in [
                     memory.alloc,
+                    memory.view,
                     operator.getitem,
                     torch.ops.higher_order.cond,
                     exir_while,
@@ -534,7 +535,13 @@ def get_node_tensor_specs(
     has no tensor specs.
     """
     # get tensor specs
-    specs = node.meta.get("spec")
+    if node.target == memory.view:
+        base = node.args[0]
+        assert isinstance(base, torch.fx.Node)
+        specs = base.meta.get("spec")
+    else:
+        specs = node.meta.get("spec")
+
     if isinstance(specs, TensorSpec):
         specs = [specs]
     if not isinstance(specs, (list, tuple)):
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
index f7f56ece2b4..7ec14fb7d88 100644
--- a/exir/passes/TARGETS
+++ b/exir/passes/TARGETS
@@ -92,6 +92,8 @@ python_library(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/dialects/edge:lib",
     ],
 )
 
diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
index 2611d6a1541..594de3f79a4 100644
--- a/exir/passes/__init__.py
+++ b/exir/passes/__init__.py
@@ -43,7 +43,7 @@
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.passes.normalize_transpose_pass import NormalizeTransposePass
 from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
-from executorch.exir.passes.remove_noop_pass import RemoveNoopPass
+from executorch.exir.passes.remove_noop_pass import RemoveNoopPass, RemoveToCopyPass
 from executorch.exir.passes.replace_aten_with_edge_pass import OpReplacePass
 from executorch.exir.passes.replace_broken_ops_with_function_ops_pass import (
     ReplaceBrokenOpsWithFunctionalOpsPass,
@@ -248,6 +248,7 @@ def callWithLoggerEnabled(self, graph_module: torch.fx.GraphModule) -> None:
     # we won't see it in the input graph to the to_out_variant pass, unless
     # it's retraced after running to_out_variant with the first trace.
     memory.alloc,
+    memory.view,
     executorch_call_delegate,
     torch.ops.aten.copy_.default,
 }
@@ -481,6 +482,7 @@ def dead_code_elimination_pass(graph_module: torch.fx.GraphModule) -> PassResult
         ScalarToTensorPass(),
         SymToTensorPass(),
         RemoveNoopPass(),
+        RemoveToCopyPass(),
     ]
 ).passes
 
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
index 267d934dae6..c6ec40269f6 100644
--- a/exir/passes/_quant_patterns_and_replacements.py
+++ b/exir/passes/_quant_patterns_and_replacements.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
-from typing import Callable, List, Tuple
+from typing import Callable, List, Optional, Tuple
 
 import torch
 from executorch.exir.dialects._ops import bind_pattern_to_op, ops as exir_ops
@@ -15,6 +15,7 @@
 )
 from torch import fx
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib
+from torch.library import impl, impl_abstract
 
 
 __all__ = [
@@ -34,6 +35,271 @@
     "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
 )
 
+quantized_decomposed_lib.define(
+    "embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+def embedding_weight_checks(weight, weight_scales, weight_zero_points):
+    assert weight.dtype in [
+        torch.int8,
+        torch.uint8,
+    ], f"Expecting weights to be of dtype in [torch.int8, torch.uint8], but got {weight.dtype}"
+    assert (
+        weight.dim() == 2
+    ), f"Expecting weight tensor to have dim()==2, but found {weight.dim()}"
+
+    assert weight_scales.dtype in [
+        torch.float16,
+        torch.float32,
+    ], f"Expecting weight_scales to be of dtype in [torch.float16, torch.float32], but got {weight_scales.dtype}"
+    assert (
+        weight_scales.dim() == 1 or weight_scales.dim() == 2
+    ), f"Expecting weight_scales tensor to have rank 1 or 2, but found {weight_scales.dim()}"
+    assert weight_scales.size(0) == weight.size(
+        0
+    ), f"Expecting weight and scale tensor to have same number of rows, but found {weight.size()} and {weight_scales.size()}"
+
+    assert (
+        weight_zero_points is None or weight_zero_points.dtype == weight_scales.dtype
+    ), "Expecting weight_zero_points to be None or have same dtype as weight_scales"
+    assert (
+        weight_zero_points is None or weight_zero_points.dim() == 1
+    ), f"Expecting weight_zero_points tensor to be None or have dim()==1, but found {weight_zero_points.dim()}"
+    assert weight_zero_points is None or weight_zero_points.size(0) == weight.size(
+        0
+    ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"
+
+
+@impl(quantized_decomposed_lib, "embedding_byte", "CompositeExplicitAutograd")
+def embedding_byte(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = weight.size(1) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        weight_scales.dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@impl_abstract("quantized_decomposed::embedding_byte.out")
+def embedding_byte_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_byte(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+    )
+
+
+@impl(quantized_decomposed_lib, "embedding_byte.dtype", "CompositeExplicitAutograd")
+def embedding_byte_dtype(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = weight.size(1) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@impl_abstract("quantized_decomposed::embedding_byte.dtype_out")
+def embedding_byte_dtype_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_byte_dtype(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+        dtype,
+    )
+
+
+quantized_decomposed_lib.define(
+    "embedding_4bit(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_4bit.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_4bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+@impl(quantized_decomposed_lib, "embedding_4bit", "CompositeExplicitAutograd")
+def embedding_4bit(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = (2 * weight.size(1)) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight_even = weight.div(16, rounding_mode="trunc")
+    weight_odd = weight.remainder(16)
+    weight_unpacked = torch.stack((weight_even, weight_odd), dim=-1)
+    weight = weight_unpacked.view(weight.shape[0], -1)
+    weight = weight.view(torch.int8).add(-8)
+
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        weight_scales.dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@impl_abstract("quantized_decomposed::embedding_4bit.out")
+def embedding_4bit_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_4bit(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+    )
+
+
+@impl(quantized_decomposed_lib, "embedding_4bit.dtype", "CompositeExplicitAutograd")
+def embedding_4bit_dtype(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = (2 * weight.size(1)) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight_even = weight.div(16, rounding_mode="trunc")
+    weight_odd = weight.remainder(16)
+    weight_unpacked = torch.stack((weight_even, weight_odd), dim=-1)
+    weight = weight_unpacked.view(weight.shape[0], -1)
+    weight = weight.view(torch.int8).add(-8)
+
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@impl_abstract("quantized_decomposed::embedding_4bit.dtype_out")
+def embedding_4bit_dtype_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_4bit_dtype(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+        dtype,
+    )
+
+
 quantized_decomposed_lib.define(
     "mixed_mm(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points) -> Tensor",
 )
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 14ff651c936..0fabf223fb8 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -4,58 +4,145 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from collections import OrderedDict
+from typing import cast, Mapping, Optional
+
 import torch
-from torch._export.utils import get_buffer, get_param, is_buffer, is_param
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from torch._export.utils import (
+    get_buffer,
+    get_lifted_tensor_constant,
+    get_param,
+    is_buffer,
+    is_lifted_tensor_constant,
+    is_param,
+)
 from torch._guards import detect_fake_mode
 from torch.export import ExportedProgram
 from torch.export.exported_program import InputKind, InputSpec, TensorArgument
+from torch.utils import _pytree as pytree
+
+
+# Avoid propagating constants for `exir.ops.edge.aten.full.default`.
+# Propagating aten.full can significantly increase compiled model size.
+_DEFAULT_SKIP_TARGETS = {exir_ops.edge.aten.full.default}
 
+_PRIMITIVE_TYPES = (
+    float,
+    int,
+    bool,
+    str,
+    torch.Tensor,
+    torch.device,
+    torch.dtype,
+    torch.layout,
+)
 
-def is_const(arg, exported_program, const_data_list) -> bool:
+
+def is_const(
+    arg,
+    exported_program: ExportedProgram,
+    const_node_to_tensor: Mapping[torch.fx.Node, torch.Tensor],
+) -> bool:
     if isinstance(arg, (tuple, list)):
-        return all(is_const(x, exported_program, const_data_list) for x in arg)
+        return all(is_const(x, exported_program, const_node_to_tensor) for x in arg)
     elif isinstance(arg, dict):
-        return all(is_const(x, exported_program, const_data_list) for x in arg.values())
-    elif not isinstance(arg, torch.fx.Node) or arg.op != "placeholder":
+        return all(
+            is_const(x, exported_program, const_node_to_tensor) for x in arg.values()
+        )
+    elif isinstance(arg, _PRIMITIVE_TYPES):
+        return True
+    elif not isinstance(arg, torch.fx.Node):
         return False
-    elif (
-        is_param(exported_program, arg)
-        or is_buffer(exported_program, arg)
-        or arg.name in const_data_list
-    ):
+    elif arg in const_node_to_tensor:
         return True
     return False
 
 
-def get_data(exported_program, arg):
+def get_data(
+    arg,
+    exported_program: ExportedProgram,
+    const_node_to_tensor: Mapping[torch.fx.Node, torch.Tensor],
+):
     if isinstance(arg, (tuple, list)):
-        return [get_data(exported_program, x) for x in arg]
-    elif is_param(exported_program, arg):
-        return get_param(exported_program, arg)
-    elif is_buffer(exported_program, arg):
-        return get_buffer(exported_program, arg)
+        return type(arg)(
+            get_data(x, exported_program, const_node_to_tensor) for x in arg
+        )
+    elif isinstance(arg, _PRIMITIVE_TYPES):
+        return arg
+    elif arg in const_node_to_tensor:
+        return const_node_to_tensor[arg]
     return None
 
 
-def constant_prop_pass(exported_program: ExportedProgram) -> ExportedProgram:
+def get_constant_placeholder_dict(
+    exported_program: ExportedProgram,
+) -> OrderedDict[torch.fx.Node, torch.Tensor]:
     """
-    This pass is for constant propagation for Exported Program with lifted parameters,
-    as the parameters will not be shown up as `get_attr` but as `placeholder` to the graph.
+    Returns a dictionary of placeholder node -> constant tensor.
     """
-    if (
-        len([node for node in exported_program.graph.nodes if node.op == "placeholder"])
-        == 0
-    ):
-        return exported_program
+    const_node_to_tensor: OrderedDict[torch.fx.Node, torch.Tensor] = OrderedDict()
+    for node in exported_program.graph.nodes:
+        if node.op != "placeholder":
+            continue
+
+        if is_param(exported_program, node):
+            const_node_to_tensor[node] = cast(
+                torch.Tensor, get_param(exported_program, node)
+            )
+        elif is_buffer(exported_program, node):
+            const_node_to_tensor[node] = cast(
+                torch.Tensor, get_buffer(exported_program, node)
+            )
+        elif is_lifted_tensor_constant(exported_program, node):
+            const_node_to_tensor[node] = cast(
+                torch.Tensor, get_lifted_tensor_constant(exported_program, node)
+            )
+    return const_node_to_tensor
 
-    has_cond = [
-        node
-        for node in exported_program.graph.nodes
-        if node.target == torch.ops.higher_order.cond
-    ]
-    if len(has_cond) > 0:
-        raise RuntimeError("constant_prop_pass for control flow is not supported yet.")
 
+def get_propagated_const_tensor_dict(
+    exported_program: ExportedProgram,
+    custom_skip_targets: Optional[set[EdgeOpOverload]],
+) -> OrderedDict[torch.fx.Node, torch.Tensor]:
+    """
+    Propagates constants and returns a dictionary of node->constant tensors.
+    """
+    # Initialize dict with all constant placeholders.
+    const_node_to_tensor = get_constant_placeholder_dict(exported_program)
+
+    all_skip_targets: set[EdgeOpOverload] = set()
+    # Default set of targets to skip.
+    all_skip_targets.update(_DEFAULT_SKIP_TARGETS)
+    if custom_skip_targets is not None:
+        all_skip_targets.update(custom_skip_targets)
+
+    for node in exported_program.graph.nodes:
+        if node.op != "call_function" or node.target in all_skip_targets:
+            continue
+
+        if not is_const(
+            node.args,
+            exported_program,
+            const_node_to_tensor,
+        ):
+            continue
+
+        args_data, kwargs_data = pytree.tree_map(
+            lambda x: get_data(x, exported_program, const_node_to_tensor),
+            (node.args, node.kwargs),
+        )
+
+        # Execute the `node.target` and create a new propagated constant tensor.
+        prop_constant_tensor = node.target(*args_data, **kwargs_data)
+        const_node_to_tensor[node] = prop_constant_tensor
+
+    return const_node_to_tensor
+
+
+def get_first_user_input(exported_program: ExportedProgram) -> torch.fx.Node:
+    """Returns the first user input node in the graph."""
     first_user_input = None
     for node in exported_program.graph.nodes:
         if (
@@ -64,11 +151,42 @@ def constant_prop_pass(exported_program: ExportedProgram) -> ExportedProgram:
         ):
             first_user_input = node
             break
+    return first_user_input
+
+
+def replace_with_constant_node(
+    node: torch.fx.Node,
+    prop_constant_tensor: torch.Tensor,
+    first_user_input: torch.fx.Node,
+    fake_mode,
+    exported_program: ExportedProgram,
+) -> tuple[torch.fx.Node, str]:
+    # Add `prop_constant_tensor` to program.state_dict.
+    prop_constant_tensor_fqn = f"_prop_tensor_constant{len(exported_program.constants)}"
+    exported_program.constants[prop_constant_tensor_fqn] = prop_constant_tensor
+
+    # Insert a new placeholder node for the propagated constant tensor.
+    with exported_program.graph.inserting_before(first_user_input):
+        const_placeholder_node = exported_program.graph.placeholder(
+            prop_constant_tensor_fqn
+        )
+
+    # Update the meta data of the new placeholder (buffer) node.
+    for k, v in node.meta.items():
+        const_placeholder_node.meta[k] = v
+    const_placeholder_node.meta["val"] = fake_mode.from_tensor(
+        prop_constant_tensor, static_shapes=True
+    )
+    const_placeholder_node.meta["val"].constant = prop_constant_tensor
+
+    # Replace the original node with the new constant node.
+    node.replace_all_uses_with(const_placeholder_node)
+    exported_program.graph.erase_node(node)
+
+    return const_placeholder_node, prop_constant_tensor_fqn
 
-    buffers = exported_program.graph_signature.buffers
-    prop_constant_data = []
-    const_data_to_be_removed = set()
 
+def get_fake_mode(exported_program: ExportedProgram):
     fake_mode = detect_fake_mode(
         tuple(
             node.meta["val"]
@@ -77,57 +195,115 @@ def constant_prop_pass(exported_program: ExportedProgram) -> ExportedProgram:
         )
     )
     assert fake_mode is not None
+    return fake_mode
 
+
+def erase_constant_node(
+    exported_program: ExportedProgram,
+    node: torch.fx.Node,
+) -> None:
+    # Remove corresponding tensor from param/constants dict.
+    signature = exported_program.graph_signature
+    if name := signature.inputs_to_parameters.pop(node.name, None):
+        exported_program.state_dict.pop(name, None)
+    elif name := signature.inputs_to_lifted_tensor_constants.pop(node.name, None):
+        exported_program.constants.pop(name, None)
+    elif name := signature.inputs_to_buffers.pop(node.name, None):
+        exported_program.constants.pop(name, None)
+        exported_program.state_dict.pop(name, None)
+
+    # Remove from graph.
+    exported_program.graph.erase_node(node)
+
+
+def create_constant_nodes_and_return_specs(
+    const_node_to_tensor: Mapping[torch.fx.Node, torch.Tensor],
+    exported_program: ExportedProgram,
+) -> dict[str, InputSpec]:
+    """
+    Creates constant nodes for all entries in `const_node_to_tensor` and returns a node.name -> InputSpec dict.
+    """
+    name_to_spec_dict: dict[str, InputSpec] = {}
+
+    fake_mode = get_fake_mode(exported_program)
+    first_user_input = get_first_user_input(exported_program)
+
+    # Iterate over nodes in reverse order.
+    for node, prop_constant_tensor in reversed(const_node_to_tensor.items()):
+        if all(x in const_node_to_tensor for x in node.users):
+            # All users of this constant node are also constant, so we don't need to create a new constant node.
+            erase_constant_node(exported_program, node)
+            continue
+
+        if node.op == "placeholder":
+            continue
+
+        const_placeholder_node, prop_constant_tensor_fqn = replace_with_constant_node(
+            node, prop_constant_tensor, first_user_input, fake_mode, exported_program
+        )
+
+        # Create input spec for lifted constant.
+        name_to_spec_dict[const_placeholder_node.name] = InputSpec(
+            kind=InputKind.CONSTANT_TENSOR,
+            arg=TensorArgument(name=const_placeholder_node.name),
+            target=prop_constant_tensor_fqn,
+            persistent=True,
+        )
+    return name_to_spec_dict
+
+
+def constant_prop_pass(
+    exported_program: ExportedProgram,
+    custom_skip_targets: Optional[set[EdgeOpOverload]] = None,
+) -> ExportedProgram:
+    """
+    This pass is for constant propagation for Exported Program with lifted parameters,
+    as the parameters will not be shown up as `get_attr` but as `placeholder` to the graph.
+
+    Args:
+        exported_program: The ExportedProgram to perform constant propagation on.
+        custom_skip_targets: Optional set of EdgeOpOverload targets to skip during constant propagation.
+
+    Returns:
+        The modified ExportedProgram with constant propagation applied.
+    """
+    if (
+        len([node for node in exported_program.graph.nodes if node.op == "placeholder"])
+        == 0
+    ):
+        return exported_program
+
+    has_control_flow = [
+        node
+        for node in exported_program.graph.nodes
+        if node.target == torch.ops.higher_order.cond
+    ]
+    if len(has_control_flow) > 0:
+        raise RuntimeError("constant_prop_pass for control flow is not supported yet.")
+
+    const_node_to_tensor = get_propagated_const_tensor_dict(
+        exported_program, custom_skip_targets
+    )
+
+    # Get old input specs.
+    name_to_spec_dict = {
+        s.arg.name: s for s in exported_program.graph_signature.input_specs
+    }
+    # Add the new constants to input specs dict.
+    name_to_spec_dict.update(
+        create_constant_nodes_and_return_specs(const_node_to_tensor, exported_program)
+    )
+
+    # Generate new input spec.
+    new_input_specs = []
     for node in exported_program.graph.nodes:
-        if node.op == "call_function":
-            constant_data_name_list = [
-                input_spec.target for input_spec in prop_constant_data
-            ]
-            if is_const(node.args, exported_program, constant_data_name_list):
-                args_data = [get_data(exported_program, arg) for arg in node.args]
-                kwargs_data = node.kwargs
-                const_data_to_be_removed.update(node.args)
-                prop_constant_tensor = node.target(*args_data, **kwargs_data)
-                prop_constant_tensor_fqn = f"_prop_tensor_constant{len(buffers)}"
-
-                with exported_program.graph.inserting_before(first_user_input):
-                    const_placeholder_node = exported_program.graph.placeholder(
-                        prop_constant_tensor_fqn
-                    )
-                    # Update the meta data of the new placeholder (buffer) node
-                    for k, v in node.meta.items():
-                        const_placeholder_node.meta[k] = v
-                    const_placeholder_node.meta["val"] = fake_mode.from_tensor(
-                        prop_constant_tensor, static_shapes=True
-                    )
-                    const_placeholder_node.meta["val"].constant = prop_constant_tensor
-
-                    node.replace_all_uses_with(const_placeholder_node)
-                    exported_program.graph.erase_node(node)
-                    prop_constant_node_input_spec = InputSpec(
-                        kind=InputKind.BUFFER,
-                        arg=TensorArgument(name=const_placeholder_node.name),
-                        target=prop_constant_tensor_fqn,
-                        persistent=True,
-                    )
-                    prop_constant_data.append(prop_constant_node_input_spec)
-                    buffers.append(prop_constant_tensor_fqn)
-                    exported_program.state_dict[prop_constant_tensor_fqn] = (
-                        prop_constant_tensor
-                    )
-                    exported_program.graph_signature.input_specs.append(
-                        prop_constant_node_input_spec
-                    )
-
-    # Remove the propogated buffer from the state dict
-    for node in exported_program.graph.nodes:
-        if (
-            node.op == "placeholder"
-            and node in const_data_to_be_removed
-            and len(node.users) == 0
-        ):
-            exported_program.state_dict.pop(node.name, None)
-            exported_program.graph.erase_node(node)
+        if node.op != "placeholder":
+            continue
+        new_input_specs.append(name_to_spec_dict[node.name])
+    exported_program.graph_signature.input_specs = new_input_specs
 
+    # Cleanup the graph.
+    exported_program.graph.eliminate_dead_code()
     exported_program.graph_module.recompile()
+
     return exported_program
diff --git a/exir/passes/normalize_view_copy_base_pass.py b/exir/passes/normalize_view_copy_base_pass.py
index 7ff1346da5a..2c98a8525d4 100644
--- a/exir/passes/normalize_view_copy_base_pass.py
+++ b/exir/passes/normalize_view_copy_base_pass.py
@@ -29,8 +29,6 @@ class NormalizeViewCopyBasePass(PassBase):
 
     When combined with dead-code elimination, this pass removes redundant
     view_copy nodes.
-
-    TODO: replace RemoveRedundantViewCopyPass with NormalizeViewCopyBasePass + dead code elimination.
     """
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
diff --git a/exir/passes/remove_noop_pass.py b/exir/passes/remove_noop_pass.py
index 5394bed5a28..cdc68c19b3c 100644
--- a/exir/passes/remove_noop_pass.py
+++ b/exir/passes/remove_noop_pass.py
@@ -90,3 +90,30 @@ def call(self, graph_module: GraphModule) -> PassResult:
         graph_module.graph.eliminate_dead_code()
 
         return PassResult(graph_module, True)
+
+
+class RemoveToCopyPass(ExportPass):
+    """
+    Removes _to_copy that pass through arguments.
+    """
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+
+            if node.target not in (torch.ops.aten._to_copy.default,):
+                continue
+
+            orig_tensor = node.args[0].meta["val"]
+
+            if (
+                orig_tensor.dtype == node.meta["val"].dtype
+                and orig_tensor.device == node.meta["val"].device
+            ):
+                node.replace_all_uses_with(node.args[0])
+
+        graph_module.graph.eliminate_dead_code()
+        graph_module.graph.lint()
+
+        return PassResult(graph_module, True)
diff --git a/exir/passes/replace_view_copy_with_view_pass.py b/exir/passes/replace_view_copy_with_view_pass.py
index 33f98304174..a9304f3eec8 100644
--- a/exir/passes/replace_view_copy_with_view_pass.py
+++ b/exir/passes/replace_view_copy_with_view_pass.py
@@ -6,9 +6,9 @@
 
 # pyre-strict
 
+import copy
 import logging
-import math
-from typing import Any, Dict, List, Tuple
+from typing import Any, List, Tuple
 
 import torch
 from executorch.exir import memory
@@ -36,28 +36,113 @@ def _is_view_copy(node: torch.fx.Node) -> bool:
 _VIEW_OP = memory.view
 
 
+class _Guard:
+    def __init__(
+        self, name: str, field_lambda, expected_val: Any  # pyre-ignore[2]
+    ) -> None:
+        self.name: str = name
+        self.field_lambda = field_lambda  # pyre-ignore[4]
+        self.expected_val = copy.deepcopy(expected_val)  # pyre-ignore[4]
+
+    def __call__(self, view_spec) -> None:  # pyre-ignore[2]
+        assert view_spec._unguarded_access
+        observed_val = self.field_lambda(view_spec)
+        if observed_val != self.expected_val:
+            raise Exception(
+                f"Guard {self.name} failed.  Expected to see value {self.expected_val}, but saw value {observed_val}."
+            )
+
+
 class _ViewSpec(TensorSpec):
     def __init__(self, base: TensorSpec, shape: List[int]) -> None:
         """
-        A ViewSpec is an immutable TensorSpec that mirrors its base for non-size
-        related information.
-        """
+        A _ViewSpec is TensorSpec that shares non-size related fields with its base.
+        The size-related fields are: shape, stride, dim_order, and shape_dynamism.
 
-        if math.prod(base.shape) != math.prod(shape):
-            raise Exception(
-                f"Cannot create a ViewSpec because the provided shape {shape} is not consistent with the number of elements in the provided base ({math.prod(base.shape)})."
-            )
+        If either the base or view spec updates a non-size related field, the change
+        is reflected in both specs.  But size related fields are not linked and can
+        be set separately.
 
-        self._init_setters = [
-            "_frozen",
-            "_base",
-            "_guards",
+        A _ViewSpec can only be created from a non-sparse, strided TensorSpec.
+        On creation, a _ViewSpec must be compatible with its base with respect to
+        shape_dynamism, dtype, and nbytes.
+
+        A _ViewSpec contains _guards that are evaluated on every __getattribute__ call.
+        The purpose of the guards is to make sure the _ViewSpec is still compatible
+        with its base.
+        """
+
+        # Explicitly put all attributes into _self_fields or _base_fields
+        # Any attribute that is not in _self_fields or _base_fields will
+        # raise an Exception.  If TensorSpec is extended with a new attribute,
+        # we should explicitly decide how _ViewSpec will handle it.
+        self._self_fields = [
+            # We need to get the debug method from self
+            # so that the object id it prints is correct.
+            "debug",  # method
+            "__repr__",  # method
+            # The following are related to size and should use self
             "shape",
             "stride",
             "dim_order",
             "shape_dynamism",
+            "nbytes",  # method
+            "allocated_memory",  # property
+            "is_dynamic_shape_tensor",  # property
+            "is_static_shape_tensor",  # property
+            "is_upper_bound_tensor",  # property
+            "is_dynamic_unbound_tensor",  # property
+        ]
+        self._base_fields = [
+            "scalar_type",
+            "const",
+            "alignment",
+            "storage",
+            "requires_grad",
+            "layout",
+            "is_sparse",
+            "init_mem_planning_fields",  # method
+            "realign",  # method
+            "from_tensor",  # class method
+            "lifetime",
+            "mem_id",
+            "mem_obj_id",
+            "mem_offset",
+            "dtype",  # property
         ]
-        self._frozen = False
+
+        # Make sure _self_fields and _base_fields are disjoint
+        assert len(set(self._self_fields) & set(self._base_fields)) == 0
+
+        self._guards: List[_Guard] = []
+        self._unguarded_access = False
+
+        # Make sure base is not sparse and add a guard
+        if base.is_sparse:
+            raise Exception(
+                "_ViewSpec can only be created from non-sparse TensorSpec, but base.is_sparse=True."
+            )
+        self._guards.append(
+            _Guard(
+                "is_sparse",
+                lambda view_spec: view_spec.is_sparse,
+                False,
+            )
+        )
+
+        # Make sure base layout is strided and add a guard
+        if base.layout != torch.strided:
+            raise Exception(
+                f"_ViewSpec can only be created from TensorSpec with layout={torch.strided}, but got layout={base.layout}."
+            )
+        self._guards.append(
+            _Guard(
+                "layout",
+                lambda view_spec: view_spec.layout,
+                torch.strided,
+            )
+        )
+
         self._base = base
         self.shape: List[int] = shape
         self.stride: Tuple[int] = contiguous_stride_from_shape(torch.Size(self.shape))
@@ -66,66 +151,108 @@ def __init__(self, base: TensorSpec, shape: List[int]) -> None:
             torch.Size(self.shape)
         )
 
-        # This spec gives a view into its base.
-        # The base can be modified (e.g., mem_id) and this spec will
-        # update accordingly, but certain fields we do not expect to change
-        # We create guards for these
-        self._guards: Dict[str, Any] = {
-            "shape_dynamism": base.shape_dynamism,
-            "scalar_type": base.scalar_type,
-            "layout": base.layout,
-            "is_sparse": base.is_sparse,
-        }
-        self._frozen = True
-
-    def _check_guards(self) -> None:
-        for name in self._guards:
-            if getattr(self._base, name) != self._guards[name]:
-                raise Exception(
-                    f"The guarded attribute '{name}' has changed value.  At creation of the ViewSpec, it was {self._guards[name]}, but it is now {getattr(self._base, name)}."
-                )
+        # Check compatibility with base on creation
+        if self.shape_dynamism != base.shape_dynamism:
+            raise Exception(
+                f"_ViewSpec is incompatible with its base on creation.  It has shape_dynamism={self.shape_dynamism}, but its base has shape_dynamism={base.shape_dynamism}."
+            )
+        self._guards.append(
+            _Guard(
+                "shape_dynamism_init",
+                lambda view_spec: view_spec.shape_dynamism,
+                base.shape_dynamism,
+            )
+        )
+        self._guards.append(
+            _Guard(
+                "shape_dynamism_eq_base",
+                lambda view_spec: view_spec.shape_dynamism
+                == view_spec._base.shape_dynamism,
+                True,
+            )
+        )
+
+        if self.dtype != base.dtype:
+            raise Exception(
+                f"_ViewSpec is incompatible with its base on creation.  It has dtype={self.dtype}, but its base has dtype={base.dtype}."
+            )
+        self._guards.append(
+            _Guard("dtype", lambda view_spec: view_spec.dtype, base.dtype)
+        )
+
+        # We do not guard nbytes because dynamic symints are replaced by upper bounds.
+        # We do guard on rank, though
+        if self.nbytes() != base.nbytes():
+            raise Exception(
+                f"_ViewSpec is incompatible with its base on creation.  It has nbytes={self.nbytes()}, but its base has nbytes={base.nbytes()}."
+            )
+        self._guards.append(
+            _Guard("rank", lambda view_spec: len(view_spec.shape), len(shape))
+        )
 
-    def __getattribute__(self, name):  # pyre-ignore
+    def _run_guards(self) -> None:
+        unguarded_access = self._unguarded_access
+        try:
+            self._unguarded_access = True
+            for g in self._guards:
+                g(self)
+        finally:
+            self._unguarded_access = unguarded_access
+
+    def __getattribute__(self, name: str):  # pyre-ignore
+        # Special field so we don't recurse infinitely
         if name in [
-            "_init_setters",
-            "_frozen",
             "_base",
+            "_self_fields",
+            "_base_fields",
             "_guards",
-            "_check_guards",
-            # Adding debug is needed so that view_spec.debug() shows the right id in
-            # its string (if debug is excluded, it shows the id(view_spec._base) instead
-            # of id(view_spec))
-            "debug",
+            "_unguarded_access",
+            "_run_guards",
         ]:
             return object.__getattribute__(self, name)
 
-        # Guard check after freeze
-        if self._frozen:
-            self._check_guards()
+        # Get some attributes from self
+        if name in self._self_fields:
+            val = object.__getattribute__(self, name)
+        elif name in self._base_fields:
+            val = object.__getattribute__(self._base, name)
+        else:
+            if len(name) > 0 and name[0] != "_":
+                logger.warning(
+                    f"Getting non-private attribute {name} on self, but it is not in _self_fields or _base_fields.  Is this intended?"
+                )
+            val = object.__getattribute__(self, name)
 
-        # self._init_setters attributes come from self, others come from base
-        if name in self._init_setters:
-            return object.__getattribute__(self, name)
-        return getattr(self._base, name)
+        if not self._unguarded_access:
+            self._run_guards()
+        return val
 
     def __setattr__(self, name: str, val) -> None:  # pyre-ignore
-        if name in ["_init_setters", "_frozen"]:
+        # Special field so we don't recurse infinitely
+        if name in [
+            "_base",
+            "_self_fields",
+            "_base_fields",
+            "_guards",
+            "_unguarded_access",
+            "_run_guards",
+        ]:
             object.__setattr__(self, name, val)
             return
 
-        # Allow setting during initialization
-        if name in self._init_setters and not self._frozen:
+        if name in self._self_fields:
             object.__setattr__(self, name, val)
             return
 
-        if name in self._init_setters:
-            raise Exception(
-                f"ViewSpec is immutable.  Cannot set the attribute '{name}' after creation."
-            )
+        if name in self._base_fields:
+            object.__setattr__(self._base, name, val)
+            return
 
-        raise Exception(
-            f"ViewSpec is immutable.  To update the non-size related attribute '{name}', update the base."
-        )
+        if len(name) > 0 and name[0] != "_":
+            logger.warning(
+                f"Setting non-private attribute {name} on self, but it is not in _self_fields or _base_fields.  Is this intended?"
+            )
+        object.__setattr__(self, name, val)
 
 
 class ReplaceViewCopyWithViewPass(PassBase):
@@ -151,8 +278,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     node.target = _VIEW_OP
 
                     # Create spec for the node.
-                    # _ViewSpec is an immutable TensorSpec gives a view into
-                    # its base spec for non-size related information.
+                    # _ViewSpec gives a view into its base spec for non-size
+                    # related information.
 
                     # the shape is not the same as node.args[1] because node.args[1]
                     # can have an inferred sizes (-1).
diff --git a/exir/program/TARGETS b/exir/program/TARGETS
index 49da0648a06..5ae3cf1ac59 100644
--- a/exir/program/TARGETS
+++ b/exir/program/TARGETS
@@ -33,8 +33,10 @@ python_library(
         "//executorch/exir/emit:lib",
         "//executorch/exir/passes:insert_write_back_for_buffers_pass",
         "//executorch/exir/passes:lib",
+        "//executorch/exir/passes:normalize_view_copy_base_pass",
         "//executorch/exir/passes:remove_graph_asserts_pass",
         "//executorch/exir/passes:remove_mixed_type_operators",
+        "//executorch/exir/passes:replace_view_copy_with_view_pass",
         "//executorch/exir/passes:spec_prop_pass",
         "//executorch/exir/verification:verifier",
     ],
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 990804fdcda..c62214f051c 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -24,6 +24,7 @@
 from executorch.exir.passes import (
     base_post_op_replace_passes,
     base_pre_op_replace_passes,
+    dead_code_elimination_pass,
     EdgeToBackendOpsPass,
     MemoryFormatOpsPass,
     OpReplacePass,
@@ -31,8 +32,14 @@
 from executorch.exir.passes.insert_write_back_for_buffers_pass import (
     insert_write_back_for_buffers_pass,
 )
+from executorch.exir.passes.normalize_view_copy_base_pass import (
+    NormalizeViewCopyBasePass,
+)
 from executorch.exir.passes.remove_graph_asserts_pass import RemoveGraphAssertsPass
 from executorch.exir.passes.remove_mixed_type_operators import RemoveMixedTypeOperators
+from executorch.exir.passes.replace_view_copy_with_view_pass import (
+    ReplaceViewCopyWithViewPass,
+)
 from executorch.exir.passes.spec_prop_pass import SpecPropPass
 from executorch.exir.print_program import pretty_print, print_program
 from executorch.exir.schema import Program
@@ -48,7 +55,6 @@
     unsafe_remove_auto_functionalized_pass,
 )
 from torch.export.exported_program import (
-    _get_updated_range_constraints,
     ConstantArgument,
     ExportGraphSignature,
     InputKind,
@@ -64,6 +70,39 @@
 Val = Any
 
 
+def _get_updated_range_constraints(gm):
+    def get_shape_env(gm):
+        vals = [
+            node.meta["val"]
+            for node in gm.graph.nodes
+            if node.meta.get("val", None) is not None
+        ]
+        from torch._guards import detect_fake_mode  # type: ignore[21]
+
+        fake_mode = detect_fake_mode(vals)
+        if fake_mode is not None:
+            return fake_mode.shape_env
+        for v in vals:
+            if isinstance(v, torch.SymInt):
+                return v.node.shape_env
+
+    shape_env = get_shape_env(gm)
+    if shape_env is None:
+        return {}
+    range_constraints = {
+        k: v
+        for k, v in shape_env.var_to_range.items()
+        if k not in shape_env.replacements
+    }
+    # Only when we have an unbacked symint, and it's used as constructor inputs,
+    # runtime_var_to_range will make a difference compated to var_to_range.
+    # e.g. [2, oo) -> [0, oo)
+    for k, v in shape_env.var_to_range.items():
+        if k not in shape_env.replacements:
+            range_constraints[k] = v
+    return range_constraints
+
+
 def _get_updated_graph_signature(
     old_signature: ExportGraphSignature,
     new_gm: torch.fx.GraphModule,
@@ -583,8 +622,25 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram":
     return new_ep
 
 
+def pre_memory_planning_passes(config: ExecutorchBackendConfig) -> List[PassType]:
+    if config.remove_view_copy:
+        # pyre-ignore
+        return [
+            NormalizeViewCopyBasePass(),
+            dead_code_elimination_pass,
+            ReplaceViewCopyWithViewPass(),
+            config.sym_shape_eval_pass,
+            config.to_out_var_pass,
+        ]
+    else:
+        # pyre-ignore
+        return [
+            config.sym_shape_eval_pass,
+            config.to_out_var_pass,
+        ]
+
+
 def edge_to_executorch_passes(config: ExecutorchBackendConfig) -> List[PassType]:
-    # pyre-ignore
     passes: List[PassType] = [
         *config.passes,
         SpecPropPass(),
@@ -593,9 +649,8 @@ def edge_to_executorch_passes(config: ExecutorchBackendConfig) -> List[PassType]
         # there exists an unbacked symint operation.
         EdgeToBackendOpsPass(),
         RemoveGraphAssertsPass(),
-        config.sym_shape_eval_pass,
-        config.to_out_var_pass,
-    ]
+    ] + pre_memory_planning_passes(config)
+
     return passes
 
 
@@ -688,7 +743,7 @@ class EdgeProgramManager:
 
     def __init__(
         self,
-        edge_programs: Dict[str, ExportedProgram],
+        edge_programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
         constant_methods: Optional[Dict[str, Any]] = None,
         compile_config: Optional[EdgeCompileConfig] = None,
     ):
@@ -698,6 +753,8 @@ def __init__(
         Constructs an EdgeProgramManager from an existing set of exported programs in edge dialect.
         """
         config = compile_config or EdgeCompileConfig()
+        if not isinstance(edge_programs, dict):
+            edge_programs = {"forward": edge_programs}
         for name, program in edge_programs.items():
             try:
                 EXIREdgeDialectVerifier(
@@ -708,7 +765,7 @@ def __init__(
                 logging.info(f"Input program {name} is not in aten dialect.")
                 raise e
 
-        self._edge_programs = edge_programs
+        self._edge_programs: Dict[str, ExportedProgram] = edge_programs
         self._config_methods = constant_methods
 
     @property
diff --git a/exir/serde/TARGETS b/exir/serde/TARGETS
index ff3cf7999f6..10c970867d7 100644
--- a/exir/serde/TARGETS
+++ b/exir/serde/TARGETS
@@ -3,6 +3,8 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 oncall("executorch")
 
 python_library(
+    # @autodeps-skip for some reason autodeps thinks this target
+    # needs to depend on exir:lib which it doesn't.
     name = "serialize",
     srcs = [
         "export_serialize.py",
@@ -13,7 +15,6 @@ python_library(
         ":schema",
         "//caffe2:torch",
         "//executorch/exir:delegate",
-        "//executorch/exir:lib",
         "//executorch/exir:lowered_backend_module",
         "//executorch/exir:memory",
         "//executorch/exir/backend:compile_spec_schema",
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
index 799a1dbe78f..87691dfbee2 100644
--- a/exir/serde/export_serialize.py
+++ b/exir/serde/export_serialize.py
@@ -242,7 +242,9 @@ def deserialize_torch_artifact(serialized: bytes):
         return {}
     buffer = io.BytesIO(serialized)
     buffer.seek(0)
-    return torch.load(buffer)
+    # TODO: If possible, it's better to set weights_only to True
+    # https://pytorch.org/docs/stable/generated/torch.load.html
+    return torch.load(buffer, weights_only=False)
 
 
 def _sympy_int_to_int(val: sympy.Expr):
@@ -1190,13 +1192,17 @@ def deserialize_tensor_meta(
                 ),
             )
 
-    def deserialize_graph_output(self, output) -> torch.fx.Node:
+    def deserialize_graph_output(self, output) -> Optional[Union[torch.fx.Node, int]]:
         if output.type == "as_tensor":
             return self.serialized_name_to_node[output.as_tensor.name]
         elif output.type == "as_sym_int":
             return self.serialized_name_to_node[output.as_sym_int.as_name]
         elif output.type == "as_sym_bool":
             return self.serialized_name_to_node[output.as_sym_bool.as_name]
+        elif output.type == "as_int":
+            return output.as_int
+        elif output.type == "as_none":
+            return None
         else:
             raise SerializeError(f"Unable to deserialize output node {output}")
 
@@ -1249,7 +1255,8 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
             output_node.meta["val"] = output_node.args[0].meta["val"]
         else:
             output_node.meta["val"] = tuple(
-                arg.meta["val"] for arg in output_node.args[0]
+                arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+                for arg in output_node.args[0]
             )
 
         return self.graph
diff --git a/exir/serde/schema.py b/exir/serde/schema.py
index 494aa29de29..7536a323ee0 100644
--- a/exir/serde/schema.py
+++ b/exir/serde/schema.py
@@ -25,3 +25,7 @@ class LoweredBackendModule:
     compile_specs: List[CompileSpec]
     original_module: export_schema.ExportedProgram
     original_state_dict: str
+
+
+# NOTE: Please update this value if any modifications are made to the schema
+SCHEMA_VERSION = (1, 0)
diff --git a/exir/serde/serialize.py b/exir/serde/serialize.py
index 5eb28b830ce..5826a52b01f 100644
--- a/exir/serde/serialize.py
+++ b/exir/serde/serialize.py
@@ -9,9 +9,12 @@
 import base64
 import copy
 import dataclasses
+import io
 import json
 import logging
 import operator
+import os
+import zipfile
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import executorch.exir as exir
@@ -33,9 +36,9 @@
 from executorch.exir.serde.schema import (
     CompileSpec,
     LoweredBackendModule as SerdeLoweredBackendModule,
+    SCHEMA_VERSION,
 )
 from torch._export.serde.schema import SchemaVersion
-from torch._export.serde.serialize import SerializeError
 from torch._export.serde.union import _Union
 from torch._export.verifier import load_verifier
 from torch.fx.experimental import symbolic_shapes
@@ -479,23 +482,22 @@ def deserialize_metadata(self, metadata: Dict[str, str]) -> Dict[str, Any]:
 
         return res
 
-    def deserialize_graph_output(self, output: schema.Argument) -> torch.fx.Node:
-        if isinstance(output.value, schema.TensorArgument):
-            if output.value.name in self.state_dict:  # TODO(T157676982)
-                val = self.state_dict[output.value.name]
-                setattr(self.module, output.value.name, val)
-                node = self.graph.create_node(
-                    "get_attr",
-                    output.value.name,
-                    name=output.value.name,
-                )
-                node.meta = {"val": ""}
-                return node
-            return self.serialized_name_to_node[output.value.name]
-        elif isinstance(output.value, (schema.SymIntArgument, schema.SymBoolArgument)):
-            return self.serialized_name_to_node[output.value.as_name]
-        else:
-            raise SerializeError(f"Unable to deserialize output node {output}")
+    def deserialize_graph_output(
+        self, output: schema.Argument
+    ) -> Optional[Union[torch.fx.Node, int]]:
+        if (
+            output.type == "as_tensor" and output.value.name in self.state_dict
+        ):  # TODO(T157676982)
+            val = self.state_dict[output.value.name]
+            setattr(self.module, output.value.name, val)
+            node = self.graph.create_node(
+                "get_attr",
+                output.value.name,
+                name=output.value.name,
+            )
+            node.meta = {"val": ""}
+            return node
+        return super().deserialize_graph_output(output)
 
     # pyre-ignore
     def deserialize_alloc_inputs(self, serialized_inputs: List[schema.NamedArgument]):
@@ -628,7 +630,7 @@ class ExportedProgramDeserializer(export_serialize.ExportedProgramDeserializer):
     def deserialize(
         self,
         serialized_artifact: export_serialize.SerializedArtifact,
-    ) -> exir.ExportedProgram:
+    ) -> ep.ExportedProgram:
         assert isinstance(serialized_artifact.exported_program, schema.ExportedProgram)
 
         symbol_name_to_range = {
@@ -677,7 +679,7 @@ def deserialize(
             root=state_dict,
             graph=dummy_g,
             graph_signature=ep.ExportGraphSignature(input_specs=[], output_specs=[]),
-            state_dict={},  # TODO(T157676982)
+            state_dict=state_dict,  # TODO(T157676982)
             range_constraints=range_constraints,
             module_call_graph=module_call_graph,
             verifier=load_verifier(
@@ -738,7 +740,7 @@ def serialize(
 def deserialize(
     artifact: export_serialize.SerializedArtifact,
     expected_opset_version: Optional[Dict[str, int]] = None,
-) -> exir.ExportedProgram:
+) -> ep.ExportedProgram:
     assert isinstance(artifact.exported_program, bytes)
     exported_program_str = artifact.exported_program.decode("utf-8")
     exported_program_dict = json.loads(exported_program_str)
@@ -750,3 +752,98 @@ def deserialize(
             serialized_exported_program, artifact.state_dict, artifact.constants
         )
     )
+
+
+def save(
+    ep_save: ep.ExportedProgram,
+    f: Union[str, os.PathLike, io.BytesIO],
+    *,
+    extra_files: Optional[Dict[str, Any]] = None,
+    opset_version: Optional[Dict[str, int]] = None,
+) -> None:
+    if not isinstance(ep_save, ep.ExportedProgram):
+        raise TypeError(f"save() expects an ExportedProgram but got {type(ep)}")
+
+    artifact: export_serialize.SerializedArtifact = serialize(ep_save, opset_version)
+
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+    with zipfile.ZipFile(f, "w") as zipf:
+        # Save every field in the SerializedArtifact to a file.
+        assert isinstance(artifact.exported_program, bytes)
+        zipf.writestr("serialized_exported_program.json", artifact.exported_program)
+        zipf.writestr("serialized_state_dict.pt", artifact.state_dict)
+        zipf.writestr("serialized_constants.pt", artifact.constants)
+
+        zipf.writestr("version", ".".join(map(str, SCHEMA_VERSION)))
+
+        # Add extra files if provided
+        if extra_files:
+            for extra_file_name, content in extra_files.items():
+                encoded_content = content.encode("utf-8")
+                zipf.writestr(f"extra_files/{extra_file_name}", encoded_content)
+
+
+def load(
+    f: Union[str, os.PathLike, io.BytesIO],
+    *,
+    extra_files: Optional[Dict[str, Any]] = None,
+    expected_opset_version: Optional[Dict[str, int]] = None,
+) -> ep.ExportedProgram:
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+    extra_files = extra_files or {}
+
+    with zipfile.ZipFile(f, "r") as zipf:
+        # Check the version
+        version = zipf.read("version").decode().split(".")
+
+        assert len(version) == len(SCHEMA_VERSION)
+        if version[0] != str(SCHEMA_VERSION[0]):
+            raise RuntimeError(
+                f"Serialized version {version} does not match our current "
+                f"schema version {SCHEMA_VERSION}."
+            )
+
+        # Load serialized_ep and serialized_state_dict from the zip file
+
+        serialized_exported_program: Optional[bytes] = None
+        serialized_state_dict: Optional[bytes] = None
+        serialized_constants: Optional[bytes] = None
+
+        for file_info in zipf.infolist():
+            file_content = zipf.read(file_info.filename)
+
+            if file_info.filename == "serialized_exported_program.json":
+                serialized_exported_program = file_content
+            elif file_info.filename == "serialized_state_dict.json":
+                print("This version of file is deprecated")
+                serialized_state_dict = file_content
+            elif file_info.filename == "serialized_constants.json":
+                print("This version of file is deprecated")
+                serialized_constants = file_content
+            elif file_info.filename == "serialized_state_dict.pt":
+                serialized_state_dict = file_content
+            elif file_info.filename == "serialized_constants.pt":
+                serialized_constants = file_content
+            elif file_info.filename.startswith("extra_files"):
+                filename = file_info.filename.split("/", 1)[1]
+                extra_files[filename] = file_content.decode("utf-8")
+
+        assert serialized_exported_program is not None
+        assert serialized_state_dict is not None
+        assert serialized_constants is not None
+        artifact: export_serialize.SerializedArtifact = (
+            export_serialize.SerializedArtifact(
+                serialized_exported_program,
+                serialized_state_dict,
+                serialized_constants,
+            )
+        )
+
+        # Deserialize ExportedProgram
+        ep = deserialize(artifact, expected_opset_version)
+
+        return ep
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
index 0c3232916d6..94a82d8a2bc 100644
--- a/exir/tests/TARGETS
+++ b/exir/tests/TARGETS
@@ -411,3 +411,17 @@ python_unittest(
         "//executorch/exir:print_program",
     ],
 )
+
+python_unittest(
+    name = "test_remove_view_copy",
+    srcs = [
+        "test_remove_view_copy.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+        "//executorch/exir:memory",
+        "//executorch/exir/capture:config",
+        "//executorch/exir/passes:lib",
+    ],
+)
diff --git a/exir/tests/models.py b/exir/tests/models.py
index c9eb0761935..74c86dab807 100644
--- a/exir/tests/models.py
+++ b/exir/tests/models.py
@@ -7,7 +7,7 @@
 # pyre-strict
 
 import itertools
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import executorch.exir as exir
 
@@ -34,6 +34,11 @@ def forward(
     def get_random_inputs(self) -> Tuple[torch.Tensor, torch.Tensor]:
         return (torch.rand(4), torch.rand(5))
 
+    def get_dynamic_shape(self) -> Any:  # pyre-ignore[3]
+        dim = torch.export.Dim("dim", max=10)
+        dim2 = torch.export.Dim("dim2", max=10)
+        return ({0: dim}, {0: dim2})
+
 
 class ModelWithUnusedArg(nn.Module):
     def __init__(self) -> None:
diff --git a/exir/tests/test_dynamic_shape_propagation.py b/exir/tests/test_dynamic_shape_propagation.py
index 1cd699a4266..abc07d60437 100644
--- a/exir/tests/test_dynamic_shape_propagation.py
+++ b/exir/tests/test_dynamic_shape_propagation.py
@@ -7,8 +7,10 @@
 from unittest import TestCase
 
 from executorch import exir
+from executorch.exir import to_edge
 from executorch.exir.passes import DebugPass, HintBasedSymShapeEvalPass, SpecPropPass
 from executorch.exir.tests.models import Repeat
+from torch.export import export
 
 
 class TestDynamicShapeProp(TestCase):
@@ -17,15 +19,14 @@ def test_repeat(self):
         inputs = eager_model.get_random_inputs()
         inputs = inputs[0], inputs[1]
 
-        prog = exir.capture(
-            eager_model,
-            inputs,
-            exir.CaptureConfig(enable_dynamic_shape=True),
-        ).to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))
+        prog = to_edge(
+            export(eager_model, inputs, dynamic_shapes=eager_model.get_dynamic_shape()),
+            compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+        )
 
-        new_prog = prog.transform(SpecPropPass(), HintBasedSymShapeEvalPass())
+        new_prog = prog.transform([SpecPropPass(), HintBasedSymShapeEvalPass()])
 
-        gm = new_prog.exported_program.graph_module
+        gm = new_prog.exported_program().graph_module
 
         DebugPass(show_spec=True)(gm)
         *_, return_node = gm.graph.nodes
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 848269c6573..582941eb2cd 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -65,6 +65,7 @@
     XNNPACKQuantizer,
 )
 from torch.export import export
+from torch.export.graph_signature import InputKind, InputSpec, TensorArgument
 from torch.fx import GraphModule, subgraph_rewriter
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.library import impl, Library
@@ -1145,7 +1146,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         # Check (_lifted_tensor_constant + to_copy) node is replaced by prop tensor
         FileCheck().check_not("_lifted_tensor_constant").check(
-            "_prop_tensor_constant1"
+            "_prop_tensor_constant0"
         ).check_not("executorch_exir_dialects_edge__ops_aten__to_copy_default").run(
             new_ep.graph_module.code
         )
@@ -1174,6 +1175,105 @@ def forward(self, x):
         new_ep = constant_prop_pass(aten)
         self.assertEqual(count_additions(new_ep.graph_module), 1)
 
+    def test_constant_prop_pass_graph_signature(self) -> None:
+        def count_additions(gm: torch.fx.GraphModule) -> int:
+            return sum(
+                (node.target == torch.ops.aten.add.Tensor) for node in gm.graph.nodes
+            )
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.ones(1, 2, 3))
+
+            def forward(self, x):
+                b = self.a + self.a
+                c = torch.cat([self.a, b])
+                return (c + c) + x
+
+        aten = export(
+            M(),
+            (torch.zeros(2, 2, 3),),
+        )
+        # Input signature will have two entries:
+        # (1) parameter `a` and (2) user input `x`.
+        self.assertEqual(len(aten.graph_signature.input_specs), 2)
+        new_ep = constant_prop_pass(aten)
+        # Check that there are exactly two propagated tensors - (1) propagated
+        # constant and (2) user input.
+        self.assertEqual(
+            new_ep.graph_signature.input_specs,
+            [
+                InputSpec(
+                    kind=InputKind.CONSTANT_TENSOR,
+                    arg=TensorArgument(name="_prop_tensor_constant0"),
+                    target="_prop_tensor_constant0",
+                    persistent=True,
+                ),
+                # User input graph signature.
+                aten.graph_signature.input_specs[-1],
+            ],
+        )
+
+    def test_constant_prop_pass_for_parameter_slice(self) -> None:
+        def count_slice(gm: torch.fx.GraphModule) -> int:
+            return sum(
+                (node.target == torch.ops.aten.slice_copy.Tensor)
+                for node in gm.graph.nodes
+            )
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.ones(3, 2, 2))
+
+            def forward(self, x):
+                # Create slice of shape (1, 2, 2)
+                slice_tensor = torch.slice_copy(self.a, dim=0, start=0, end=1)
+                return torch.cat([x, slice_tensor])
+
+        aten = export(
+            M(),
+            (torch.zeros(2, 2, 2),),
+        )
+        self.assertIn("a", aten.state_dict)
+        self.assertEqual(count_slice(aten.graph_module), 1)
+
+        new_ep = constant_prop_pass(aten)
+        # Check there is a propagated tensor.
+        FileCheck().check("_prop_tensor_constant0").run(aten.graph_module.code)
+        self.assertIn("_prop_tensor_constant0", new_ep.constants)
+        self.assertNotIn("a", new_ep.state_dict)
+        # No more slice copy.
+        self.assertEqual(count_slice(new_ep.graph_module), 0)
+
+    def test_constant_prop_pass_no_propagate(self) -> None:
+        def count_placeholder(gm: torch.fx.GraphModule) -> int:
+            return sum((node.op == "placeholder") for node in gm.graph.nodes)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.ones(3, 2, 4))
+
+            def forward(self, x, y):
+                # y is unused.
+                return x + self.a
+
+        aten = export(
+            M(),
+            (torch.zeros(3, 2, 4), torch.zeros(3, 2, 4)),
+        )
+        self.assertIn("a", aten.state_dict)
+        self.assertEqual(count_placeholder(aten.graph_module), 3)
+
+        new_ep = constant_prop_pass(aten)
+        # Check there is no propagated tensor.
+        FileCheck().check("p_a").check("x").check("y").run(aten.graph_module.code)
+        self.assertNotIn("_prop_tensor_constant0", new_ep.constants)
+        self.assertIn("a", new_ep.state_dict)
+        self.assertEqual(count_placeholder(new_ep.graph_module), 3)
+
     def test_constant_prop_pass_for_control_flow(self) -> None:
         class Module(torch.nn.Module):
             def __init__(self):
@@ -1498,61 +1598,29 @@ def __init__(self):
                 self.parameter = torch.nn.Parameter(torch.ones(1))
 
             def forward(self, x):
-                o1 = torch.ops.aten.view_copy.default(
-                    self.parameter, [1]
-                )  # replaceable parameter
-                o2 = torch.ops.aten.view_copy.default(x, [1])  # replaceable user input
-                o3 = torch.ops.aten.view_copy.default(
-                    torch.ops.aten.relu.default(x), [1]
-                )  # replaceable dynamic unbound
-                o4 = torch.ops.aten.view_copy.default(
-                    torch.ops.aten.gelu.default(x), [1]
-                )  # replaceable dynamic bound
-                o5 = torch.ops.aten.view_copy.default(
-                    torch.ops.aten.tanh.default(x), [1]
-                )  # replaceable static
-                return o1, o2, o3, o4, o5
+                o1 = torch.ops.aten.view_copy.default(x, [1])
+                o2 = torch.ops.aten.view_copy.default(self.parameter, [1])
+                return o1, o2
 
         ep = torch.export.export(
             TestViewCopies(),
             args=(torch.ones(1),),
         )
-        self.assertEqual(len(ep.graph.nodes), 11)
         for node in ep.graph.nodes:
             if node.op == "placeholder":
                 node.meta["spec"] = TensorSpec.from_tensor(torch.empty(1))
                 node.meta["spec"].shape_dynamism = TensorShapeDynamism.STATIC
-            elif node.target == torch.ops.aten.relu.default:
-                node.meta["spec"] = TensorSpec.from_tensor(torch.empty(1))
-                node.meta["spec"].shape_dynamism = TensorShapeDynamism.DYNAMIC_UNBOUND
-            elif node.target == torch.ops.aten.gelu.default:
-                node.meta["spec"] = TensorSpec.from_tensor(torch.empty(1))
-                node.meta["spec"].shape_dynamism = TensorShapeDynamism.DYNAMIC_BOUND
-            elif node.target == torch.ops.aten.tanh.default:
-                node.meta["spec"] = TensorSpec.from_tensor(torch.empty(1))
-                node.meta["spec"].shape_dynamism = TensorShapeDynamism.STATIC
-            elif node.target == torch.ops.aten.view_copy.default:
-                node.meta["spec"] = TensorSpec.from_tensor(torch.empty(1))
-                node.meta["spec"].shape_dynamism = (
-                    node.args[0].meta["spec"].shape_dynamism
-                )
-            else:
-                pass
 
         # Run tests
         gm = ep.graph_module
 
         # Check before transformation
-        n_view_copy_before = 0
-        n_memory_view_before = 0
-        for node in gm.graph.nodes:
-            if is_view_copy(node):
-                n_view_copy_before += 1
-            if is_memory_view(node):
-                n_memory_view_before += 1
-
-        self.assertEqual(n_view_copy_before, 5)
-        self.assertEqual(n_memory_view_before, 0)
+        FileCheck().check_count(
+            "torch.ops.aten.view_copy.default", 2, exactly=True
+        ).run(gm.code)
+        FileCheck().check_count("executorch_exir_memory_view", 0, exactly=True).run(
+            gm.code
+        )
 
         # Do transformation
         p = ReplaceViewCopyWithViewPass()
@@ -1560,14 +1628,10 @@ def forward(self, x):
         assert gm_res is not None
         gm = gm_res.graph_module
 
-        # Check after transformation
-        n_view_copy_after = 0
-        n_memory_view_after = 0
-        for node in gm.graph.nodes:
-            if is_view_copy(node):
-                n_view_copy_after += 1
-            if is_memory_view(node):
-                n_memory_view_after += 1
-
-        self.assertEqual(n_view_copy_after, 0)
-        self.assertEqual(n_memory_view_after, 5)
+        # Check before transformation
+        FileCheck().check_count(
+            "torch.ops.aten.view_copy.default", 0, exactly=True
+        ).run(gm.code)
+        FileCheck().check_count("executorch_exir_memory_view", 2, exactly=True).run(
+            gm.code
+        )
diff --git a/exir/tests/test_quant_fusion_pass.py b/exir/tests/test_quant_fusion_pass.py
index 00269da92d7..791cb3e16ef 100644
--- a/exir/tests/test_quant_fusion_pass.py
+++ b/exir/tests/test_quant_fusion_pass.py
@@ -10,7 +10,7 @@
 
 import torch
 from executorch import exir
-from executorch.exir import CaptureConfig, EdgeCompileConfig
+from executorch.exir import EdgeCompileConfig, to_edge
 from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
 from executorch.exir.tests.common import register_additional_test_aten_ops
 from torch.ao.quantization import (  # @manual
@@ -26,6 +26,7 @@
     _convert_to_reference_decomposed_fx,
     prepare_fx,
 )
+from torch.export import export
 from torch.nn import functional as F
 
 from torch.testing import FileCheck
@@ -56,9 +57,11 @@ def forward(self, x, y):
         )
         m = _convert_to_reference_decomposed_fx(m)
         config = EdgeCompileConfig(_check_ir_validity=False)
-        m = exir.capture(m, example_inputs, CaptureConfig()).to_edge(config=config)
+        m = to_edge(export(m, example_inputs), compile_config=config)
         # QuantFusionPass should be part of to_executorch() config, separating it out so that we can check the graph.
-        m = m.transform(QuantFusionPass(_fix_node_meta_val=True))
+        m = m.transform(
+            [QuantFusionPass(_fix_node_meta_val=True)], check_ir_validity=False
+        )
         # check that we are using functional variant of q/dq/add
         FileCheck().check(
             "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default"
@@ -67,12 +70,12 @@ def forward(self, x, y):
         ).check(
             "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default"
         ).run(
-            m.exported_program.graph_module.code
+            m.exported_program().graph_module.code
         )
         m = m.to_executorch()
         # check that we are using out variant of q/dq/add
         FileCheck().check("torch.ops.quantized_decomposed.add.out").run(
-            m.exported_program.graph_module.code
+            m.exported_program().graph_module.code
         )
 
     def test_reshape(self) -> None:
@@ -95,9 +98,11 @@ def forward(self, x, y):
         m(*example_inputs)
         m = _convert_to_reference_decomposed_fx(m)
         config = EdgeCompileConfig(_check_ir_validity=False)
-        m = exir.capture(m, example_inputs, CaptureConfig()).to_edge(config=config)
+        m = to_edge(export(m, example_inputs), compile_config=config)
         # QuantFusionPass should be part of to_executorch() config, separating it out so that we can check the graph.
-        m = m.transform(QuantFusionPass(_fix_node_meta_val=True))
+        m = m.transform(
+            [QuantFusionPass(_fix_node_meta_val=True)], check_ir_validity=False
+        )
         # check that we are using functional variant of q/dq/add/reshape
         # make sure we only have two quant and one dequant since the q/dq around reshape
         # should be fused
@@ -114,14 +119,14 @@ def forward(self, x, y):
             1,
             exactly=True,
         ).run(
-            m.exported_program.graph_module.code
+            m.exported_program().graph_module.code
         )
 
-        m = m.to_executorch()
+        m = m.to_executorch(exir.ExecutorchBackendConfig(remove_view_copy=False))
         # check that we are using out variant of q/dq/add
         FileCheck().check("torch.ops.quantized_decomposed.add.out").check(
             "torch.ops.aten.view_copy.out"
-        ).run(m.exported_program.graph_module.code)
+        ).run(m.exported_program().graph_module.code)
 
     def test_slice(self) -> None:
         """We don't proactively quantize slice today, but we'll fuse the dq-slice-q
@@ -150,9 +155,11 @@ def forward(self, x, y):
         )
         m = _convert_to_reference_decomposed_fx(m)
         config = EdgeCompileConfig(_check_ir_validity=False)
-        m = exir.capture(m, example_inputs, CaptureConfig()).to_edge(config=config)
+        m = to_edge(export(m, example_inputs), compile_config=config)
         # QuantFusionPass should be part of to_executorch() config, separating it out so that we can check the graph.
-        m = m.transform(QuantFusionPass(_fix_node_meta_val=True))
+        m = m.transform(
+            [QuantFusionPass(_fix_node_meta_val=True)], check_ir_validity=False
+        )
         # check that we are using functional variant of q/dq/add/slice
         # make sure we only have one quant and one dequant since the q/dq around slice
         # should be fused
@@ -169,14 +176,14 @@ def forward(self, x, y):
         ).check(
             "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default"
         ).run(
-            m.exported_program.graph_module.code
+            m.exported_program().graph_module.code
         )
 
         m = m.to_executorch()
         # check that we are using out variant of add and slice_copy
         FileCheck().check("torch.ops.quantized_decomposed.add.out").check(
             "torch.ops.aten.slice_copy.Tensor_out"
-        ).run(m.dump_graph_module().code)
+        ).run(m.exported_program().graph_module.code)
 
     def test_cat(self) -> None:
         class M(torch.nn.Module):
@@ -197,9 +204,9 @@ def forward(self, x, y):
         m(*example_inputs)
         m = _convert_to_reference_decomposed_fx(m)
         config = EdgeCompileConfig(_check_ir_validity=False)
-        m = exir.capture(m, example_inputs, CaptureConfig()).to_edge(config=config)
+        m = to_edge(export(m, example_inputs), compile_config=config)
         # QuantFusionPass should be part of to_executorch() config, separating it out so that we can check the graph.
-        m = m.transform(QuantFusionPass())
+        m = m.transform([QuantFusionPass()], check_ir_validity=False)
         # check that we are using functional variant of q/dq/cat
         FileCheck().check_count(
             "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default",
@@ -210,7 +217,7 @@ def forward(self, x, y):
             1,
             exactly=True,
         ).run(
-            m.exported_program.graph_module.code
+            m.exported_program().graph_module.code
         )
 
         m = m.to_executorch()
@@ -224,7 +231,7 @@ def forward(self, x, y):
         ).check("torch.ops.aten.cat.out").check_count(
             "torch.ops.quantized_decomposed.dequantize_per_tensor.out", 1, exactly=True
         ).run(
-            m.dump_graph_module().code
+            m.exported_program().graph_module.code
         )
 
     def test_embedding_byte(self) -> None:
@@ -292,16 +299,18 @@ def forward(self, indices):
                 _check_ir_validity=False,
                 _use_edge_ops=True,
             )
-            m = exir.capture(m, example_inputs).to_edge(config=compile_config)
+            m = to_edge(export(m, example_inputs), compile_config=compile_config)
             # QuantFusionPass should be part of to_executorch() config, separating it out so that we can check the graph.
-            m = m.transform(QuantFusionPass(_fix_node_meta_val=True))
+            m = m.transform(
+                [QuantFusionPass(_fix_node_meta_val=True)], check_ir_validity=False
+            )
             # check that we are using functional variant of q/dq/cat
             FileCheck().check(
                 "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_channel_default",
             ).check(
                 "executorch_exir_dialects_edge__ops_quantized_decomposed_embedding_byte_default"
             ).run(
-                m.exported_program.graph_module.code
+                m.exported_program().graph_module.code
             )
 
             # TODO: enable after the out variants of quantize_per_channel is supported
@@ -348,17 +357,18 @@ def forward(self, indices):
                 _check_ir_validity=False,
                 _use_edge_ops=True,
             )
-            m = exir.capture(m, example_inputs).to_edge(config=compile_config)
+            m = to_edge(export(m, example_inputs), compile_config=compile_config)
             # QuantFusionPass should be part of to_executorch() config, separating it out so that we can check the graph.
-            m = m.transform(QuantFusionPass(_fix_node_meta_val=True))
-            m(*example_inputs)
+            m = m.transform(
+                [QuantFusionPass(_fix_node_meta_val=True)], check_ir_validity=False
+            )
             # check that we are using functional variant of q/dq/cat
             FileCheck().check(
                 "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_channel_default",
             ).check(
                 "executorch_exir_dialects_edge__ops_quantized_decomposed_embedding_byte_default"
             ).run(
-                m.exported_program.graph_module.code
+                m.exported_program().graph_module.code
             )
 
             # TODO: enable after the out variants of quantize_per_channel is supported
diff --git a/exir/tests/test_remove_view_copy.py b/exir/tests/test_remove_view_copy.py
new file mode 100644
index 00000000000..0c5b61f8d8f
--- /dev/null
+++ b/exir/tests/test_remove_view_copy.py
@@ -0,0 +1,202 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+import torch.nn as nn
+from executorch.exir import memory, to_edge
+from executorch.exir.capture._config import ExecutorchBackendConfig
+from executorch.exir.passes import MemoryPlanningPass
+
+
+class TestModel1(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.parameter = nn.Parameter(torch.rand(5, 6))
+        self.parameter.requires_grad = False
+
+    def forward(self, x):
+        v1 = self.parameter.view(
+            6, 5
+        )  # removed, lifetime of parameter will be extended
+        v2 = x.view(6, 5)  # not removed
+        v3 = torch.ops.aten.mul.Tensor(v1, v2).view(
+            30
+        )  # removed, lifetime of mul.Tensor will be extended
+        return v3
+
+    def get_example_inputs(self):
+        return (torch.rand(5, 6),)
+
+
+class TestRemoveViewCopy(unittest.TestCase):
+    def test_disable(self) -> None:
+        model = TestModel1()
+        model.eval()
+        example_inputs = model.get_example_inputs()
+        ep = torch.export.export(model, example_inputs)
+        etpm = to_edge(ep).to_executorch(
+            config=ExecutorchBackendConfig(
+                remove_view_copy=False,
+                memory_planning_pass=MemoryPlanningPass(
+                    "greedy", alloc_graph_input=False
+                ),
+            ),
+        )
+
+        for node in etpm.exported_program().graph_module.graph.nodes:
+            assert node.target != memory.view
+
+    def test_output_matches(self) -> None:
+        model = TestModel1()
+        model.eval()
+        example_inputs = model.get_example_inputs()
+        ep = torch.export.export(model, example_inputs)
+
+        epm_remove = to_edge(ep)
+        epm_no_remove = copy.deepcopy(
+            epm_remove
+        )  # to_executorch modifies the edge_program, so we make a copy
+
+        # Run pass with no removal
+        etpm_remove = epm_remove.to_executorch(
+            config=ExecutorchBackendConfig(
+                remove_view_copy=True,
+                memory_planning_pass=MemoryPlanningPass(
+                    "greedy", alloc_graph_input=False
+                ),
+            ),
+        )
+
+        # Run pass with removal
+        etpm_no_remove = epm_no_remove.to_executorch(
+            config=ExecutorchBackendConfig(
+                remove_view_copy=True,
+                memory_planning_pass=MemoryPlanningPass(
+                    "greedy", alloc_graph_input=False
+                ),
+            ),
+        )
+
+        out_remove = etpm_remove.exported_program().module()(*example_inputs)
+        out_no_remove = etpm_no_remove.exported_program().module()(*example_inputs)
+
+        self.assertTrue(torch.allclose(out_remove, out_no_remove))
+
+    def test_spec(self) -> None:
+        model = TestModel1()
+        model.eval()
+        example_inputs = model.get_example_inputs()
+        ep = torch.export.export(model, example_inputs)
+
+        etpm = to_edge(ep).to_executorch(
+            config=ExecutorchBackendConfig(
+                remove_view_copy=True,
+                memory_planning_pass=MemoryPlanningPass(
+                    "greedy", alloc_graph_input=False
+                ),
+            ),
+        )
+
+        # etpm.exported_program().graph.print_tabular()
+
+        # idx  opcode         name                      target                              args                                                kwargs
+        # ---  -------------  ------------------------  ----------------------------------  --------------------------------------------------  --------------
+        # 0    placeholder    p_parameter               p_parameter                         ()                                                  {}
+        # 1    placeholder    x                         x                                   ()                                                  {}
+        # 2    call_function  aten_view_copy_default    <function view at 0x7facb871fb50>   (p_parameter, [6, 5])                               {}
+        # 3    call_function  aten_view_copy_default_1  <function view at 0x7facb871fb50>   (x, [6, 5])                                         {}
+        # 4    call_function  alloc                     <function alloc at 0x7facb871fa30>  (((6, 5), torch.float32),)                          {}
+        # 5    call_function  aten_mul_tensor           aten.mul.out                        (aten_view_copy_default, aten_view_copy_default_1)  {'out': alloc}
+        # 6    call_function  aten_view_copy_default_2  <function view at 0x7facb871fb50>   (aten_mul_tensor, [30])                             {}
+        # 7    output         output_1                  output                              ((aten_view_copy_default_2,),)                      {}
+
+        for node in etpm.exported_program().graph.nodes:
+            if node.name == "p_parameter":
+                # p_parameter's lifetime is extended through aten_view_copy_default (memory.view) to idx 5
+                self.assertEqual(node.meta["spec"].lifetime, [0, 5])
+            elif node.name == "aten_view_copy_default":
+                # aten_view_copy_default is a memory.view of p_parameter.
+                # p_parameter is a constant with storage, so we check that the view's storage matches the base
+
+                # assert base is p_parameter
+                self.assertEqual(node.args[0].name, "p_parameter")
+
+                # assert base is const with storage
+                self.assertTrue(node.args[0].meta["spec"].const)
+                self.assertTrue(node.args[0].meta["spec"].storage is not None)
+                self.assertTrue(node.args[0].meta["spec"].mem_id is None)
+                self.assertTrue(node.args[0].meta["spec"].mem_offset is None)
+
+                # assert self is const with storage
+                self.assertTrue(node.meta["spec"].const)
+                self.assertTrue(node.meta["spec"].storage is not None)
+                self.assertTrue(node.meta["spec"].mem_id is None)
+                self.assertTrue(node.meta["spec"].mem_offset is None)
+
+                # assert storage matches
+                self.assertEqual(
+                    node.meta["spec"].storage, node.args[0].meta["spec"].storage
+                )
+
+                # assert lifetime matches
+                self.assertEqual(
+                    node.meta["spec"].lifetime, node.args[0].meta["spec"].lifetime
+                )
+            elif node.name == "aten_mul_tensor":
+                # aten_mul_tensor's lifetime is extended through aten_view_copy_default_2 (memory.view) to idx 7
+                self.assertEqual(node.meta["spec"].lifetime, [4, 7])
+            elif node.name == "aten_view_copy_default_2":
+                # aten_view_copy_default_2 is a memory.view of aten_mul_tensor
+
+                # assert base is aten_mul_tensor
+                self.assertEqual(node.args[0].name, "aten_mul_tensor")
+
+                # assert base and self are not const, do not have storage,
+                # but do have mem_id and mem_offset
+                self.assertFalse(node.args[0].meta["spec"].const)
+                self.assertTrue(node.args[0].meta["spec"].storage is None)
+                self.assertTrue(node.args[0].meta["spec"].mem_id is not None)
+                self.assertTrue(node.args[0].meta["spec"].mem_offset is not None)
+
+                self.assertFalse(node.meta["spec"].const)
+                self.assertTrue(node.meta["spec"].storage is None)
+                self.assertTrue(node.meta["spec"].mem_id is not None)
+                self.assertTrue(node.meta["spec"].mem_offset is not None)
+
+                # assert self and base mem_id, mem_offset, and lifetime matches
+                self.assertEqual(
+                    node.meta["spec"].mem_id, node.args[0].meta["spec"].mem_id
+                )
+                self.assertEqual(
+                    node.meta["spec"].mem_offset, node.args[0].meta["spec"].mem_offset
+                )
+                self.assertEqual(
+                    node.meta["spec"].lifetime, node.args[0].meta["spec"].lifetime
+                )
+
+        # Test evalues in execution plan
+        plan = etpm.executorch_program.execution_plan[0]
+        self.assertEqual(plan.operators[0].name, "executorch_prim::et_view")
+        self.assertEqual(plan.operators[1].name, "aten::mul")
+
+        instructions = plan.chains[0].instructions
+        self.assertEqual(len(instructions), 4)
+
+        self.assertEqual(
+            instructions[0].instr_args.op_index, 0  # pyre-ignore
+        )  # view @ idx2
+        self.assertEqual(
+            instructions[1].instr_args.op_index, 0  # pyre-ignore
+        )  # view @ idx3
+        self.assertEqual(
+            instructions[2].instr_args.op_index, 1  # pyre-ignore
+        )  # aten:mul @ idx5
+        self.assertEqual(
+            instructions[3].instr_args.op_index, 0  # pyre-ignore
+        )  # view @ idx6
diff --git a/exir/tests/test_serde.py b/exir/tests/test_serde.py
index d4be4686590..2c68920ff34 100644
--- a/exir/tests/test_serde.py
+++ b/exir/tests/test_serde.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import io
 import unittest
 from typing import Tuple
 
@@ -47,7 +48,7 @@ def check_ep(
             self.assertTrue(torch.allclose(orig, loaded))
 
     # pyre-ignore
-    def check_serde(self, m, inputs) -> None:
+    def check_serde(self, m, inputs, check_executorch=True) -> None:
         aten = export(m, inputs)
         aten_new = deserialize(serialize(aten))
         self.check_ep(aten, aten_new, inputs)
@@ -56,10 +57,23 @@ def check_serde(self, m, inputs) -> None:
         edge_new = deserialize(serialize(edge.exported_program()))
         self.check_ep(edge.exported_program(), edge_new, inputs)
 
+        buffer = io.BytesIO()
+        exir.save(edge.exported_program(), buffer)
+        buffer.seek(0)
+        loaded_ep = exir.load(buffer)
+        self.check_ep(edge.exported_program(), loaded_ep, inputs)
+
         executorch = edge.to_executorch().exported_program()
         executorch_new = deserialize(serialize(executorch))
-        with torch.no_grad():
-            self.check_ep(executorch, executorch_new, inputs)
+        if check_executorch:
+            with torch.no_grad():
+                self.check_ep(executorch, executorch_new, inputs)
+
+                buffer = io.BytesIO()
+                exir.save(executorch, buffer)
+                buffer.seek(0)
+                loaded_ep = exir.load(buffer)
+                self.check_ep(executorch, loaded_ep, inputs)
 
     def test_basic(self) -> None:
         class MyModule(torch.nn.Module):
@@ -88,7 +102,12 @@ def get_random_inputs(self):
 
         model = MyModel()
         inputs = model.get_random_inputs()
-        self.check_serde(model, inputs)
+        # We set check_executorch to false for this test because this triggers
+        # an edge case where calling .module() on the executorch exported program
+        # will cause an unlift pass to be run on the graph and dead code elimination
+        # will be subsequently run, which essentially causes the split_copy op to be
+        # removed.
+        self.check_serde(model, inputs, check_executorch=False)
 
     def test_to_out_variant_multiple_out(self) -> None:
         class MyModel(torch.nn.Module):
@@ -140,6 +159,28 @@ def forward(self, x):
         edge_new = deserialize(serialize(edge.exported_program()))
         self.check_ep(edge.exported_program(), edge_new, model_inputs)
 
+    def test_model_with_weights(self) -> None:
+        class LinearAdd(nn.Module):
+            def __init__(self, M: int, N: int):
+                super().__init__()
+                self.M = M
+                self.N = N
+                self.linear = torch.nn.Linear(M, N)
+
+            def forward(self, x, y):
+                x = self.linear(x)
+                y = self.linear(y)
+                return torch.add(x, y)
+
+            @classmethod
+            def _get_random_inputs(cls):
+                return (torch.rand(128, 20), torch.rand(128, 20))
+
+        linear_add = LinearAdd(20, 30)
+        model_inputs = LinearAdd._get_random_inputs()
+
+        self.check_serde(linear_add, model_inputs)
+
     def test_delegate_partitioner(self) -> None:
         class Model(torch.nn.Module):
             def __init__(self):
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index c69cea0323e..ec8cf850317 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -8,6 +8,11 @@ cmake_minimum_required(VERSION 3.19)
 
 project(executorch_jni)
 
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+  # Can't set to 11 due to executor_runner.cpp make_unique
+endif()
+
 if(NOT ANDROID)
   message(FATAL_ERROR "This directory is for Android build only")
 endif()
@@ -58,14 +63,12 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   add_library(llama_runner STATIC IMPORTED)
   set_property(TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH})
 
-  set(CUSTOM_OPS_LIB_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops_lib.a)
-  add_library(custom_ops_lib STATIC IMPORTED)
-  set_property(TARGET custom_ops_lib PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_LIB_PATH})
-
   set(CUSTOM_OPS_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops.a)
   add_library(custom_ops STATIC IMPORTED)
   set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH})
-  target_link_options_shared_lib(custom_ops_lib)
+  target_link_options_shared_lib(custom_ops)
+
+  target_link_options_shared_lib(quantized_ops_lib)
 
   if(TARGET pthreadpool)
     set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp ../../backends/xnnpack/threadpool/cpuinfo_utils.cpp)
@@ -82,6 +85,16 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   endif()
   target_include_directories(executorch_llama_jni PRIVATE ${_common_include_directories})
   target_link_libraries(executorch_llama_jni ${link_libraries} llama_runner
-                        custom_ops custom_ops_lib cpublas eigen_blas)
+                        custom_ops cpublas eigen_blas quantized_kernels quantized_ops_lib)
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
+  if(EXECUTORCH_USE_TIKTOKEN)
+    set(ABSL_ENABLE_INSTALL ON)
+    set(_pic_flag
+      ${CMAKE_POSITION_INDEPENDENT_CODE})
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+    target_link_libraries(executorch_llama_jni re2::re2)
+  endif()
 endif()
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index 4089a7e0ff0..ac93f5a7137 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -324,12 +324,14 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
 
 #endif
 
-    ET_CHECK_MSG(
-        result.ok(),
-        "Execution of method %s failed with status 0x%" PRIx32,
-        method.c_str(),
-        static_cast<error_code_t>(result.error()));
-    ET_LOG(Info, "Model executed successfully.");
+    if (!result.ok()) {
+      facebook::jni::throwNewJavaException(
+          "java/lang/Exception",
+          "Execution of method %s failed with status 0x%" PRIx32,
+          method.c_str(),
+          static_cast<error_code_t>(result.error()));
+      return {};
+    }
 
     facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> jresult =
         facebook::jni::JArrayClass<JEValue>::newArray(result.get().size());
diff --git a/extension/aot_util/README.md b/extension/aot_util/README.md
deleted file mode 100644
index dbb3866bec3..00000000000
--- a/extension/aot_util/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# AOT Util
-
-Ahead-of-time (AOT) utility library. Contains native code used by the AOT lowering and delegation logic. Note 
-that this library should build independently of the runtime code, and as such, should not have dependencies 
-on runtime targets.
-
-This library is intended to be built and distributed as part of the Python pip package, such that it can be
-loaded by AOT Python code.
-
diff --git a/extension/aten_util/aten_bridge.cpp b/extension/aten_util/aten_bridge.cpp
index 907d23e2384..993f5e40778 100644
--- a/extension/aten_util/aten_bridge.cpp
+++ b/extension/aten_util/aten_bridge.cpp
@@ -68,6 +68,8 @@ torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type) {
       return torch::executor::ScalarType::Byte;
     case c10::ScalarType::Char:
       return torch::executor::ScalarType::Char;
+    case c10::ScalarType::Short:
+      return torch::executor::ScalarType::Short;
     case c10::ScalarType::Half:
       return torch::executor::ScalarType::Half;
     case c10::ScalarType::Int:
@@ -95,6 +97,8 @@ c10::ScalarType execuTorchtoTorchScalarType(torch::executor::ScalarType type) {
       return c10::ScalarType::Byte;
     case torch::executor::ScalarType::Char:
       return c10::ScalarType::Char;
+    case torch::executor::ScalarType::Short:
+      return c10::ScalarType::Short;
     case torch::executor::ScalarType::Half:
       return c10::ScalarType::Half;
     case torch::executor::ScalarType::Int:
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
index 976549af8db..92d19c04843 100644
--- a/extension/aten_util/make_aten_functor_from_et_functor.h
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -149,8 +149,7 @@ struct type_convert<
     }
     c10::ScalarType scalar_type =
         static_cast<c10::ScalarType>(val.scalar_type());
-    converted =
-        at::from_blob(val.mutable_data_ptr(), val.numel(), sizes, scalar_type);
+    converted = at::from_blob(val.mutable_data_ptr(), sizes, scalar_type);
   }
   ATensor call() {
     return converted;
diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl
index b396cb78325..6e325830292 100644
--- a/extension/aten_util/targets.bzl
+++ b/extension/aten_util/targets.bzl
@@ -27,6 +27,7 @@ def define_common_targets():
         ],
         exported_deps = [
             "//executorch/extension/kernel_util:kernel_util",
+            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core:evalue",
             "//executorch/runtime/core/exec_aten:lib",
diff --git a/extension/data_loader/CMakeLists.txt b/extension/data_loader/CMakeLists.txt
index 90008b1981c..1a0ec7b9cc1 100644
--- a/extension/data_loader/CMakeLists.txt
+++ b/extension/data_loader/CMakeLists.txt
@@ -25,6 +25,6 @@ target_compile_options(extension_data_loader PUBLIC ${_common_compile_options})
 # Install libraries
 install(
   TARGETS extension_data_loader
-  DESTINATION ${CMAKE_BINARY_DIR}/lib
+  DESTINATION lib
   INCLUDES
   DESTINATION ${_common_include_directories})
diff --git a/extension/evalue_util/print_evalue.cpp b/extension/evalue_util/print_evalue.cpp
index ba6f92de8a5..efef4c414f5 100644
--- a/extension/evalue_util/print_evalue.cpp
+++ b/extension/evalue_util/print_evalue.cpp
@@ -149,12 +149,12 @@ void print_tensor(std::ostream& os, exec_aten::Tensor tensor) {
   //
   // TODO(T159700776): Format multidimensional data like numpy/PyTorch does.
   // https://github.com/pytorch/pytorch/blob/main/torch/_tensor_str.py
-#define PRINT_TENSOR_DATA(ctype, dtype)                            \
-  case ScalarType::dtype:                                          \
-    print_scalar_list(                                             \
-        os,                                                        \
-        ArrayRef<ctype>(tensor.data_ptr<ctype>(), tensor.numel()), \
-        /*print_length=*/false);                                   \
+#define PRINT_TENSOR_DATA(ctype, dtype)                                  \
+  case ScalarType::dtype:                                                \
+    print_scalar_list(                                                   \
+        os,                                                              \
+        ArrayRef<ctype>(tensor.const_data_ptr<ctype>(), tensor.numel()), \
+        /*print_length=*/false);                                         \
     break;
 
   switch (tensor.scalar_type()) {
diff --git a/extension/kernel_util/meta_programming.h b/extension/kernel_util/meta_programming.h
index 46262b843ea..c412e907ea0 100644
--- a/extension/kernel_util/meta_programming.h
+++ b/extension/kernel_util/meta_programming.h
@@ -49,7 +49,7 @@ struct is_compile_time_function_pointer<
     CompileTimeFunctionPointer<FuncType, func_ptr>> : std::true_type {};
 
 #define EXECUTORCH_FN_TYPE(func)                                      \
-  CompileTimeFunctionPointer<                                         \
+  ::torch::executor::CompileTimeFunctionPointer<                      \
       std::remove_pointer_t<std::remove_reference_t<decltype(func)>>, \
       func>
 #define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)()
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 83ded144469..2c9733d1dae 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -51,8 +51,8 @@ Module::Module(
     std::unique_ptr<EventTracer> event_tracer)
     : data_loader_(std::move(data_loader)),
       memory_allocator_(
-          std::move(memory_allocator)
-              ?: std::make_unique<util::MallocMemoryAllocator>()),
+          memory_allocator ? std::move(memory_allocator)
+                           : std::make_unique<util::MallocMemoryAllocator>()),
       event_tracer_(std::move(event_tracer)) {
   runtime_init();
 }
diff --git a/extension/pybindings/TARGETS b/extension/pybindings/TARGETS
index 0b4e9ef3049..9dee0e208b1 100644
--- a/extension/pybindings/TARGETS
+++ b/extension/pybindings/TARGETS
@@ -30,9 +30,9 @@ runtime.genrule(
     srcs = [":pybinding_types"],
     outs = {
         "aten_lib.pyi": ["aten_lib.pyi"],
-        "portable_lib.pyi": ["portable_lib.pyi"],
+        "_portable_lib.pyi": ["_portable_lib.pyi"],
     },
-    cmd = "cp $(location :pybinding_types)/* $OUT/portable_lib.pyi && cp $(location :pybinding_types)/* $OUT/aten_lib.pyi",
+    cmd = "cp $(location :pybinding_types)/* $OUT/_portable_lib.pyi && cp $(location :pybinding_types)/* $OUT/aten_lib.pyi",
     visibility = ["//executorch/extension/pybindings/..."],
 )
 
@@ -46,8 +46,9 @@ executorch_pybindings(
 executorch_pybindings(
     compiler_flags = ["-std=c++17"],
     cppdeps = PORTABLE_MODULE_DEPS + MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB,
-    python_module_name = "portable_lib",
-    types = ["//executorch/extension/pybindings:pybindings_types_gen[portable_lib.pyi]"],
+    # Give this an underscore prefix because it has a pure python wrapper.
+    python_module_name = "_portable_lib",
+    types = ["//executorch/extension/pybindings:pybindings_types_gen[_portable_lib.pyi]"],
     visibility = ["PUBLIC"],
 )
 
@@ -58,3 +59,10 @@ executorch_pybindings(
     types = ["//executorch/extension/pybindings:pybindings_types_gen[aten_lib.pyi]"],
     visibility = ["PUBLIC"],
 )
+
+runtime.python_library(
+    name = "portable_lib",
+    srcs = ["portable_lib.py"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [":_portable_lib"],
+)
diff --git a/extension/pybindings/portable_lib.py b/extension/pybindings/portable_lib.py
new file mode 100644
index 00000000000..b9ed089f918
--- /dev/null
+++ b/extension/pybindings/portable_lib.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+# When installed as a pip wheel, we must import `torch` before trying to import
+# the pybindings shared library extension. This will load libtorch.so and
+# related libs, ensuring that the pybindings lib can resolve those runtime
+# dependencies.
+import torch as _torch
+
+# Let users import everything from the C++ _portable_lib extension as if this
+# python file defined them. Although we could import these dynamically, it
+# wouldn't preserve the static type annotations.
+from executorch.extension.pybindings._portable_lib import (  # noqa: F401
+    # Disable "imported but unused" (F401) checks.
+    _create_profile_block,  # noqa: F401
+    _dump_profile_results,  # noqa: F401
+    _get_operator_names,  # noqa: F401
+    _load_bundled_program_from_buffer,  # noqa: F401
+    _load_for_executorch,  # noqa: F401
+    _load_for_executorch_from_buffer,  # noqa: F401
+    _load_for_executorch_from_bundled_program,  # noqa: F401
+    _reset_profile_results,  # noqa: F401
+    BundledModule,  # noqa: F401
+    ExecuTorchModule,  # noqa: F401
+)
+
+# Clean up so that `dir(portable_lib)` is the same as `dir(_portable_lib)`
+# (apart from some __dunder__ names).
+del _torch
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index 0392b5b7cef..8f2b43cac9e 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -7,9 +7,12 @@
 # pyre-strict
 from typing import Any, Dict, List, Sequence, Tuple
 
-class ExecutorchModule:
+class ExecuTorchModule:
+    # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def __call__(self, inputs: Any) -> List[Any]: ...
+    # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def run_method(self, method_name: str, inputs: Sequence[Any]) -> List[Any]: ...
+    # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def forward(self, inputs: Sequence[Any]) -> List[Any]: ...
     # Bundled program methods.
     def load_bundled_input(
@@ -30,16 +33,17 @@ class BundledModule: ...
 
 def _load_for_executorch(
     path: str, enable_etdump: bool = False
-) -> ExecutorchModule: ...
+) -> ExecuTorchModule: ...
 def _load_for_executorch_from_buffer(
     buffer: bytes, enable_etdump: bool = False
-) -> ExecutorchModule: ...
+) -> ExecuTorchModule: ...
 def _load_for_executorch_from_bundled_program(
     module: BundledModule, enable_etdump: bool = False
-) -> ExecutorchModule: ...
+) -> ExecuTorchModule: ...
 def _load_bundled_program_from_buffer(
     buffer: bytes, non_const_pool_size: int = ...
 ) -> BundledModule: ...
+def _get_operator_names() -> List[str]: ...
 def _create_profile_block(name: str) -> None: ...
 def _dump_profile_results() -> bytes: ...
 def _reset_profile_results() -> None: ...
diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h
index aa4657bcb58..16b712d6595 100644
--- a/extension/runner_util/managed_tensor.h
+++ b/extension/runner_util/managed_tensor.h
@@ -10,6 +10,10 @@
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/assert.h>
+#include <memory>
+// NOTE: required by torchchat install_et.sh script.
+// @nolint PATTERNLINT Ok to use stdlib for this optional library
+#include <vector>
 
 #ifdef USE_ATEN_LIB
 #include <torch/torch.h>
diff --git a/install_requirements.sh b/install_requirements.sh
index c96aefc5628..d541a30b221 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -9,7 +9,7 @@
 # Dependencies are defined in .pyproject.toml
 if [[ -z $PYTHON_EXECUTABLE ]];
 then
-  if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]];
+  if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]] || [[ ! -x "$(command -v python)" ]];
   then
     PYTHON_EXECUTABLE=python3
   else
@@ -17,9 +17,16 @@ then
   fi
 fi
 
+if [[ "$PYTHON_EXECUTABLE" == "python" ]];
+then
+  PIP_EXECUTABLE=pip
+else
+  PIP_EXECUTABLE=pip3
+fi
+
+
 # Parse options.
 EXECUTORCH_BUILD_PYBIND=OFF
-CMAKE_ARGS=""
 
 for arg in "$@"; do
   case $arg in
@@ -53,7 +60,7 @@ done
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION=dev20240324
+NIGHTLY_VERSION=dev20240422
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
@@ -67,6 +74,7 @@ EXIR_REQUIREMENTS=(
 # pip packages needed for development.
 DEVEL_REQUIREMENTS=(
   cmake  # For building binary targets.
+  pyyaml  # Imported by the kernel codegen tools.
   setuptools  # For building the pip package.
   tomli  # Imported by extract_sources.py when using python < 3.11.
   wheel  # For building the pip package archive.
@@ -92,7 +100,7 @@ REQUIREMENTS_TO_INSTALL=(
 
 # Install the requirements. `--extra-index-url` tells pip to look for package
 # versions on the provided URL if they aren't available on the default URL.
-pip install --extra-index-url "${TORCH_NIGHTLY_URL}" \
+$PIP_EXECUTABLE install --extra-index-url "${TORCH_NIGHTLY_URL}" \
     "${REQUIREMENTS_TO_INSTALL[@]}"
 
 #
@@ -101,4 +109,5 @@ pip install --extra-index-url "${TORCH_NIGHTLY_URL}" \
 
 EXECUTORCH_BUILD_PYBIND="${EXECUTORCH_BUILD_PYBIND}" \
     CMAKE_ARGS="${CMAKE_ARGS}" \
-    pip install . --no-build-isolation -v
+    CMAKE_BUILD_ARGS="${CMAKE_BUILD_ARGS}" \
+    $PIP_EXECUTABLE install . --no-build-isolation -v
diff --git a/kernels/aten/targets.bzl b/kernels/aten/targets.bzl
index 8e2f1b04c48..519dfaf3484 100644
--- a/kernels/aten/targets.bzl
+++ b/kernels/aten/targets.bzl
@@ -35,18 +35,3 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
-
-    # TODO(T149415783): temporarily testing portable kernel "in aten mode" and remove after migration is done
-    executorch_generated_lib(
-        name = "generated_lib_aten",
-        aten_mode = True,
-        deps = [
-            ":executorch_aten_ops",
-        ],
-        functions_yaml_target = None,
-        define_static_targets = True,
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 98acdf88ca9..cd34eb78e39 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -47,7 +47,7 @@ endif()
 # Build cpublas.
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
-target_link_libraries(cpublas PRIVATE executorch eigen_blas)
+target_link_libraries(cpublas PRIVATE executorch_no_prim_ops eigen_blas)
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
@@ -61,7 +61,7 @@ message("Generated files ${gen_command_sources}")
 
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
-target_link_libraries(optimized_kernels PRIVATE executorch cpublas)
+target_link_libraries(optimized_kernels PRIVATE executorch_no_prim_ops cpublas)
 target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
 # Build a library for _optimized_kernels_srcs
 #
diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index a532cfc7ba6..5bbba7b39f2 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -41,10 +41,12 @@ Tensor& add_out(
   ET_KERNEL_CHECK(
       ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
-      ET_SWITCH_REALB_TYPES(common_type, ctx, "add.out", CTYPE_IN, [&]() {
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
+  constexpr auto name = "add.out";
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REALB_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
           CTYPE_IN alpha_val;
           utils::extract_scalar(alpha, &alpha_val);
 
@@ -99,29 +101,29 @@ Tensor& add_scalar_out(
     common_type = ScalarType::Float;
   }
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() {
-    ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "add.Scalar_out", CTYPE_B, [&]() {
-      ET_SWITCH_REALB_TYPES(
-          common_type, ctx, "add.Scalar_out", CTYPE_IN, [&]() {
-            ET_SWITCH_REALHB_TYPES(
-                out_type, ctx, "add.Scalar_out", CTYPE_OUT, [&]() {
-                  CTYPE_B b_val;
-                  utils::extract_scalar(b, &b_val);
-                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
-                  CTYPE_IN alpha_val;
-                  utils::extract_scalar(alpha, &alpha_val);
-
-                  apply_unary_map_fn(
-                      [b_casted, alpha_val](const CTYPE_A val_a) {
-                        CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                        CTYPE_IN value = a_casted + alpha_val * b_casted;
-                        return static_cast<CTYPE_OUT>(value);
-                      },
-                      a.const_data_ptr<CTYPE_A>(),
-                      out.mutable_data_ptr<CTYPE_OUT>(),
-                      out.numel());
-                });
-          });
+  constexpr auto name = "add.Scalar_out";
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REALB_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_B b_val;
+          utils::extract_scalar(b, &b_val);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+          CTYPE_IN alpha_val;
+          utils::extract_scalar(alpha, &alpha_val);
+
+          apply_unary_map_fn(
+              [b_casted, alpha_val](const CTYPE_A val_a) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN value = a_casted + alpha_val * b_casted;
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a.const_data_ptr<CTYPE_A>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
+        });
+      });
     });
   });
 
diff --git a/kernels/portable/cpu/op_cumsum.cpp b/kernels/portable/cpu/op_cumsum.cpp
index 7fe483ad2e1..fffc2d46392 100644
--- a/kernels/portable/cpu/op_cumsum.cpp
+++ b/kernels/portable/cpu/op_cumsum.cpp
@@ -11,8 +11,8 @@
 #include <executorch/runtime/platform/assert.h>
 #include <cmath>
 #include <cstddef>
-//#include <cstdint>
-//#include <type_traits>
+// #include <cstdint>
+// #include <type_traits>
 
 namespace torch {
 namespace executor {
diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp
index 16817a27195..d70ceaa859b 100644
--- a/kernels/portable/cpu/op_index.cpp
+++ b/kernels/portable/cpu/op_index.cpp
@@ -40,12 +40,11 @@ Tensor& index_Tensor_out(
   if (block_count == 0) {
     ET_KERNEL_CHECK(
         ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
-    ET_SWITCH_REAL_TYPES_AND(
-        Bool, in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
-          const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
-          CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
-          memcpy(out_data, in_data, in.nbytes());
-        });
+    ET_SWITCH_REALHB_TYPES(in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
+      const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+      CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+      memcpy(out_data, in_data, in.nbytes());
+    });
     return out;
   }
 
@@ -85,20 +84,19 @@ Tensor& index_Tensor_out(
   compute_dim_map(in, indices, dim_map, block_count == 1);
   compute_index_map(in, indices, ix_map);
 
-  ET_SWITCH_REAL_TYPES_AND(
-      Bool, in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
-        const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
-        CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
-
-        for (auto out_ix = 0; out_ix < out.numel(); out_ix++) {
-          size_t in_ix = 0;
-          bool success = true;
-          std::tie(in_ix, success) =
-              get_in_ix(in, indices, out, out_ix, start, xdim, dim_map, ix_map);
-          ET_KERNEL_CHECK(ctx, success, InvalidArgument, );
-          out_data[out_ix] = in_data[in_ix];
-        }
-      });
+  ET_SWITCH_REALHB_TYPES(in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
+    const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+    CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+    for (auto out_ix = 0; out_ix < out.numel(); out_ix++) {
+      size_t in_ix = 0;
+      bool success = true;
+      std::tie(in_ix, success) =
+          get_in_ix(in, indices, out, out_ix, start, xdim, dim_map, ix_map);
+      ET_KERNEL_CHECK(ctx, success, InvalidArgument, );
+      out_data[out_ix] = in_data[in_ix];
+    }
+  });
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp
index da8599d5fac..068f402c07d 100644
--- a/kernels/portable/cpu/op_isinf.cpp
+++ b/kernels/portable/cpu/op_isinf.cpp
@@ -15,7 +15,10 @@ namespace executor {
 namespace native {
 
 Tensor& isinf_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_bool(std::isinf, ctx, in, out);
+  // Lambda is syntactic sugar needed to workaround compilation on some older
+  // non-compatible distros where isnan is returning int rather than bool
+  return internal::unary_ufunc_realhb_to_bool(
+      [](double x) -> bool { return std::isinf(x); }, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp
index 2a82b127d3e..09fb4f5f8ac 100644
--- a/kernels/portable/cpu/op_isnan.cpp
+++ b/kernels/portable/cpu/op_isnan.cpp
@@ -15,7 +15,10 @@ namespace executor {
 namespace native {
 
 Tensor& isnan_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_bool(std::isnan, ctx, in, out);
+  // Lambda is syntactic sugar needed to workaround compilation on some older
+  // non-compatible distros where isnan is returning int rather than bool
+  return internal::unary_ufunc_realhb_to_bool(
+      [](double x) -> bool { return std::isnan(x); }, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
index 828b14b24ce..7d1b485e7a4 100644
--- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp
+++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
@@ -55,8 +55,7 @@ void split_with_sizes_copy_out(
     target_out_sizes[dim] = static_cast<Tensor::SizesType>(split_sizes[i]);
     ET_KERNEL_CHECK(
         ctx,
-        tensor_is_broadcastable_to(
-            {target_out_sizes, target_out_ndim}, out[i].sizes()),
+        resize_tensor(out[i], {target_out_sizes, target_out_ndim}) == Error::Ok,
         InvalidArgument, );
   }
 
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index e8a0fc919d7..2df71a6d6b0 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -29,7 +29,7 @@ Tensor& sub_out(
       InvalidArgument,
       out);
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
@@ -37,15 +37,16 @@ Tensor& sub_out(
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
   ET_KERNEL_CHECK(
       ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
-  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
-  ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
-      ET_SWITCH_REAL_TYPES(common_type, ctx, "sub.out", CTYPE_IN, [&]() {
-        ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
+  constexpr auto name = "sub.out";
+
+  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
           CTYPE_IN alpha_val;
           utils::extract_scalar(alpha, &alpha_val);
 
@@ -84,11 +85,11 @@ Tensor& sub_scalar_out(
       out,
       "Failed to resize output tensor.");
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
-  ScalarType alpha_type = utils::get_scalar_dtype(b);
+  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
   ScalarType common_type =
       utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false);
   ScalarType out_type = out.scalar_type();
@@ -100,31 +101,30 @@ Tensor& sub_scalar_out(
     common_type = ScalarType::Float;
   }
 
-  ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.Scalar_out", CTYPE_A, [&]() {
-    ET_SWITCH_SCALAR_OBJ_REAL_TYPES(
-        b_type, ctx, "sub.Scalar_out", CTYPE_B, [&]() {
-          ET_SWITCH_REAL_TYPES(
-              common_type, ctx, "sub.Scalar_out", CTYPE_IN, [&]() {
-                ET_SWITCH_REALH_TYPES(
-                    out_type, ctx, "sub.Scalar_out", CTYPE_OUT, [&]() {
-                      CTYPE_B b_val;
-                      utils::extract_scalar(b, &b_val);
-                      CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
-                      CTYPE_IN alpha_val;
-                      utils::extract_scalar(alpha, &alpha_val);
-
-                      apply_unary_map_fn(
-                          [b_casted, alpha_val](const CTYPE_A val_a) {
-                            CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                            CTYPE_IN value = a_casted - alpha_val * b_casted;
-                            return static_cast<CTYPE_OUT>(value);
-                          },
-                          a.const_data_ptr<CTYPE_A>(),
-                          out.mutable_data_ptr<CTYPE_OUT>(),
-                          out.numel());
-                    });
-              });
+  constexpr auto name = "sub.Scalar_out";
+
+  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_SCALAR_OBJ_REAL_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_B b_val;
+          utils::extract_scalar(b, &b_val);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+          CTYPE_IN alpha_val;
+          utils::extract_scalar(alpha, &alpha_val);
+
+          apply_unary_map_fn(
+              [b_casted, alpha_val](const CTYPE_A val_a) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN value = a_casted - alpha_val * b_casted;
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a.const_data_ptr<CTYPE_A>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
         });
+      });
+    });
   });
 
   return out;
diff --git a/kernels/prim_ops/et_view.cpp b/kernels/prim_ops/et_view.cpp
index 69a75170260..b3d3592fe7b 100644
--- a/kernels/prim_ops/et_view.cpp
+++ b/kernels/prim_ops/et_view.cpp
@@ -87,18 +87,7 @@ void et_view(RuntimeContext& context, EValue** stack) {
   // Do some checks
   ET_CHECK(self.numel() == out.numel());
 
-  // If out has a data_ptr, it must match self
-  // We hit this path for memory-planned tensors
-  if (out.const_data_ptr() != nullptr) {
-    ET_CHECK_MSG(
-        self.const_data_ptr() == out.const_data_ptr(),
-        "out has a non-null data_ptr, but it does not equal self's data_ptr.");
-
-    // nothing else to do
-    return;
-  }
-
-  // out.const_data_ptr() == nullptr now
+  // Update data ptr
   ET_CHECK_MSG(
       internal::set_tensor_data(
           out,
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
index fdcc13cf13e..7d91a0f6820 100644
--- a/kernels/prim_ops/test/prim_ops_test.cpp
+++ b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -331,14 +331,13 @@ TEST_F(RegisterPrimOpsTest, TestETView) {
       EValue(good_outs[0]), EValue(good_outs[1])};
 
   // bad outs expect death
-  constexpr int N_BAD_OUTS = 3;
+  constexpr int N_BAD_OUTS = 2;
   Tensor bad_outs[N_BAD_OUTS] = {
       tf.ones({1, 3, 2, 1}), // wrong rank
-      tf.ones({1, 3, 3}), // wrong size
-      tf.ones({1, 3, 2}) // occupied data_ptr
+      tf.ones({1, 3, 3}) // wrong size
   };
   EValue bad_out_evalues[N_BAD_OUTS] = {
-      EValue(bad_outs[0]), EValue(bad_outs[1]), EValue(bad_outs[2])};
+      EValue(bad_outs[0]), EValue(bad_outs[1])};
 
   // ***************************************************************************
   // Run tests
@@ -349,7 +348,6 @@ TEST_F(RegisterPrimOpsTest, TestETView) {
       // Bad out stacks
       {&self_evalue, &size_int_list_evalue, &bad_out_evalues[0]},
       {&self_evalue, &size_int_list_evalue, &bad_out_evalues[1]},
-      {&self_evalue, &size_int_list_evalue, &bad_out_evalues[2]},
       // Bad size stacks
       {&self_evalue, &bad_size_int_list_evalue1, &good_out_evalues[0]},
       {&self_evalue, &bad_size_int_list_evalue2, &good_out_evalues[0]}};
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index 7be9e73827f..b34ba75ae29 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -10,6 +10,9 @@
 # ~~~
 cmake_minimum_required(VERSION 3.19)
 
+option(EXECUTORCH_BUILD_QUANTIZED_OPS_AOT
+       "Build the optimized ops library for AOT export usage" OFF)
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
@@ -49,7 +52,7 @@ message("Generated files ${gen_command_sources}")
 # quantized_ops_aot_lib quantized_ops_lib but none of these is a common
 # dependency of the other(s). This is not allowed by the Xcode "new build
 # system".
-if(NOT CMAKE_GENERATOR STREQUAL "Xcode")
+if(NOT CMAKE_GENERATOR STREQUAL "Xcode" AND EXECUTORCH_BUILD_QUANTIZED_OPS_AOT)
   # Build a AOT library to register quantized ops into PyTorch. This is a hack.
   set(_quantized_sources
       ${_quantized_kernels__srcs}
diff --git a/kernels/quantized/cpu/op_embedding4b.cpp b/kernels/quantized/cpu/op_embedding4b.cpp
new file mode 100644
index 00000000000..f234ee224ca
--- /dev/null
+++ b/kernels/quantized/cpu/op_embedding4b.cpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using Scalar = exec_aten::Scalar;
+using ScalarType = exec_aten::ScalarType;
+
+namespace {
+
+/**
+ * Asserts that the parameters are valid.
+ */
+void check_embedding_4bit_args(
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    const int64_t weight_quant_min,
+    const int64_t weight_quant_max,
+    const Tensor& indices,
+    exec_aten::optional<ScalarType> out_dtype,
+    Tensor& out) {
+  ET_CHECK_MSG(
+      weight.dim() == 2, "weight must be 2D but got() %zd dims", weight.dim());
+
+  ET_CHECK_MSG(
+      weight_scales.dim() == 1 || weight_scales.dim() == 2,
+      "weight_scales must be 1D or 2D but got() %zd dims",
+      weight_scales.dim());
+
+  ET_CHECK_MSG(
+      weight_scales.size(0) == weight.size(0),
+      "Number of scales must be == weight.size(0)=%zd"
+      ", but got %zd",
+      weight_scales.size(0),
+      weight.size(0));
+
+  if (weight_scales.dim() == 2) {
+    auto num_groups = weight_scales.size(1);
+    ET_CHECK_MSG(
+        // each 8b uint8 column is 2 columns
+        (2 * weight.size(1)) % num_groups == 0,
+        "Number of groups must divide weight.size(1)=%zd"
+        ", but got # of groups = %zd",
+        weight.size(1),
+        num_groups);
+  }
+
+  ET_CHECK_MSG(
+      weight.scalar_type() == ScalarType::Byte,
+      "weight.scalar_type() %" PRId8 " is not supported:",
+      static_cast<int8_t>(weight.scalar_type()));
+
+  ET_CHECK_MSG(
+      out.scalar_type() == ScalarType::Float ||
+          out.scalar_type() == ScalarType::Half,
+      "out.scalar_type() %" PRId8 " is not supported:",
+      static_cast<int8_t>(out.scalar_type()));
+
+  ET_CHECK_MSG(
+      weight_scales.scalar_type() == ScalarType::Float ||
+          weight_scales.scalar_type() == ScalarType::Half,
+      "weight_scales.scalar_type() %" PRId8 " is not supported:",
+      static_cast<int8_t>(weight_scales.scalar_type()));
+
+  if (opt_weight_zero_points.has_value()) {
+    ET_CHECK_MSG(
+        opt_weight_zero_points.value().dim() == weight_scales.dim(),
+        "weight_zero_points's rank match that of weight_scales. "
+        "weight_zero_points rank: %" PRId8 ", weight_scales rank: %" PRId8,
+        static_cast<int8_t>(opt_weight_zero_points.value().dim()),
+        static_cast<int8_t>(weight_scales.dim()));
+
+    ET_CHECK_MSG(
+        opt_weight_zero_points.value().scalar_type() == out.scalar_type(),
+        "weight zero points scalar type %" PRId8
+        " does not match out.scalar_type()",
+        static_cast<int8_t>(opt_weight_zero_points.value().scalar_type()));
+
+    for (int32_t i = 0; i < weight_scales.dim(); ++i) {
+      ET_CHECK_MSG(
+          opt_weight_zero_points.value().size(i) == weight_scales.size(i),
+          "Dimension size misatch at dim %" PRId8
+          "Weight_zero_point size = %zd"
+          ", weight_scales size = %zd.",
+          i,
+          opt_weight_zero_points.value().size(i),
+          weight_scales.size(i));
+    }
+  }
+
+  ET_CHECK_MSG(
+      indices.scalar_type() == ScalarType::Long,
+      "indices.scalar_type() %" PRId8 " is not Long only Long is supported:",
+      static_cast<int8_t>(indices.scalar_type()));
+
+  ET_CHECK_MSG(
+      weight_quant_min <= weight_quant_max,
+      "weight quant min: %" PRId64
+      " is greater than weight quant max: %" PRId64,
+      weight_quant_min,
+      weight_quant_max);
+
+  if (out_dtype.has_value()) {
+    ET_CHECK_MSG(
+        out.scalar_type() == out_dtype.value(),
+        "output_dtype must match the dtype of the out tensor");
+  }
+}
+
+static inline int32_t weight_value(const unsigned char* w_data, int32_t index) {
+  int32_t odd = index & 1;
+  index >>= 1;
+  if (odd) {
+    return (int32_t)(w_data[index] & 0x0F) - 8;
+  } else {
+    return (int32_t)((w_data[index] >> 4) & 0x0F) - 8;
+  }
+}
+
+/**
+ * Retrieves the embeddings specified by indices, dequantizes them, and stores
+ * them in out. Weight will always be uint8
+ */
+template <typename CTYPE_PARAMS, typename CTYPE_OUT>
+void embedding_4bit_per_channel(
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    const Tensor& indices,
+    Tensor& out) {
+  auto embedding_dim = weight.size(1) * 2;
+
+  int32_t num_groups_per_channel = 1;
+  if (weight_scales.dim() == 2) {
+    num_groups_per_channel = weight_scales.size(1);
+  }
+  int32_t group_size = embedding_dim / num_groups_per_channel;
+
+  CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+  const int64_t* indices_ptr = indices.const_data_ptr<int64_t>();
+
+  const CTYPE_PARAMS* scales = weight_scales.const_data_ptr<CTYPE_PARAMS>();
+  const CTYPE_PARAMS* zero_points = nullptr;
+  if (opt_weight_zero_points.has_value()) {
+    zero_points = opt_weight_zero_points.value().const_data_ptr<CTYPE_PARAMS>();
+  }
+
+  for (int i = 0; i < indices.numel(); i++) {
+    int64_t index = indices_ptr[i];
+    // If using groupwise embedding
+    int32_t qparams_index = index * num_groups_per_channel;
+    CTYPE_PARAMS zp = 0.0;
+    const CTYPE_PARAMS* scale_ptr = scales + qparams_index;
+    const CTYPE_PARAMS* zero_points_ptr = nullptr;
+    if (opt_weight_zero_points.has_value()) {
+      zero_points_ptr = zero_points + qparams_index;
+    }
+
+    const uint8_t* w_data = weight.data_ptr<uint8_t>() + weight.size(1) * index;
+
+    for (int j = 0; j < embedding_dim; ++j) {
+      int32_t group_id = j / group_size;
+      const CTYPE_PARAMS scale = scale_ptr[group_id];
+      if (opt_weight_zero_points.has_value()) {
+        zp = zero_points_ptr[group_id];
+      }
+      out_data[j] = static_cast<CTYPE_OUT>(
+          (static_cast<float>(weight_value(w_data, j)) -
+           static_cast<float>(zp)) *
+          static_cast<float>(scale));
+    }
+    out_data += embedding_dim;
+  }
+}
+
+void resize_out_tensor(
+    const Tensor& weight,
+    const Tensor& indices,
+    Tensor& out) {
+  exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
+  for (size_t i = 0; i < indices.dim(); i++) {
+    expected_output_size[i] = indices.size(i);
+  }
+  const size_t embedding_dim = weight.size(1) * 2;
+  expected_output_size[out.dim() - 1] = embedding_dim;
+
+  exec_aten::ArrayRef<exec_aten::SizesType> output_size{
+      expected_output_size, static_cast<size_t>(out.dim())};
+
+  torch::executor::Error err = resize_tensor(out, output_size);
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize out Tensor in quantized_embedding_4bit_out");
+}
+
+} // namespace
+
+/**
+ * Retrieves the embeddings specified by indices, dequantizes them, and stores
+ * them in out. The weight is quantized per channel, with a scale and zero_point
+ * for each embedding.
+ *
+ * Corresponds as the out variant to torch.ops.quantized.embedding_4bit
+ *
+ * NOTE: quant_min, quant_max, and Dtype are not used in computation, but rather
+ * metadata that is passed around which can be useful for pattern matching. See
+ * https://github.com/pytorch/pytorch/pull/87093#discussion_r1000841181 for more
+ * info.
+ */
+Tensor& quantized_embedding_4bit_out(
+    // TODO Evaluate whether this name is appropriate for an operator that takes
+    // non quant input and returns fp output
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    const int64_t weight_quant_min,
+    const int64_t weight_quant_max,
+    const Tensor& indices,
+    Tensor& out) {
+  ScalarType out_type = out.scalar_type();
+
+  // TODO (jakeszwe): improve these to account for the size of out in relation
+  // to weight and indices accounting for a possible batch dimension
+  check_embedding_4bit_args(
+      weight,
+      weight_scales,
+      opt_weight_zero_points,
+      weight_quant_min,
+      weight_quant_max,
+      indices,
+      out_type,
+      out);
+
+  constexpr auto name = "quantized_decomposed::embedding_4bit.out";
+  ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
+    embedding_4bit_per_channel<CTYPE_OUT, CTYPE_OUT>(
+        weight, weight_scales, opt_weight_zero_points, indices, out);
+  });
+
+  return out;
+}
+
+Tensor& quantized_embedding_4bit_out(
+    RuntimeContext& context,
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    int64_t weight_quant_min,
+    int64_t weight_quant_max,
+    const Tensor& indices,
+    Tensor& out) {
+  // TODO(larryliu): Add a context arg to the real op function and remove this
+  // wrapper
+  (void)context;
+  resize_out_tensor(weight, indices, out);
+  return quantized_embedding_4bit_out(
+      weight,
+      weight_scales,
+      opt_weight_zero_points,
+      weight_quant_min,
+      weight_quant_max,
+      indices,
+      out);
+}
+
+Tensor& quantized_embedding_4bit_dtype_out(
+    // TODO Evaluate whether this name is appropriate for an operator that takes
+    // non quant input and returns fp output
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    const int64_t weight_quant_min,
+    const int64_t weight_quant_max,
+    const Tensor& indices,
+    exec_aten::optional<ScalarType> out_dtype,
+    Tensor& out) {
+  // TODO (jakeszwe): improve these to account for the size of out in relation
+  // to weight and indices accounting for a possible batch dimension
+  check_embedding_4bit_args(
+      weight,
+      weight_scales,
+      opt_weight_zero_points,
+      weight_quant_min,
+      weight_quant_max,
+      indices,
+      out_dtype,
+      out);
+
+  ScalarType params_type = weight_scales.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  constexpr auto name = "quantized_decomposed::embedding_4bit.dtype_out";
+  ET_SWITCH_TWO_TYPES(Float, Half, params_type, ctx, name, CTYPE_P, [&]() {
+    ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() {
+      embedding_4bit_per_channel<CTYPE_P, CTYPE_OUT>(
+          weight, weight_scales, opt_weight_zero_points, indices, out);
+    });
+  });
+
+  return out;
+}
+
+Tensor& quantized_embedding_4bit_dtype_out(
+    RuntimeContext& context,
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    int64_t weight_quant_min,
+    int64_t weight_quant_max,
+    const Tensor& indices,
+    exec_aten::optional<ScalarType> out_dtype,
+    Tensor& out) {
+  // TODO(larryliu): Add a context arg to the real op function and remove this
+  // wrapper
+  (void)context;
+  resize_out_tensor(weight, indices, out);
+  return quantized_embedding_4bit_dtype_out(
+      weight,
+      weight_scales,
+      opt_weight_zero_points,
+      weight_quant_min,
+      weight_quant_max,
+      indices,
+      out_dtype,
+      out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl
index 3a6f74631ac..39552aaaf10 100644
--- a/kernels/quantized/cpu/targets.bzl
+++ b/kernels/quantized/cpu/targets.bzl
@@ -23,6 +23,9 @@ _QUANT_OPS = (
     op_target(
         name = "op_embedding",
     ),
+    op_target(
+        name = "op_embedding4b",
+    ),
     op_target(
         name = "op_mixed_mm",
         deps = [
diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml
index 484641318b4..ca2360b7d80 100644
--- a/kernels/quantized/quantized.yaml
+++ b/kernels/quantized/quantized.yaml
@@ -40,12 +40,24 @@
     - arg_meta: null
       kernel_name: torch::executor::quantized_embedding_byte_out
 
-- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::quantized_embedding_byte_dtype_out
 
+- func: quantized_decomposed::embedding_4bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::quantized_embedding_4bit_out
+
+- func: quantized_decomposed::embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::quantized_embedding_4bit_dtype_out
+
 - func: quantized_decomposed::mixed_mm.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
diff --git a/kernels/quantized/targets.bzl b/kernels/quantized/targets.bzl
index f907ed557ae..2c951fa4a7b 100644
--- a/kernels/quantized/targets.bzl
+++ b/kernels/quantized/targets.bzl
@@ -9,23 +9,54 @@ def define_common_targets():
         ],
     )
 
+    # Excluding embedding_byte ops because we choose to define them
+    # in python separately, mostly to be easy to share with oss.
     et_operator_library(
-        name = "all_quantized_ops",
-        ops_schema_yaml_target = ":quantized.yaml",
+        name = "quantized_ops_need_aot_registration",
+        ops = [
+            "quantized_decomposed::add.out",
+            "quantized_decomposed::choose_qparams.Tensor_out",
+            "quantized_decomposed::dequantize_per_channel.out",
+            "quantized_decomposed::dequantize_per_tensor.out",
+            "quantized_decomposed::dequantize_per_tensor.Tensor_out",
+            "quantized_decomposed::mixed_linear.out",
+            "quantized_decomposed::mixed_mm.out",
+            "quantized_decomposed::quantize_per_channel.out",
+            "quantized_decomposed::quantize_per_tensor.out",
+            "quantized_decomposed::quantize_per_tensor.Tensor_out",
+        ],
         define_static_targets = True,
     )
 
     # lib used to register quantized ops into EXIR
+    exir_custom_ops_aot_lib(
+        name = "custom_ops_generated_lib",
+        yaml_target = ":quantized.yaml",
+        visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
+        kernels = [":quantized_operators_aten"],
+        deps = [
+            ":quantized_ops_need_aot_registration",
+        ],
+    )
+
+    # lib used to register quantized ops into EXIR
+    # TODO: merge this with custom_ops_generated_lib
     exir_custom_ops_aot_lib(
         name = "aot_lib",
         yaml_target = ":quantized.yaml",
         visibility = ["//executorch/..."],
         kernels = [":quantized_operators_aten"],
         deps = [
-            ":all_quantized_ops",
+            ":quantized_ops_need_aot_registration",
         ],
     )
 
+    et_operator_library(
+        name = "all_quantized_ops",
+        ops_schema_yaml_target = ":quantized.yaml",
+        define_static_targets = True,
+    )
+
     for aten_mode in (True, False):
         aten_suffix = "_aten" if aten_mode else ""
 
@@ -49,6 +80,7 @@ def define_common_targets():
             ],
             custom_ops_yaml_target = ":quantized.yaml",
             custom_ops_aten_kernel_deps = [":quantized_operators_aten"] if aten_mode else [],
+            custom_ops_requires_aot_registration = False,
             aten_mode = aten_mode,
             visibility = [
                 "//executorch/...",
diff --git a/kernels/quantized/test/op_embedding4b_test.cpp b/kernels/quantized/test/op_embedding4b_test.cpp
new file mode 100644
index 00000000000..1eb7aa11b2a
--- /dev/null
+++ b/kernels/quantized/test/op_embedding4b_test.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/quantized/NativeFunctions.h> // Declares the operator
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/test/utils/DeathTest.h>
+
+#include <gtest/gtest.h>
+#include <limits>
+
+using namespace ::testing;
+using exec_aten::ArrayRef;
+using exec_aten::optional;
+using exec_aten::RuntimeContext;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::native::quantized_embedding_4bit_out;
+
+using torch::executor::testing::TensorFactory;
+
+TEST(OpQuantizedEmbedding4bTest, TestGroupWiseQuantizedEmbedding) {
+  et_pal_init();
+  TensorFactory<ScalarType::Byte> tfb;
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  int64_t quant_min = -8;
+  int64_t quant_max = 7;
+
+  Tensor weight_scales = tf.make({3}, {0.5, 1.0, 1.5});
+  Tensor weight_zero_points = tf.make({3}, {1, -5, 0});
+
+  // -3,  1,  6, 7,
+  //  2, -5, -4, 0,
+  // -8,  3, -1, 6,
+
+  Tensor qweight = tfb.make({3, 2}, {89, 239, 163, 72, 11, 126});
+
+  Tensor indices = tfl.make({3}, {0, 2, 1});
+
+  Tensor out = tf.zeros({3, 4});
+  Tensor expected = tf.make(
+      {3, 4}, {-2.0, 0.0, 2.5, 3.0, -12.0, 4.5, -1.5, 9.0, 7.0, 0.0, 1.0, 5.0});
+
+  quantized_embedding_4bit_out(
+      qweight,
+      weight_scales,
+      weight_zero_points,
+      quant_min,
+      quant_max,
+      indices,
+      out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+
+  out = tf.zeros({3, 4});
+  auto context = RuntimeContext();
+  torch::executor::native::quantized_embedding_4bit_out(
+      context,
+      qweight,
+      weight_scales,
+      weight_zero_points,
+      quant_min,
+      quant_max,
+      indices,
+      out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+
+  // Groupwise quantization. groupsize = 2
+  weight_scales = tf.make({3, 2}, {0.5, 1.0, 1.5, 2.0, 2.5, 3.0});
+  weight_zero_points = tf.make({3, 2}, {1, -5, 0, 2, -3, -1});
+  /*
+  fp_weight = [-2.0,  0.0,  11.0, 12.0,
+                3.0, -7.5, -12.0, -4.0,
+              -12.5, 15.0,   0.0, 21.0]
+  */
+
+  out = tf.zeros({3, 4});
+  expected = tf.make(
+      {3, 4},
+      {-2.0, 0.0, 11.0, 12.0, -12.5, 15.0, 0.0, 21.0, 3.0, -7.5, -12.0, -4.0});
+
+  quantized_embedding_4bit_out(
+      qweight,
+      weight_scales,
+      weight_zero_points,
+      quant_min,
+      quant_max,
+      indices,
+      out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizedEmbedding4bTest, TestGroupWiseQuantizedEmbeddingDeath1) {
+  et_pal_init();
+  TensorFactory<ScalarType::Byte> tfb;
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  int64_t quant_min = -8;
+  int64_t quant_max = 7;
+
+  Tensor weight_scales = tf.make({4}, {0.5, 1.0, 1.5, 3.3});
+  Tensor weight_zero_points = tf.make({4}, {1, 5, 7, 5});
+  Tensor qweight = tfb.make({3, 2}, {89, 239, 163, 72, 11, 126});
+  Tensor indices = tfl.make({3}, {0, 2, 1});
+
+  Tensor out = tf.zeros({3, 4});
+  ET_EXPECT_DEATH(
+      quantized_embedding_4bit_out(
+          qweight,
+          weight_scales,
+          weight_zero_points,
+          quant_min,
+          quant_max,
+          indices,
+          out),
+      "");
+}
+
+TEST(OpQuantizedEmbedding4bTest, TestGroupWiseQuantizedEmbeddingDeath2) {
+  et_pal_init();
+  TensorFactory<ScalarType::Byte> tfb;
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  int64_t quant_min = -8;
+  int64_t quant_max = 7;
+
+  Tensor weight_scales = tf.make({2}, {0.5, 1.0});
+  Tensor weight_zero_points = tf.make({2}, {1, 5});
+  Tensor qweight = tfb.make({3, 2}, {89, 239, 163, 72, 11, 126});
+  Tensor indices = tfl.make({3}, {0, 2, 1});
+
+  Tensor out = tf.zeros({3, 4});
+  ET_EXPECT_DEATH(
+      quantized_embedding_4bit_out(
+          qweight,
+          weight_scales,
+          weight_zero_points,
+          quant_min,
+          quant_max,
+          indices,
+          out),
+      "");
+}
diff --git a/kernels/quantized/test/targets.bzl b/kernels/quantized/test/targets.bzl
index e06090cae91..a4129ee22fb 100644
--- a/kernels/quantized/test/targets.bzl
+++ b/kernels/quantized/test/targets.bzl
@@ -25,6 +25,7 @@ def define_common_targets():
         "//executorch/kernels/portable/cpu:op_embedding",
         "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
     ])
+    op_test("op_embedding4b_test", kernel_name = "quantized")
     op_test("op_mixed_mm_test", kernel_name = "quantized", deps = [
         "//executorch/kernels/quantized/cpu:op_mixed_mm",
         "//executorch/kernels/quantized:generated_lib_headers",
diff --git a/kernels/test/op_split_with_sizes_copy_test.cpp b/kernels/test/op_split_with_sizes_copy_test.cpp
index f67789c3561..91ef94af653 100644
--- a/kernels/test/op_split_with_sizes_copy_test.cpp
+++ b/kernels/test/op_split_with_sizes_copy_test.cpp
@@ -27,66 +27,88 @@ class OpSplitWithSizesCopyOutTest : public OperatorTest {
     return torch::executor::aten::split_with_sizes_copy_outf(
         context_, self, split_sizes, dim, out);
   }
+
+  void test_tensor_shape_dynamism(exec_aten::TensorShapeDynamism dynamism) {
+    torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float>
+        tfFloat;
+
+    exec_aten::Tensor self = tfFloat.make(
+        {2, 6, 3},
+        {-31.25,  -92.75,  -39.75,  -3.25,   53.875,  88.25,   -0.625,  -1.125,
+         14.75,   42.0,    89.875,  -21.125, -8.0,    -64.125, 23.0,    37.0,
+         46.125,  -83.25,  -58.125, 19.625,  -71.125, 64.75,   -1.375,  -83.5,
+         -61.375, 13.125,  28.625,  -94.0,   -67.0,   -8.625,  -88.875, -79.125,
+         0.375,   -61.375, 65.0,    -99.375});
+    ::std::vector<int64_t> split_sizes_vec = {3, 1, 2};
+    exec_aten::ArrayRef<int64_t> split_sizes = exec_aten::ArrayRef<int64_t>(
+        split_sizes_vec.data(), split_sizes_vec.size());
+    int64_t dim = 1;
+
+    ::std::vector<exec_aten::Tensor> out_vec;
+    if (dynamism == exec_aten::TensorShapeDynamism::STATIC) {
+      out_vec = {
+          tfFloat.zeros({2, 3, 3}),
+          tfFloat.zeros({2, 1, 3}),
+          tfFloat.zeros({2, 2, 3})};
+    } else { // dynamism == exec_aten::TensorShapeDynamism::DYNAMIC_BOUND
+      out_vec = {
+          tfFloat.zeros(
+              {2, 3, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND),
+          tfFloat.zeros(
+              {2, 1, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND),
+          tfFloat.zeros(
+              {2, 2, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND)};
+    }
+
+    exec_aten::TensorList out =
+        exec_aten::TensorList(out_vec.data(), out_vec.size());
+    ::std::vector<exec_aten::Tensor> out_expected_vec = {
+        tfFloat.make(
+            {2, 3, 3},
+            {-31.25,
+             -92.75,
+             -39.75,
+             -3.25,
+             53.875,
+             88.25,
+             -0.625,
+             -1.125,
+             14.75,
+             -58.125,
+             19.625,
+             -71.125,
+             64.75,
+             -1.375,
+             -83.5,
+             -61.375,
+             13.125,
+             28.625}),
+        tfFloat.make({2, 1, 3}, {42.0, 89.875, -21.125, -94.0, -67.0, -8.625}),
+        tfFloat.make(
+            {2, 2, 3},
+            {-8.0,
+             -64.125,
+             23.0,
+             37.0,
+             46.125,
+             -83.25,
+             -88.875,
+             -79.125,
+             0.375,
+             -61.375,
+             65.0,
+             -99.375})};
+    exec_aten::TensorList out_expected =
+        exec_aten::TensorList(out_expected_vec.data(), out_expected_vec.size());
+    op_split_with_sizes_copy_out(self, split_sizes, dim, out);
+    EXPECT_TENSOR_LISTS_CLOSE(out, out_expected);
+  }
 };
 
 TEST_F(OpSplitWithSizesCopyOutTest, SanityCheckDim1) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  test_tensor_shape_dynamism(exec_aten::TensorShapeDynamism::STATIC);
+}
 
-  exec_aten::Tensor self = tfFloat.make(
-      {2, 6, 3},
-      {-31.25,  -92.75,  -39.75,  -3.25,   53.875,  88.25,   -0.625,  -1.125,
-       14.75,   42.0,    89.875,  -21.125, -8.0,    -64.125, 23.0,    37.0,
-       46.125,  -83.25,  -58.125, 19.625,  -71.125, 64.75,   -1.375,  -83.5,
-       -61.375, 13.125,  28.625,  -94.0,   -67.0,   -8.625,  -88.875, -79.125,
-       0.375,   -61.375, 65.0,    -99.375});
-  ::std::vector<int64_t> split_sizes_vec = {3, 1, 2};
-  exec_aten::ArrayRef<int64_t> split_sizes = exec_aten::ArrayRef<int64_t>(
-      split_sizes_vec.data(), split_sizes_vec.size());
-  int64_t dim = 1;
-  ::std::vector<exec_aten::Tensor> out_vec = {
-      tfFloat.zeros({2, 3, 3}),
-      tfFloat.zeros({2, 1, 3}),
-      tfFloat.zeros({2, 2, 3})};
-  exec_aten::TensorList out =
-      exec_aten::TensorList(out_vec.data(), out_vec.size());
-  ::std::vector<exec_aten::Tensor> out_expected_vec = {
-      tfFloat.make(
-          {2, 3, 3},
-          {-31.25,
-           -92.75,
-           -39.75,
-           -3.25,
-           53.875,
-           88.25,
-           -0.625,
-           -1.125,
-           14.75,
-           -58.125,
-           19.625,
-           -71.125,
-           64.75,
-           -1.375,
-           -83.5,
-           -61.375,
-           13.125,
-           28.625}),
-      tfFloat.make({2, 1, 3}, {42.0, 89.875, -21.125, -94.0, -67.0, -8.625}),
-      tfFloat.make(
-          {2, 2, 3},
-          {-8.0,
-           -64.125,
-           23.0,
-           37.0,
-           46.125,
-           -83.25,
-           -88.875,
-           -79.125,
-           0.375,
-           -61.375,
-           65.0,
-           -99.375})};
-  exec_aten::TensorList out_expected =
-      exec_aten::TensorList(out_expected_vec.data(), out_expected_vec.size());
-  op_split_with_sizes_copy_out(self, split_sizes, dim, out);
-  EXPECT_TENSOR_LISTS_CLOSE(out, out_expected);
+TEST_F(OpSplitWithSizesCopyOutTest, DynamicShape) {
+  test_tensor_shape_dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
 }
diff --git a/pyproject.toml b/pyproject.toml
index ddd7bb0914c..fc597331fd7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,20 @@
 [build-system]
-requires = ["setuptools", "wheel"]
+requires = [
+  "cmake",  # For building binary targets in the wheel.
+  "pyyaml",  # Imported by the kernel codegen tools.
+  "setuptools",  # For building the pip package contents.
+  "tomli",  # Imported by extract_sources.py when using python < 3.11.
+  "wheel",  # For building the pip package archive.
+  "zstd",  # Imported by resolve_buck.py.
+]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "executorch"
-version = "0.1.0"
+# TODO(dbort): Use setuptools-git-versioning or setuptools-scm to get the
+# version from the git branch state. For now, use a version that doesn't look
+# like a real release.
+version = "0.2.1.dev0+unknown"
 # Python dependencies required for development
 dependencies=[
   "expecttest",
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index c3d550fc07d..da10dd53cab 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -10,7 +10,7 @@ flake8-comprehensions==3.12.0
 flake8-pyi==23.5.0
 mccabe==0.7.0
 pycodestyle==2.10.0
-torchfix==0.1.1
+torchfix==0.5.0
 
 # UFMT
 black==24.2.0
@@ -18,5 +18,5 @@ ufmt==2.5.1
 usort==1.0.5
 
 # Other linters
-clang-format==12.0.1
+clang-format==18.1.3
 cmakelint==1.4.1
diff --git a/runtime/core/memory_allocator.h b/runtime/core/memory_allocator.h
index ec9315a5c2b..8613b9ac647 100644
--- a/runtime/core/memory_allocator.h
+++ b/runtime/core/memory_allocator.h
@@ -63,7 +63,7 @@ class MemoryAllocator {
   /**
    * Allocates `size` bytes of memory.
    *
-   * @param[in] size Number of memory chunks to allocate.
+   * @param[in] size Number of bytes to allocate.
    * @param[in] alignment Minimum alignment for the returned pointer. Must be a
    *     power of 2.
    *
diff --git a/runtime/core/portable_type/optional.h b/runtime/core/portable_type/optional.h
index 94c0caa8a22..3d8cb41eac8 100644
--- a/runtime/core/portable_type/optional.h
+++ b/runtime/core/portable_type/optional.h
@@ -74,8 +74,8 @@ class optional final {
   }
 
   optional& operator=(optional&& rhs) noexcept(
-      std::is_nothrow_move_assignable<T>::value&&
-          std::is_nothrow_move_constructible<T>::value) {
+      std::is_nothrow_move_assignable<T>::value &&
+      std::is_nothrow_move_constructible<T>::value) {
     if (init_ && !rhs.init_) {
       clear();
     } else if (!init_ && rhs.init_) {
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index d4d79b121ce..fc03c36c57d 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -49,6 +49,7 @@ def define_common_targets():
             "bits_types.h",
         ],
         visibility = [
+            "//executorch/extension/...",
             "//executorch/runtime/core/exec_aten/util/...",
             "//executorch/kernels/...",
         ],
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index fe9edd5c8a2..4bbe58f322b 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -9,6 +9,15 @@ def get_event_tracer_flags():
         event_tracer_flags += ["-DET_EVENT_TRACER_ENABLED"]
     return event_tracer_flags
 
+def build_sdk():
+    return native.read_config("executorch", "build_sdk", "false") == "true"
+
+def get_sdk_flags():
+    sdk_flags = []
+    if build_sdk():
+        sdk_flags += ["-DEXECUTORCH_BUILD_SDK"]
+    return sdk_flags
+
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
@@ -92,7 +101,7 @@ def define_common_targets():
                 "//executorch/...",
                 "@EXECUTORCH_CLIENTS",
             ],
-            exported_preprocessor_flags = get_event_tracer_flags(),
+            exported_preprocessor_flags = get_event_tracer_flags() + get_sdk_flags(),
             exported_deps = [
                 "//executorch/runtime/platform:platform",
                 "//executorch/runtime/core:evalue" + aten_suffix,
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 0a3a64a13c4..378adf15288 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -786,7 +786,7 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
       input_idx,
       inputs_size());
 
-  const auto& e = get_input(input_idx);
+  const auto& e = get_value(get_input_index(input_idx));
   ET_CHECK_OR_RETURN_ERROR(
       e.isTensor() || e.isScalar(),
       InvalidArgument,
@@ -946,7 +946,7 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
       output_idx,
       outputs_size());
 
-  auto& output = mutable_output(output_idx);
+  auto& output = mutable_value(get_output_index(output_idx));
   ET_CHECK_OR_RETURN_ERROR(
       output.isTensor(),
       InvalidArgument,
@@ -1013,11 +1013,14 @@ Error Method::execute_instruction() {
       EXECUTORCH_SCOPE_PROF("OPERATOR_CALL");
       internal::EventTracerProfileScope event_tracer_scope =
           internal::EventTracerProfileScope(event_tracer_, "OPERATOR_CALL");
-      // TODO(T147221312): Also expose the temp allocator and tensor resizer
-      // via the context.
-      KernelRuntimeContext context(event_tracer_);
+      // TODO(T147221312): Also expose tensor resizer via the context.
+      // The temp_allocator passed can be null, but calling allocate_temp will
+      // fail
+      KernelRuntimeContext context(
+          event_tracer_, memory_manager_->temp_allocator());
       auto args = chain.argument_lists_[step_state_.instr_idx];
       chain.kernels_[step_state_.instr_idx](context, args.data());
+      // We reset the temp_allocator after the switch statement
       err = context.failure_state();
       if (err != Error::Ok) {
         // We know that instr_args_as_KernelCall is non-null because it was
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index b3db52720c7..46f997a80ad 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -44,9 +44,20 @@ def define_common_targets():
 
     for aten_mode in (True, False):
         aten_suffix = "_aten" if aten_mode else ""
-
         runtime.cxx_library(
             name = "program" + aten_suffix,
+            exported_deps = [
+                ":program_no_prim_ops" + aten_suffix,
+                "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
+            ],
+            visibility = [
+                "//executorch/runtime/executor/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+        )
+
+        runtime.cxx_library(
+            name = "program_no_prim_ops" + aten_suffix,
             srcs = [
                 "method.cpp",
                 "method_meta.cpp",
@@ -54,34 +65,28 @@ def define_common_targets():
                 "tensor_parser_exec_aten.cpp",
                 "tensor_parser{}.cpp".format(aten_suffix if aten_mode else "_portable"),
             ],
-            headers = [
-                "tensor_parser.h",
-            ],
             exported_headers = [
                 "method.h",
                 "method_meta.h",
                 "program.h",
+                "tensor_parser.h",
             ],
-            deps = [
-                "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
+            preprocessor_flags = _program_preprocessor_flags(),
+            exported_deps = [
+                ":memory_manager",
                 "//executorch/runtime/backend:interface",
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/core:core",
+                "//executorch/runtime/core:evalue" + aten_suffix,
+                "//executorch/runtime/core:event_tracer" + aten_suffix,
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
                 "//executorch/runtime/kernel:operator_registry",
                 "//executorch/runtime/platform:platform",
                 "//executorch/schema:extended_header",
-                "//executorch/schema:program",
-                ":memory_manager",
             ],
-            preprocessor_flags = _program_preprocessor_flags(),
-            exported_deps = [
-                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/core:evalue" + aten_suffix,
-                "//executorch/runtime/platform:platform",
-                "//executorch/runtime/core:event_tracer" + aten_suffix,
-                ":memory_manager",
+            deps = [
+                "//executorch/schema:program",
             ],
             visibility = [
                 "//executorch/runtime/executor/...",
diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h
index 8b51ca5ed08..7317315b529 100644
--- a/runtime/kernel/kernel_runtime_context.h
+++ b/runtime/kernel/kernel_runtime_context.h
@@ -10,6 +10,8 @@
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/event_tracer_hooks.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
 
 namespace torch {
@@ -24,10 +26,21 @@ namespace executor {
 class KernelRuntimeContext {
  public:
   /**
-   * Construct a new kernel runtime context along with an optional event tracer.
+   * Construct a new kernel runtime context.
+   *
+   * KernelRuntimeContext does not take ownership
+   * of these pointers, so they must outlive the context instance.
+   *
+   * @param[in] event_tracer The optional EventTracer to use for
+   *     profiling/debugging
+   * @param[in] temp_allocator The optional MemoryAllocator used to allocate
+   *     temporary memory for the kernel. If not provided, an error will be
+   *     returned when calling allocate_temp.
    */
-  KernelRuntimeContext(EventTracer* event_tracer = nullptr)
-      : event_tracer_(event_tracer) {}
+  KernelRuntimeContext(
+      EventTracer* event_tracer = nullptr,
+      MemoryAllocator* temp_allocator = nullptr)
+      : event_tracer_(event_tracer), temp_allocator_(temp_allocator) {}
   /**
    * Tells the runtime that the kernel call has failed. Prefer this over
    * ET_CHECK_*(), which fatally panics the process/system.
@@ -60,12 +73,37 @@ class KernelRuntimeContext {
     return event_tracer_;
   }
 
-  // TODO(T147221312): Add a way to allocate temporary memory.
+  /**
+   * Allocates temporary memory that will be freed when the kernel returns. This
+   * returns a pointer to the allocated memory or an error if the allocation
+   * fails.
+   *
+   * @param[in] size Number of bytes to allocate.
+   * @param[in] alignment Minimum alignment for the returned pointer. Must be a
+   *     power of 2.
+   *
+   * @returns A result object containing either a pointer to the allocated
+   *     memory or an error to indicate failure
+   */
+  Result<void*> allocate_temp(
+      size_t size,
+      size_t alignment = MemoryAllocator::kDefaultAlignment) {
+    ET_CHECK_OR_RETURN_ERROR(
+        temp_allocator_ != nullptr, NotFound, "No temp allocator provided");
+    void* temp_memory = temp_allocator_->allocate(size, alignment);
+    ET_CHECK_OR_RETURN_ERROR(
+        temp_memory != nullptr,
+        MemoryAllocationFailed,
+        "Failed to allocate temp memory. Bytes requested: %zu",
+        size);
+    return temp_memory;
+  }
 
   // TODO(T147221312): Add a way to resize a tensor.
 
  private:
   EventTracer* event_tracer_ = nullptr;
+  MemoryAllocator* temp_allocator_ = nullptr;
   Error failure_state_ = Error::Ok;
 };
 
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index a8f9eb50525..0bf45321dc9 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -55,6 +55,7 @@ def define_common_targets():
             exported_deps = [
                 "//executorch/runtime/core:core",
                 "//executorch/runtime/platform:platform",
+                "//executorch/runtime/core:memory_allocator",
                 "//executorch/runtime/core:event_tracer" + aten_suffix,
                 # TODO(T147221312): This will eventually depend on exec_aten
                 # once KernelRuntimeContext support tensor resizing, which is
diff --git a/runtime/kernel/test/kernel_runtime_context_test.cpp b/runtime/kernel/test/kernel_runtime_context_test.cpp
index 7147dc2a169..15709d52bff 100644
--- a/runtime/kernel/test/kernel_runtime_context_test.cpp
+++ b/runtime/kernel/test/kernel_runtime_context_test.cpp
@@ -15,6 +15,8 @@
 using namespace ::testing;
 using torch::executor::Error;
 using torch::executor::KernelRuntimeContext;
+using torch::executor::MemoryAllocator;
+using torch::executor::Result;
 
 class KernelRuntimeContextTest : public ::testing::Test {
  public:
@@ -23,6 +25,17 @@ class KernelRuntimeContextTest : public ::testing::Test {
   }
 };
 
+class TestMemoryAllocator : public MemoryAllocator {
+ public:
+  TestMemoryAllocator(uint32_t size, uint8_t* base_address)
+      : MemoryAllocator(size, base_address), last_seen_alignment(0) {}
+  void* allocate(size_t size, size_t alignment) override {
+    last_seen_alignment = alignment;
+    return MemoryAllocator::allocate(size, alignment);
+  }
+  size_t last_seen_alignment;
+};
+
 TEST_F(KernelRuntimeContextTest, FailureStateDefaultsToOk) {
   KernelRuntimeContext context;
 
@@ -47,3 +60,43 @@ TEST_F(KernelRuntimeContextTest, FailureStateReflectsFailure) {
   context.fail(Error::Ok);
   EXPECT_EQ(context.failure_state(), Error::Ok);
 }
+
+TEST_F(KernelRuntimeContextTest, FailureNoMemoryAllocatorProvided) {
+  KernelRuntimeContext context;
+  Result<void*> allocated_memory = context.allocate_temp(4);
+  EXPECT_EQ(allocated_memory.error(), Error::NotFound);
+}
+
+TEST_F(KernelRuntimeContextTest, SuccessfulMemoryAllocation) {
+  constexpr size_t temp_memory_allocator_pool_size = 4;
+  auto temp_memory_allocator_pool =
+      std::make_unique<uint8_t[]>(temp_memory_allocator_pool_size);
+  MemoryAllocator temp_allocator(
+      temp_memory_allocator_pool_size, temp_memory_allocator_pool.get());
+  KernelRuntimeContext context(nullptr, &temp_allocator);
+  Result<void*> allocated_memory = context.allocate_temp(4);
+  EXPECT_EQ(allocated_memory.ok(), true);
+}
+
+TEST_F(KernelRuntimeContextTest, FailureMemoryAllocationInsufficientSpace) {
+  constexpr size_t temp_memory_allocator_pool_size = 4;
+  auto temp_memory_allocator_pool =
+      std::make_unique<uint8_t[]>(temp_memory_allocator_pool_size);
+  MemoryAllocator temp_allocator(
+      temp_memory_allocator_pool_size, temp_memory_allocator_pool.get());
+  KernelRuntimeContext context(nullptr, &temp_allocator);
+  Result<void*> allocated_memory = context.allocate_temp(8);
+  EXPECT_EQ(allocated_memory.error(), Error::MemoryAllocationFailed);
+}
+
+TEST_F(KernelRuntimeContextTest, MemoryAllocatorAlignmentPassed) {
+  constexpr size_t temp_memory_allocator_pool_size = 4;
+  auto temp_memory_allocator_pool =
+      std::make_unique<uint8_t[]>(temp_memory_allocator_pool_size);
+  TestMemoryAllocator temp_allocator(
+      temp_memory_allocator_pool_size, temp_memory_allocator_pool.get());
+  KernelRuntimeContext context(nullptr, &temp_allocator);
+  Result<void*> allocated_memory = context.allocate_temp(4, 2);
+  EXPECT_EQ(allocated_memory.ok(), true);
+  EXPECT_EQ(temp_allocator.last_seen_alignment, 2);
+}
diff --git a/runtime/platform/assert.h b/runtime/platform/assert.h
index 8728e6a8d74..9222b8ed7fa 100644
--- a/runtime/platform/assert.h
+++ b/runtime/platform/assert.h
@@ -35,7 +35,7 @@
  */
 #define ET_CHECK_MSG(_cond, _format, ...)                               \
   ({                                                                    \
-    if (!(_cond)) {                                                     \
+    if __ET_UNLIKELY (!(_cond)) {                                       \
       ET_ASSERT_MESSAGE_EMIT(" (%s): " _format, #_cond, ##__VA_ARGS__); \
       torch::executor::runtime_abort();                                 \
     }                                                                   \
@@ -49,7 +49,7 @@
  */
 #define ET_CHECK(_cond)                       \
   ({                                          \
-    if (!(_cond)) {                           \
+    if __ET_UNLIKELY (!(_cond)) {             \
       ET_ASSERT_MESSAGE_EMIT(": %s", #_cond); \
       torch::executor::runtime_abort();       \
     }                                         \
diff --git a/schema/program.fbs b/schema/program.fbs
index c17836cda7f..df585ec03fd 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -324,7 +324,7 @@ table ExecutionPlan {
   delegates: [BackendDelegate];
 
   // List of buffer sizes for non_constant memory allocations. (Think neural net activations)
-  // A list instead of a single buffer to account for complex memory heirarchies.
+  // A list instead of a single buffer to account for complex memory hierarchies.
   // TODO(jakeszwe, razy): How to reconcile this with the ability for the hierarchical memory allocator
   // to be id based instead of index based.
   // Runtime should use the len(constant_buffer) as the ground truth of the
diff --git a/sdk/CMakeLists.txt b/sdk/CMakeLists.txt
index 96e2526f8de..73b30008c0b 100644
--- a/sdk/CMakeLists.txt
+++ b/sdk/CMakeLists.txt
@@ -69,18 +69,33 @@ include(ExternalProject)
 set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/include")
 set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/bundled_program")
 
-# Add the host project
-# lint_cmake: -readability/wonkycase
-ExternalProject_Add(
-  flatcc_project
-  PREFIX ${CMAKE_BINARY_DIR}/_host_build
-  SOURCE_DIR ${_flatcc_source_dir}
-  BINARY_DIR ${CMAKE_BINARY_DIR}/_host_build
-  CMAKE_CACHE_ARGS -DFLATCC_TEST:BOOL=OFF -DFLATCC_REFLECTION:BOOL=OFF
+# TODO(dbort): Only enable this when cross-compiling. It can cause build race
+# conditions (libflatcc.a errors) when enabled.
+option(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT
+  "Whether to build the flatcc commandline tool as a separate project" ON)
+
+if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
+  # Add the host project. We build this separately so that we can generate
+  # headers on the host during the build, even if we're cross-compiling the
+  # flatcc runtime to a different architecture.
+
+  # lint_cmake: -readability/wonkycase
+  ExternalProject_Add(
+    flatcc_project
+    PREFIX ${CMAKE_BINARY_DIR}/_host_build
+    SOURCE_DIR ${_flatcc_source_dir}
+    BINARY_DIR ${CMAKE_BINARY_DIR}/_host_build
+    CMAKE_CACHE_ARGS
+      -DFLATCC_TEST:BOOL=OFF -DFLATCC_REFLECTION:BOOL=OFF
       # See above comment about POSITION_INDEPENDENT_CODE.
       -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-  INSTALL_COMMAND "" # Prevent the install step, modify as needed
-)
+    INSTALL_COMMAND "" # Prevent the install step
+  )
+  set(_etdump_schema_gen_dep flatcc_project)
+else()
+  # If we're not cross-compiling, we can just use the plain commandline target.
+  set(_etdump_schema_gen_dep flatcc_cli)
+endif()
 
 set(_etdump_schema__outputs)
 foreach(fbs_file ${_etdump_schema_names})
@@ -114,33 +129,45 @@ file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump)
 file(MAKE_DIRECTORY
      ${_program_schema__include_dir}/executorch/sdk/bundled_program)
 
-add_custom_command(
-  OUTPUT ${_etdump_schema__outputs}
-  COMMAND
-    # Note that the flatcc project actually writes its outputs into the source
-    # tree instead of under the binary directory, and there's no way to change
-    # that behavior.
-    ${_flatcc_source_dir}/bin/flatcc -cwr -o
-    ${_program_schema__include_dir}/executorch/sdk/etdump
-    ${_etdump_schema__srcs}
+if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
+  # If we cross-compiling, we need to use the version of the commandline tool
+  # built for the host.
+  set(_etdump_schema_gen_dep flatcc_project)
+
   # TODO(dbort): flatcc installs its files directly in its source directory
-  # instead of under CMAKE_BINARY_DIR, and it has no options to avoid
-  # doing this. We build flatcc twice in the executorch build: once to get
-  # the `flatcc` host commandline tool, and once to get the (potentially
+  # instead of under CMAKE_BINARY_DIR, and it has no options to avoid doing
+  # this. We build flatcc twice in the executorch build: once to get the
+  # `flatcc` host commandline tool, and once to get the (potentially
   # cross-compiled) target runtime library. The host build will put its outputs
-  # in the source tree, making the cross-compiling target build think that
-  # the outputs have already been built. It will then try to link against the
+  # in the source tree, making the cross-compiling target build think that the
+  # outputs have already been built. It will then try to link against the
   # host-architecture libraries, failing when cross-compiling. To work around
   # this, delete the host outputs after running this command (which only runs
   # when setting up the cmake files, not when actually building). This leaves
   # room for the target build to put its own files in the source tree. We should
   # try to remove this hack, ideally by submitting an upstream PR that adds an
   # option to change the installation location.
+  set(_etdump_schema_cleanup_paths ${_flatcc_source_dir}/bin/*
+                                   ${_flatcc_source_dir}/lib/*)
+else()
+  # If we're not cross-compiling we can use the plain commandline target, and we
+  # don't need to delete any files.
+  set(_etdump_schema_gen_dep flatcc_cli)
+  set(_etdump_schema_cleanup_paths "")
+endif()
+
+add_custom_command(
+  OUTPUT ${_etdump_schema__outputs}
   COMMAND
-    rm -f ${_flatcc_source_dir}/bin/*
-    ${_flatcc_source_dir}/lib/*
+    # Note that the flatcc project actually writes its outputs into the source
+    # tree instead of under the binary directory, and there's no way to change
+    # that behavior.
+    ${_flatcc_source_dir}/bin/flatcc -cwr -o
+    ${_program_schema__include_dir}/executorch/sdk/etdump
+    ${_etdump_schema__srcs}
+  COMMAND rm -f ${_etdump_schema_cleanup_paths}
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
-  DEPENDS flatcc_project
+  DEPENDS ${_etdump_schema_gen_dep}
   COMMENT "Generating etdump headers"
   VERBATIM)
 
diff --git a/sdk/bundled_program/config.py b/sdk/bundled_program/config.py
index d1ca4c10e36..3bfbe7bc69c 100644
--- a/sdk/bundled_program/config.py
+++ b/sdk/bundled_program/config.py
@@ -62,7 +62,7 @@ def __init__(
             input: All inputs required by eager_model with specific inference method for one-time execution.
 
                     It is worth mentioning that, although both bundled program and ET runtime apis support setting input
-                    other than torch.tensor type, only the input in torch.tensor type will be actually updated in
+                    other than `torch.tensor` type, only the input in `torch.tensor` type will be actually updated in
                     the method, and the rest of the inputs will just do a sanity check if they match the default value in method.
 
             expected_output: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling.
diff --git a/sdk/debug_format/et_schema.py b/sdk/debug_format/et_schema.py
index af95bc7f03a..9a6af4edba9 100644
--- a/sdk/debug_format/et_schema.py
+++ b/sdk/debug_format/et_schema.py
@@ -260,7 +260,12 @@ def gen_operator_graph(
                 assert len(args) == 1
                 # Args of op=='output' is a wrapped list of return nodes ([ret_1, ret_2, ...], )
                 in_nodes = [
-                    nodes[FXOperatorGraph._get_node_name(ret)] for ret in args[0]
+                    (
+                        nodes[FXOperatorGraph._get_node_name(ret)]
+                        if ret is not None
+                        else []
+                    )
+                    for ret in args[0]
                 ]
                 node = ValueNode(
                     name,
diff --git a/sdk/etdump/etdump_flatcc.cpp b/sdk/etdump/etdump_flatcc.cpp
index 4e67532ca81..da4851214fd 100644
--- a/sdk/etdump/etdump_flatcc.cpp
+++ b/sdk/etdump/etdump_flatcc.cpp
@@ -103,7 +103,8 @@ ETDumpGen::ETDumpGen(Span<uint8_t> buffer) {
     alloc.set_buffer(
         (uint8_t*)buffer_with_builder,
         buffer_size,
-        (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size : buffer_size / 4));
+        (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size
+                                                        : buffer_size / 4));
     et_flatcc_custom_init(builder, &alloc);
   } else {
     builder = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder));
diff --git a/sdk/inspector/_inspector.py b/sdk/inspector/_inspector.py
index 91492643c89..45fe272cbb2 100644
--- a/sdk/inspector/_inspector.py
+++ b/sdk/inspector/_inspector.py
@@ -312,6 +312,9 @@ class Event:
     _instruction_id: Optional[int] = None
 
     _delegate_metadata_parser: Optional[Callable[[List[str]], Dict[str, Any]]] = None
+    _delegate_time_scale_converter: Optional[
+        Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+    ] = None
 
     @cached_property
     def delegate_debug_metadatas(self) -> Union[List[str], Dict[str, Any]]:
@@ -391,6 +394,9 @@ def _gen_from_inference_events(
         delegate_metadata_parser: Optional[
             Callable[[List[str]], Dict[str, Any]]
         ] = None,
+        delegate_time_scale_converter: Optional[
+            Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+        ] = None,
     ) -> "Event":
         """
         Given an EventSignature and a list of Events with that signature,
@@ -411,6 +417,7 @@ def _gen_from_inference_events(
             name="",
             _instruction_id=signature.instruction_id,
             _delegate_metadata_parser=delegate_metadata_parser,
+            _delegate_time_scale_converter=delegate_time_scale_converter,
         )
 
         # Populate fields from profile events
@@ -476,14 +483,35 @@ def _populate_profiling_related_fields(
                         f"Expected exactly one profile event per InstructionEvent when generating Inspector Event, but got {len(profile_events)}"
                     )
 
+                profile_event = profile_events[0]
+
                 # Scale factor should only be applied to non-delegated ops
-                scale_factor_updated = 1 if ret_event.is_delegated_op else scale_factor
+                if (
+                    ret_event.is_delegated_op
+                    and ret_event._delegate_time_scale_converter is not None
+                ):
+                    scaled_time = ret_event._delegate_time_scale_converter(
+                        ret_event.name,
+                        profile_event.end_time,
+                        # pyre-ignore
+                    ) - ret_event._delegate_time_scale_converter(
+                        ret_event.name, profile_event.start_time
+                    )
+                # If it's not a delegated op then we can just use the raw time values
+                # and then scale them according to the scale factor that was passed in.
+                elif not ret_event.is_delegated_op:
+                    scaled_time = (
+                        float(profile_event.end_time - profile_event.start_time)
+                        / scale_factor
+                    )
+                # If there was no scale factor passed in just take a difference of the
+                # end and start times.
+                else:
+                    scaled_time = float(
+                        profile_event.end_time - profile_event.start_time
+                    )
 
-                profile_event = profile_events[0]
-                data.append(
-                    float(profile_event.end_time - profile_event.start_time)
-                    / scale_factor_updated
-                )
+                data.append(scaled_time)
                 delegate_debug_metadatas.append(
                     profile_event.delegate_debug_metadata
                     if profile_event.delegate_debug_metadata
@@ -646,6 +674,9 @@ def _gen_from_etdump(
         delegate_metadata_parser: Optional[
             Callable[[List[str]], Dict[str, Any]]
         ] = None,
+        delegate_time_scale_converter: Optional[
+            Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+        ] = None,
     ) -> List["EventBlock"]:
         """
         Given an etdump, generate a list of EventBlocks corresponding to the
@@ -743,6 +774,7 @@ class GroupedRunInstances:
                     scale_factor,
                     output_buffer,
                     delegate_metadata_parser,
+                    delegate_time_scale_converter,
                 )
                 for signature, instruction_events in run_group.items()
             ]
@@ -875,6 +907,9 @@ def __init__(
         delegate_metadata_parser: Optional[
             Callable[[List[str]], Dict[str, Any]]
         ] = None,
+        delegate_time_scale_converter: Optional[
+            Callable[[Union[int, str], Union[int, float]], Union[int, float]]
+        ] = None,
         enable_module_hierarchy: bool = False,
     ) -> None:
         r"""
@@ -930,6 +965,7 @@ def __init__(
             self._target_time_scale,
             output_buffer,
             delegate_metadata_parser=delegate_metadata_parser,
+            delegate_time_scale_converter=delegate_time_scale_converter,
         )
 
         # Connect ETRecord to EventBlocks
diff --git a/sdk/inspector/_inspector_utils.py b/sdk/inspector/_inspector_utils.py
index a71d34753b3..ecef1d13e73 100644
--- a/sdk/inspector/_inspector_utils.py
+++ b/sdk/inspector/_inspector_utils.py
@@ -103,6 +103,9 @@ def get_scalar_type_size(scalar_type: ScalarType) -> Tuple[torch.dtype, int]:
         return torch.zeros(tensor.sizes, dtype=torch_dtype)
 
     tensor_bytes_size = math.prod(tensor.sizes) * dtype_size
+    if tensor_bytes_size == 0:
+        # Empty tensor. Return empty tensor.
+        return torch.zeros(tensor.sizes, dtype=torch_dtype)
 
     if tensor.offset is None:
         raise ValueError("Tensor offset cannot be None")
diff --git a/sdk/inspector/tests/TARGETS b/sdk/inspector/tests/TARGETS
index 0e6d06e776c..374d2ea7538 100644
--- a/sdk/inspector/tests/TARGETS
+++ b/sdk/inspector/tests/TARGETS
@@ -9,6 +9,7 @@ python_unittest(
         "//executorch/exir:lib",
         "//executorch/sdk:lib",
         "//executorch/sdk/debug_format:et_schema",
+        "//executorch/sdk/etdump:schema_flatcc",
         "//executorch/sdk/etrecord/tests:etrecord_test_library",
         "//executorch/sdk/inspector:inspector",
         "//executorch/sdk/inspector:lib",
diff --git a/sdk/inspector/tests/inspector_test.py b/sdk/inspector/tests/inspector_test.py
index 472f56f767d..e1625bec755 100644
--- a/sdk/inspector/tests/inspector_test.py
+++ b/sdk/inspector/tests/inspector_test.py
@@ -17,9 +17,15 @@
 from executorch.exir import ExportedProgram
 from executorch.sdk import generate_etrecord, parse_etrecord
 from executorch.sdk.debug_format.et_schema import OperatorNode
+from executorch.sdk.etdump.schema_flatcc import ProfileEvent
 from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord
 
 from executorch.sdk.inspector import _inspector, Event, EventBlock, Inspector, PerfData
+from executorch.sdk.inspector._inspector import (
+    InstructionEvent,
+    InstructionEventSignature,
+    ProfileEventSignature,
+)
 
 
 OP_TYPE = "aten::add"
@@ -183,6 +189,49 @@ def test_inspector_associate_with_op_graph_nodes_multiple_debug_handles(self):
         expected_ops = ["op_0", "op_1"]
         self.assertEqual(event_with_multiple_debug_handles.op_types, expected_ops)
 
+    def test_inspector_delegate_time_scale_converter(self):
+        def time_scale_converter(event_name, time):
+            return time / 10
+
+        event = Event(
+            name="",
+            _delegate_metadata_parser=None,
+            _delegate_time_scale_converter=None,
+        )
+        event_signature = ProfileEventSignature(
+            name="",
+            instruction_id=0,
+            delegate_id_str="test_event",
+        )
+        instruction_events = [
+            InstructionEvent(
+                signature=InstructionEventSignature(0, 0),
+                profile_events=[
+                    ProfileEvent(
+                        name="test_event",
+                        chain_index=0,
+                        instruction_id=0,
+                        delegate_debug_id_int=None,
+                        delegate_debug_id_str="test_event_delegated",
+                        start_time=100,
+                        end_time=200,
+                        delegate_debug_metadata=None,
+                    )
+                ],
+            )
+        ]
+        Event._populate_profiling_related_fields(
+            event, event_signature, instruction_events, 1
+        )
+        # Value of the perf data before scaling is done.
+        self.assertEqual(event.perf_data.raw[0], 100)
+        event._delegate_time_scale_converter = time_scale_converter
+        Event._populate_profiling_related_fields(
+            event, event_signature, instruction_events, 1
+        )
+        # Value of the perf data after scaling is done. 200/10 - 100/10.
+        self.assertEqual(event.perf_data.raw[0], 10)
+
     def test_inspector_get_exported_program(self):
         # Create a context manager to patch functions called by Inspector.__init__
         with patch.object(
diff --git a/sdk/targets.bzl b/sdk/targets.bzl
new file mode 100644
index 00000000000..38c2e6e820e
--- /dev/null
+++ b/sdk/targets.bzl
@@ -0,0 +1,8 @@
+def build_sdk():
+    return native.read_config("executorch", "build_sdk", "false") == "true"
+
+def get_sdk_flags():
+    sdk_flags = []
+    if build_sdk():
+        sdk_flags += ["-DEXECUTORCH_BUILD_SDK"]
+    return sdk_flags
diff --git a/setup.py b/setup.py
index bef57764b9d..6eac32bcb30 100644
--- a/setup.py
+++ b/setup.py
@@ -88,6 +88,11 @@ def pybindings(cls) -> bool:
     def xnnpack(cls) -> bool:
         return cls._is_env_enabled("EXECUTORCH_BUILD_XNNPACK", default=False)
 
+    @classmethod
+    @property
+    def llama_custom_ops(cls) -> bool:
+        return cls._is_env_enabled("EXECUTORCH_BUILD_CUSTOM_OPS_AOT", default=True)
+
 
 class _BaseExtension(Extension):
     """A base class that maps an abstract source to an abstract destination."""
@@ -359,6 +364,12 @@ def run(self):
             # useful error information to users.
             "-DEXECUTORCH_ENABLE_LOGGING=ON",
             "-DEXECUTORCH_LOG_LEVEL=Info",
+            "-DCMAKE_OSX_DEPLOYMENT_TARGET=10.15",
+            # The separate host project is only required when cross-compiling,
+            # and it can cause build race conditions (libflatcc.a errors) when
+            # enabled. TODO(dbort): Remove this override once this option is
+            # managed by cmake itself.
+            "-DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF",
         ]
 
         build_args = [f"-j{self.parallel}"]
@@ -370,6 +381,7 @@ def run(self):
         if ShouldBuild.pybindings:
             cmake_args += [
                 "-DEXECUTORCH_BUILD_PYBIND=ON",
+                "-DEXECUTORCH_BUILD_QUANTIZED=ON",  # add quantized ops to pybindings.
             ]
             build_args += ["--target", "portable_lib"]
             if ShouldBuild.xnnpack:
@@ -380,12 +392,26 @@ def run(self):
                 # into the portable_lib target.
             # TODO(dbort): Add MPS/CoreML backends when building on macos.
 
+        if ShouldBuild.llama_custom_ops:
+            cmake_args += [
+                "-DEXECUTORCH_BUILD_CUSTOM=ON",  # add llama sdpa ops to pybindings.
+                "-DEXECUTORCH_BUILD_CUSTOM_OPS_AOT=ON",
+            ]
+            build_args += ["--target", "custom_ops_aot_lib"]
         # Allow adding extra cmake args through the environment. Used by some
         # tests and demos to expand the set of targets included in the pip
         # package.
         if "CMAKE_ARGS" in os.environ:
             cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
 
+        # Allow adding extra build args through the environment. Used by some
+        # tests and demos to expand the set of targets included in the pip
+        # package.
+        if "CMAKE_BUILD_ARGS" in os.environ:
+            build_args += [
+                item for item in os.environ["CMAKE_BUILD_ARGS"].split(" ") if item
+            ]
+
         # Put the cmake cache under the temp directory, like
         # "pip-out/temp.<plat>/cmake-out".
         cmake_cache_dir = os.path.join(repo_root, self.build_temp, "cmake-out")
@@ -434,7 +460,15 @@ def get_ext_modules() -> list[Extension]:
             # portable kernels, and a selection of backends. This lets users
             # load and execute .pte files from python.
             BuiltExtension(
-                "portable_lib.*", "executorch.extension.pybindings.portable_lib"
+                "_portable_lib.*", "executorch.extension.pybindings._portable_lib"
+            )
+        )
+    if ShouldBuild.llama_custom_ops:
+        ext_modules.append(
+            # Install the prebuilt library for custom ops used in llama.
+            BuiltFile(
+                "examples/models/llama2/custom_ops/libcustom_ops_aot_lib.*",
+                "executorch/examples/models/llama2/custom_ops",
             )
         )
 
diff --git a/shim/shims.bzl b/shim/shims.bzl
index b332c566af2..4bc98f9fef9 100644
--- a/shim/shims.bzl
+++ b/shim/shims.bzl
@@ -5,7 +5,10 @@
 # License, Version 2.0 found in the LICENSE-APACHE file in the root directory
 # of this source tree.
 
-# @lint-ignore FBCODEBZLADDLOADS
+# @lint-ignore-every FBCODEBZLADDLOADS
+
+prelude = native
+
 _SELECT_TYPE = type(select({"DEFAULT": []}))
 
 def is_select(thing):
@@ -17,10 +20,9 @@ def cpp_library(
         undefined_symbols = None,
         visibility = ["PUBLIC"],
         **kwargs):
-    _unused = (undefined_symbols)  # @unused
+    _unused = undefined_symbols  # @unused
 
-    # @lint-ignore BUCKLINT: avoid "native is forbidden in fbcode"
-    native.cxx_library(
+    prelude.cxx_library(
         deps = _maybe_select_map(deps + external_deps_to_targets(external_deps), _fix_deps),
         visibility = visibility,
         preferred_linkage = "static",
@@ -49,9 +51,8 @@ def rust_library(
     # Reset visibility because internal and external paths are different.
     visibility = ["PUBLIC"]
 
-    # @lint-ignore BUCKLINT: avoid "Direct usage of native rules is not allowed."
-    native.rust_library(
-        rustc_flags = rustc_flags + [_CFG_BUCK_OSS_BUILD],
+    prelude.rust_library(
+        rustc_flags = rustc_flags + [_CFG_BUCK_BUILD],
         deps = deps,
         visibility = visibility,
         mapped_srcs = mapped_srcs,
@@ -71,8 +72,8 @@ def rust_binary(
     deps = _maybe_select_map(deps, _fix_deps)
 
     # @lint-ignore BUCKLINT: avoid "Direct usage of native rules is not allowed."
-    native.rust_binary(
-        rustc_flags = rustc_flags + [_CFG_BUCK_OSS_BUILD],
+    prelude.rust_binary(
+        rustc_flags = rustc_flags + [_CFG_BUCK_BUILD],
         deps = deps,
         visibility = visibility,
         **kwargs
@@ -85,9 +86,8 @@ def rust_unittest(
         **kwargs):
     deps = _maybe_select_map(deps, _fix_deps)
 
-    # @lint-ignore BUCKLINT: avoid "Direct usage of native rules is not allowed."
-    native.rust_test(
-        rustc_flags = rustc_flags + [_CFG_BUCK_OSS_BUILD],
+    prelude.rust_test(
+        rustc_flags = rustc_flags + [_CFG_BUCK_BUILD],
         deps = deps,
         visibility = visibility,
         **kwargs
@@ -129,8 +129,7 @@ def rust_protobuf_library(
         },
     )
 
-    # @lint-ignore BUCKLINT: avoid "Direct usage of native rules is not allowed."
-    native.genrule(
+    prelude.genrule(
         name = proto_name,
         srcs = protos + [
             "buck//third-party/proto:google_protobuf",
@@ -157,8 +156,7 @@ def rust_protobuf_library(
 
     # For python tests only
     for proto in protos:
-        # @lint-ignore BUCKLINT: avoid "Direct usage of native rules is not allowed."
-        native.export_file(
+        prelude.export_file(
             name = proto,
             visibility = ["PUBLIC"],
         )
@@ -169,17 +167,13 @@ def ocaml_binary(
         **kwargs):
     deps = _maybe_select_map(deps, _fix_deps)
 
-    # @lint-ignore BUCKLINT: avoid "native is forbidden in fbcode"
-    native.ocaml_binary(
+    prelude.ocaml_binary(
         deps = deps,
         visibility = visibility,
         **kwargs
     )
 
-# Configuration that is used when building open source using Buck2 as the build system.
-# E.g. not applied either internally, or when using Cargo to build the open source code.
-# At the moment of writing, mostly used to disable jemalloc.
-_CFG_BUCK_OSS_BUILD = "--cfg=buck_oss_build"
+_CFG_BUCK_BUILD = "--cfg=buck_build"
 
 def _maybe_select_map(v, mapper):
     if is_select(v):
diff --git a/shim/target_determinator/macros/ci.bzl b/shim/target_determinator/macros/ci.bzl
index 7bd5c0fbb30..abbd47d8835 100644
--- a/shim/target_determinator/macros/ci.bzl
+++ b/shim/target_determinator/macros/ci.bzl
@@ -8,9 +8,11 @@
 def _lbl(*_args):
     return ""
 
-def _package(_values, overwrite = False):
-    # @lint-ignore BUILDIFIERLINT
-    _ = overwrite
+def _package(
+        _values,
+        # starlark-lint-disable unused-argument
+        overwrite = False):  # @unused
+    pass
 
 ci = struct(
     package = _package,
diff --git a/shim/third-party/rust/Cargo.toml b/shim/third-party/rust/Cargo.toml
index 331559816aa..95749d12cbf 100644
--- a/shim/third-party/rust/Cargo.toml
+++ b/shim/third-party/rust/Cargo.toml
@@ -48,7 +48,7 @@ byteorder = "1.4.3"
 bytes = "1.0"
 bytesize = "1.1.0"
 chrono = "0.4.28"
-clap = { package = "clap", version = "4.0.7", features = ["derive", "env"] }
+clap = { package = "clap", version = "4.5.4", features = ["derive", "env", "string"] }
 clap-3 = { package = "clap", version = "3.2.24", features = ["derive", "env", "regex", "unicode", "wrap_help"] }
 common-path = "1.0.0"
 compact_str = "0.6.1"
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
index 5035521dbbd..03e4ef7830d 100644
--- a/shim/xplat/executorch/build/env_interface.bzl
+++ b/shim/xplat/executorch/build/env_interface.bzl
@@ -42,6 +42,7 @@ _EXTERNAL_DEPS = {
     "libtorch_python": "//third-party:libtorch_python",
     "prettytable": "//third-party:prettytable",
     "pybind11": "//third-party:pybind11",
+    "re2": [],  # TODO(larryliu0820): Add support
     # Core C++ PyTorch functionality like Tensor and ScalarType.
     "torch-core-cpp": "//third-party:libtorch",
     "torchgen": "//third-party:torchgen",
@@ -215,7 +216,7 @@ env = struct(
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     genrule = native.genrule,
     is_oss = True,
-    is_xplat = False,
+    is_xplat = lambda: False,
     patch_deps = _patch_deps,
     patch_cxx_compiler_flags = _patch_cxx_compiler_flags,
     patch_executorch_genrule_cmd = _patch_executorch_genrule_cmd,
diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim/xplat/executorch/codegen/codegen.bzl
index 42cea1ae35d..3de73770e26 100644
--- a/shim/xplat/executorch/codegen/codegen.bzl
+++ b/shim/xplat/executorch/codegen/codegen.bzl
@@ -332,6 +332,7 @@ def executorch_generated_lib(
         define_static_targets = False,
         custom_ops_aten_kernel_deps = [],
         custom_ops_requires_runtime_registration = True,
+        custom_ops_requires_aot_registration = True,
         visibility = [],
         aten_mode = False,
         manual_registration = False,
@@ -536,7 +537,7 @@ def executorch_generated_lib(
             platforms = platforms,
         )
 
-    if custom_ops_yaml_target:
+    if custom_ops_yaml_target and custom_ops_requires_aot_registration:
         exir_custom_ops_aot_lib(
             name = "custom_ops_" + name,
             yaml_target = custom_ops_yaml_target,
diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
index 04f21fdb1a0..405c2937553 100644
--- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl
+++ b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
@@ -37,7 +37,7 @@ ATEN_MODULE_DEPS = [
 # Generated lib for all ATen ops with aten kernel used by models in model inventory
 MODELS_ATEN_OPS_ATEN_MODE_GENERATED_LIB = [
     "//executorch/kernels/quantized:generated_lib_aten",
-    "//executorch/kernels/aten:generated_lib_aten",
+    "//executorch/kernels/aten:generated_lib",
 ]
 
 def executorch_pybindings(python_module_name, srcs = [], cppdeps = [], visibility = ["//executorch/..."], types = [], compiler_flags = []):
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
index 5f9508fceb6..32a6da5260f 100644
--- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "is_xplat", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 
 def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = False, _aten_mode_deps = []):
@@ -122,7 +122,17 @@ def define_op_library(name, deps, android_deps, aten_target, _allow_third_party_
         fbandroid_platform_deps = android_deps,
         # kernels often have helpers with no prototypes just disabling the warning here as the headers
         # are codegend and linked in later
-        compiler_flags = ["-Wno-missing-prototypes"],
+        compiler_flags = ["-Wno-missing-prototypes"] + (
+            # For shared library build, we don't want to expose symbols of
+            # kernel implementation (ex torch::executor::native::tanh_out)
+            # to library users. They should use kernels through registry only.
+            # With visibility=hidden, linker won't expose kernel impl symbols
+            # so it can prune unregistered kernels.
+            # Currently fbcode linkes all dependent libraries through shared
+            # library, and it blocks users like unit tests to use kernel
+            # implementation directly. So we enable this for xplat only.
+            ["-fvisibility=hidden"] if is_xplat() else []
+        ),
         deps = [
             "//executorch/runtime/kernel:kernel_includes" + aten_suffix,
         ] + deps,
diff --git a/test/size_test.cpp b/test/size_test.cpp
index cb186a7ff7d..88b605c3bf2 100644
--- a/test/size_test.cpp
+++ b/test/size_test.cpp
@@ -37,13 +37,17 @@ int main(int argc, char** argv) {
 
   Result<FileDataLoader> loader = FileDataLoader::from(argv[1]);
   ET_CHECK_MSG(
-      loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error());
+      loader.ok(),
+      "FileDataLoader::from() failed: 0x%" PRIx32,
+      static_cast<uint32_t>(loader.error()));
 
   uint32_t prof_tok = EXECUTORCH_BEGIN_PROF("de-serialize model");
   const auto program = Program::load(&loader.get());
   EXECUTORCH_END_PROF(prof_tok);
   ET_CHECK_MSG(
-      program.ok(), "Program::load() failed: 0x%" PRIx32, program.error());
+      program.ok(),
+      "Program::load() failed: 0x%" PRIx32,
+      static_cast<uint32_t>(program.error()));
   ET_LOG(Info, "Program file %s loaded.", argv[1]);
 
   // Use the first method in the program.
diff --git a/third-party/pytorch b/third-party/pytorch
index 0a038cf0cff..b1984237a0f 160000
--- a/third-party/pytorch
+++ b/third-party/pytorch
@@ -1 +1 @@
-Subproject commit 0a038cf0cff2d071b7359ac0491fd2ba7798a438
+Subproject commit b1984237a0fb32b760c1b84d6d02d2f0f7ed293b
diff --git a/version.txt b/version.txt
index 9e8f5f9ed83..c181bf59966 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.0a0
+0.3.0a0