From 51c4a5a8897c57ab7393632982ccbf97c129aac5 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 7 May 2025 18:24:37 -0700
Subject: [PATCH 001/178] Vulkan tests use executorch_core

Differential Revision: D74365586

Pull Request resolved: https://github.com/pytorch/executorch/pull/10765
---
 backends/vulkan/test/CMakeLists.txt          | 2 +-
 backends/vulkan/test/op_tests/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/vulkan/test/CMakeLists.txt b/backends/vulkan/test/CMakeLists.txt
index 95f0179f367..0b3f22875ad 100644
--- a/backends/vulkan/test/CMakeLists.txt
+++ b/backends/vulkan/test/CMakeLists.txt
@@ -82,7 +82,7 @@ if(TARGET vulkan_backend)
   )
   target_include_directories(vulkan_compute_api_test PRIVATE ${COMMON_INCLUDES})
   target_link_libraries(
-    vulkan_compute_api_test PRIVATE GTest::gtest_main vulkan_backend executorch
+    vulkan_compute_api_test PRIVATE GTest::gtest_main vulkan_backend executorch_core
                                     test_shaderlib
   )
   target_compile_options(vulkan_compute_api_test PRIVATE ${VULKAN_CXX_FLAGS})
diff --git a/backends/vulkan/test/op_tests/CMakeLists.txt b/backends/vulkan/test/op_tests/CMakeLists.txt
index 584719d5c28..a34d3f297f6 100644
--- a/backends/vulkan/test/op_tests/CMakeLists.txt
+++ b/backends/vulkan/test/op_tests/CMakeLists.txt
@@ -81,7 +81,7 @@ function(vulkan_op_test test_name test_src)
     ${test_name}
     PRIVATE GTest::gtest_main
             vulkan_backend
-            executorch
+            executorch_core
             ${LIB_TORCH}
             ${LIB_TORCH_CPU}
             ${LIB_C10}

From 486398856a1fa416fa6a37a6e4d1691ba7e04210 Mon Sep 17 00:00:00 2001
From: Eli Amesefe <eliamesefe@meta.com>
Date: Wed, 7 May 2025 18:29:36 -0700
Subject: [PATCH 002/178] Handle avg_pool2d with padding == 0 as no padding

Differential Revision: D74117402

Pull Request resolved: https://github.com/pytorch/executorch/pull/10697
---
 .../arm/operator_support/pool_2d_support.py   |  5 +++-
 backends/arm/test/ops/test_avg_pool2d.py      | 25 ++++++++++++++-----
 backends/arm/test/targets.bzl                 |  1 +
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/backends/arm/operator_support/pool_2d_support.py b/backends/arm/operator_support/pool_2d_support.py
index 750fab2730d..f4ada36de80 100644
--- a/backends/arm/operator_support/pool_2d_support.py
+++ b/backends/arm/operator_support/pool_2d_support.py
@@ -54,8 +54,11 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         kernel = cast(tuple[int, int], node.args[1])
         stride = cast(tuple[int, int], node.args[2])
         if len(node.args) > 3:
+            padding = cast(tuple[int, int], node.args[3])
             # Padding case
-            if not all(1 <= k <= 8 for k in kernel):
+            if not all(1 <= k <= 8 for k in kernel) and not all(
+                v == 0 for v in padding
+            ):
                 self.reporter.report_reject(
                     node, f"Avgpool2d with padding needs kernel dims < 8, got {kernel}"
                 )
diff --git a/backends/arm/test/ops/test_avg_pool2d.py b/backends/arm/test/ops/test_avg_pool2d.py
index 2a50ef38834..c48595aec7f 100644
--- a/backends/arm/test/ops/test_avg_pool2d.py
+++ b/backends/arm/test/ops/test_avg_pool2d.py
@@ -9,9 +9,11 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineBI,
@@ -64,15 +66,24 @@ def forward(self, x):
 
 
 @common.parametrize("test_module", test_modules)
+@pytest.mark.tosa_ref_model
 def test_avgpool2d_tosa_MI(test_module):
     model, input_tensor = test_module
 
-    pipeline = TosaPipelineMI[input_t](model, input_tensor, aten_op, exir_op)
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
-    pipeline.run()
+    pipeline = TosaPipelineMI[input_t](
+        model,
+        input_tensor,
+        aten_op,
+        exir_op,
+        run_on_tosa_ref_model=conftest.is_option_enabled("tosa_ref_model"),
+    )
+    if conftest.is_option_enabled("tosa_ref_model"):
+        pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+        pipeline.run()
 
 
 @common.parametrize("test_module", test_modules)
+@pytest.mark.tosa_ref_model
 def test_avgpool2d_tosa_BI(test_module):
     model, input_tensor = test_module
 
@@ -82,9 +93,11 @@ def test_avgpool2d_tosa_BI(test_module):
         aten_op,
         exir_op,
         symmetric_io_quantization=True,
+        run_on_tosa_ref_model=conftest.is_option_enabled("tosa_ref_model"),
     )
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
-    pipeline.run()
+    if conftest.is_option_enabled("tosa_ref_model"):
+        pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+        pipeline.run()
 
 
 @common.parametrize("test_module", test_modules)
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 832dcb3286c..acb27f13798 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -13,6 +13,7 @@ def define_arm_tests():
 
     # Operators
     test_files += [
+        "ops/test_avg_pool2d.py",
         "ops/test_linear.py", 
         "ops/test_slice.py",
         "ops/test_sigmoid.py",

From bf5b99a5211c37eb0fdba00b1fbac686e7d72446 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 7 May 2025 18:57:59 -0700
Subject: [PATCH 003/178] Update buck2 to 2025-05-06 (#10742)

Notably, pinned prelude version includes
https://github.com/facebook/buck2-prelude/commit/958af4f5e064aed9fcbfc51d68a052835671a2ff
. Also, we're able to simplify our Buck versioning logic now that Buck
has consistent versions across platforms
(https://github.com/facebook/buck2/issues/828#issuecomment-2755327893)
---
 .ci/docker/ci_commit_pins/buck2.txt |  2 +-
 third-party/prelude                 |  2 +-
 tools/cmake/resolve_buck.py         | 48 ++++++++---------------------
 3 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/buck2.txt b/.ci/docker/ci_commit_pins/buck2.txt
index 1b22c8ffc09..38d6362c8cb 100644
--- a/.ci/docker/ci_commit_pins/buck2.txt
+++ b/.ci/docker/ci_commit_pins/buck2.txt
@@ -1 +1 @@
-2024-12-16
+2025-05-06
diff --git a/third-party/prelude b/third-party/prelude
index 851d3f09c45..48c249f8c7b 160000
--- a/third-party/prelude
+++ b/third-party/prelude
@@ -1 +1 @@
-Subproject commit 851d3f09c452937fc5adef27e2c50f7f304f1646
+Subproject commit 48c249f8c7b99ff501d6e857754760315072b306
diff --git a/tools/cmake/resolve_buck.py b/tools/cmake/resolve_buck.py
index 6da0a81b6de..f9c42a0a3c8 100644
--- a/tools/cmake/resolve_buck.py
+++ b/tools/cmake/resolve_buck.py
@@ -15,7 +15,7 @@
 
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Sequence, Union
+from typing import Union
 
 import buck_util
 import zstd
@@ -46,54 +46,34 @@ def _buck_version_path() -> Path:
 @dataclass
 class BuckInfo:
     archive_name: str
-    target_versions: Sequence[str]
 
 
-# Mapping of os family and architecture to buck2 binary versions. The
-# target version is the hash given by running 'buck2 --version'. The
+# Mapping of os family and architecture to buck2 archive name. The target version is the
+# hash given by running 'buck2 --version', which is now consistent across platforms. The
 # archive name is the archive file name to download, as seen under
 # https://github.com/facebook/buck2/releases/.
 #
-# To add or update versions, download the appropriate version of buck2
-# and run 'buck2 --version'. Add the corresponding entry to the platform
-# map below, and if adding new os families or architectures, update the
-# platform detection logic in resolve_buck2().
-#
-# Some platforms (linux) provide multiple binaries (GNU and MUSL). All
-# versions in the list are accepted when validating a user-provided or
-# system buck2.
+# To update Buck2, download the appropriate version of buck2 for your platform, run
+# 'buck2 --version', and update BUCK_TARGET_VERSION. To add a new platform, add the
+# corresponding entry to the platform map below, and if adding new os families or
+# architectures, update the platform detection logic in resolve_buck2().
+BUCK_TARGET_VERSION = "2025-05-06-201beb86106fecdc84e30260b0f1abb5bf576988"
+
 BUCK_PLATFORM_MAP = {
     ("linux", "x86_64"): BuckInfo(
         archive_name="buck2-x86_64-unknown-linux-musl.zst",
-        target_versions=[
-            # MUSL
-            "edae27cfca00053d9c5f7c7be81b6b0d7d07573a50be374ce53a9d8692afa5fc",
-            # GNU
-            "10334cb20cb7c321",
-        ],
     ),
     ("linux", "aarch64"): BuckInfo(
         archive_name="buck2-aarch64-unknown-linux-gnu.zst",
-        target_versions=[
-            # MUSL
-            "5d7af382acbe0dde70f0e9b0a0bc36deea906077ec1ffe80d3fa280490109051",
-            # GNU
-            "08d4382de22fab275978abc7c27c001d7823eb2f",
-        ],
     ),
     ("darwin", "aarch64"): BuckInfo(
         archive_name="buck2-aarch64-apple-darwin.zst",
-        target_versions=["f3b7a37732803ed090cd8a37f00cc000"],
     ),
     ("darwin", "x86_64"): BuckInfo(
         archive_name="buck2-x86_64-apple-darwin.zst",
-        target_versions=["9c9a583658d43e82b41f3fc9d369a9b0"],
     ),
     ("windows", "x86_64"): BuckInfo(
         archive_name="buck2-x86_64-pc-windows-msvc.exe.zst",
-        target_versions=[
-            "c7d378f3f307e9590f0b29a5f7f1b21b8e784f4e4bd30a0160b2a69df50d2ee0"
-        ],
     ),
 }
 
@@ -160,13 +140,13 @@ def resolve_buck2(args: argparse.Namespace) -> Union[str, int]:
         # If we have an explicit buck2 arg, check the version and fail if
         # there is a mismatch.
         ver = buck_util.get_buck2_version(args.buck2)
-        if ver in buck_info.target_versions:
+        if ver == BUCK_TARGET_VERSION:
             return args.buck2
         else:
             print(
                 f'The provided buck2 binary "{args.buck2}" reports version '
                 f'"{ver}", but ExecuTorch needs version '
-                f'"{buck_info.target_versions[0]}". Ensure that the correct buck2'
+                f'"{BUCK_TARGET_VERSION}". Ensure that the correct buck2'
                 " version is installed or avoid explicitly passing the BUCK2 "
                 "version to automatically download the correct version.",
                 file=sys.stderr,
@@ -181,7 +161,7 @@ def resolve_buck2(args: argparse.Namespace) -> Union[str, int]:
         # Look for system buck2 and check version. Note that this can return
         # None.
         ver = buck_util.get_buck2_version("buck2")
-        if ver in buck_info.target_versions:
+        if ver == BUCK_TARGET_VERSION:
             # Use system buck2.
             return "buck2"
         else:
@@ -190,9 +170,7 @@ def resolve_buck2(args: argparse.Namespace) -> Union[str, int]:
             os.makedirs(cache_dir, exist_ok=True)
 
             buck2_local_path = (
-                (cache_dir / f"buck2-{buck_info.target_versions[0]}")
-                .absolute()
-                .as_posix()
+                (cache_dir / f"buck2-{BUCK_TARGET_VERSION}").absolute().as_posix()
             )
 
             # Check for a previously cached buck2 binary. The filename includes

From bb7e50f095533a88437c7aa457e204c1bf752544 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 7 May 2025 19:23:49 -0700
Subject: [PATCH 004/178] Tests use executorch_core

Differential Revision: D74369346

Pull Request resolved: https://github.com/pytorch/executorch/pull/10764
---
 examples/selective_build/CMakeLists.txt | 9 ++++-----
 runtime/kernel/test/CMakeLists.txt      | 6 +++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/selective_build/CMakeLists.txt b/examples/selective_build/CMakeLists.txt
index db570bb98c3..fc059c2cc68 100644
--- a/examples/selective_build/CMakeLists.txt
+++ b/examples/selective_build/CMakeLists.txt
@@ -43,7 +43,7 @@ find_package(
   gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
 )
 
-target_include_directories(executorch INTERFACE ${_common_include_directories})
+target_include_directories(executorch_core INTERFACE ${_common_include_directories})
 
 # ------------------------------ OPTIONS BEGIN -------------------------------
 
@@ -91,7 +91,7 @@ if(EXECUTORCH_SELECT_OPS_YAML)
   # custom_kernels: C++ kernel implementations of custom ops
   #
   add_library(custom_kernels ${kernel_sources})
-  target_link_libraries(custom_kernels PRIVATE executorch)
+  target_link_libraries(custom_kernels PRIVATE executorch_core)
   target_compile_options(custom_kernels PUBLIC ${_common_compile_options})
 
   list(APPEND _kernel_lib custom_kernels)
@@ -117,7 +117,7 @@ generate_bindings_for_kernels(
 )
 
 gen_operators_lib(
-  LIB_NAME "select_build_lib" KERNEL_LIBS ${_kernel_lib} DEPS executorch
+  LIB_NAME "select_build_lib" KERNEL_LIBS ${_kernel_lib} DEPS executorch_core
 )
 
 list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
@@ -131,10 +131,9 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(selective_build_test PRIVATE "LINKER:--gc-sections")
 endif()
 target_link_libraries(
-  selective_build_test PRIVATE executorch gflags select_build_lib
+  selective_build_test PRIVATE executorch_core gflags select_build_lib
 )
 target_link_options_shared_lib(select_build_lib)
-target_link_options_shared_lib(executorch)
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
 
 # Print all summary
diff --git a/runtime/kernel/test/CMakeLists.txt b/runtime/kernel/test/CMakeLists.txt
index 9ff47fbefd5..5a9c4f0febf 100644
--- a/runtime/kernel/test/CMakeLists.txt
+++ b/runtime/kernel/test/CMakeLists.txt
@@ -20,7 +20,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 add_executable(operator_registry_test operator_registry_test.cpp)
 target_link_libraries(
-  operator_registry_test GTest::gtest GTest::gtest_main GTest::gmock executorch
+  operator_registry_test GTest::gtest GTest::gtest_main GTest::gmock executorch_core
 )
 target_include_directories(operator_registry_test PRIVATE ${EXECUTORCH_ROOT}/..)
 add_test(operator_registry_test operator_registry_test)
@@ -28,7 +28,7 @@ add_test(operator_registry_test operator_registry_test)
 add_executable(kernel_runtime_context_test kernel_runtime_context_test.cpp)
 target_link_libraries(
   kernel_runtime_context_test GTest::gtest GTest::gtest_main GTest::gmock
-  executorch
+  executorch_core
 )
 target_include_directories(
   kernel_runtime_context_test PRIVATE ${EXECUTORCH_ROOT}/..
@@ -47,7 +47,7 @@ add_executable(
 )
 target_link_libraries(
   operator_registry_max_kernel_num_test GTest::gtest GTest::gtest_main
-  GTest::gmock executorch
+  GTest::gmock executorch_core
 )
 target_compile_definitions(
   operator_registry_max_kernel_num_test PRIVATE "-DMAX_KERNEL_NUM=1"

From b1d00e2a46b17864545b250dbfd17de15c11c9e9 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 8 May 2025 02:39:22 -0400
Subject: [PATCH 005/178] [ET-VK] Introduce generic export pass for fusing Q/DQ
 nodes (#10771)

## Context

When quantizing models with the PT2E quantization flow, quantize/dequantize nodes will be inserted into the graph. However, these quantize/dequantize nodes must be fused with operators such as `aten.linear.default` to produce nodes corresponding to quantized operators (e.g. `weight_int8pack_mm`) in order for quantized operator implementations to be called at runtime.

Currently, the op fusion is done by the `fuse_dequant_linear.py` pass, however, this only handles one specific fusion pattern to generate a `weight_int8pack_mm` operator. As more quantized operators are to be supported in ET-VK via the PT2E quantization flow, a more generic fusion pass is needed that can handle a variety of fusion patterns.

## Changes

Introduce the `FuseQuantizedOpsTransform()` pass. I elected to introduce a new pass under the `backends/vulkan/_passes` directory, as opposed to modifying the existing pass because I anticipate the majority of the fusion patterns to be specific to ET-VK.

Remove the existing `FuseDequantLinearPass()`

Switch to using the `FuseQuantizedOpsTransform` pass instead of the old `FuseDequantLinear` pass.

Add `test_vulkan_passes` Python test to test export passes.

Some small refactors to `test_vulkan_delegate` Python test to improve code organizations.

Differential Revision: [D73794042](https://our.internmc.facebook.com/intern/diff/D73794042/)
---
 backends/transforms/fuse_dequant_linear.py    |  77 ------
 backends/transforms/targets.bzl               |  15 --
 backends/vulkan/_passes/TARGETS               |  19 ++
 backends/vulkan/_passes/__init__.py           |   4 +
 backends/vulkan/_passes/fuse_quantized_ops.py | 229 ++++++++++++++++++
 backends/vulkan/custom_ops_lib.py             |  47 ++++
 backends/vulkan/quantizer/vulkan_quantizer.py |  63 ++---
 backends/vulkan/targets.bzl                   |   2 +-
 backends/vulkan/test/TARGETS                  |  13 +
 backends/vulkan/test/test_vulkan_delegate.py  | 213 ++++++++++------
 backends/vulkan/test/test_vulkan_passes.py    | 151 ++++++++++++
 backends/vulkan/utils.py                      |  69 ++++++
 backends/vulkan/vulkan_preprocess.py          |   4 +-
 extension/llm/export/quantizer_lib.py         |   8 +-
 14 files changed, 712 insertions(+), 202 deletions(-)
 delete mode 100644 backends/transforms/fuse_dequant_linear.py
 create mode 100644 backends/vulkan/_passes/fuse_quantized_ops.py
 create mode 100644 backends/vulkan/test/test_vulkan_passes.py

diff --git a/backends/transforms/fuse_dequant_linear.py b/backends/transforms/fuse_dequant_linear.py
deleted file mode 100644
index 235715ac74f..00000000000
--- a/backends/transforms/fuse_dequant_linear.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-import torch
-
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-
-
-class FuseDequantLinearPass(ExportPass):
-    """
-    Fuses weight dequantize_per_channel nodes with linear nodes into
-    weight_int8pack_mm nodes, for 8-bit weight-only quantization.
-
-    Replaces dq(weight) -> linear(activation, dq)       with weight_int8pack_mm
-    Replaces dq(weight) -> linear(activation, dq, bias) with weight_int8pack_mm -> add
-    """
-
-    def fuse_dequant_with_linear(
-        self,
-        graph_module: torch.fx.GraphModule,
-        dequant_node: torch.fx.Node,
-        linear_node: torch.fx.Node,
-    ) -> None:
-        activations = linear_node.args[0]
-        bias = None
-        if len(linear_node.args) > 2:
-            bias = linear_node.args[2]
-        quant_weight = dequant_node.args[0]
-        scale = dequant_node.args[1]
-
-        with graph_module.graph.inserting_before(linear_node):
-            weight_int8pack_mm_node = graph_module.graph.create_node(
-                "call_function",
-                exir_ops.edge.aten._weight_int8pack_mm.default,
-                (activations, quant_weight, scale),
-            )
-            if bias:
-                add_node = graph_module.graph.create_node(
-                    "call_function",
-                    exir_ops.edge.aten.add.Tensor,
-                    (weight_int8pack_mm_node, bias),
-                )
-                linear_node.replace_all_uses_with(add_node)
-            else:
-                linear_node.replace_all_uses_with(weight_int8pack_mm_node)
-            graph_module.graph.erase_node(linear_node)
-            graph_module.graph.erase_node(dequant_node)
-
-    def is_node_target(
-        self, node: torch.fx.Node, target: torch._ops.OperatorBase
-    ) -> bool:
-        return node.op == "call_function" and node.target == target
-
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        for node in graph_module.graph.nodes:
-            if self.is_node_target(node, exir_ops.edge.aten.linear.default):
-                weight_node = node.args[1]
-                if self.is_node_target(
-                    weight_node,
-                    exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
-                ):
-                    # only fuse if weight tensor is int8 packed
-                    quant_weight = weight_node.args[0]
-                    if quant_weight.meta["val"].dtype != torch.int8:
-                        continue
-                    self.fuse_dequant_with_linear(graph_module, weight_node, node)
-
-        graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
-
-        return PassResult(graph_module, True)
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
index 66ff9111f52..71980195962 100644
--- a/backends/transforms/targets.bzl
+++ b/backends/transforms/targets.bzl
@@ -77,21 +77,6 @@ def define_common_targets():
         ],
     )
 
-    runtime.python_library(
-        name = "fuse_dequant_linear",
-        srcs = ["fuse_dequant_linear.py"],
-        visibility = [
-            "//executorch/backends/...",
-        ],
-        deps = [
-            ":utils",
-            "//caffe2:torch",
-            "//executorch/exir:pass_base",
-            "//executorch/exir:sym_util",
-            "//executorch/exir/dialects:lib",
-        ],
-    )
-
     runtime.python_library(
         name = "view_copy_to_squeeze_unsqueeze",
         srcs = ["view_copy_to_squeeze_unsqueeze.py"],
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
index 5478ad0eab6..cfe20892994 100644
--- a/backends/vulkan/_passes/TARGETS
+++ b/backends/vulkan/_passes/TARGETS
@@ -3,6 +3,23 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
 
+runtime.python_library(
+    name = "fuse_quantized_ops",
+    srcs = ["fuse_quantized_ops.py"],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/transforms:utils",
+        "//executorch/backends/vulkan:custom_ops_lib",
+        "//executorch/backends/vulkan:utils_lib",
+        "//executorch/exir:pass_base",
+        "//executorch/exir:sym_util",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
 runtime.python_library(
     name = "insert_prepack_nodes",
     srcs = ["insert_prepack_nodes.py"],
@@ -13,6 +30,7 @@ runtime.python_library(
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/backends/vulkan:utils_lib",
+        "//executorch/backends/vulkan:op_registry",
     ],
 )
 
@@ -110,6 +128,7 @@ runtime.python_library(
         "//executorch/examples/...",
     ],
     deps = [
+        ":fuse_quantized_ops",
         ":insert_prepack_nodes",
         ":int4_weight_only_quantizer",
         ":remove_asserts",
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
index 220afa6a35c..7ff93a6ee38 100644
--- a/backends/vulkan/_passes/__init__.py
+++ b/backends/vulkan/_passes/__init__.py
@@ -6,6 +6,9 @@
 
 # pyre-strict
 
+from executorch.backends.vulkan._passes.fuse_quantized_ops import (
+    FuseQuantizedOpsTransform,
+)
 from executorch.backends.vulkan._passes.insert_prepack_nodes import insert_prepack_nodes
 from executorch.backends.vulkan._passes.int4_weight_only_quantizer import (
     VkInt4WeightOnlyQuantizer,
@@ -26,6 +29,7 @@
 from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass
 
 __all__ = [
+    "FuseQuantizedOpsTransform",
     "insert_prepack_nodes",
     "VkInt4WeightOnlyQuantizer",
     "remove_asserts",
diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py
new file mode 100644
index 00000000000..d510e1d4342
--- /dev/null
+++ b/backends/vulkan/_passes/fuse_quantized_ops.py
@@ -0,0 +1,229 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import executorch.backends.vulkan.utils as utils
+import torch
+
+import torch.nn.functional as F
+
+from executorch.backends.transforms.utils import get_param_tensor, is_param_node
+from executorch.exir import ExportedProgram
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+#################
+## linear_qcnw ##
+#################
+
+
+def matches_linear_qcnw_pattern(  # noqa: C901
+    program: ExportedProgram, node: torch.fx.Node
+) -> Optional[Tuple[torch.qscheme, int]]:
+    """
+    Checks if the nodes surrounding a linear node matches the pattern for weight only
+    quantized linear, where the weight is quantized channelswise to n bits.
+
+    If the graph pattern matches, then return a tuple of (quantization_method, nbits)
+    describing the type of quantization used for the weights. Otherwise, return None.
+    """
+    if not utils.is_linear_node(node):
+        return None
+
+    input_node = node.args[0]
+    weight_node = node.args[1]
+
+    # Type checking
+    if not isinstance(weight_node, torch.fx.Node):
+        return None
+    if not isinstance(input_node, torch.fx.Node):
+        return None
+
+    # The input arg should not be a dequant node; if it is, then it is indicative that
+    # dynamically quantized linear should be used instead
+    if utils.is_dequant_node(input_node):
+        return None
+
+    # The weight arg should be a dequant node dequantizing the quantized weight
+    # Furthermore, the op expects per channel quantization of the weight
+    if not utils.is_dequant_per_channel_node(weight_node):
+        return None
+
+    orig_weight = weight_node.args[0]
+    zeros = weight_node.args[2]
+
+    # Type checking
+    if not isinstance(orig_weight, torch.fx.Node):
+        return None
+    if not is_param_node(program, orig_weight):
+        return None
+    if not isinstance(zeros, torch.fx.Node):
+        return None
+    if not is_param_node(program, zeros):
+        return None
+
+    zeros_tensor = get_param_tensor(program, zeros)
+    if not isinstance(zeros_tensor, torch.Tensor):
+        return None
+
+    quant_method = torch.per_channel_affine
+    # Check for symmetric quantization, where the zeros used for dequantization will
+    # actually be all zeros.
+    if torch.all(zeros_tensor == 0):
+        quant_method = torch.per_channel_symmetric
+
+    orig_weight_tensor = get_param_tensor(program, orig_weight)
+    if not isinstance(orig_weight_tensor, torch.Tensor):
+        return None
+    # Sanity check the dtype of the quantized weight
+    if orig_weight_tensor.dtype != torch.int8:
+        return None
+
+    quant_min = orig_weight_tensor.min().item()
+    quant_max = orig_weight_tensor.max().item()
+    # Determine the number of bits the weight has been quantized to
+    if quant_min >= -8 and quant_max <= 7:
+        return quant_method, 4
+    elif quant_min >= -128 and quant_max <= 127:
+        return quant_method, 8
+
+    return None
+
+
+def pack_4bit_weight_tensor(inp: torch.Tensor) -> torch.Tensor:
+    """
+    Given a 8-bit weight tensor containing values quantized to 4 bits, create a packed
+    weight tensor by packing 2 4-bit values in one unsigned 8-bit value.
+
+    An input weight tensor of shape (M, K) will produce a packed weight tensor of shape
+    (M, K / 2).
+    """
+
+    # Assert we got a properly quantized tensor.
+    min, max = inp.min().item(), inp.max().item()
+    assert (
+        max <= 7 and min >= -8
+    ), f"convert_to_qc4w: [min,max] out of [-8, 7] range, got [{min}, {max}]"
+
+    # Assuming we have a 2d tensor
+    if inp.ndim != 2:
+        inp = inp.squeeze()
+    assert (
+        inp.ndim == 2
+    ), f"convert_to_qc4w: expecting input tensor to be 2d, got {inp.ndim}"
+
+    # pad ic
+    if inp.shape[-1] % 2 != 0:
+        inp = F.pad(input=inp, pad=(0, 1, 0, 0), mode="constant", value=0)
+
+    # Shape after padding
+    oc, ic = inp.shape
+    assert ic % 2 == 0, "convert_to_qc4w: expecting ic to be even"
+
+    # Adjust inp tensor for zp
+    inp = inp.to(dtype=torch.uint8) + 8
+
+    # Prepare the Result tensor
+    inp = inp.contiguous().view(-1)
+    return (inp[::2] << 4 | inp[1::2]).view(oc, int(ic / 2))
+
+
+def fuse_into_linear_qcnw_node(
+    program: ExportedProgram,
+    graph_module: torch.fx.GraphModule,
+    linear_node: torch.fx.Node,
+    quant_method: torch.qscheme,
+    nbits: int,
+) -> None:
+    """
+    The weight_int8pack_mm operator represents a weight only quantized linear operator,
+    where the weight tensor has been quantized channelswise to nbits bits.
+
+      After the PT2E quantization flow, the expected graph pattern is
+
+          dq_weight = dequantize(weight, scales)
+          out = linear(activation, dq_weight, bias?)
+
+      The goal of this function is to condense that sequence into
+
+          out = quantized_linear(activation, dq_weight, scales)
+          out = out + bias
+    """
+    activation = linear_node.args[0]
+    dq_weight_node = linear_node.args[1]
+    assert isinstance(activation, torch.fx.Node)
+    assert isinstance(dq_weight_node, torch.fx.Node)
+
+    bias = None
+    if len(linear_node.args) > 2:
+        bias = linear_node.args[2]
+        assert isinstance(bias, torch.fx.Node)
+
+    orig_weight = dq_weight_node.args[0]
+    scale = dq_weight_node.args[1]
+
+    # For 4 bit quantization, pack the weight tensor
+    if nbits == 4:
+        assert isinstance(orig_weight, torch.fx.Node)
+        orig_weight_tensor = get_param_tensor(program, orig_weight)
+        assert isinstance(orig_weight_tensor, torch.Tensor)
+        packed_weight_tensor = pack_4bit_weight_tensor(orig_weight_tensor)
+        utils.update_program_state_dict(
+            program,
+            orig_weight.name,
+            packed_weight_tensor,
+        )
+        orig_weight.meta["val"] = orig_weight.meta["val"][:, ::2].to(torch.uint8)
+
+    if nbits == 8 and quant_method == torch.per_channel_symmetric:
+        op_target = exir_ops.edge.aten._weight_int8pack_mm.default
+    elif nbits == 4 and quant_method == torch.per_channel_symmetric:
+        op_target = exir_ops.edge.et_vk.linear_qcs4w.default
+    else:
+        raise NotImplementedError(
+            "only 4 and 8 bits per channel symmetric quant supported for linear_qcnw"
+        )
+
+    with graph_module.graph.inserting_before(linear_node):
+        weight_int8pack_mm_node = graph_module.graph.create_node(
+            "call_function",
+            op_target,
+            (activation, orig_weight, scale),
+        )
+        if bias:
+            add_node = graph_module.graph.create_node(
+                "call_function",
+                exir_ops.edge.aten.add.Tensor,
+                (weight_int8pack_mm_node, bias),
+            )
+            linear_node.replace_all_uses_with(add_node)
+        else:
+            linear_node.replace_all_uses_with(weight_int8pack_mm_node)
+        graph_module.graph.erase_node(linear_node)
+        graph_module.graph.erase_node(dq_weight_node)
+
+
+class FuseQuantizedOpsTransform(ExportPass):
+    def __init__(self, exported_program: ExportedProgram) -> None:
+        super().__init__()
+        self.program = exported_program
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            qcnw_details = matches_linear_qcnw_pattern(self.program, node)
+            if qcnw_details is not None:
+                qcnw_method, qcnw_nbits = qcnw_details
+                fuse_into_linear_qcnw_node(
+                    self.program, graph_module, node, qcnw_method, qcnw_nbits
+                )
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
index 0275239a86a..af6fcbfbb14 100644
--- a/backends/vulkan/custom_ops_lib.py
+++ b/backends/vulkan/custom_ops_lib.py
@@ -184,6 +184,53 @@ def linear_weight_int4_impl(
 lib.impl(name, linear_weight_int4_impl, "CompositeExplicitAutograd")
 linear_weight_int4_op = getattr(getattr(torch.ops, namespace), name)
 
+#################
+## linear_qcs4w ##
+#################
+
+
+def linear_qcs4w(
+    x: torch.Tensor,
+    weights_4x2: torch.Tensor,
+    scales: torch.Tensor,
+):
+    original_x_shape = x.shape
+    x = x.reshape(-1, original_x_shape[-1])
+
+    unpacked_weights_shape = weights_4x2.shape
+    out_features = unpacked_weights_shape[0]
+    in_features = unpacked_weights_shape[1]
+
+    weights_unpacked = torch.empty(
+        (out_features, in_features * 2), dtype=torch.int8, device=weights_4x2.device
+    )
+
+    weights_unpacked[:, ::2] = weights_4x2 >> 4
+    weights_unpacked[:, 1::2] = weights_4x2 & 0x0F
+
+    n_bit = 8
+    quant_min = -(2 ** (n_bit - 1))
+    quant_max = 2 ** (n_bit - 1) - 1
+    dq_weights = torch.ops.quantized_decomposed.dequantize_per_channel(
+        weights_unpacked,
+        scales,
+        None,
+        0,
+        quant_min,
+        quant_max,
+        torch.int8,
+    )
+
+    out = torch.nn.functional.linear(x, dq_weights)
+    out_shape = original_x_shape[:-1] + (out_features,)
+    return out.reshape(out_shape)
+
+
+name = "linear_qcs4w"
+lib.define(f"{name}(Tensor self, Tensor weight, Tensor scales) -> Tensor")
+lib.impl(name, linear_qcs4w, "CompositeExplicitAutograd")
+linear_qc4w_op = getattr(getattr(torch.ops, namespace), name)
+
 ######################
 ## apply_rotary_emb ##
 ######################
diff --git a/backends/vulkan/quantizer/vulkan_quantizer.py b/backends/vulkan/quantizer/vulkan_quantizer.py
index 2ea3e321dc3..b2f1a658040 100644
--- a/backends/vulkan/quantizer/vulkan_quantizer.py
+++ b/backends/vulkan/quantizer/vulkan_quantizer.py
@@ -9,7 +9,7 @@
 from __future__ import annotations
 
 import functools
-from typing import Any, Callable, Dict, Optional
+from typing import Callable, Optional
 
 import torch
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
@@ -18,53 +18,60 @@
     propagate_annotation,
     QuantizationConfig,
 )
-from torch.ao.quantization.observer import MinMaxObserver, PerChannelMinMaxObserver
-from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+from torch.ao.quantization.observer import PerChannelMinMaxObserver
 from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer
 from torch.fx import Node
 
 
 __all__ = [
     "VulkanQuantizer",
-    "get_weight_quantization_config",
+    "get_linear_weight_qcs_qspec",
+    "get_linear_weight_only_qcs_xnn_qconfig",
 ]
 
 
-@functools.lru_cache
-def get_weight_quantization_config(
-    is_per_channel: bool = True,
-    weight_qmin: int = -128,
-    weight_qmax: int = 127,
-) -> QuantizationConfig:
-
-    weight_qscheme = (
-        torch.per_channel_symmetric if is_per_channel else torch.per_tensor_symmetric
-    )
-    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
-        PerChannelMinMaxObserver if is_per_channel else MinMaxObserver
-    )
-    extra_args: Dict[str, Any] = {"eps": 2**-12}
+def get_linear_weight_qcs_qspec(quant_bits: int) -> QuantizationSpec:
+    """
+    Return a QuantizationSpec to perform per-channel symmetric (i.e. "qcs") quantization
+    of weight tensors of linear layers to the number of bits specified by quant_bits.
+    """
+    weight_observer = PerChannelMinMaxObserver
+    assert quant_bits in {
+        8,
+        4,
+    }, f"Unsupported weight quantization bits: {quant_bits}"
 
-    weight_quantization_spec = QuantizationSpec(
+    quant_min = -(2 ** (quant_bits - 1))
+    quant_max = 2 ** (quant_bits - 1) - 1
+    qscheme = torch.per_channel_symmetric
+
+    return QuantizationSpec(
         dtype=torch.int8,
-        quant_min=weight_qmin,
-        quant_max=weight_qmax,
-        qscheme=weight_qscheme,
+        quant_min=quant_min,
+        quant_max=quant_max,
+        qscheme=qscheme,
         ch_axis=0,
         is_dynamic=False,
-        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
-            **extra_args
-        ),
+        observer_or_fake_quant_ctr=weight_observer,
     )
 
-    quantization_config = QuantizationConfig(
+
+@functools.lru_cache
+def get_linear_weight_only_qcs_xnn_qconfig(quant_bits: int) -> QuantizationConfig:
+    """
+    Return a XNNPACKQuantizer QuantizationConfig class instance that specifies
+    quantizing the weight tensors of linear layers using per-channel symmetric (qcs)
+    quantization to the number of bits specified by quant_bits.
+    """
+    weight_qspec = get_linear_weight_qcs_qspec(quant_bits)
+
+    return QuantizationConfig(
         input_activation=None,
         output_activation=None,
-        weight=weight_quantization_spec,
+        weight=weight_qspec,
         bias=None,
         is_qat=False,
     )
-    return quantization_config
 
 
 _SUPPORTED_OPS = [
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index aafc87ad2c3..665fde103fc 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -280,6 +280,7 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 "//caffe2:torch",
                 "//executorch/exir:tensor",
+                "//executorch/exir/backend/canonical_partitioners:config_partitioner_lib",
                 "//executorch/backends/vulkan/serialization:lib",
             ]
         )
@@ -332,7 +333,6 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/backends/transforms:addmm_mm_to_linear",
                 "//executorch/backends/transforms:fuse_batch_norm_with_conv",
                 "//executorch/backends/transforms:fuse_conv_with_clamp",
-                "//executorch/backends/transforms:fuse_dequant_linear",
                 "//executorch/backends/transforms:fuse_view_copy",
                 "//executorch/backends/transforms:remove_clone_ops",
                 "//executorch/backends/transforms:view_copy_to_squeeze_unsqueeze",
diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS
index 5ac87892762..8f07040d586 100644
--- a/backends/vulkan/test/TARGETS
+++ b/backends/vulkan/test/TARGETS
@@ -24,6 +24,19 @@ python_unittest(
     ],
 )
 
+python_unittest(
+    name = "test_vulkan_passes",
+    srcs = [
+        "test_vulkan_passes.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan/_passes:vulkan_passes",
+        "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
+        "//executorch/backends/vulkan:vulkan_preprocess",
+    ]
+)
+
 python_unittest(
     name = "test_vulkan_delegate_header",
     srcs = [
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 5fba5ed54cf..b57710974e8 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -15,10 +15,19 @@
 from executorch.backends.transforms.convert_dtype_pass import I64toI32
 
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+
 from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
 
-from executorch.exir import EdgeCompileConfig
-from torch.export import Dim, export, ExportedProgram
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    ExecutorchProgramManager,
+)
+
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+from torch.ao.quantization.quantizer import Quantizer
+from torch.export import Dim, export, export_for_training, ExportedProgram
 
 ctypes.CDLL("libvulkan.so.1")
 
@@ -30,11 +39,66 @@
 from executorch.extension.pytree import tree_flatten
 
 
-class TestBackends(unittest.TestCase):
-    _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
+def lower_module(
+    model: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], dynamic_shapes=None
+) -> EdgeProgramManager:
+    compile_options = {}
+    edge_compile_config = EdgeCompileConfig(
+        _skip_dim_order=False,  # TODO(T182928844): Delegate dim order op to backend.
+    )
+
+    program: ExportedProgram = export(
+        model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
+    )
+
+    edge_program = to_edge_transform_and_lower(
+        program,
+        compile_config=edge_compile_config,
+        transform_passes=[
+            I64toI32(edge_compile_config._skip_dim_order),
+        ],
+        partitioner=[VulkanPartitioner(compile_options)],
+    )
+
+    return edge_program
+
+
+def quantize_and_lower_module(
+    model: torch.nn.Module,
+    sample_inputs: Tuple[torch.Tensor],
+    quantizer: Quantizer,
+    dynamic_shapes=None,
+) -> EdgeProgramManager:
+    compile_options = {}
+    edge_compile_config = EdgeCompileConfig(
         _skip_dim_order=False,  # TODO(T182928844): Delegate dim order op to backend.
     )
 
+    program = export_for_training(
+        model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
+    ).module()
+
+    program = prepare_pt2e(program, quantizer)  # pyre-ignore
+    # Calibrate
+    program(*sample_inputs)
+
+    program = convert_pt2e(program)
+
+    program = export(program, sample_inputs, dynamic_shapes=dynamic_shapes)
+
+    edge_program = to_edge_transform_and_lower(
+        program,
+        compile_config=edge_compile_config,
+        transform_passes=[
+            I64toI32(edge_compile_config._skip_dim_order),
+        ],
+        partitioner=[VulkanPartitioner(compile_options)],
+    )
+
+    return edge_program
+
+
+class TestVulkanBackend(unittest.TestCase):
     def assert_outputs_equal(
         self,
         model_output,
@@ -88,6 +152,59 @@ def assert_outputs_equal(
                 )
             )
 
+    def check_no_delegation(self, et_program: ExecutorchProgramManager):
+        self.assertEqual(
+            len(et_program.executorch_program.execution_plan[0].delegates),
+            0,
+        )
+        return
+
+    def check_vk_delegation(self, et_program: ExecutorchProgramManager):
+        self.assertEqual(
+            et_program.executorch_program.execution_plan[0].delegates[0].id,
+            VulkanBackend.__name__,
+        )
+
+    def run_delegated_model_and_check_output(
+        self,
+        et_program: ExecutorchProgramManager,
+        model: torch.nn.Module,
+        sample_inputs: Tuple[torch.Tensor],
+        atol=1e-03,
+        rtol=1e-01,
+        test_inputs=None,
+        first_output_only=False,
+    ):
+        executorch_module = _load_for_executorch_from_buffer(et_program.buffer)
+        inputs_flattened, _ = tree_flatten(sample_inputs)
+
+        model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
+        ref_output = model(*sample_inputs)
+
+        self.assert_outputs_equal(
+            model_output,
+            ref_output,
+            atol=atol,
+            rtol=rtol,
+            first_output_only=first_output_only,
+        )
+
+        if test_inputs is not None:
+            for test_input in test_inputs:
+                test_inputs_flattened, _ = tree_flatten(test_input)
+                model_output = executorch_module.run_method(
+                    "forward", tuple(test_inputs_flattened)
+                )
+                ref_output = model(*test_input)
+
+                self.assert_outputs_equal(
+                    model_output,
+                    ref_output,
+                    atol=atol,
+                    rtol=rtol,
+                    first_output_only=first_output_only,
+                )
+
     def lower_module_and_test_output(
         self,
         model: torch.nn.Module,
@@ -105,80 +222,29 @@ def lower_module_and_test_output(
         outputs with the outputs of the eager module.
         """
 
-        def run_test():
-            compile_options = {}
+        # Validate that the model can execute in eager mode
+        model.eval()
+        model(*sample_inputs)
 
-            # At least model should run in eager mode.
-            model.eval()
-            model(*sample_inputs)
+        edge_program = lower_module(model, sample_inputs, dynamic_shapes=dynamic_shapes)
 
-            program: ExportedProgram = export(
-                model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
-            )
+        et_program = edge_program.to_executorch()
 
-            edge_program = to_edge_transform_and_lower(
-                program,
-                compile_config=self._edge_compile_config,
-                transform_passes=[
-                    I64toI32(self._edge_compile_config._skip_dim_order),
-                ],
-                partitioner=[VulkanPartitioner(compile_options)],
-            )
-            executorch_program = edge_program.to_executorch()
-
-            if expect_no_delegates:
-                self.assertEqual(
-                    len(
-                        executorch_program.executorch_program.execution_plan[
-                            0
-                        ].delegates
-                    ),
-                    0,
-                )
-                return
-            else:
-                self.assertEqual(
-                    executorch_program.executorch_program.execution_plan[0]
-                    .delegates[0]
-                    .id,
-                    VulkanBackend.__name__,
-                )
-
-            executorch_module = _load_for_executorch_from_buffer(
-                executorch_program.buffer
-            )
-            inputs_flattened, _ = tree_flatten(sample_inputs)
+        if expect_no_delegates:
+            self.check_no_delegation(et_program)
+            return
 
-            model_output = executorch_module.run_method(
-                "forward", tuple(inputs_flattened)
-            )
-            ref_output = model(*sample_inputs)
-
-            self.assert_outputs_equal(
-                model_output,
-                ref_output,
-                atol=atol,
-                rtol=rtol,
-                first_output_only=first_output_only,
-            )
-
-            if test_inputs is not None:
-                for test_input in test_inputs:
-                    test_inputs_flattened, _ = tree_flatten(test_input)
-                    model_output = executorch_module.run_method(
-                        "forward", tuple(test_inputs_flattened)
-                    )
-                    ref_output = model(*test_input)
+        self.check_vk_delegation(et_program)
 
-                    self.assert_outputs_equal(
-                        model_output,
-                        ref_output,
-                        atol=atol,
-                        rtol=rtol,
-                        first_output_only=first_output_only,
-                    )
-
-        run_test()
+        self.run_delegated_model_and_check_output(
+            et_program,
+            model,
+            sample_inputs,
+            atol,
+            rtol,
+            test_inputs=test_inputs,
+            first_output_only=first_output_only,
+        )
 
     def test_vulkan_backend_add(self):
         # This test is the simplest test by manually lowering some submodules, we can use paritioner
@@ -942,6 +1008,7 @@ def forward(self, x):
             sample_inputs,
         )
 
+    @unittest.skip("layer norm compute shader not working with swiftshader")
     def test_vulkan_backend_native_layer_norm(self):
         class NativeLayerNormModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py
new file mode 100644
index 00000000000..7572ebd5a5a
--- /dev/null
+++ b/backends/vulkan/test/test_vulkan_passes.py
@@ -0,0 +1,151 @@
+import unittest
+from typing import Optional, Tuple
+
+import torch
+
+from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
+from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform
+
+from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
+    get_linear_weight_only_qcs_xnn_qconfig,
+    VulkanQuantizer,
+)
+
+from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
+
+from executorch.exir.backend.canonical_partitioners.config_partitioner import (
+    format_target_name,
+)
+
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer import Quantizer
+
+###################
+## Common Models ##
+###################
+
+
+class SingleLinearModule(torch.nn.Module):
+    def __init__(self, K=256, N=128):
+        super().__init__()
+        self.K = K
+        self.N = N
+        self.linear = torch.nn.Linear(K, N, bias=False)
+
+    def forward(self, x):
+        return self.linear(x)
+
+    def get_sample_inputs(self):
+        sample_inputs = (torch.rand(size=(32, self.K), dtype=torch.float32),)
+        return sample_inputs
+
+
+###########
+## Tests ##
+###########
+
+
+def quantize_and_lower_module(
+    model: torch.nn.Module,
+    sample_inputs: Tuple[torch.Tensor],
+    quantizer: Quantizer,
+    dynamic_shapes=None,
+) -> EdgeProgramManager:
+    edge_compile_config = EdgeCompileConfig(
+        _skip_dim_order=False,  # TODO(T182928844): Delegate dim order op to backend.
+        _check_ir_validity=False,
+    )
+
+    program = torch.export.export_for_training(
+        model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
+    ).module()
+
+    program = prepare_pt2e(program, quantizer)  # pyre-ignore
+    # Calibrate
+    program(*sample_inputs)
+
+    program = convert_pt2e(program)
+
+    program = torch.export.export(program, sample_inputs, dynamic_shapes=dynamic_shapes)
+
+    edge_program = to_edge(
+        program,
+        compile_config=edge_compile_config,
+    )
+
+    return edge_program
+
+
+def get_target_canonical_name(node: torch.fx.Node) -> Optional[str]:
+    if node.op != "call_function":
+        return None
+    node_name = format_target_name(node.target.__name__)  # pyre-ignore
+    return node_name
+
+
+def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) -> int:
+    count = 0
+    for node in graph_module.graph.nodes:
+        canonical_name = get_target_canonical_name(node)
+        if canonical_name is not None and canonical_name == canonical_op_name:
+            count += 1
+    return count
+
+
+class TestVulkanPasses(unittest.TestCase):
+
+    def test_fuse_int8pack_mm(self):
+        K = 256
+        N = 256
+        model = SingleLinearModule(K, N)
+        sample_inputs = model.get_sample_inputs()
+
+        quantizer = VulkanQuantizer()
+        quantizer.set_global(get_linear_weight_only_qcs_xnn_qconfig(8))
+
+        edge_manager = quantize_and_lower_module(
+            model,
+            sample_inputs,
+            quantizer,
+        )
+
+        ep = edge_manager._edge_programs["forward"]
+        edge_manager.transform(
+            [
+                AddmmToLinearTransform(),
+                FuseQuantizedOpsTransform(ep),
+            ]
+        )
+
+        gm = ep.graph_module
+
+        self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1)
+        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
+
+    def test_fuse_linear_qcs4w(self):
+        K = 256
+        N = 256
+        model = SingleLinearModule(K, N)
+        sample_inputs = model.get_sample_inputs()
+
+        quantizer = VulkanQuantizer()
+        quantizer.set_global(get_linear_weight_only_qcs_xnn_qconfig(4))
+
+        edge_manager = quantize_and_lower_module(
+            model,
+            sample_inputs,
+            quantizer,
+        )
+
+        ep = edge_manager._edge_programs["forward"]
+        edge_manager.transform(
+            [
+                AddmmToLinearTransform(),
+                FuseQuantizedOpsTransform(ep),
+            ]
+        )
+
+        gm = ep.graph_module
+
+        self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1)
+        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index fa032cd7b4f..eb949a6ace8 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -14,6 +14,10 @@
     VkStorageType,
 )
 
+from executorch.exir.backend.canonical_partitioners.config_partitioner import (
+    format_target_name,
+)
+
 from executorch.exir.tensor import TensorSpec
 
 from torch._export.utils import is_buffer, is_param
@@ -22,11 +26,44 @@
 
 from torch.export import ExportedProgram
 
+from torch.export.exported_program import InputKind
+from torch.export.graph_signature import TensorArgument
+
+_DQ_OPS = {
+    "dequantize_per_tensor.tensor",
+    "dequantize_per_tensor.default",
+    "dequantize_per_channel.default",
+    "dequantize_per_channel_group.default",
+    "dequantize_per_token.default",
+    "dequantize_affine.default",
+}
+
 ##
 ## Node type determination
 ##
 
 
+def is_dequant_node(node: torch.fx.Node) -> bool:
+    if node.op != "call_function":
+        return False
+    node_name = format_target_name(node.target.__name__)  # pyre-ignore
+    return node_name in _DQ_OPS
+
+
+def is_dequant_per_channel_node(node: torch.fx.Node) -> bool:
+    if node.op != "call_function":
+        return False
+    node_name = format_target_name(node.target.__name__)  # pyre-ignore
+    return node_name == "dequantize_per_channel.default"
+
+
+def is_linear_node(node: torch.fx.Node) -> bool:
+    if node.op != "call_function":
+        return False
+    node_name = format_target_name(node.target.__name__)  # pyre-ignore
+    return node_name == "linear.default"
+
+
 def is_get_attr_node(node: torch.fx.Node) -> bool:
     return isinstance(node, torch.fx.Node) and node.op == "get_attr"
 
@@ -258,3 +295,35 @@ def get_node_storage_type(node: torch.fx.Node) -> Optional[VkStorageType]:
 
 def get_node_memory_layout(node: torch.fx.Node) -> Optional[VkMemoryLayout]:
     return get_node_spec_attr(node, "vk_memory_layout")
+
+
+##
+## Misc
+##
+
+
+def update_program_state_dict(
+    program: ExportedProgram,
+    buffer_name: str,
+    updated_tensor: torch.Tensor,
+) -> None:
+    target_name = None
+    # Iterate over all the tensors in the graph signature, and find
+    # the one corresponding to the parameter/buffer name
+    for input_ in program.graph_signature.input_specs:
+        if (
+            input_.kind in (InputKind.BUFFER, InputKind.PARAMETER)
+            and isinstance(input_.arg, TensorArgument)
+            and input_.arg.name == buffer_name
+        ):
+            target_name = input_.target
+            break
+
+    # Assert that we found the parameter/buffer
+    assert (
+        target_name is not None
+    ), f"could not find {buffer_name} in source program signature"
+    assert target_name in program.state_dict, f"could not find {target_name}"
+
+    # Finally, overwrite the current tensor with updated tensor
+    program.state_dict[target_name] = updated_tensor
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 188311e5f2c..4200df3e131 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -17,12 +17,12 @@
     FuseBatchNormWithConvPass,
 )
 from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
-from executorch.backends.transforms.fuse_dequant_linear import FuseDequantLinearPass
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 from executorch.backends.transforms.view_copy_to_squeeze_unsqueeze import (
     ViewCopyToSqueezeUnsqueezePass,
 )
 from executorch.backends.vulkan._passes import (
+    FuseQuantizedOpsTransform,
     insert_prepack_nodes,
     RemoveLocalScalarDenseOpsTransform,
     RemoveRedundantOpsTransform,
@@ -152,7 +152,7 @@ def preprocess(  # noqa: C901
             [
                 RemoveRedundantOpsTransform(),
                 AddmmToLinearTransform(),
-                FuseDequantLinearPass(),
+                FuseQuantizedOpsTransform(program),
                 SqueezeUnsqueezeInputs(),
                 FuseViewCopyTransform(),
                 ViewCopyToSqueezeUnsqueezePass(),
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 24c3be2e802..d7b8b3a92b1 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -266,16 +266,12 @@ def get_coreml_quantizer(pt2e_quantize: str):
 
 def get_vulkan_quantizer(pt2e_quantize: str):
     from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
-        get_weight_quantization_config,
+        get_linear_weight_only_qcs_xnn_qconfig,
         VulkanQuantizer,
     )
 
     if pt2e_quantize == "vulkan_8w":
-        config = get_weight_quantization_config(
-            is_per_channel=True,
-            weight_qmin=-128,
-            weight_qmax=127,
-        )
+        config = get_linear_weight_only_qcs_xnn_qconfig(8)
     else:
         raise ValueError(f"Unsupported Vulkan quantizer specification {pt2e_quantize}")
 

From 5e8295ef80db6c32b65592d43e5fa8e9134daba9 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 8 May 2025 02:40:32 -0400
Subject: [PATCH 006/178] [ET-VK] Implement linear_qcs4w (#10772)

## Context

Title says it all!

## Changes

Extended the implementation of `linear_qcsnw` to support packed 4-bit weight tensors.

Differential Revision: [D73941991](https://our.internmc.facebook.com/intern/diff/D73941991/)
---
 backends/vulkan/op_registry.py                |   7 +-
 .../runtime/graph/ops/glsl/indexing_utils.h   |  18 +-
 .../graph/ops/glsl/linear_qcsnw_coop.glsl     | 145 +++++++++-----
 .../graph/ops/glsl/linear_qcsnw_coop.yaml     |  10 +
 .../graph/ops/glsl/linear_qcsnw_tiled.glsl    | 115 ++++++++---
 .../graph/ops/glsl/linear_qcsnw_tiled.yaml    |  10 +
 .../graph/ops/impl/QuantizedLinearQCSNW.cpp   |  88 ++++++---
 .../graph/ops/impl/QuantizedLinearQGANW.cpp   |  48 -----
 .../vulkan/runtime/graph/ops/impl/Staging.cpp |  48 +++++
 .../vulkan/runtime/graph/ops/impl/Staging.h   |   7 +
 .../test/op_tests/linear_weight_int4_test.cpp | 182 +++++++++++++++++-
 11 files changed, 528 insertions(+), 150 deletions(-)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index aa3cca5f384..8502e254ec5 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -377,7 +377,12 @@ def register_mm_op(features: OpFeatures):
     return features
 
 
-@update_features(exir_ops.edge.aten._weight_int8pack_mm.default)
+@update_features(
+    [
+        exir_ops.edge.aten._weight_int8pack_mm.default,
+        exir_ops.edge.et_vk.linear_qcs4w.default,
+    ]
+)
 def register_int8_mm_op(features: OpFeatures):
     features.texture_impl = TextureImplFeatures(
         uses_axis_map=False,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 2126104430f..2b41d2b7e1a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -41,22 +41,32 @@
 /*
  * Fast division by 4 using bit shifting
  */
-#define div4(x) (x >> 2)
+#define div4(x) ((x) >> 2)
+
+/*
+ * Fast multiplication by 4 using bit shifting
+ */
+#define mul4(x) ((x) << 2)
 
 /*
  * Divides input and rounds up to 4
  */
-#define divup4(x) ((x + 3) >> 2)
+#define divup4(x) (((x) + 3) >> 2)
+
+/*
+ * Divides input by denominator and rounds up
+ */
+#define divup(x, d) (((x) + (d) - 1) / (d))
 
 /*
  * Aligns input to the next multiple of 4
  */
-#define alignup4(x) ((x + 3) & -4)
+#define alignup4(x) (((x) + 3) & -4)
 
 /*
  * Fast modulo by 4 using bit masking
  */
-#define mod4(x) (x & 3)
+#define mod4(x) ((x) & 3)
 
 /*
  * Find the packed dimension of a tensor given its strides. The packed dimension
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl
index 3ad9e759910..c766a3cd7d0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl
@@ -14,6 +14,7 @@
 #define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
 
 #define TILE_ROWS ${TILE_ROWS}
+#define TILE_TXCOLS ${TILE_TXCOLS}
 
 #define NGROUPS 8
 #define NWORKERS 8
@@ -29,7 +30,10 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
+$if QUANT_NBITS == 4:
+  ${layout_declare_tensor(B, "r", "t_weight", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
+$else:
+  ${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)}
 
 layout(push_constant) uniform restrict Block {
@@ -42,12 +46,23 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-shared VEC4_T partial_c[NGROUPS][NWORKERS][TILE_ROWS];
+shared VEC4_T partial_sums[NGROUPS][NWORKERS][TILE_ROWS][TILE_TXCOLS];
 
 void main() {
-  const uint out_width_ntexels = divup4(out_sizes.x);
-  const uint out_col = (gl_GlobalInvocationID.x % out_width_ntexels) << 2;
-  const uint out_row = (gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS;
+  // txcol stands for "texel column". One txcol corresponds to 4 scalar columns.
+  $if TILE_TXCOLS > 1:
+    const uint global_wg_x = uint(divup(out_sizes.x, 4 * TILE_TXCOLS));
+    const uint out_txcol = uint(
+      (gl_GlobalInvocationID.x % global_wg_x) * TILE_TXCOLS);
+  $else:
+    const uint global_wg_x = uint(divup4(out_sizes.x));
+    const uint out_txcol = uint(gl_GlobalInvocationID.x % global_wg_x);
+
+  const uint out_row = uint(
+    (gl_GlobalInvocationID.x / global_wg_x) * TILE_ROWS);
+
+  $if QUANT_NBITS == 4:
+    const uint weight_txcol = uint(out_txcol / 2);
 
   const int gid = int(gl_LocalInvocationID.x); // group id
   const int wid = int(gl_LocalInvocationID.z); // worker id
@@ -56,46 +71,78 @@ void main() {
     return;
   }
 
-  VEC4_T a[TILE_ROWS];
-  VEC4_T b[4];
-  VEC4_T local_c[TILE_ROWS];
+  VEC4_T mat1[TILE_ROWS];
+  VEC4_T qmat2[4][TILE_TXCOLS];
+  VEC4_T local_sums[TILE_ROWS][TILE_TXCOLS];
 
-  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-    local_c[i] = VEC4_T(0.0);
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    $for c in range(TILE_TXCOLS):
+      local_sums[r][${c}] = VEC4_T(0.0);
   }
 
-  $if SCALES_STORAGE == "buffer":
-    const VEC4_T scales = VEC4_T(t_scales[out_col >> 2]);
-  $else:
-    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec2(out_col >> 2, 0), 0));
-
-  for (int pos = 4 * wid; pos < in_sizes.x; pos += (4 * NWORKERS)) {
-    // Preload t_weight
-    [[unroll]] for (int i = 0; i < 4; i++) {
-      $if WEIGHT_STORAGE == "buffer":
-        b[i] = t_weight[((pos + i) * weight_sizes.x + out_col) >> 2];
+  VEC4_T scales[TILE_TXCOLS];
+  $for c in range(TILE_TXCOLS):
+    $if SCALES_STORAGE == "buffer":
+      scales[${c}] = VEC4_T(t_scales[out_txcol + ${c}]);
+    $else:
+      scales[${c}] = VEC4_T(
+        texelFetch(t_scales, ivec2(out_txcol + ${c}, 0), 0));
+
+  for (int pos = (4 * wid), txpos = wid;
+       pos < in_sizes.x;
+       pos += (4 * NWORKERS), txpos += NWORKERS) {
+    $if WEIGHT_STORAGE == "buffer":
+      uint qmat2_bufi;
+      uint weight_row_txstride = div4(weight_sizes.x);
+
+    // Preload weight tensor
+    [[unroll]] for (int r = 0; r < 4; r++) {
+      $if QUANT_NBITS == 4:
+        $for c in range(0, TILE_TXCOLS, 2):
+          $if WEIGHT_STORAGE == "buffer":
+            qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol;
+            const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}]
+          $else:
+            const uvec4 packed_weight_tex = texelFetch(
+              t_weight, ivec2(weight_txcol + ${c}, pos + r), 0);
+
+          qmat2[r][${c}] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0);
+          qmat2[r][${c + 1}] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0);
       $else:
-        b[i] = VEC4_T(texelFetch(t_weight, ivec2(out_col >> 2, pos + i), 0));
+        $for c in range(TILE_TXCOLS):
+          $if WEIGHT_STORAGE == "buffer":
+            qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol;
+            qmat2[r][${c}] = t_weight[qmat2_bufi + ${c}];
+          $else:
+            qmat2[r][${c}] = VEC4_T(
+              texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
     }
-    // Preload t_in
-    for (int i = 0; i < TILE_ROWS; i++) {
+
+    $if IN_STORAGE == "buffer":
+      uint in_row_txstride = div4(in_sizes.x);
+
+    // Preload input tensor
+    [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
       $if IN_STORAGE == "buffer":
-        a[i] = t_in[((out_row + i) * in_sizes.x + pos) >> 2];
+        mat1[i] = t_in[(out_row + i) * in_row_txstride + txpos];
       $else:
-        a[i] = VEC4_T(texelFetch(t_in, ivec3(pos >> 2, out_row + i, 0), 0));
+        mat1[i] = VEC4_T(
+          texelFetch(t_in, ivec3(txpos, out_row + i, 0), 0));
     }
 
     // Accumulate partial output
-    [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-        local_c[i] += a[i].x * b[0] +
-                      a[i].y * b[1] +
-                      a[i].z * b[2] +
-                      a[i].w * b[3];
+    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+      $for c in range(TILE_TXCOLS):
+        local_sums[r][${c}] += mat1[r].x * qmat2[0][${c}] +
+                               mat1[r].y * qmat2[1][${c}] +
+                               mat1[r].z * qmat2[2][${c}] +
+                               mat1[r].w * qmat2[3][${c}];
     }
   }
 
-  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-    partial_c[gid][wid][i] = local_c[i];
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    $for c in range(TILE_TXCOLS):
+      partial_sums[gid][wid][r][${c}] = local_sums[r][${c}];
   }
 
   memoryBarrierShared();
@@ -105,21 +152,33 @@ void main() {
     return;
   }
 
-  VEC4_T c[TILE_ROWS];
+  VEC4_T sums[TILE_ROWS][TILE_TXCOLS];
+
+  for (int r = 0; r < TILE_ROWS; ++r) {
+    $for c in range(TILE_TXCOLS):
+      sums[r][${c}] = VEC4_T(0.0);
 
-  for (int row = 0; row < TILE_ROWS; ++row) {
-    c[row] = VEC4_T(0.0);
     [[unroll]] for (int worker = 0; worker < NWORKERS; ++worker) {
-      c[row] += partial_c[gid][worker][row];
+      $for c in range(TILE_TXCOLS):
+        sums[r][${c}] += partial_sums[gid][worker][r][${c}];
     }
   }
 
-  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-    $if OUT_STORAGE == "buffer":
-      if (out_row + i < out_sizes.y) {
-        t_out[((out_row + i) * out_sizes.x + out_col) >> 2] = c[i] * scales;
-      }
-    $else:
-      imageStore(t_out, ivec3(out_col >> 2, out_row + i, 0), c[i] * scales);
+  $if OUT_STORAGE == "buffer":
+    uint out_bufi;
+    uint out_row_txstride = div4(out_sizes.x);
+
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    $for c in range(TILE_TXCOLS):
+      $if OUT_STORAGE == "buffer":
+        if (out_row + r < out_sizes.y) {
+          out_bufi = (out_row + r) * out_row_txstride + out_txcol;
+          t_out[out_bufi + ${c}] = sums[r][${c}] * scales[${c}];
+        }
+      $else:
+        imageStore(
+          t_out,
+          ivec3(out_txcol + ${c}, out_row + r, 0),
+          sums[r][${c}] * scales[${c}]);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml
index e0477a3a3d1..3dff6855142 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml
@@ -12,6 +12,8 @@ linear_qcsnw_coop:
     WEIGHT_STORAGE: texture2d
     SCALES_STORAGE: texture2d
     TILE_ROWS: 4
+    TILE_TXCOLS: 1
+    QUANT_NBITS: 8
   generate_variant_forall:
     TILE_ROWS:
       - VALUE: 1
@@ -26,3 +28,11 @@ linear_qcsnw_coop:
       OUT_STORAGE: buffer
       WEIGHT_STORAGE: buffer
       SCALES_STORAGE: buffer
+    - NAME: linear_qcs4w_coop_texture3d_texture3d_texture2d_texture2d_float
+      TILE_TXCOLS: 2
+      QUANT_NBITS: 4
+    - NAME: linear_qcs4w_coop_buffer_buffer_texture2d_texture2d_float
+      IN_STORAGE: buffer
+      OUT_STORAGE: buffer
+      TILE_TXCOLS: 2
+      QUANT_NBITS: 4
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
index 3ef952ea34d..f6f05aab7ca 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
@@ -14,6 +14,7 @@
 #define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
 
 #define TILE_ROWS ${TILE_ROWS}
+#define TILE_TXCOLS ${TILE_TXCOLS}
 
 ${define_required_extensions(DTYPE)}
 
@@ -26,7 +27,10 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
+$if QUANT_NBITS == 4:
+  ${layout_declare_tensor(B, "r", "t_weight", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
+$else:
+  ${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)}
 
 
@@ -43,57 +47,110 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 
 void main() {
-  const uint16_t out_width_ntexels = uint16_t(divup4(out_sizes.x));
-  const uint16_t out_col = uint16_t((gl_GlobalInvocationID.x % out_width_ntexels) << 2);
-  const uint16_t out_row = uint16_t((gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS);
+  // txcol stands for "texel column". One txcol corresponds to 4 scalar columns.
+  $if TILE_TXCOLS > 1:
+    const uint16_t global_wg_x = uint16_t(divup(out_sizes.x, 4 * TILE_TXCOLS));
+    const uint16_t out_txcol = uint16_t(
+      (gl_GlobalInvocationID.x % global_wg_x) * TILE_TXCOLS);
+  $else:
+    const uint16_t global_wg_x = uint16_t(divup4(out_sizes.x));
+    const uint16_t out_txcol = uint16_t(gl_GlobalInvocationID.x % global_wg_x);
+
+  const uint16_t out_row = uint16_t(
+    (gl_GlobalInvocationID.x / global_wg_x) * TILE_ROWS);
+
+  $if QUANT_NBITS == 4:
+    const uint16_t weight_txcol = uint16_t(out_txcol / 2);
 
   if (out_row >= uint16_t(out_sizes.y)) {
     return;
   }
 
-  VEC4_T a[TILE_ROWS];
-  VEC4_T b[4];
-  VEC4_T c[TILE_ROWS];
+  VEC4_T mat1[TILE_ROWS];
+  VEC4_T qmat2[4][TILE_TXCOLS];
+  VEC4_T sums[TILE_ROWS][TILE_TXCOLS];
 
-  $if SCALES_STORAGE == "buffer":
-    const VEC4_T scales = VEC4_T(t_scales[int(out_col >> 2)]);
-  $else:
-    const VEC4_T scales = VEC4_T(texelFetch(t_scales, u16vec2(out_col >> 2, 0), 0));
+  VEC4_T scales[TILE_TXCOLS];
+  $for c in range(TILE_TXCOLS):
+    $if SCALES_STORAGE == "buffer":
+      scales[${c}] = VEC4_T(t_scales[out_txcol + ${c}]);
+    $else:
+      scales[${c}] = VEC4_T(
+        texelFetch(t_scales, u16vec2(out_txcol + ${c}, 0), 0));
 
-  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-    c[i] = VEC4_T(0.0);
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    $for c in range(TILE_TXCOLS):
+      sums[r][${c}] = VEC4_T(0.0);
   }
 
-  for (uint16_t pos = uint16_t(0); pos < uint16_t(in_sizes.x); pos += uint16_t(4)) {
+  for (uint16_t pos = uint16_t(0), txpos = uint16_t(0);
+       pos < uint16_t(in_sizes.x);
+       pos += uint16_t(4), txpos += uint16_t(1)) {
+    $if WEIGHT_STORAGE == "buffer":
+      uint qmat2_bufi;
+      uint weight_row_txstride = div4(weight_sizes.x);
+
     // Preload weight tensor
-    [[unroll]] for (int i = 0; i < 4; i++) {
-      $if WEIGHT_STORAGE == "buffer":
-        b[i] = t_weight[((pos + i) * out_sizes.x + out_col) >> 2];
+    [[unroll]] for (int r = 0; r < 4; r++) {
+      $if QUANT_NBITS == 4:
+        $for c in range(0, TILE_TXCOLS, 2):
+          $if WEIGHT_STORAGE == "buffer":
+            qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol;
+            const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}]
+          $else:
+            const uvec4 packed_weight_tex = texelFetch(
+              t_weight, u16vec2(weight_txcol + ${c}, pos + r), 0);
+
+          qmat2[r][${c}] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0);
+          qmat2[r][${c + 1}] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0);
       $else:
-        b[i] = VEC4_T(texelFetch(t_weight, u16vec2(out_col >> 2, pos + i), 0));
+        $for c in range(TILE_TXCOLS):
+          $if WEIGHT_STORAGE == "buffer":
+            qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol;
+            qmat2[r][${c}] = t_weight[qmat2_bufi + ${c}];
+          $else:
+            qmat2[r][${c}] = VEC4_T(
+              texelFetch(t_weight, u16vec2(out_txcol + ${c}, pos + r), 0));
     }
 
+    $if IN_STORAGE == "buffer":
+      uint in_row_txstride = div4(in_sizes.x);
+
     // Preload input tensor
     [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
       $if IN_STORAGE == "buffer":
-        a[i] = t_in[((out_row + i) * in_sizes.x + pos) >> 2];
+        mat1[i] = t_in[(out_row + i) * in_row_txstride + txpos];
       $else:
-        a[i] = VEC4_T(texelFetch(t_in, u16vec3(pos >> 2, out_row + i, 0), 0));
+        mat1[i] = VEC4_T(
+          texelFetch(t_in, u16vec3(txpos, out_row + i, 0), 0));
     }
 
     // Accumulate output
-    [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-        c[i] += a[i].x * b[0] + a[i].y * b[1] + a[i].z * b[2] + a[i].w * b[3];
+    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+      $for c in range(TILE_TXCOLS):
+        sums[r][${c}] += mat1[r].x * qmat2[0][${c}] +
+                         mat1[r].y * qmat2[1][${c}] +
+                         mat1[r].z * qmat2[2][${c}] +
+                         mat1[r].w * qmat2[3][${c}];
     }
   }
 
   // Store to output tensor
-  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-    $if OUT_STORAGE == "buffer":
-      if (out_row + i < out_sizes.y) {
-        t_out[((out_row + i) * out_sizes.x + out_col) >> 2] = c[i] * scales;
-      }
-    $else:
-      imageStore(t_out, ivec3(out_col >> 2, out_row + i, 0), c[i] * scales);
+  $if OUT_STORAGE == "buffer":
+    uint out_bufi;
+    uint out_row_txstride = div4(out_sizes.x);
+
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    $for c in range(TILE_TXCOLS):
+      $if OUT_STORAGE == "buffer":
+        if (out_row + r < out_sizes.y) {
+          out_bufi = (out_row + r) * out_row_txstride + out_txcol;
+          t_out[out_bufi + ${c}] = sums[r][${c}] * scales[${c}];
+        }
+      $else:
+        imageStore(
+          t_out,
+          ivec3(out_txcol + ${c}, out_row + r, 0),
+          sums[r][${c}] * scales[${c}]);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml
index f9f0134d995..1c9ec4e524a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml
@@ -12,6 +12,8 @@ linear_qcsnw_tiled:
     WEIGHT_STORAGE: texture2d
     SCALES_STORAGE: texture2d
     TILE_ROWS: 4
+    TILE_TXCOLS: 1
+    QUANT_NBITS: 8
   generate_variant_forall:
     TILE_ROWS:
       - VALUE: 1
@@ -30,3 +32,11 @@ linear_qcsnw_tiled:
       OUT_STORAGE: buffer
       WEIGHT_STORAGE: buffer
       SCALES_STORAGE: buffer
+    - NAME: linear_qcs4w_tiled_texture3d_texture3d_texture2d_texture2d_float
+      TILE_TXCOLS: 2
+      QUANT_NBITS: 4
+    - NAME: linear_qcs4w_tiled_buffer_buffer_texture2d_texture2d_float
+      IN_STORAGE: buffer
+      OUT_STORAGE: buffer
+      TILE_TXCOLS: 2
+      QUANT_NBITS: 4
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
index 85695488dfc..6e101195e3f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
@@ -17,6 +17,7 @@ namespace vkcompute {
 
 void check_linear_qcsnw_args(
     const ComputeGraph& graph,
+    const int quant_nbits,
     const ValueRef mat1,
     const ValueRef qmat2_data,
     const ValueRef scales,
@@ -31,13 +32,20 @@ void check_linear_qcsnw_args(
 
   VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
 
-  VK_CHECK_COND(
-      utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes));
-  VK_CHECK_COND(
-      utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
+  if (quant_nbits == 4) {
+    VK_CHECK_COND(
+        utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes) * 2);
+    VK_CHECK_COND(
+        utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
+  } else {
+    VK_CHECK_COND(
+        utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes));
+    VK_CHECK_COND(
+        utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
+  }
 }
 
-void resize_linear_qcs8w_node(
+void resize_linear_qcsnw_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
@@ -48,7 +56,12 @@ void resize_linear_qcs8w_node(
   vTensorPtr qmat2 = graph->get_tensor(args[1].refs[1]);
 
   const int out_cols = utils::val_at(-2, mat1->sizes());
-  const int out_rows = utils::val_at(-1, qmat2->sizes());
+  int out_rows = utils::val_at(-1, qmat2->sizes());
+  // Byte dtype suggests 4-bit quantization in which case the weight tensor is
+  // packed with 2 values per byte.
+  if (qmat2->dtype() == vkapi::kByte) {
+    out_rows *= 2;
+  }
 
   std::vector<int64_t> new_out_sizes(3);
   if (mat1->sizes().size() == 2) {
@@ -135,34 +148,40 @@ void add_linear_qcs8w_node(
       // Resize Args
       {},
       // Resizing Logic
-      resize_linear_qcs8w_node));
+      resize_linear_qcsnw_node));
   if (!graph.is_buffer_storage(out) &&
       graph.packed_dim_of(out) != WHCN::kWidthDim) {
     viewFn(graph, {out_W_packed, graph.add_none(), out});
   }
 }
 
-void add_linear_qcs8w_tiled_node(
+void add_linear_qcsnw_tiled_node(
     ComputeGraph& graph,
     const bool use_coop_algorithm,
+    const int quant_nbits,
     const ValueRef mat1,
     const ValueRef q_mat2_data,
     const ValueRef scales_data,
     const ValueRef out) {
-  utils::StorageType q_mat2_storage = utils::kTexture2D;
-
   uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
   std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(q_mat2_data);
   const int64_t ndim = graph.dim_of(q_mat2_data);
   const int64_t K = qmat2_orig_sizes.at(ndim - 1);
   const int64_t N = qmat2_orig_sizes.at(ndim - 2);
 
-  if (N > max_extent * 4 || K > max_extent) {
-    q_mat2_storage = utils::kBuffer;
-  }
+  ValueRef q_mat2;
+  if (quant_nbits == 4) {
+    q_mat2 =
+        prepack_int4_linear_weight_transposed_interleaved(graph, q_mat2_data);
+  } else {
+    utils::StorageType q_mat2_storage = utils::kTexture2D;
+    if (N > max_extent * 4 || K > max_extent) {
+      q_mat2_storage = utils::kBuffer;
+    }
 
-  ValueRef q_mat2 = prepack_standard_hw_transposed(
-      graph, q_mat2_data, q_mat2_storage, utils::kWidthPacked);
+    q_mat2 = prepack_standard_hw_transposed(
+        graph, q_mat2_data, q_mat2_storage, utils::kWidthPacked);
+  }
 
   utils::StorageType scales_storage = utils::kTexture2D;
   if (N > max_extent) {
@@ -171,8 +190,14 @@ void add_linear_qcs8w_tiled_node(
   ValueRef scales =
       prepack_standard(graph, scales_data, scales_storage, utils::kWidthPacked);
 
-  std::string kernel_name =
-      use_coop_algorithm ? "linear_qcs8w_coop" : "linear_qcs8w_tiled";
+  std::string kernel_name;
+  if (quant_nbits == 4) {
+    kernel_name =
+        use_coop_algorithm ? "linear_qcs4w_coop" : "linear_qcs4w_tiled";
+  } else {
+    kernel_name =
+        use_coop_algorithm ? "linear_qcs8w_coop" : "linear_qcs8w_tiled";
+  }
   kernel_name.reserve(kShaderNameReserve);
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1));
@@ -197,9 +222,16 @@ void add_linear_qcs8w_tiled_node(
     out_tile_nrows = 4;
   }
 
+  // Number of output texels in the output tile
+  uint32_t out_tile_ntxcols = 1;
+  if (quant_nbits == 4) {
+    out_tile_ntxcols = 2;
+  }
+
   utils::uvec3 out_limits = graph.logical_limits_of(out);
+  uint32_t global_wg_x = utils::div_up(out_limits[0], out_tile_ntxcols);
   utils::uvec3 global_wg_size = {
-      out_limits[0] * (utils::div_up(out_limits[1], out_tile_nrows)),
+      global_wg_x * (utils::div_up(out_limits[1], out_tile_nrows)),
       1,
       out_limits[2]};
 
@@ -224,7 +256,7 @@ void add_linear_qcs8w_tiled_node(
       // Resize Args
       {},
       // Resizing Logic
-      resize_linear_qcs8w_node));
+      resize_linear_qcsnw_node));
 }
 
 bool can_use_tiled_impl(
@@ -238,7 +270,7 @@ bool can_use_tiled_impl(
 
   // Check if mat1 is not a 3D tensor or that batches = 1
   // TODO(ssjia): Add support for batches in the tiled impl
-  if (graph.dim_of(mat1) == 3 && graph.size_at<int>(-1, mat1) != 1) {
+  if (graph.dim_of(mat1) == 3 && graph.size_at<int>(0, mat1) != 1) {
     return false;
   }
   // Check that K is a multiple of 4
@@ -283,17 +315,27 @@ bool can_use_coop_impl(ComputeGraph& graph, const ValueRef mat1) {
 void weight_int8pack_mm(
     ComputeGraph& graph,
     const std::vector<ValueRef>& args) {
-  check_linear_qcsnw_args(graph, args[0], args[1], args[2], args[3]);
+  check_linear_qcsnw_args(graph, 8, args[0], args[1], args[2], args[3]);
   if (can_use_tiled_impl(graph, args[0], args[1], args[2], args[3])) {
     bool use_coop_algorithm = can_use_coop_impl(graph, args[0]);
-    return add_linear_qcs8w_tiled_node(
-        graph, use_coop_algorithm, args[0], args[1], args[2], args[3]);
+    return add_linear_qcsnw_tiled_node(
+        graph, use_coop_algorithm, 8, args[0], args[1], args[2], args[3]);
   }
   return add_linear_qcs8w_node(graph, args[0], args[1], args[2], args[3]);
 }
 
+void linear_qcs4w(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  check_linear_qcsnw_args(graph, 4, args[0], args[1], args[2], args[3]);
+
+  VK_CHECK_COND(can_use_tiled_impl(graph, args[0], args[1], args[2], args[3]));
+  bool use_coop_algorithm = can_use_coop_impl(graph, args[0]);
+  return add_linear_qcsnw_tiled_node(
+      graph, use_coop_algorithm, 4, args[0], args[1], args[2], args[3]);
+}
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten._weight_int8pack_mm.default, weight_int8pack_mm);
+  VK_REGISTER_OP(et_vk.linear_qcs4w.default, linear_qcs4w);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
index b3ead94d8ff..8c5cb0093d9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
@@ -70,54 +70,6 @@ void resize_linear_qga4w_node(
   out->virtual_resize(new_out_sizes);
 }
 
-ValueRef prepack_int4_linear_weight_transposed_interleaved(
-    ComputeGraph& graph,
-    const ValueRef qmat2_data) {
-  std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(qmat2_data);
-  const int64_t ndim = graph.dim_of(qmat2_data);
-
-  const int64_t K = qmat2_orig_sizes.at(ndim - 1) * 2;
-  const int64_t N = qmat2_orig_sizes.at(ndim - 2);
-  const int64_t N_div2 = N / int64_t(2);
-
-  utils::StorageType storage_type = utils::kTexture2D;
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  if (N_div2 > max_extent * 4 || K > max_extent) {
-    storage_type = utils::kBuffer;
-  }
-
-  std::vector<int64_t> qmat2_sizes{K, N_div2};
-  ValueRef qmat2 = graph.add_tensor(
-      qmat2_sizes, vkcompute::vkapi::kByte, storage_type, utils::kWidthPacked);
-
-  utils::uvec3 global_wg_size;
-  global_wg_size = graph.logical_limits_of(qmat2);
-  global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(2));
-
-  std::string kernel_name =
-      graph.context()->adapter_ptr()->has_full_int8_buffers_support()
-      ? "pack_int4_linear_weight_transposed_interleaved"
-      : "pack_int4_linear_weight_transposed_interleaved_nobitw8buffer";
-  add_storage_type_suffix(kernel_name, storage_type);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      qmat2_data,
-      qmat2,
-      // UBOs
-      {},
-      // Specialization Constants
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(qmat2)}));
-
-  return qmat2;
-}
-
 void add_linear_qga4w_node(
     ComputeGraph& graph,
     const ValueRef mat1,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 32e63baeafc..f39b0fc33ff 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -246,6 +246,54 @@ ValueRef prepack_direct_copy_buffer(
   return tensor;
 }
 
+ValueRef prepack_int4_linear_weight_transposed_interleaved(
+    ComputeGraph& graph,
+    const ValueRef qmat2_data) {
+  std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(qmat2_data);
+  const int64_t ndim = graph.dim_of(qmat2_data);
+
+  const int64_t K = qmat2_orig_sizes.at(ndim - 1) * 2;
+  const int64_t N = qmat2_orig_sizes.at(ndim - 2);
+  const int64_t N_div2 = N / int64_t(2);
+
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (N_div2 > max_extent * 4 || K > max_extent) {
+    storage_type = utils::kBuffer;
+  }
+
+  std::vector<int64_t> qmat2_sizes{K, N_div2};
+  ValueRef qmat2 = graph.add_tensor(
+      qmat2_sizes, vkcompute::vkapi::kByte, storage_type, utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size;
+  global_wg_size = graph.logical_limits_of(qmat2);
+  global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(2));
+
+  std::string kernel_name =
+      graph.context()->adapter_ptr()->has_full_int8_buffers_support()
+      ? "pack_int4_linear_weight_transposed_interleaved"
+      : "pack_int4_linear_weight_transposed_interleaved_nobitw8buffer";
+  add_storage_type_suffix(kernel_name, storage_type);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      qmat2_data,
+      qmat2,
+      // UBOs
+      {},
+      // Specialization Constants
+      {},
+      // Push Constants
+      {graph.sizes_pc_of(qmat2)}));
+
+  return qmat2;
+}
+
 void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   return add_prepack_standard_node(graph, args[0], args[1]);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
index 1b6f245bd34..090a3718295 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -87,4 +87,11 @@ ValueRef prepack_direct_copy_buffer(
     ComputeGraph& graph,
     const ValueRef tensor_data);
 
+//
+// Op specific prepack functions
+
+ValueRef prepack_int4_linear_weight_transposed_interleaved(
+    ComputeGraph& graph,
+    const ValueRef qmat2_data);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
index 5d08ee57859..b95b7b3aa6d 100644
--- a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
+++ b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
@@ -62,7 +62,7 @@ at::Tensor unpack_weights_4x2(const at::Tensor& weights_4x2) {
   return weights_unpacked;
 }
 
-at::Tensor dequantize_and_linear(
+at::Tensor dequantize_and_linear_qga4w(
     const at::Tensor& x,
     const at::Tensor& weights_4x2,
     const int64_t groupsize,
@@ -97,6 +97,56 @@ at::Tensor dequantize_and_linear(
   return at::linear(x, weights_dequantized);
 }
 
+at::Tensor dequantize_and_linear_qcs4w(
+    const at::Tensor& x,
+    const at::Tensor& weights_4x2,
+    const at::Tensor& scales) {
+  std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
+  weights_shape[1] *= 2;
+
+  at::Tensor weights_dequantized =
+      at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat));
+
+  const int64_t N = weights_dequantized.size(0);
+  const int64_t K = weights_dequantized.size(1);
+
+  for (int n = 0; n < N; n++) {
+    for (int k = 0; k < K; k += 2) {
+      // const int scale_idx = k_groups * n + group_idx;
+      const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
+      const uint8_t second_val = packed_val & 0x0F;
+      const uint8_t first_val = (packed_val & 0xF0) >> 4;
+
+      const float scale = scales[n].item().to<float>();
+
+      weights_dequantized[n][k] = (float(first_val) - 8.0) * scale;
+      weights_dequantized[n][k + 1] = (float(second_val) - 8.0) * scale;
+    }
+  }
+
+  return at::linear(x, weights_dequantized);
+}
+
+at::Tensor linear_qcs4w_reference_impl(
+    const at::Tensor& x,
+    const at::Tensor& weights_4x2,
+    const at::Tensor& scales) {
+  const std::vector<int64_t> original_x_size(x.sizes().vec());
+  const size_t ndim = original_x_size.size();
+  const int64_t out_features = weights_4x2.size(0);
+  const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]});
+
+  const at::Tensor weights_unpacked =
+      (unpack_weights_4x2(weights_4x2) - 8).to(at::kChar);
+  at::Tensor out =
+      at::_weight_int8pack_mm(x_flattened, weights_unpacked, scales);
+
+  std::vector<int64_t> out_shape(
+      original_x_size.begin(), original_x_size.end());
+  out_shape.at(ndim - 1) = out_features;
+  return out.reshape(out_shape);
+}
+
 //
 // Test functions
 //
@@ -126,12 +176,31 @@ void test_reference_linear_qga4w(
       scales_and_zeros,
       inner_k_tiles);
 
-  at::Tensor out_ref = dequantize_and_linear(
+  at::Tensor out_ref = dequantize_and_linear_qga4w(
       x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
 
   ASSERT_TRUE(at::allclose(out, out_ref));
 }
 
+void test_reference_linear_qcs4w(
+    const int B,
+    const int M,
+    const int K,
+    const int N) {
+  at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor weights_4x2 =
+      at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
+  at::Tensor weights_int = unpack_weights_4x2(weights_4x2);
+
+  at::Tensor scales = at::rand({N}, at::device(at::kCPU).dtype(at::kFloat));
+
+  at::Tensor out = linear_qcs4w_reference_impl(x, weights_4x2, scales);
+
+  at::Tensor out_ref = dequantize_and_linear_qcs4w(x, weights_4x2, scales);
+
+  ASSERT_TRUE(at::allclose(out, out_ref));
+}
+
 vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
   using namespace vkcompute;
   switch (at_scalartype) {
@@ -265,6 +334,85 @@ void test_vulkan_linear_qga4w(
       vkcompute::utils::kTexture3D);
 }
 
+void test_vulkan_linear_qcs4w_impl(
+    const int B,
+    const int M,
+    const int K,
+    const int N,
+    const vkcompute::utils::StorageType in_storage =
+        vkcompute::utils::kTexture3D,
+    const vkcompute::utils::StorageType out_storage =
+        vkcompute::utils::kTexture3D) {
+  at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor weights_4x2 =
+      at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
+
+  at::Tensor scales = at::rand({N}, at::device(at::kCPU).dtype(at::kFloat));
+
+  at::Tensor out_ref = linear_qcs4w_reference_impl(x, weights_4x2, scales);
+
+  // Build Vulkan graph
+  using namespace vkcompute;
+
+  GraphConfig config;
+  config.set_storage_type_override(utils::kTexture3D);
+  ComputeGraph graph(config);
+
+#define MAKE_TENSORREF_FOR(x)              \
+  ValueRef r_##x = graph.add_tensorref(    \
+      x.sizes().vec(),                     \
+      from_at_scalartype(x.scalar_type()), \
+      x.const_data_ptr());
+
+  MAKE_TENSORREF_FOR(weights_4x2);
+  MAKE_TENSORREF_FOR(scales);
+
+  IOValueRef r_x = graph.add_input_tensor(
+      x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage);
+
+  const ValueRef r_out = graph.add_tensor(
+      out_ref.sizes().vec(),
+      from_at_scalartype(out_ref.scalar_type()),
+      out_storage);
+
+  VK_GET_OP_FN("et_vk.linear_qcs4w.default")
+  (graph, {r_x.value, r_weights_4x2, r_scales, r_out});
+
+  ValueRef staging_out = graph.set_output_tensor(r_out);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  //
+  // Run model
+  //
+
+  graph.propagate_resize();
+  graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel());
+
+  graph.execute();
+
+  at::Tensor vk_out = at::empty_like(out_ref);
+  graph.copy_from_staging(
+      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
+
+  ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4));
+}
+
+void test_vulkan_linear_qcs4w(
+    const int B,
+    const int M,
+    const int K,
+    const int N) {
+  test_vulkan_linear_qcs4w_impl(
+      B, M, K, N, vkcompute::utils::kBuffer, vkcompute::utils::kBuffer);
+
+  test_vulkan_linear_qcs4w_impl(
+      B, M, K, N, vkcompute::utils::kTexture3D, vkcompute::utils::kTexture3D);
+}
+
 TEST(VulkanLinearQGA4WTest, test_reference_impl) {
   test_reference_linear_qga4w(
       /*B = */ 1,
@@ -294,3 +442,33 @@ TEST(VulkanLinearQGA4WTest, test_vulkan_impl_gemm) {
       /*K = */ 256,
       /*N = */ 256);
 }
+
+TEST(VulkanLinearQCS4WTest, test_reference_impl) {
+  test_reference_linear_qcs4w(
+      /*B = */ 1,
+      /*M = */ 4,
+      /*K = */ 128,
+      /*N = */ 32);
+}
+
+TEST(VulkanLinearQCS4WTest, test_vulkan_impl_small_m) {
+  test_vulkan_linear_qcs4w(
+      /*B = */ 1,
+      /*M = */ 4,
+      /*K = */ 128,
+      /*N = */ 32);
+
+  test_vulkan_linear_qcs4w(
+      /*B = */ 1,
+      /*M = */ 1,
+      /*K = */ 256,
+      /*N = */ 256);
+}
+
+TEST(VulkanLinearQCS4WTest, test_vulkan_impl_gemm) {
+  test_vulkan_linear_qcs4w(
+      /*B = */ 1,
+      /*M = */ 32,
+      /*K = */ 32,
+      /*N = */ 32);
+}

From d9c6f80546af09684e85277d7cee5aa7c15b2746 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Thu, 8 May 2025 14:14:10 +0200
Subject: [PATCH 007/178] Arm backend: Add model name to -llama_inputs (#10775)

This way other Llama variants than stories110m can be run.
---
 backends/arm/test/models/test_llama.py | 29 +++++++++++++++++---------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
index 44a8fdc2a04..f5d879b3b8b 100644
--- a/backends/arm/test/models/test_llama.py
+++ b/backends/arm/test/models/test_llama.py
@@ -33,27 +33,35 @@
 class TestLlama(unittest.TestCase):
     """
     Test class of Llama models. Type of Llama model depends on command line parameters:
-    --llama_inputs <path to .pt file> <path to json file>
-    Example: --llama_inputs stories110M/stories110M.pt stories110M/params.json
+    --llama_inputs <path to .pt file> <path to json file> <name of model variant>
+    Example: --llama_inputs stories110M/stories110M.pt stories110M/params.json stories110m
+    For more examples and info see examples/models/llama/README.md.
     """
 
     def prepare_model(self):
 
         checkpoint = None
         params_file = None
+        usage = "To run use --llama_inputs <.pt/.pth> <.json> <name>"
+
         if conftest.is_option_enabled("llama_inputs"):
             param_list = conftest.get_option("llama_inputs")
-            assert (
-                isinstance(param_list, list) and len(param_list) == 2
-            ), "invalid number of inputs for --llama_inputs"
+
+            if not isinstance(param_list, list) or len(param_list) != 3:
+                raise RuntimeError(
+                    f"Invalid number of inputs for --llama_inputs. {usage}"
+                )
+            if not all(isinstance(param, str) for param in param_list):
+                raise RuntimeError(
+                    f"All --llama_inputs are expected to be strings. {usage}"
+                )
+
             checkpoint = param_list[0]
             params_file = param_list[1]
-            assert isinstance(checkpoint, str) and isinstance(
-                params_file, str
-            ), "invalid input for --llama_inputs"
+            model_name = param_list[2]
         else:
             logger.warning(
-                "Skipping Llama test because of lack of input. To run use --llama_inputs <.pt> <.json>"
+                "Skipping Llama tests because of missing --llama_inputs. {usage}"
             )
             return None, None, None
 
@@ -71,7 +79,7 @@ def prepare_model(self):
             "-p",
             params_file,
             "--model",
-            "stories110m",
+            model_name,
         ]
         parser = build_args_parser()
         args = parser.parse_args(args)
@@ -122,6 +130,7 @@ def test_llama_tosa_BI(self):
                 .quantize()
                 .export()
                 .to_edge_transform_and_lower()
+                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
                 .to_executorch()
                 .run_method_and_compare_outputs(
                     inputs=llama_inputs,

From 3c21e3a9d8d79c6091647b70b27ac2487940cccc Mon Sep 17 00:00:00 2001
From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com>
Date: Thu, 8 May 2025 15:53:08 +0100
Subject: [PATCH 008/178] Arm Backend: Update unit tests for TOSA 1.0 (#10776)

### Summary
Refactoring of unit tests to allow for testing of TOSA 1.0
Adds command-line argument --arm_run_tosa_version to run tests on
particular version
---
 backends/arm/scripts/parse_test_names.py      |  10 +-
 backends/arm/test/common.py                   |  14 +-
 backends/arm/test/conftest.py                 |  16 +-
 backends/arm/test/ops/test_abs.py             | 173 ++----
 backends/arm/test/ops/test_add.py             |  85 +--
 backends/arm/test/ops/test_alias_copy.py      |  24 +-
 backends/arm/test/ops/test_amax.py            |  95 ++-
 backends/arm/test/ops/test_amin.py            |  86 ++-
 backends/arm/test/ops/test_any.py             |  88 +--
 backends/arm/test/ops/test_arange.py          |  39 +-
 backends/arm/test/ops/test_avg_pool2d.py      |  95 +--
 backends/arm/test/ops/test_batch_norm.py      | 500 +++++----------
 backends/arm/test/ops/test_bitwise.py         |  56 +-
 backends/arm/test/ops/test_bmm.py             | 307 +++++-----
 backends/arm/test/ops/test_cat.py             | 274 ++++-----
 backends/arm/test/ops/test_clamp.py           |  67 +--
 backends/arm/test/ops/test_clone.py           |  63 +-
 backends/arm/test/ops/test_constant_pad_nd.py |  25 +-
 backends/arm/test/ops/test_conv1d.py          |  72 +--
 backends/arm/test/ops/test_conv2d.py          |  99 ++-
 backends/arm/test/ops/test_conv3d.py          |  71 ++-
 backends/arm/test/ops/test_conv_combos.py     | 567 ++++++++++--------
 backends/arm/test/ops/test_depthwise_conv.py  | 266 ++++----
 backends/arm/test/ops/test_div.py             | 268 +++------
 backends/arm/test/ops/test_eq.py              |  66 +-
 backends/arm/test/ops/test_erf.py             |  20 +-
 backends/arm/test/ops/test_exp.py             | 188 +++---
 backends/arm/test/ops/test_expand.py          | 263 ++++----
 backends/arm/test/ops/test_full.py            | 369 ++++++------
 backends/arm/test/ops/test_ge.py              |  54 +-
 backends/arm/test/ops/test_gelu.py            |  42 +-
 backends/arm/test/ops/test_gt.py              |  58 +-
 backends/arm/test/ops/test_hardsigmoid.py     | 191 +++---
 backends/arm/test/ops/test_hardswish.py       | 181 ++----
 backends/arm/test/ops/test_hardtanh.py        | 206 +++----
 backends/arm/test/ops/test_layer_norm.py      |  35 +-
 backends/arm/test/ops/test_le.py              |  60 +-
 backends/arm/test/ops/test_leaky_relu.py      |  28 +-
 backends/arm/test/ops/test_linear.py          | 326 ++++------
 backends/arm/test/ops/test_log.py             | 178 ++----
 backends/arm/test/ops/test_logical.py         |  86 ++-
 backends/arm/test/ops/test_logsoftmax.py      |  41 +-
 backends/arm/test/ops/test_lshift.py          |  91 +--
 backends/arm/test/ops/test_lt.py              |  62 +-
 backends/arm/test/ops/test_max_pool.py        | 421 +++++--------
 backends/arm/test/ops/test_maximum.py         | 182 ++----
 backends/arm/test/ops/test_mean_dim.py        |  80 +--
 backends/arm/test/ops/test_minimum.py         | 185 ++----
 backends/arm/test/ops/test_mm.py              |  71 +--
 backends/arm/test/ops/test_mul.py             | 273 ++++-----
 backends/arm/test/ops/test_ne.py              |   6 +-
 backends/arm/test/ops/test_permute.py         | 260 +++-----
 backends/arm/test/ops/test_pow.py             |  28 +-
 backends/arm/test/ops/test_reciprocal.py      | 197 +++---
 backends/arm/test/ops/test_relu.py            | 197 +++---
 backends/arm/test/ops/test_repeat.py          | 183 +++---
 backends/arm/test/ops/test_rshift.py          | 113 ++--
 backends/arm/test/ops/test_rsqrt.py           | 170 +++---
 backends/arm/test/ops/test_scalar_tensor.py   |  42 +-
 backends/arm/test/ops/test_scalars.py         | 178 +++---
 backends/arm/test/ops/test_select.py          | 313 +++++-----
 backends/arm/test/ops/test_sigmoid.py         | 327 +++++-----
 backends/arm/test/ops/test_sigmoid_16bit.py   |  68 ++-
 backends/arm/test/ops/test_sigmoid_32bit.py   |  59 +-
 backends/arm/test/ops/test_silu.py            |  32 +-
 backends/arm/test/ops/test_slice.py           | 212 +++----
 backends/arm/test/ops/test_softmax.py         |  25 +-
 backends/arm/test/ops/test_split.py           | 262 ++++----
 backends/arm/test/ops/test_sqrt.py            |  32 +-
 backends/arm/test/ops/test_squeeze.py         | 389 ++++++------
 backends/arm/test/ops/test_sub.py             | 115 +---
 backends/arm/test/ops/test_sum.py             | 247 +++-----
 backends/arm/test/ops/test_tanh.py            | 203 +++----
 backends/arm/test/ops/test_to_copy.py         |  75 ++-
 backends/arm/test/ops/test_unary.py           |  77 ++-
 backends/arm/test/ops/test_unsqueeze.py       | 156 ++---
 .../arm/test/ops/test_upsample_nearest2d.py   | 262 ++++----
 backends/arm/test/ops/test_var.py             | 546 +++++++++--------
 backends/arm/test/ops/test_view.py            | 206 +++----
 backends/arm/test/ops/test_where.py           | 113 ++--
 backends/arm/test/tester/test_pipeline.py     |  43 +-
 81 files changed, 5412 insertions(+), 6841 deletions(-)

diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index 8aabf7c2c59..46cf3e17a73 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -5,7 +5,15 @@
 from executorch.exir.dialects.edge.spec.utils import SAMPLE_INPUT
 
 # Add edge ops which we lower but which are not included in exir/dialects/edge/edge.yaml here.
-CUSTOM_EDGE_OPS = ["linspace.default", "eye.default"]
+CUSTOM_EDGE_OPS = [
+    "linspace.default",
+    "eye.default",
+    "hardsigmoid.default",
+    "hardswish.default",
+    "linear.default",
+    "maximum.default",
+    "adaptive_avg_pool2d.default",
+]
 ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
 
 # Add all targets and TOSA profiles we support here.
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 57606e51f47..3f90c8c056c 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -259,17 +259,15 @@ def decorator_func(func):
                     raise RuntimeError(
                         "xfail info needs to be str, or tuple[str, type[Exception]]"
                     )
-                pytest_param = pytest.param(
-                    test_parameters,
-                    id=id,
-                    marks=pytest.mark.xfail(
-                        reason=reason, raises=raises, strict=strict
-                    ),
+                # Set up our fail marker
+                marker = (
+                    pytest.mark.xfail(reason=reason, raises=raises, strict=strict),
                 )
             else:
-                pytest_param = pytest.param(test_parameters, id=id)
-            pytest_testsuite.append(pytest_param)
+                marker = ()
 
+            pytest_param = pytest.param(test_parameters, id=id, marks=marker)
+            pytest_testsuite.append(pytest_param)
         return pytest.mark.parametrize(arg_name, pytest_testsuite)(func)
 
     return decorator_func
diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py
index db097e9d7d9..2d247f7bd42 100644
--- a/backends/arm/test/conftest.py
+++ b/backends/arm/test/conftest.py
@@ -12,12 +12,6 @@
 
 import pytest
 
-try:
-    import tosa_tools.v0_80.tosa_reference_model as tosa_reference_model
-except ImportError:
-    logging.warning("tosa_reference_model not found, can't run reference model tests")
-    tosa_reference_model = None
-
 """
 This file contains the pytest hooks, fixtures etc. for the Arm test suite.
 """
@@ -50,10 +44,11 @@ def pytest_configure(config):
     if getattr(config.option, "fast_fvp", False):
         pytest._test_options["fast_fvp"] = config.option.fast_fvp  # type: ignore[attr-defined]
 
-    # TODO: remove this flag once we have a way to run the reference model tests with Buck
-    pytest._test_options["tosa_ref_model"] = False  # type: ignore[attr-defined]
-    if tosa_reference_model is not None:
-        pytest._test_options["tosa_ref_model"] = True  # type: ignore[attr-defined]
+    if config.option.arm_run_tosa_version:
+        pytest._test_options["tosa_version"] = config.option.arm_run_tosa_version
+
+    pytest._test_options["tosa_ref_model"] = True  # type: ignore[attr-defined]
+
     logging.basicConfig(level=logging.INFO, stream=sys.stdout)
 
 
@@ -76,6 +71,7 @@ def try_addoption(*args, **kwargs):
         nargs="+",
         help="List of two files. Firstly .pt file. Secondly .json",
     )
+    try_addoption("--arm_run_tosa_version", action="store", default="0.80")
 
 
 def pytest_sessionstart(session):
diff --git a/backends/arm/test/ops/test_abs.py b/backends/arm/test/ops/test_abs.py
index 481c7d5ed0d..ed7e616e946 100644
--- a/backends/arm/test/ops/test_abs.py
+++ b/backends/arm/test/ops/test_abs.py
@@ -1,125 +1,68 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestAbs(unittest.TestCase):
-    class Abs(torch.nn.Module):
-        test_parameters = [
-            (torch.zeros(5),),
-            (torch.full((5,), -1, dtype=torch.float32),),
-            (torch.ones(5) * -1,),
-            (torch.randn(8),),
-            (torch.randn(2, 3, 4),),
-            (torch.randn(1, 2, 3, 4),),
-            (torch.normal(mean=0, std=10, size=(2, 3, 4)),),
-        ]
-
-        def forward(self, x):
-            return torch.abs(x)
-
-    def _test_abs_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.abs.default": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.abs.default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_abs_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.abs.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_abs_ethosu_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.abs.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(Abs.test_parameters)
-    def test_abs_tosa_MI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_abs_tosa_MI_pipeline(self.Abs(), test_data)
-
-    @parameterized.expand(Abs.test_parameters)
-    def test_abs_tosa_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_abs_tosa_BI_pipeline(self.Abs(), test_data)
-
-    @parameterized.expand(Abs.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_abs_u55_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_abs_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Abs(), test_data
-        )
-
-    @parameterized.expand(Abs.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_abs_u85_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_abs_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Abs(), test_data
-        )
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.abs.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_abs_default"
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Abs(torch.nn.Module):
+    test_parameters = {
+        "zeros": lambda: (torch.zeros(5),),
+        "full": lambda: (torch.full((5,), -1, dtype=torch.float32),),
+        "ones": lambda: (torch.ones(5) * -1,),
+        "randn_1d": lambda: (torch.randn(8),),
+        "randn_3d": lambda: (torch.randn(2, 3, 4),),
+        "randn_4d": lambda: (torch.randn(1, 2, 3, 4),),
+        "torch_normal": lambda: (torch.normal(mean=0, std=10, size=(2, 3, 4)),),
+    }
+
+    def forward(self, x):
+        return torch.abs(x)
+
+
+@common.parametrize("test_data", Abs.test_parameters)
+def test_abs_tosa_MI(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](Abs(), test_data(), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Abs.test_parameters)
+def test_abs_tosa_BI(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](Abs(), test_data(), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Abs.test_parameters)
+@common.XfailIfNoCorstone300
+def test_abs_u55_BI(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Abs(), test_data(), aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Abs.test_parameters)
+@common.XfailIfNoCorstone320
+def test_abs_u85_BI(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Abs(), test_data(), aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 486e53c5f03..67833576886 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -10,18 +10,18 @@
 import torch
 from executorch.backends.arm.arm_backend import get_tosa_spec
 from executorch.backends.arm.quantizer import arm_quantizer
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineBI,
     EthosU85PipelineBI,
     TosaPipelineBI,
     TosaPipelineMI,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester import Quantize
 from torch.ao.quantization.observer import HistogramObserver
 from torch.ao.quantization.quantizer import QuantizationSpec
 
-
 aten_op = "torch.ops.aten.add.Tensor"
 exir_op = "executorch_exir_dialects_edge__ops_aten_add_Tensor"
 
@@ -33,11 +33,11 @@ def forward(self, x: torch.Tensor):
         return x + x
 
     test_data: list[input_t1] = {
-        "5d_float": (torch.FloatTensor([1, 2, 3, 5, 7]),),
-        "1d_ones": ((3 * torch.ones(8),)),
-        "1d_randn": (10 * torch.randn(8),),
-        "4d_ones_1": (torch.ones(1, 1, 4, 4),),
-        "4d_ones_2": (torch.ones(1, 3, 4, 2),),
+        "5d_float": lambda: (torch.FloatTensor([1, 2, 3, 5, 7]),),
+        "1d_ones": lambda: ((3 * torch.ones(8),)),
+        "1d_randn": lambda: (10 * torch.randn(8),),
+        "4d_ones_1": lambda: (torch.ones(1, 1, 4, 4),),
+        "4d_ones_2": lambda: (torch.ones(1, 3, 4, 2),),
     }
 
 
@@ -49,14 +49,17 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x + y
 
     test_data: list[input_t2] = {
-        "5d_float": (
+        "5d_float": lambda: (
             torch.FloatTensor([1, 2, 3, 5, 7]),
             (torch.FloatTensor([2, 1, 2, 1, 10])),
         ),
-        "4d_ones": (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
-        "4d_randn_1": (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-        "4d_randn_2": (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
-        "4d_randn_big": (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
+        "4d_ones": lambda: (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
+        "4d_randn_1": lambda: (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
+        "4d_randn_2": lambda: (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
+        "4d_randn_big": lambda: (
+            10000 * torch.randn(1, 1, 4, 4),
+            torch.randn(1, 1, 4, 1),
+        ),
     }
 
 
@@ -65,31 +68,35 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x + y
 
     test_data: list[input_t2] = {
-        "3d_randn_diff_rank": (torch.randn(1, 4, 5), torch.randn(4, 1)),
-        "4d_randn_diff_rank": (torch.randn(1, 1, 4, 4), torch.randn(4, 1)),
-        "4d_randn_diff_rank_2": (torch.randn(4, 1), torch.randn(1, 1, 4, 5)),
+        "3d_randn_diff_rank": lambda: (torch.randn(1, 4, 5), torch.randn(4, 1)),
+        "4d_randn_diff_rank": lambda: (torch.randn(1, 1, 4, 4), torch.randn(4, 1)),
+        "4d_randn_diff_rank_2": lambda: (torch.randn(4, 1), torch.randn(1, 1, 4, 5)),
     }
 
 
 @common.parametrize("test_data", Add.test_data)
-def test_add_tosa_MI(test_data: input_t1):
-    pipeline = TosaPipelineMI[input_t1](Add(), test_data, aten_op, exir_op)
+def test_add_tensor_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](Add(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add.test_data)
-def test_add_tosa_BI(test_data: input_t1):
-    pipeline = TosaPipelineBI[input_t1](Add(), test_data, aten_op, exir_op)
+def test_add_tensor_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Add(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add.test_data)
-def test_add_i32_tosa_BI(test_data: input_t1):
-    pipeline = TosaPipelineBI[input_t1](Add(), test_data, aten_op, exir_op)
-
+def test_add_tensor_tosa_BI_i32(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Add(), test_data(), aten_op, exir_op)
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "0.80": TosaSpecification.create_from_string("TOSA-0.80+BI"),
+        "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT"),
+    }
     # Create a  quantizer with int8 quantization on the input and output but int32 on everything else.
     quantizer = arm_quantizer.TOSAQuantizer(
-        get_tosa_spec(common.get_tosa_compile_spec("TOSA-0.80+BI"))
+        get_tosa_spec(common.get_tosa_compile_spec(tosa_profiles[tosa_version]))
     )
     quantizer.set_io(arm_quantizer.get_symmetric_quantization_config())
     observer_options = {"eps": 2**-16}
@@ -117,59 +124,59 @@ def test_add_i32_tosa_BI(test_data: input_t1):
 
 @common.parametrize("test_data", Add.test_data)
 @common.XfailIfNoCorstone300
-def test_add_u55_BI(test_data: input_t1):
+def test_add_tensor_u55_BI(test_data: input_t1):
     pipeline = EthosU55PipelineBI[input_t1](
-        Add(), test_data, aten_op, exir_op, run_on_fvp=True
+        Add(), test_data(), aten_op, exir_op, run_on_fvp=True
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", Add.test_data)
 @common.XfailIfNoCorstone320
-def test_add_u85_BI(test_data: input_t1):
+def test_add_tensor_u85_BI(test_data: input_t1):
     pipeline = EthosU85PipelineBI[input_t1](
-        Add(), test_data, aten_op, exir_op, run_on_fvp=True
+        Add(), test_data(), aten_op, exir_op, run_on_fvp=True
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", Add2.test_data)
-def test_add_2_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](Add2(), test_data, aten_op, exir_op)
+def test_add_tensor_tosa_MI_2(test_data: input_t2):
+    pipeline = TosaPipelineMI[input_t2](Add2(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add3.test_data)
-def test_add3_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](Add3(), test_data, aten_op, exir_op)
+def test_add_tensor_tosa_MI_3(test_data: input_t2):
+    pipeline = TosaPipelineMI[input_t2](Add3(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add3.test_data)
-def test_add3_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Add3(), test_data, aten_op, exir_op)
+def test_add_tensor_tosa_BI_3(test_data: input_t2):
+    pipeline = TosaPipelineBI[input_t2](Add3(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add2.test_data)
-def test_add_2_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Add2(), test_data, aten_op, exir_op)
+def test_add_tensor_tosa_BI_2(test_data: input_t2):
+    pipeline = TosaPipelineBI[input_t2](Add2(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add2.test_data)
 @common.XfailIfNoCorstone300
-def test_add_2_u55_BI(test_data: input_t2):
+def test_add_tensor_u55_BI_2(test_data: input_t2):
     pipeline = EthosU55PipelineBI[input_t2](
-        Add2(), test_data, aten_op, exir_op, run_on_fvp=True
+        Add2(), test_data(), aten_op, exir_op, run_on_fvp=True
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", Add2.test_data)
 @common.XfailIfNoCorstone320
-def test_add_2_u85_BI(test_data: input_t2):
+def test_add_tensor_u85_BI_2(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        Add2(), test_data, aten_op, exir_op, run_on_fvp=True
+        Add2(), test_data(), aten_op, exir_op, run_on_fvp=True
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_alias_copy.py b/backends/arm/test/ops/test_alias_copy.py
index 66fa92bc445..44787fed950 100644
--- a/backends/arm/test/ops/test_alias_copy.py
+++ b/backends/arm/test/ops/test_alias_copy.py
@@ -30,10 +30,10 @@ class AliasCopy(torch.nn.Module):
     exir_op = "executorch_exir_dialects_edge__ops_aten_alias_copy_default"
 
     test_data: dict[input_t1] = {
-        "1d_ramp": (torch.arange(-16, 16, 0.2),),
-        "2d_ones": (torch.ones(5, 5),),
-        "3d_rand": (torch.rand(3, 5, 5),),
-        "4d_zeros": (torch.zeros(1, 10, 10, 10),),
+        "1d_ramp": lambda: (torch.arange(-16, 16, 0.2),),
+        "2d_ones": lambda: (torch.ones(5, 5),),
+        "3d_rand": lambda: (torch.rand(3, 5, 5),),
+        "4d_zeros": lambda: (torch.zeros(1, 10, 10, 10),),
     }
 
     def __init__(self):
@@ -44,40 +44,40 @@ def forward(self, x: torch.Tensor):
 
 
 @common.parametrize("test_data", AliasCopy.test_data)
-def test_alias_copy_tosa_MI(test_data: input_t1):
+def test_alias_tosa_MI(test_data: input_t1):
     TosaPipelineMI[input_t1](
         AliasCopy(),
-        test_data,
+        test_data(),
         AliasCopy.aten_op,
         AliasCopy.exir_op,
     ).run()
 
 
 @common.parametrize("test_data", AliasCopy.test_data)
-def test_alias_copy_tosa_BI(test_data: input_t1):
+def test_alias_tosa_BI(test_data: input_t1):
     TosaPipelineBI[input_t1](
         AliasCopy(),
-        test_data,
+        test_data(),
         AliasCopy.aten_op,
         AliasCopy.exir_op,
     ).run()
 
 
 @common.parametrize("test_data", AliasCopy.test_data)
-def test_alias_copy_u55_BI(test_data: input_t1):
+def test_alias_u55_BI(test_data: input_t1):
     EthosU55PipelineBI[input_t1](
         AliasCopy(),
-        test_data,
+        test_data(),
         AliasCopy.aten_op,
         AliasCopy.exir_op,
     ).run()
 
 
 @common.parametrize("test_data", AliasCopy.test_data)
-def test_alias_copy_u85_BI(test_data: input_t1):
+def test_alias_u85_BI(test_data: input_t1):
     EthosU85PipelineBI[input_t1](
         AliasCopy(),
-        test_data,
+        test_data(),
         AliasCopy.aten_op,
         AliasCopy.exir_op,
     ).run()
diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py
index b2639a5f108..0d1f4257b7b 100644
--- a/backends/arm/test/ops/test_amax.py
+++ b/backends/arm/test/ops/test_amax.py
@@ -30,11 +30,11 @@ def forward(self, x):
         return torch.amax(x, self.dim, self.keep_dims)
 
     test_data: Dict[str, input_t] = {
-        "rank_1_dim_0": ((torch.rand([10]),), 0, False),
-        "rank_2_dim_1_keep_dims": ((torch.rand([2, 2]),), (1,), True),
-        "rank_4_all_dim": ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
-        "rank_4_0,3_keep_dims": ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
-        "rank_4_mult_batches": ((torch.rand([2, 2, 2, 2]),), (0), True),
+        "rank_1_dim_0": lambda: ((torch.rand([10]),), 0, False),
+        "rank_2_dim_1_keep_dims": lambda: ((torch.rand([2, 2]),), (1,), True),
+        "rank_4_all_dim": lambda: ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
+        "rank_4_0,3_keep_dims": lambda: ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
+        "rank_4_mult_batches": lambda: ((torch.rand([2, 2, 2, 2]),), (0), True),
     }
 
 
@@ -51,10 +51,10 @@ def forward(self, x):
         return x[0]
 
     test_data: Dict[str, input_t] = {
-        "rank_1_dim_0": ((torch.rand([10]),), 0),
-        "rank_2_dim_1": ((torch.rand([2, 2]),), 1),
-        "rank_4_dim_2": ((torch.rand([2, 2, 2, 2]),), 2),
-        "rank_4_dim_3": ((torch.rand([2, 2, 2, 2]),), 3),
+        "rank_1_dim_0": lambda: ((torch.rand([10]),), 0),
+        "rank_2_dim_1": lambda: ((torch.rand([2, 2]),), 1),
+        "rank_4_dim_2": lambda: ((torch.rand([2, 2, 2, 2]),), 2),
+        "rank_4_dim_3": lambda: ((torch.rand([2, 2, 2, 2]),), 3),
     }
 
 
@@ -70,44 +70,26 @@ def forward(self, x):
 
 @common.parametrize("test_data", Amax.test_data)
 def test_amax_tosa_MI(test_data: Amax.input_t):
-    data, dim, keep_dims = test_data
-    pipeline = TosaPipelineMI[Amax.input_t](
-        Amax(dim, keep_dims),
-        data,
-        Amax.aten_op,
-    )
+    data, dim, keep_dims = test_data()
+    pipeline = TosaPipelineMI[Amax.input_t](Amax(dim, keep_dims), data, Amax.aten_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Amax.test_data)
 def test_amax_tosa_BI(test_data: Amax.input_t):
-    data, dim, keep_dims = test_data
-    pipeline = TosaPipelineBI[Amax.input_t](
-        Amax(dim, keep_dims),
-        data,
-        Amax.aten_op,
-    )
+    data, dim, keep_dims = test_data()
+    pipeline = TosaPipelineBI[Amax.input_t](Amax(dim, keep_dims), data, Amax.aten_op)
     pipeline.run()
 
 
 def test_amax_u55_BI_not_delegated():
-    data, dim, keep_dims = Amax.test_data["rank_4_all_dim"]
+    data, dim, keep_dims = Amax.test_data["rank_4_all_dim"]()
     pipeline = OpNotSupportedPipeline[Amax.input_t](
         Amax(dim, keep_dims),
         data,
-        "TOSA-0.80+BI+u55",
         {" executorch_exir_dialects_edge__ops_aten_amax_default": 1},
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", Amax.test_data)
-def test_amax_u85_BI(test_data: Amax.input_t):
-    data, dim, keep_dims = test_data
-    pipeline = EthosU85PipelineBI[Amax.input_t](
-        Amax(dim, keep_dims),
-        data,
-        Amax.aten_op,
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -116,50 +98,43 @@ def test_amax_u85_BI(test_data: Amax.input_t):
 
 
 @common.parametrize("test_data", Amax.test_data, fvp_xfails)
-@common.SkipIfNoCorstone320
-def test_amax_u85_BI_on_fvp(test_data: Amax.input_t):
-    data, dim, keep_dims = test_data
+@common.XfailIfNoCorstone320
+def test_amax_u85_BI(test_data: Amax.input_t):
+    data, dim, keep_dims = test_data()
     pipeline = EthosU85PipelineBI[Amax.input_t](
-        Amax(dim, keep_dims), data, Amax.aten_op, run_on_fvp=True
+        Amax(dim, keep_dims),
+        data,
+        Amax.aten_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", Max.test_data)
-def test_max_to_amax_MI(test_data: Max.input_t):
-    data, dim = test_data
-    pipeline = TosaPipelineMI[Max.input_t](
-        Max(dim),
-        data,
-        "torch.ops.aten.max",
-    )
+def test_max_dim_tosa_MI_to_amax(test_data: Max.input_t):
+    data, dim = test_data()
+    pipeline = TosaPipelineMI[Max.input_t](Max(dim), data, "torch.ops.aten.max")
     pipeline.run()
 
 
 @common.parametrize("test_data", Max.test_data)
-def test_max_to_amax_BI(test_data: Max.input_t):
-    data, dim = test_data
+def test_max_dim_tosa_BI_to_amax(test_data: Max.input_t):
+    data, dim = test_data()
     module = Max(dim)
-    pipeline = TosaPipelineBI[Max.input_t](
-        module,
-        data,
-        "torch.ops.aten.amax",
-    )
+    pipeline = TosaPipelineBI[Max.input_t](module, data, "torch.ops.aten.amax")
     pipeline.run()
 
 
 @pytest.mark.xfail(reason="MLETORCH-718 : Quantization of indices in arm_quantizer")
-def test_max_index_not_delegated_BI():
-    data, dim = Max.test_data["rank_4_dim_3"]
+def test_max_dim_tosa_BI_not_delegated():
+    data, dim = Max.test_data()["rank_4_dim_3"]()
     pipeline = OpNotSupportedPipeline[Max.input_t](
-        MaxWithIndex(dim), data, "TOSA-0.80+BI", {}
+        MaxWithIndex(dim), data, {}, quantize=True
     )
     pipeline.run()
 
 
-def test_max_index_not_delegated_MI():
-    data, dim = Max.test_data["rank_4_dim_3"]
-    pipeline = OpNotSupportedPipeline[Max.input_t](
-        MaxWithIndex(dim), data, "TOSA-0.80+MI", {}
-    )
+def test_max_dim_tosa_MI_not_delegated():
+    data, dim = Max.test_data["rank_4_dim_3"]()
+    pipeline = OpNotSupportedPipeline[Max.input_t](MaxWithIndex(dim), data, {})
     pipeline.run()
diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py
index 092ed472bce..d83a5ee8839 100644
--- a/backends/arm/test/ops/test_amin.py
+++ b/backends/arm/test/ops/test_amin.py
@@ -31,11 +31,11 @@ def forward(self, x):
         return torch.amin(x, self.dim, self.keep_dims)
 
     test_data: Dict[str, input_t] = {
-        "rank_1_dim_0": ((torch.rand([10]),), 0, False),
-        "rank_2_dim_1_keep_dims": ((torch.rand([2, 2]),), (1,), True),
-        "rank_4_all_dim": ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
-        "rank_4_0,3_keep_dims": ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
-        "rank_4_mult_batches": ((torch.rand([2, 2, 2, 2]),), (0), True),
+        "rank_1_dim_0": lambda: ((torch.rand([10]),), 0, False),
+        "rank_2_dim_1_keep_dims": lambda: ((torch.rand([2, 2]),), (1,), True),
+        "rank_4_all_dim": lambda: ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
+        "rank_4_0,3_keep_dims": lambda: ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
+        "rank_4_mult_batches": lambda: ((torch.rand([2, 2, 2, 2]),), (0), True),
     }
 
 
@@ -52,10 +52,10 @@ def forward(self, x):
         return x[0]
 
     test_data: Dict[str, input_t] = {
-        "rank_1_dim_0": ((torch.rand([10]),), 0),
-        "rank_2_dim_1": ((torch.rand([2, 2]),), 1),
-        "rank_4_dim_2": ((torch.rand([2, 2, 2, 2]),), 2),
-        "rank_4_dim_3": ((torch.rand([2, 2, 2, 2]),), 3),
+        "rank_1_dim_0": lambda: ((torch.rand([10]),), 0),
+        "rank_2_dim_1": lambda: ((torch.rand([2, 2]),), 1),
+        "rank_4_dim_2": lambda: ((torch.rand([2, 2, 2, 2]),), 2),
+        "rank_4_dim_3": lambda: ((torch.rand([2, 2, 2, 2]),), 3),
     }
 
 
@@ -71,7 +71,7 @@ def forward(self, x):
 
 @common.parametrize("test_data", Amin.test_data)
 def test_amin_tosa_MI(test_data: Amin.input_t):
-    data, dim, keep_dims = test_data
+    data, dim, keep_dims = test_data()
     pipeline = TosaPipelineMI[Amin.input_t](
         Amin(dim, keep_dims),
         data,
@@ -82,7 +82,7 @@ def test_amin_tosa_MI(test_data: Amin.input_t):
 
 @common.parametrize("test_data", Amin.test_data)
 def test_amin_tosa_BI(test_data: Amin.input_t):
-    data, dim, keep_dims = test_data
+    data, dim, keep_dims = test_data()
     pipeline = TosaPipelineBI[Amin.input_t](
         Amin(dim, keep_dims),
         data,
@@ -92,23 +92,13 @@ def test_amin_tosa_BI(test_data: Amin.input_t):
 
 
 def test_amin_u55_BI_not_delegated():
-    data, dim, keep_dims = Amin.test_data["rank_4_all_dim"]
+    data, dim, keep_dims = Amin.test_data["rank_4_all_dim"]()
     pipeline = OpNotSupportedPipeline[Amin.input_t](
         Amin(dim, keep_dims),
         data,
-        "TOSA-0.80+BI+u55",
         {" executorch_exir_dialects_edge__ops_aten_amin_default": 1},
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", Amin.test_data)
-def test_amin_u85_BI(test_data: Amin.input_t):
-    data, dim, keep_dims = test_data
-    pipeline = EthosU85PipelineBI[Amin.input_t](
-        Amin(dim, keep_dims),
-        data,
-        Amin.aten_op,
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -117,50 +107,46 @@ def test_amin_u85_BI(test_data: Amin.input_t):
 
 
 @common.parametrize("test_data", Amin.test_data, fvp_xfails)
-@common.SkipIfNoCorstone320
-def test_amin_u85_BI_on_fvp(test_data: Amin.input_t):
-    data, dim, keep_dims = test_data
+@common.XfailIfNoCorstone320
+def test_amin_u85_BI(test_data: Amin.input_t):
+    data, dim, keep_dims = test_data()
     pipeline = EthosU85PipelineBI[Amin.input_t](
-        Amin(dim, keep_dims), data, Amin.aten_op, run_on_fvp=True
+        Amin(dim, keep_dims),
+        data,
+        Amin.aten_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", Min.test_data)
-def test_min_to_amin_MI(test_data: Min.input_t):
-    data, dim = test_data
-    pipeline = TosaPipelineMI[Min.input_t](
-        Min(dim),
-        data,
-        "torch.ops.aten.min",
-    )
+def test_min_dim_tosa_MI_to_amin(test_data: Min.input_t):
+    data, dim = test_data()
+    pipeline = TosaPipelineMI[Min.input_t](Min(dim), data, "torch.ops.aten.min")
     pipeline.run()
 
 
 @common.parametrize("test_data", Min.test_data)
-def test_min_to_amin_BI(test_data: Min.input_t):
-    data, dim = test_data
+def test_min_dim_tosa_BI_to_amin(test_data: Min.input_t):
+    data, dim = test_data()
     module = Min(dim)
-    pipeline = TosaPipelineBI[Min.input_t](
-        module,
-        data,
-        "torch.ops.aten.amin",
-    )
+    pipeline = TosaPipelineBI[Min.input_t](module, data, "torch.ops.aten.amin")
     pipeline.run()
 
 
 @pytest.mark.xfail(reason="MLETORCH-718 : Quantization of indices in arm_quantizer")
-def test_max_index_not_delegated_BI():
-    data, dim = Min.test_data["rank_4_dim_3"]
+def test_min_dim_tosa_BI_not_delegated():
+    data, dim = Min.test_data["rank_4_dim_3"]()
     pipeline = OpNotSupportedPipeline[Min.input_t](
-        MinWithIndex(dim), data, "TOSA-0.80+BI", {}
+        MinWithIndex(dim),
+        data,
+        {},
+        quantize=True,
     )
     pipeline.run()
 
 
-def test_max_index_not_delegated_MI():
-    data, dim = Min.test_data["rank_4_dim_3"]
-    pipeline = OpNotSupportedPipeline[Min.input_t](
-        MinWithIndex(dim), data, "TOSA-0.80+MI", {}
-    )
+def test_min_dim_tosa_MI_not_delegated():
+    data, dim = Min.test_data["rank_4_dim_3"]()
+    pipeline = OpNotSupportedPipeline[Min.input_t](MinWithIndex(dim), data, {})
     pipeline.run()
diff --git a/backends/arm/test/ops/test_any.py b/backends/arm/test/ops/test_any.py
index b5de87061ea..6ddef1ad0b5 100644
--- a/backends/arm/test/ops/test_any.py
+++ b/backends/arm/test/ops/test_any.py
@@ -45,90 +45,94 @@ def forward(self, x: torch.Tensor):
 
 
 test_input: dict[input_t1] = {
-    "rank1": (torch.tensor([True, False, False], dtype=torch.bool), 0, True),
-    "rank1_squeeze": (torch.tensor([True, False, False], dtype=torch.bool), -1, False),
-    "rank2": (
+    "rank1": lambda: (torch.tensor([True, False, False], dtype=torch.bool), 0, True),
+    "rank1_squeeze": lambda: (
+        torch.tensor([True, False, False], dtype=torch.bool),
+        -1,
+        False,
+    ),
+    "rank2": lambda: (
         torch.randint(0, 2, (2, 3), dtype=torch.bool),
         0,
         True,
     ),
-    "rank2_squeeze": (
+    "rank2_squeeze": lambda: (
         torch.randint(0, 2, (2, 3), dtype=torch.bool),
         0,
         False,
     ),
-    "rank2_dims": (
+    "rank2_dims": lambda: (
         torch.randint(0, 2, (2, 3), dtype=torch.bool),
         [0, 1],
         True,
     ),
-    "rank2_dims_squeeze": (
+    "rank2_dims_squeeze": lambda: (
         torch.randint(0, 2, (2, 3), dtype=torch.bool),
         [-2, 1],
         False,
     ),
-    "rank3_dims_squeeze": (
+    "rank3_dims_squeeze": lambda: (
         torch.randint(0, 2, (6, 8, 10), dtype=torch.bool),
         [1, 2],
         False,
     ),
-    "rank4": (
+    "rank4": lambda: (
         torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
         1,
         True,
     ),
-    "rank4_squeeze": (
+    "rank4_squeeze": lambda: (
         torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
         1,
         False,
     ),
-    "rank4_dims": (
+    "rank4_dims": lambda: (
         torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
         [0, 2],
         True,
     ),
-    "rank4_dims_squeeze": (
+    "rank4_dims_squeeze": lambda: (
         torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
         [1, -1],
         False,
     ),
-    "rank1_reduce_all": (torch.tensor([True, False, False], dtype=torch.bool),),
-    "rank2_reduce_all": (torch.randint(0, 2, (2, 3), dtype=torch.bool),),
-    "rank3_reduce_all": (torch.randint(0, 2, (6, 8, 10), dtype=torch.bool),),
-    "rank4_reduce_all": (torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),),
+    "rank1_reduce_all": lambda: (torch.tensor([True, False, False], dtype=torch.bool),),
+    "rank2_reduce_all": lambda: (torch.randint(0, 2, (2, 3), dtype=torch.bool),),
+    "rank3_reduce_all": lambda: (torch.randint(0, 2, (6, 8, 10), dtype=torch.bool),),
+    "rank4_reduce_all": lambda: (torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),),
 }
 
 
 test_data = {
-    "any_rank1": (AnyDim(), test_input["rank1"]),
-    "any_rank1_squeeze": (AnyDim(), test_input["rank1_squeeze"]),
-    "any_rank2": (AnyDim(), test_input["rank2"]),
-    "any_rank2_squeeze": (AnyDim(), test_input["rank2_squeeze"]),
-    "any_rank2_dims": (AnyDims(), test_input["rank2_dims"]),
-    "any_rank2_dims_squeeze": (AnyDims(), test_input["rank2_dims_squeeze"]),
-    "any_rank3_dims_squeeze": (AnyDims(), test_input["rank3_dims_squeeze"]),
-    "any_rank4": (AnyDim(), test_input["rank4"]),
-    "any_rank4_squeeze": (AnyDim(), test_input["rank4_squeeze"]),
-    "any_rank4_dims": (AnyDims(), test_input["rank4_dims"]),
-    "any_rank4_dims_squeeze": (AnyDims(), test_input["rank4_dims_squeeze"]),
-    "any_rank1_reduce_all": (AnyReduceAll(), test_input["rank1_reduce_all"]),
-    "any_rank2_reduce_all": (AnyReduceAll(), test_input["rank2_reduce_all"]),
-    "any_rank3_reduce_all": (AnyReduceAll(), test_input["rank3_reduce_all"]),
-    "any_rank4_reduce_all": (AnyReduceAll(), test_input["rank4_reduce_all"]),
+    "any_rank1": lambda: (AnyDim(), test_input["rank1"]),
+    "any_rank1_squeeze": lambda: (AnyDim(), test_input["rank1_squeeze"]),
+    "any_rank2": lambda: (AnyDim(), test_input["rank2"]),
+    "any_rank2_squeeze": lambda: (AnyDim(), test_input["rank2_squeeze"]),
+    "any_rank2_dims": lambda: (AnyDims(), test_input["rank2_dims"]),
+    "any_rank2_dims_squeeze": lambda: (AnyDims(), test_input["rank2_dims_squeeze"]),
+    "any_rank3_dims_squeeze": lambda: (AnyDims(), test_input["rank3_dims_squeeze"]),
+    "any_rank4": lambda: (AnyDim(), test_input["rank4"]),
+    "any_rank4_squeeze": lambda: (AnyDim(), test_input["rank4_squeeze"]),
+    "any_rank4_dims": lambda: (AnyDims(), test_input["rank4_dims"]),
+    "any_rank4_dims_squeeze": lambda: (AnyDims(), test_input["rank4_dims_squeeze"]),
+    "any_rank1_reduce_all": lambda: (AnyReduceAll(), test_input["rank1_reduce_all"]),
+    "any_rank2_reduce_all": lambda: (AnyReduceAll(), test_input["rank2_reduce_all"]),
+    "any_rank3_reduce_all": lambda: (AnyReduceAll(), test_input["rank3_reduce_all"]),
+    "any_rank4_reduce_all": lambda: (AnyReduceAll(), test_input["rank4_reduce_all"]),
 }
 
 
 @common.parametrize("test_data", test_data)
 def test_any_tosa_MI(test_data: input_t1):
-    op, test_input = test_data
-    pipeline = TosaPipelineMI[input_t1](op, test_input, op.aten_op, op.exir_op)
+    op, test_input = test_data()
+    pipeline = TosaPipelineMI[input_t1](op, test_input(), op.aten_op, op.exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data)
 def test_any_tosa_BI(test_data: input_t1):
-    op, test_input = test_data
-    pipeline = TosaPipelineBI[input_t1](op, test_input, op.aten_op, op.exir_op)
+    op, test_input = test_data()
+    pipeline = TosaPipelineBI[input_t1](op, test_input(), op.aten_op, op.exir_op)
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -137,9 +141,13 @@ def test_any_tosa_BI(test_data: input_t1):
 @common.parametrize("test_data", test_data)
 def test_any_u55_BI(test_data: input_t1):
     # Tests that we don't delegate these ops since they are not supported on U55.
-    op, test_input = test_data
+    op, test_input = test_data()
     pipeline = OpNotSupportedPipeline[input_t1](
-        op, test_input, "TOSA-0.80+BI+u55", {op.exir_op: 1}
+        op,
+        test_input(),
+        {op.exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -148,9 +156,13 @@ def test_any_u55_BI(test_data: input_t1):
 @pytest.mark.xfail(reason="MLETORCH-706: Support ScalarType::Bool in EthosUBackend.")
 @common.XfailIfNoCorstone320
 def test_any_u85_BI(test_data: input_t1):
-    op, test_input = test_data
+    op, test_input = test_data()
     pipeline = EthosU85PipelineBI[input_t1](
-        op, test_input, op.aten_op, op.exir_op, run_on_fvp=True
+        op,
+        test_input(),
+        op.aten_op,
+        op.exir_op,
+        run_on_fvp=True,
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
diff --git a/backends/arm/test/ops/test_arange.py b/backends/arm/test/ops/test_arange.py
index 124f3ee597e..cb5f329a7f9 100644
--- a/backends/arm/test/ops/test_arange.py
+++ b/backends/arm/test/ops/test_arange.py
@@ -54,16 +54,22 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 def test_arange_start_step_tosa_MI(test_data: test_data_t):
     input_data, init_data = test_data
     pipeline = TosaPipelineMI[input_t](
-        ArangeAdd(*init_data), input_data(), ArangeAdd.aten_op, ArangeAdd.exir_op
+        ArangeAdd(*init_data),
+        input_data(),
+        ArangeAdd.aten_op,
+        ArangeAdd.exir_op,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", ArangeAdd.test_data_dtypes)
-def test_arange_start_step_dtypes_tosa_MI(test_data: test_data_t):
+def test_arange_start_step_tosa_MI_dtypes(test_data: test_data_t):
     input_data, init_data = test_data
     pipeline = TosaPipelineMI[input_t](
-        ArangeAdd(*init_data), input_data(), ArangeAdd.aten_op, ArangeAdd.exir_op
+        ArangeAdd(*init_data),
+        input_data(),
+        ArangeAdd.aten_op,
+        ArangeAdd.exir_op,
     )
     pipeline.run()
 
@@ -72,27 +78,34 @@ def test_arange_start_step_dtypes_tosa_MI(test_data: test_data_t):
 def test_arange_start_step_tosa_BI(test_data: test_data_t):
     input_data, init_data = test_data
     pipeline = TosaPipelineBI[input_t](
-        ArangeAdd(*init_data), input_data(), ArangeAdd.aten_op, ArangeAdd.exir_op
+        ArangeAdd(*init_data),
+        input_data(),
+        ArangeAdd.aten_op,
+        ArangeAdd.exir_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
 @common.parametrize("test_data", ArangeAdd.test_data)
-def test_arange_start_step_tosa_u55(test_data: test_data_t):
+def test_arange_start_step_u55_BI(test_data: test_data_t):
     input_data, init_data = test_data
     pipeline = EthosU55PipelineBI[input_t](
-        ArangeAdd(*init_data), input_data(), ArangeAdd.aten_op
+        ArangeAdd(*init_data),
+        input_data(),
+        ArangeAdd.aten_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
 @common.parametrize("test_data", ArangeAdd.test_data)
-def test_arange_start_step_tosa_u85(test_data: test_data_t):
+def test_arange_start_step_u85_BI(test_data: test_data_t):
     input_data, init_data = test_data
     pipeline = EthosU85PipelineBI[input_t](
-        ArangeAdd(*init_data), input_data(), ArangeAdd.aten_op
+        ArangeAdd(*init_data),
+        input_data(),
+        ArangeAdd.aten_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -120,7 +133,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 def test_linspace_tosa_MI(test_data):
     input_data, init_data = test_data
     pipeline = TosaPipelineMI[input_t](
-        LinspaceAdd(*init_data), input_data(), LinspaceAdd.aten_op, LinspaceAdd.exir_op
+        LinspaceAdd(*init_data),
+        input_data(),
+        LinspaceAdd.aten_op,
+        LinspaceAdd.exir_op,
     )
     pipeline.run()
 
@@ -129,7 +145,10 @@ def test_linspace_tosa_MI(test_data):
 def test_linspace_tosa_BI(test_data: test_data_t):
     input_data, init_data = test_data
     pipeline = TosaPipelineBI[input_t](
-        LinspaceAdd(*init_data), input_data(), LinspaceAdd.aten_op, LinspaceAdd.exir_op
+        LinspaceAdd(*init_data),
+        input_data(),
+        LinspaceAdd.aten_op,
+        LinspaceAdd.exir_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
diff --git a/backends/arm/test/ops/test_avg_pool2d.py b/backends/arm/test/ops/test_avg_pool2d.py
index c48595aec7f..65c1830b9b2 100644
--- a/backends/arm/test/ops/test_avg_pool2d.py
+++ b/backends/arm/test/ops/test_avg_pool2d.py
@@ -1,6 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-#
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -9,8 +8,6 @@
 
 from typing import Tuple
 
-import pytest
-
 import torch
 
 from executorch.backends.arm.test import common, conftest
@@ -23,9 +20,9 @@
     TosaPipelineMI,
 )
 
-
 aten_op = "torch.ops.aten.avg_pool2d.default"
 exir_op = "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"
+
 input_t = Tuple[torch.Tensor]
 
 
@@ -46,19 +43,19 @@ def forward(self, x):
 
 
 test_modules = {
-    "zeros": (AvgPool2d(4, 2, 0), (torch.zeros(1, 16, 50, 32),)),
-    "ones": (AvgPool2d(4, 2, 0), (torch.ones(1, 16, 50, 32),)),
-    "rand": (AvgPool2d(4, 2, 0), (torch.rand(1, 16, 50, 32),)),
-    "randn": (AvgPool2d(4, 2, 0), (torch.randn(1, 16, 50, 32),)),
-    "kernel_3x3_stride_1_pad_1": (
+    "zeros": lambda: (AvgPool2d(4, 2, 0), (torch.zeros(1, 16, 50, 32),)),
+    "ones": lambda: (AvgPool2d(4, 2, 0), (torch.ones(1, 16, 50, 32),)),
+    "rand": lambda: (AvgPool2d(4, 2, 0), (torch.rand(1, 16, 50, 32),)),
+    "randn": lambda: (AvgPool2d(4, 2, 0), (torch.randn(1, 16, 50, 32),)),
+    "kernel_3x3_stride_1_pad_1": lambda: (
         AvgPool2d((3, 3), (1, 1), 1),
         (torch.rand(1, 16, 50, 32),),
     ),
-    "kernel_3x2_stride_1x2_pad_1x0": (
+    "kernel_3x2_stride_1x2_pad_1x0": lambda: (
         AvgPool2d((3, 2), (1, 2), (1, 0)),
         (torch.rand(1, 16, 50, 32),),
     ),
-    "kernel_4x6_stride_1x2_pad_2x3": (
+    "kernel_4x6_stride_1x2_pad_2x3": lambda: (
         AvgPool2d((4, 6), (1, 2), (2, 3)),
         (torch.rand(1, 16, 50, 32),),
     ),
@@ -66,9 +63,8 @@ def forward(self, x):
 
 
 @common.parametrize("test_module", test_modules)
-@pytest.mark.tosa_ref_model
-def test_avgpool2d_tosa_MI(test_module):
-    model, input_tensor = test_module
+def test_avg_pool2d_tosa_MI(test_module):
+    model, input_tensor = test_module()
 
     pipeline = TosaPipelineMI[input_t](
         model,
@@ -83,9 +79,8 @@ def test_avgpool2d_tosa_MI(test_module):
 
 
 @common.parametrize("test_module", test_modules)
-@pytest.mark.tosa_ref_model
-def test_avgpool2d_tosa_BI(test_module):
-    model, input_tensor = test_module
+def test_avg_pool2d_tosa_BI(test_module):
+    model, input_tensor = test_module()
 
     pipeline = TosaPipelineBI[input_t](
         model,
@@ -101,41 +96,9 @@ def test_avgpool2d_tosa_BI(test_module):
 
 
 @common.parametrize("test_module", test_modules)
-def test_avgpool2d_u55_BI(test_module):
-    model, input_tensor = test_module
-
-    pipeline = EthosU55PipelineBI[input_t](
-        model,
-        input_tensor,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        symmetric_io_quantization=True,
-    )
-
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_modules)
-def test_avgpool2d_u85_BI(test_module):
-    model, input_tensor = test_module
-
-    pipeline = EthosU85PipelineBI[input_t](
-        model,
-        input_tensor,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        symmetric_io_quantization=True,
-    )
-
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_modules)
-@common.SkipIfNoCorstone300
-def test_avgpool2d_u55_BI_on_fvp(test_module):
-    model, input_tensor = test_module
+@common.XfailIfNoCorstone300
+def test_avg_pool2d_u55_BI(test_module):
+    model, input_tensor = test_module()
 
     pipeline = EthosU55PipelineBI[input_t](
         model,
@@ -150,9 +113,9 @@ def test_avgpool2d_u55_BI_on_fvp(test_module):
 
 
 @common.parametrize("test_module", test_modules)
-@common.SkipIfNoCorstone320
-def test_avgpool2d_u85_BI_on_fvp(test_module):
-    model, input_tensor = test_module
+@common.XfailIfNoCorstone320
+def test_avg_pool2d_u85_BI(test_module):
+    model, input_tensor = test_module()
 
     pipeline = EthosU85PipelineBI[input_t](
         model,
@@ -168,14 +131,20 @@ def test_avgpool2d_u85_BI_on_fvp(test_module):
 
 
 reject_modules = {
-    "kernel_1x1_stride_1_pad_0": (AvgPool2d(1, 1, 0), torch.rand(2, 5, 5, 5)),
-    "kernel_2x9_stride_1_pad_1": (AvgPool2d((2, 9), 1, 1), torch.rand(1, 16, 5, 32)),
-    "kernel_1x4_stride_0_pad_0": (AvgPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)),
-    "kernel_1x257_stride_1_pad_0_large": (
+    "kernel_1x1_stride_1_pad_0": lambda: (AvgPool2d(1, 1, 0), torch.rand(2, 5, 5, 5)),
+    "kernel_2x9_stride_1_pad_1": lambda: (
+        AvgPool2d((2, 9), 1, 1),
+        torch.rand(1, 16, 5, 32),
+    ),
+    "kernel_1x4_stride_0_pad_0": lambda: (
+        AvgPool2d(1, 4, 0),
+        torch.rand(1, 10, 10, 10),
+    ),
+    "kernel_1x257_stride_1_pad_0_large": lambda: (
         AvgPool2d((1, 257), 1, 0),
         torch.rand(1, 16, 5, 300),
     ),
-    "kernel_800x90_stride_1_pad_0_extreme": (
+    "kernel_800x90_stride_1_pad_0_extreme": lambda: (
         AvgPool2d((800, 90), 1, 0),
         torch.rand(1, 16, 850, 100),
     ),
@@ -183,15 +152,15 @@ def test_avgpool2d_u85_BI_on_fvp(test_module):
 
 
 @common.parametrize("reject_module", reject_modules)
-def test_reject_avgpool2d(reject_module):
+def test_avg_pool2d_tosa_BI_not_delegated(reject_module):
 
-    model, test_data = reject_module
+    model, test_data = reject_module()
 
     pipeline = OpNotSupportedPipeline[input_t](
         module=model,
         test_data=(test_data,),
-        tosa_version="TOSA-0.80+BI",
         non_delegated_ops={},
         n_expected_delegates=0,
+        quantize=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py
index 980ab28df64..5134353c671 100644
--- a/backends/arm/test/ops/test_batch_norm.py
+++ b/backends/arm/test/ops/test_batch_norm.py
@@ -5,20 +5,25 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]  # Input x
 
 
-test_data_suite = [
+test_data_suite = {
     # (test_name, test_data, [num_features, affine, track_running_stats, weight, bias, running_mean, running_var,] )
-    (
-        "zeros_affineT_runStatsT_default_weight_bias_mean_var",
+    "zeros_affineT_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.zeros(1, 32, 112, 112),
         [
             32,
@@ -26,8 +31,7 @@
             True,
         ],
     ),
-    (
-        "zeros_affineF_runStatsT_default_weight_bias_mean_var",
+    "zeros_affineF_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.zeros(1, 32, 112, 112),
         [
             32,
@@ -35,8 +39,7 @@
             True,
         ],
     ),
-    (
-        "zeros_affineT_runStatsT_rand_weight_bias_mean_var",
+    "zeros_affineT_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.zeros(1, 32, 112, 112),
         [
             32,
@@ -48,8 +51,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "zeros_affineF_runStatsT_rand_weight_bias_mean_var",
+    "zeros_affineF_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.zeros(1, 32, 112, 112),
         [
             32,
@@ -61,8 +63,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "ones_affineT_runStatsT_default_weight_bias_mean_var",
+    "ones_affineT_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.ones(1, 32, 112, 112),
         [
             32,
@@ -70,8 +71,7 @@
             True,
         ],
     ),
-    (
-        "ones_affineF_runStatsT_default_weight_bias_mean_var",
+    "ones_affineF_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.ones(1, 32, 112, 112),
         [
             32,
@@ -79,8 +79,7 @@
             True,
         ],
     ),
-    (
-        "ones_affineT_runStatsT_rand_weight_bias_mean_var",
+    "ones_affineT_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.ones(1, 32, 112, 112),
         [
             32,
@@ -92,8 +91,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "ones_affineF_runStatsT_rand_weight_bias_mean_var",
+    "ones_affineF_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.ones(1, 32, 112, 112),
         [
             32,
@@ -105,8 +103,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "rand_affineT_runStatsT_default_weight_bias_mean_var",
+    "rand_affineT_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.rand(1, 32, 112, 112),
         [
             32,
@@ -114,8 +111,7 @@
             True,
         ],
     ),
-    (
-        "rand_affineF_runStatsT_default_weight_bias_mean_var",
+    "rand_affineF_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.rand(1, 32, 112, 112),
         [
             32,
@@ -123,8 +119,7 @@
             True,
         ],
     ),
-    (
-        "rand_affineT_runStatsT_rand_weight_bias_mean_var",
+    "rand_affineT_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.rand(1, 32, 112, 112),
         [
             32,
@@ -136,8 +131,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "rand_affineF_runStatsT_rand_weight_bias_mean_var",
+    "rand_affineF_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.rand(1, 32, 112, 112),
         [
             32,
@@ -149,8 +143,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "randn_affineT_runStatsT_default_weight_bias_mean_var",
+    "randn_affineT_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -158,8 +151,7 @@
             True,
         ],
     ),
-    (
-        "randn_affineF_runStatsT_default_weight_bias_mean_var",
+    "randn_affineF_runStatsT_default_weight_bias_mean_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -167,8 +159,7 @@
             True,
         ],
     ),
-    (
-        "randn_affineT_runStatsT_rand_weight_bias_mean_var",
+    "randn_affineT_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -180,8 +171,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "randn_affineF_runStatsT_rand_weight_bias_mean_var",
+    "randn_affineF_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -194,100 +184,81 @@
         ],
     ),
     # Test some different sizes
-    (
-        "size_3_4_5_6_affineT_runStatsT_rand_weight_bias_mean_var",
+    "size_3_4_5_6_affineT_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.rand(3, 4, 5, 6),
         [4, True, True, torch.rand(4), torch.rand(4), torch.rand(4), torch.rand(4)],
     ),
-    (
-        "size_3_4_5_6_affineF_runStatsT_rand_weight_bias_mean_var",
+    "size_3_4_5_6_affineF_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.rand(3, 4, 5, 6),
         [4, True, True, torch.rand(4), torch.rand(4), torch.rand(4), torch.rand(4)],
     ),
-    (
-        "size_1_3_254_254_affineT_runStatsT_rand_weight_bias_mean_var",
+    "size_1_3_254_254_affineT_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.rand(1, 3, 254, 254),
         [3, True, True, torch.rand(3), torch.rand(3), torch.rand(3), torch.rand(3)],
     ),
-    (
-        "size_1_3_254_254_affineF_runStatsT_rand_weight_bias_mean_var",
+    "size_1_3_254_254_affineF_runStatsT_rand_weight_bias_mean_var": lambda: (
         torch.rand(1, 3, 254, 254),
         [3, True, True, torch.rand(3), torch.rand(3), torch.rand(3), torch.rand(3)],
     ),
     # Test combination of weight and bias
-    (
-        "check_weight_bias_affineT_runStatsT_none_none",
+    "check_weight_bias_affineT_runStatsT_none_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, True, None, None],
     ),
-    (
-        "check_weight_bias_affineF_runStatsT_none_none",
+    "check_weight_bias_affineF_runStatsT_none_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, True, None, None],
     ),
-    (
-        "check_weight_bias_affineT_runStatsT_weight_none",
+    "check_weight_bias_affineT_runStatsT_weight_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, True, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineF_runStatsT_weight_none",
+    "check_weight_bias_affineF_runStatsT_weight_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, True, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineT_runStatsT_none_bias",
+    "check_weight_bias_affineT_runStatsT_none_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, True, None, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineF_runStatsT_none_bias",
+    "check_weight_bias_affineF_runStatsT_none_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, True, None, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineT_runStatsT_weight_bias",
+    "check_weight_bias_affineT_runStatsT_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, True, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineF_runStatsT_weight_bias",
+    "check_weight_bias_affineF_runStatsT_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, True, torch.rand(32), torch.rand(32)],
     ),
     # Test combination of running_mean and running_var
-    (
-        "check_mean_var_affineT_runStatsT_none_none",
+    "check_mean_var_affineT_runStatsT_none_none": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, True, True, torch.rand(32), torch.rand(32), None, None],
     ),
-    (
-        "check_mean_var_affineF_runStatsT_none_none",
+    "check_mean_var_affineF_runStatsT_none_none": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, False, True, torch.rand(32), torch.rand(32), None, None],
     ),
-    (
-        "check_mean_var_affineT_runStatsT_mean_none",
+    "check_mean_var_affineT_runStatsT_mean_none": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, True, True, torch.rand(32), torch.rand(32), torch.rand(32), None],
     ),
-    (
-        "check_mean_var_affineF_runStatsT_mean_none",
+    "check_mean_var_affineF_runStatsT_mean_none": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, False, True, torch.rand(32), torch.rand(32), torch.rand(32), None],
     ),
-    (
-        "check_mean_var_affineT_runStatsT_none_var",
+    "check_mean_var_affineT_runStatsT_none_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, True, True, torch.rand(32), torch.rand(32), None, torch.rand(32)],
     ),
-    (
-        "check_mean_var_affineF_runStatsT_none_var",
+    "check_mean_var_affineF_runStatsT_none_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, False, True, torch.rand(32), torch.rand(32), None, torch.rand(32)],
     ),
-    (
-        "check_mean_var_affineT_runStatsT_mean_var",
+    "check_mean_var_affineT_runStatsT_mean_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -299,8 +270,7 @@
             torch.rand(32),
         ],
     ),
-    (
-        "check_mean_var_affineF_runStatsT_mean_var",
+    "check_mean_var_affineF_runStatsT_mean_var": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -312,12 +282,11 @@
             torch.rand(32),
         ],
     ),
-]
+}
 
-test_no_stats_data_suite = [
+test_no_stats_data_suite = {
     # (test_name, test_data, [num_features, affine, track_running_stats, weight, bias, running_mean, running_var, ] )
-    (
-        "zeros_affineT_runStatsF_default_weight_bias",
+    "zeros_affineT_runStatsF_default_weight_bias": lambda: (
         torch.zeros(1, 32, 112, 112),
         [
             32,
@@ -325,8 +294,7 @@
             False,
         ],
     ),
-    (
-        "zeros_affineF_runStatsF_default_weight_bias",
+    "zeros_affineF_runStatsF_default_weight_bias": lambda: (
         torch.zeros(1, 32, 112, 112),
         [
             32,
@@ -334,18 +302,15 @@
             False,
         ],
     ),
-    (
-        "zeros_affineT_runStatsF_rand_weight_bias",
+    "zeros_affineT_runStatsF_rand_weight_bias": lambda: (
         torch.zeros(1, 32, 112, 112),
         [32, True, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "zeros_affineF_runStatsF_rand_weight_bias",
+    "zeros_affineF_runStatsF_rand_weight_bias": lambda: (
         torch.zeros(1, 32, 112, 112),
         [32, False, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "ones_affineT_runStatsF_default_weight_bias",
+    "ones_affineT_runStatsF_default_weight_bias": lambda: (
         torch.ones(1, 32, 112, 112),
         [
             32,
@@ -353,8 +318,7 @@
             False,
         ],
     ),
-    (
-        "ones_affineF_runStatsF_default_weight_bias",
+    "ones_affineF_runStatsF_default_weight_bias": lambda: (
         torch.ones(1, 32, 112, 112),
         [
             32,
@@ -362,18 +326,15 @@
             False,
         ],
     ),
-    (
-        "ones_affineT_runStatsF_rand_weight_bias",
+    "ones_affineT_runStatsF_rand_weight_bias": lambda: (
         torch.ones(1, 32, 112, 112),
         [32, True, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "ones_affineF_runStatsF",
+    "ones_affineF_runStatsF": lambda: (
         torch.ones(1, 32, 112, 112),
         [32, False, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "rand_affineT_runStatsF_default_weight_bias",
+    "rand_affineT_runStatsF_default_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [
             32,
@@ -381,8 +342,7 @@
             False,
         ],
     ),
-    (
-        "rand_affineF_runStatsF_default_weight_bias",
+    "rand_affineF_runStatsF_default_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [
             32,
@@ -390,18 +350,15 @@
             False,
         ],
     ),
-    (
-        "rand_affineT_runStatsF_rand_weight_bias",
+    "rand_affineT_runStatsF_rand_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "rand_affineF_runStatsF_rand_weight_bias",
+    "rand_affineF_runStatsF_rand_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "randn_affineT_runStatsF_default_weight_bias",
+    "randn_affineT_runStatsF_default_weight_bias": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -409,8 +366,7 @@
             False,
         ],
     ),
-    (
-        "randn_affineF_runStatsF_default_weight_bias",
+    "randn_affineF_runStatsF_default_weight_bias": lambda: (
         torch.randn(1, 32, 112, 112),
         [
             32,
@@ -418,304 +374,148 @@
             False,
         ],
     ),
-    (
-        "randn_affineT_runStatsF_rand_weight_bias",
+    "randn_affineT_runStatsF_rand_weight_bias": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, True, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "randn_affineF_runStatsF_rand_weight_bias",
+    "randn_affineF_runStatsF_rand_weight_bias": lambda: (
         torch.randn(1, 32, 112, 112),
         [32, False, False, torch.rand(32), torch.rand(32)],
     ),
     # Test some different sizes
-    (
-        "size_3_4_5_6_affineT_runStatsF_rand_weight_bias_mean_var",
+    "size_3_4_5_6_affineT_runStatsF_rand_weight_bias_mean_var": lambda: (
         torch.rand(3, 4, 5, 6),
         [4, True, False, torch.rand(4), torch.rand(4)],
     ),
-    (
-        "size_3_4_5_6_affineF_runStatsF_rand_weight_bias_mean_var",
+    "size_3_4_5_6_affineF_runStatsF_rand_weight_bias_mean_var": lambda: (
         torch.rand(3, 4, 5, 6),
         [4, True, False, torch.rand(4), torch.rand(4)],
     ),
-    (
-        "size_1_3_254_254_affineT_runStatsF_rand_weight_bias_mean_var",
+    "size_1_3_254_254_affineT_runStatsF_rand_weight_bias_mean_var": lambda: (
         torch.rand(1, 3, 254, 254),
         [3, True, False, torch.rand(3), torch.rand(3)],
     ),
-    (
-        "size_1_3_254_254_affineF_runStatsF_rand_weight_bias_mean_var",
+    "size_1_3_254_254_affineF_runStatsF_rand_weight_bias_mean_var": lambda: (
         torch.rand(1, 3, 254, 254),
         [3, True, False, torch.rand(3), torch.rand(3)],
     ),
     # Test combination of weight and bias
-    (
-        "check_weight_bias_affineT_runStatsF_none_none",
+    "check_weight_bias_affineT_runStatsF_none_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, False, None, None],
     ),
-    (
-        "check_weight_bias_affineF_runStatsF_none_none",
+    "check_weight_bias_affineF_runStatsF_none_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, False, None, None],
     ),
-    (
-        "check_weight_bias_affineT_runStatsF_weight_none",
+    "check_weight_bias_affineT_runStatsF_weight_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, False, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineF_runStatsF_weight_none",
+    "check_weight_bias_affineF_runStatsF_weight_none": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, False, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineT_runStatsF_none_bias",
+    "check_weight_bias_affineT_runStatsF_none_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, False, None, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineF_runStatsF_none_bias",
+    "check_weight_bias_affineF_runStatsF_none_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, False, None, torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineT_runStatsF_weight_bias",
+    "check_weight_bias_affineT_runStatsF_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, True, False, torch.rand(32), torch.rand(32)],
     ),
-    (
-        "check_weight_bias_affineF_runStatsF_weight_bias",
+    "check_weight_bias_affineF_runStatsF_weight_bias": lambda: (
         torch.rand(1, 32, 112, 112),
         [32, False, False, torch.rand(32), torch.rand(32)],
     ),
-]
-
-
-class TestBatchNorm2d(unittest.TestCase):
-    """Tests BatchNorm2d."""
+}
 
-    class BatchNorm2d(torch.nn.Module):
-        def __init__(
-            self,
-            num_features: int = 32,
-            affine: bool = False,
-            track_running_stats: bool = True,
-            weights: torch.tensor = None,
-            bias: torch.tensor = None,
-            running_mean: torch.tensor = None,
-            running_var: torch.tensor = None,
-        ):
-            super().__init__()
-            self.batch_norm_2d = torch.nn.BatchNorm2d(
-                num_features, affine=affine, track_running_stats=track_running_stats
-            )
-            if weights is not None:
-                self.batch_norm_2d.weight = torch.nn.Parameter(weights)
-            if bias is not None:
-                self.batch_norm_2d.bias = torch.nn.Parameter(bias)
-            if running_mean is not None:
-                self.batch_norm_2d.running_mean = running_mean
-            if running_var is not None:
-                self.batch_norm_2d.running_var = running_var
 
-        def forward(self, x):
-            return self.batch_norm_2d(x)
-
-    def _test_batchnorm2d_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+class BatchNorm2d(torch.nn.Module):
+    def __init__(
+        self,
+        num_features: int = 32,
+        affine: bool = False,
+        track_running_stats: bool = True,
+        weights: torch.tensor = None,
+        bias: torch.tensor = None,
+        running_mean: torch.tensor = None,
+        running_var: torch.tensor = None,
     ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .check_count(
-                {
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 1
-                }
-            )
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default"
-                ]
-            )
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
+        super().__init__()
+        self.batch_norm_2d = torch.nn.BatchNorm2d(
+            num_features, affine=affine, track_running_stats=track_running_stats
         )
+        if weights is not None:
+            self.batch_norm_2d.weight = torch.nn.Parameter(weights)
+        if bias is not None:
+            self.batch_norm_2d.bias = torch.nn.Parameter(bias)
+        if running_mean is not None:
+            self.batch_norm_2d.running_mean = running_mean
+        if running_var is not None:
+            self.batch_norm_2d.running_var = running_var
 
-    def _test_batchnorm2d_no_stats_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten._native_batch_norm_legit.no_stats": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .check_count(
-                {
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_stats": 1
-                }
-            )
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_stats"
-                ]
-            )
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
+    def forward(self, x):
+        return self.batch_norm_2d(x)
 
-    def _test_batchnorm2d_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count(
-                {"torch.ops.aten._native_batch_norm_legit_no_training.default": 1}
-            )
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .check_count(
-                {
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 1
-                }
-            )
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default"
-                ]
-            )
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
 
-    def _test_batchnorm2d_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .check_count(
-                {"torch.ops.aten._native_batch_norm_legit_no_training.default": 1}
-            )
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .check_count(
-                {
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 1
-                }
-            )
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default"
-                ]
-            )
-            .to_executorch()
-        )
+@common.parametrize("test_data", test_data_suite)
+def test_native_batch_norm_legit_tosa_MI_no_training(test_data: Tuple):
+    test_data, model_params = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        BatchNorm2d(*model_params),
+        (test_data,),
+        aten_op=[],
+        exir_op="executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
+    )
+    pipeline.run()
 
-    @parameterized.expand(test_data_suite)
-    def test_native_batch_norm_legit_no_training_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: (
-            int
-            | Tuple[
-                int, bool, bool, torch.tensor, torch.tensor, torch.tensor, torch.tensor
-            ]
-        ),
-    ):
-        self._test_batchnorm2d_tosa_MI_pipeline(
-            self.BatchNorm2d(*model_params), (test_data,)
-        )
 
-    # Expected to fail since not inplemented
-    @parameterized.expand(test_no_stats_data_suite)
-    @unittest.expectedFailure
-    def test_native_batch_norm_legit_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: (
-            int
-            | Tuple[
-                int, bool, bool, torch.tensor, torch.tensor, torch.tensor, torch.tensor
-            ]
-        ),
-    ):
-        self._test_batchnorm2d_no_stats_tosa_MI_pipeline(
-            self.BatchNorm2d(*model_params), (test_data,)
-        )
+@common.parametrize("test_data", test_no_stats_data_suite)
+# Expected to fail since not inplemented
+@pytest.mark.skip  # Not implemented, skip until it is.
+def test_native_batch_norm_legit_tosa_MI(test_data: Tuple):
+    test_data, model_params = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        BatchNorm2d(*model_params),
+        (test_data,),
+        aten_op=[],
+        exir_op="executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
+    )
+    pipeline.pop_stage("check_count.exir")
+    pipeline.run()
+
 
-    # Expected to fail since TOSAQuantizer cannot quantize a BatchNorm layer
-    # TODO(MLETORCH-100)
-    @parameterized.expand(test_data_suite)
-    @unittest.skip(
-        reason="Expected to fail since TOSAQuantizer (for BI) cannot quantize a BatchNorm layer"
+# Expected to fail since TOSAQuantizer cannot quantize a BatchNorm layer
+# TODO(MLETORCH-100)
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.skip  # Not implemented, skip until it is.
+def test_native_batch_norm_legit_tosa_BI_no_training(test_data: Tuple):
+    test_data, model_params = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        BatchNorm2d(*model_params),
+        (test_data,),
+        aten_op="torch.ops.aten._native_batch_norm_legit_no_training.default",
+        exir_op="executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
     )
-    def test_native_batch_norm_legit_no_training_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: (
-            int
-            | Tuple[
-                int, bool, bool, torch.tensor, torch.tensor, torch.tensor, torch.tensor
-            ]
-        ),
-    ):
-        self._test_batchnorm2d_tosa_BI_pipeline(
-            self.BatchNorm2d(*model_params), (test_data,)
-        )
+    pipeline.run()
+
 
-    # Expected to fail since EthosUQuantizer (TOSAQuantizer (BI)) cannot quantize a BatchNorm layer
-    # TODO(MLETORCH-100)
-    @parameterized.expand(test_data_suite)
-    @unittest.skip(
-        reason="Expected to fail since EthosUQuantizer cannot quantize a BatchNorm layer"
+# Expected to fail since EthosUQuantizer (TOSAQuantizer (BI)) cannot quantize a BatchNorm layer
+# TODO(MLETORCH-100)
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.skip  # Not implemented, skip until it is.
+def test_native_batch_norm_legit_u55_BI_no_training(test_data: Tuple):
+    test_data, model_params = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        BatchNorm2d(*model_params),
+        test_data,
+        aten_ops="torch.ops.aten._native_batch_norm_legit_no_training.default",
+        exir_ops="executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
+        run_on_fvp=True,
     )
-    @unittest.expectedFailure
-    def test_native_batch_norm_legit_no_training_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: (
-            int
-            | Tuple[
-                int, bool, bool, torch.tensor, torch.tensor, torch.tensor, torch.tensor
-            ]
-        ),
-    ):
-        self._test_batchnorm2d_u55_BI_pipeline(
-            self.BatchNorm2d(*model_params), (test_data,)
-        )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_bitwise.py b/backends/arm/test/ops/test_bitwise.py
index 412701b17da..8be8ba35b4e 100644
--- a/backends/arm/test/ops/test_bitwise.py
+++ b/backends/arm/test/ops/test_bitwise.py
@@ -22,19 +22,19 @@
 
 class BitwiseBinary(torch.nn.Module):
     test_data: dict[input_t2] = {
-        "zeros": (
+        "zeros": lambda: (
             torch.zeros(1, 10, 10, 10, dtype=torch.int32),
             torch.zeros(1, 10, 10, 10, dtype=torch.int32),
         ),
-        "ones": (
+        "ones": lambda: (
             torch.ones(10, 10, 10, dtype=torch.int8),
             torch.ones(10, 10, 10, dtype=torch.int8),
         ),
-        "rand_rank2": (
+        "rand_rank2": lambda: (
             torch.randint(-128, 127, (10, 10), dtype=torch.int8),
             torch.randint(-128, 127, (10, 10), dtype=torch.int8),
         ),
-        "rand_rank4": (
+        "rand_rank4": lambda: (
             torch.randint(-128, -127, (1, 10, 10, 10), dtype=torch.int8),
             torch.randint(-128, 127, (1, 10, 10, 10), dtype=torch.int8),
         ),
@@ -67,13 +67,17 @@ def forward(self, tensor1: torch.Tensor, tensor2: torch.Tensor):
 
 @common.parametrize("test_data", And().test_data)
 def test_bitwise_and_tensor_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](And(), test_data, And().aten_op, And().exir_op)
+    pipeline = TosaPipelineMI[input_t2](
+        And(), test_data(), And().aten_op, And().exir_op
+    )
     pipeline.run()
 
 
 @common.parametrize("test_data", And().test_data)
 def test_bitwise_and_tensor_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](And(), test_data, And().aten_op, And().exir_op)
+    pipeline = TosaPipelineBI[input_t2](
+        And(), test_data(), And().aten_op, And().exir_op
+    )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -83,7 +87,11 @@ def test_bitwise_and_tensor_tosa_BI(test_data: input_t2):
 def test_bitwise_and_tensor_u55_BI(test_data: input_t2):
     # Tests that we don't delegate these ops since they are not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t2](
-        And(), test_data, "TOSA-0.80+BI+u55", {And().exir_op: 1}
+        And(),
+        test_data(),
+        {And().exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -93,7 +101,7 @@ def test_bitwise_and_tensor_u55_BI(test_data: input_t2):
 @common.XfailIfNoCorstone320
 def test_bitwise_and_tensor_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        And(), test_data, And().aten_op, And().exir_op, run_on_fvp=True
+        And(), test_data(), And().aten_op, And().exir_op, run_on_fvp=True
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
@@ -102,13 +110,17 @@ def test_bitwise_and_tensor_u85_BI(test_data: input_t2):
 
 @common.parametrize("test_data", Xor().test_data)
 def test_bitwise_xor_tensor_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](Xor(), test_data, Xor().aten_op, Xor().exir_op)
+    pipeline = TosaPipelineMI[input_t2](
+        Xor(), test_data(), Xor().aten_op, Xor().exir_op
+    )
     pipeline.run()
 
 
 @common.parametrize("test_data", Xor().test_data)
 def test_bitwise_xor_tensor_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Xor(), test_data, Xor().aten_op, Xor().exir_op)
+    pipeline = TosaPipelineBI[input_t2](
+        Xor(), test_data(), Xor().aten_op, Xor().exir_op
+    )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -118,7 +130,11 @@ def test_bitwise_xor_tensor_tosa_BI(test_data: input_t2):
 def test_bitwise_xor_tensor_u55_BI(test_data: input_t2):
     # Tests that we don't delegate these ops since they are not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t2](
-        Xor(), test_data, "TOSA-0.80+BI+u55", {Xor().exir_op: 1}
+        Xor(),
+        test_data(),
+        {Xor().exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -128,7 +144,7 @@ def test_bitwise_xor_tensor_u55_BI(test_data: input_t2):
 @common.XfailIfNoCorstone320
 def test_bitwise_xor_tensor_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        Xor(), test_data, Xor().aten_op, Xor().exir_op, run_on_fvp=True
+        Xor(), test_data(), Xor().aten_op, Xor().exir_op, run_on_fvp=True
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
@@ -137,13 +153,13 @@ def test_bitwise_xor_tensor_u85_BI(test_data: input_t2):
 
 @common.parametrize("test_data", Or().test_data)
 def test_bitwise_or_tensor_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](Or(), test_data, Or().aten_op, Or().exir_op)
+    pipeline = TosaPipelineMI[input_t2](Or(), test_data(), Or().aten_op, Or().exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Or().test_data)
 def test_bitwise_or_tensor_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Or(), test_data, Or().aten_op, Or().exir_op)
+    pipeline = TosaPipelineBI[input_t2](Or(), test_data(), Or().aten_op, Or().exir_op)
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -153,7 +169,11 @@ def test_bitwise_or_tensor_tosa_BI(test_data: input_t2):
 def test_bitwise_or_tensor_u55_BI(test_data: input_t2):
     # Tests that we don't delegate these ops since they are not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t2](
-        Or(), test_data, "TOSA-0.80+BI+u55", {Or().exir_op: 1}
+        Or(),
+        test_data(),
+        {Or().exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -163,7 +183,11 @@ def test_bitwise_or_tensor_u55_BI(test_data: input_t2):
 @common.XfailIfNoCorstone320
 def test_bitwise_or_tensor_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        Or(), test_data, Or().aten_op, Or().exir_op, run_on_fvp=True
+        Or(),
+        test_data(),
+        Or().aten_op,
+        Or().exir_op,
+        run_on_fvp=True,
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 375e77cb9b0..bd2c9338275 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -1,165 +1,162 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
-from typing import Callable, Tuple
+from typing import Tuple
 
 import pytest
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestBMM(unittest.TestCase):
-    """Tests Batch MatMul"""
-
-    class BMM(torch.nn.Module):
-        test_data_generators = [
-            lambda: (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
-            lambda: (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
-            lambda: (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
-            lambda: (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
-            lambda: (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
-        ]
-
-        def forward(self, x, y):
-            return torch.bmm(x, y)
-
-    class BMMSingleInput(torch.nn.Module):
-        test_data_generators = [
-            lambda: (torch.rand(20, 3, 3),),
-            lambda: (torch.rand(2, 128, 128),),
-            lambda: (10000 * torch.randn(4, 25, 25),),
-            lambda: (5 + 5 * torch.randn(3, 64, 64),),
-        ]
-
-        def forward(self, x):
-            return torch.bmm(x, x)
-
-    def _test_bmm_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .check_count({"executorch_exir_dialects_edge__ops_aten_bmm_default": 1})
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_bmm_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .check_count({"executorch_exir_dialects_edge__ops_aten_bmm_default": 1})
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_bmm_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor, ...],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.bmm.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
-
-    @parameterized.expand(BMM.test_data_generators)
-    def test_bmm_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data)
-
-    @parameterized.expand(BMMSingleInput.test_data_generators)
-    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
-    def test_bmm_single_input_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data)
-
-    @parameterized.expand(BMM.test_data_generators)
-    def test_bmm_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
-
-    @parameterized.expand(BMMSingleInput.test_data_generators)
-    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
-    def test_bmm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
-
-    @parameterized.expand(BMM.test_data_generators)
-    @pytest.mark.corstone_fvp
-    def test_bmm_u55_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_ethosu_BI_pipeline(
-            self.BMM(), common.get_u55_compile_spec(), test_data
-        )
-
-    @parameterized.expand(BMM.test_data_generators)
-    @pytest.mark.corstone_fvp
-    def test_bmm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_ethosu_BI_pipeline(
-            self.BMM(), common.get_u85_compile_spec(), test_data
-        )
-
-    # Expected to fail on FVP as TOSA.MATMUL is not supported on U55
-    @parameterized.expand(BMMSingleInput.test_data_generators)
-    @pytest.mark.corstone_fvp
-    def test_bmm_single_input_u55_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_ethosu_BI_pipeline(
-            self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
-        )
-
-    @parameterized.expand(BMMSingleInput.test_data_generators)
-    @pytest.mark.corstone_fvp
-    def test_bmm_single_input_u85_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_ethosu_BI_pipeline(
-            self.BMMSingleInput(), common.get_u85_compile_spec(), test_data
-        )
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op_bmm = "torch.ops.aten.bmm.default"
+exir_op_bmm = "executorch_exir_dialects_edge__ops_aten_bmm_default"
+
+aten_op_mm = "torch.ops.aten.matmul.default"
+exir_op_mm = "executorch_exir_dialects_edge__ops_aten_matmul_default"
+
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x
+
+
+class BMM(torch.nn.Module):
+    test_data_generators = {
+        "rand_same": lambda: (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
+        "rand_diff": lambda: (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
+        "rand_ones": lambda: (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
+        "rand_big": lambda: (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
+        "rand_neg": lambda: (
+            -10 * torch.randn(2, 32, 64),
+            5 + 5 * torch.randn(2, 64, 32),
+        ),
+    }
+
+    def forward(self, x, y):
+        return torch.bmm(x, y)
+
+
+class MatMul(torch.nn.Module):
+    test_data_generators = {
+        "rand_3d": lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
+        "rand_4d": lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
+    }
+
+    def forward(self, x, y):
+        return torch.matmul(x, y)
+
+
+class BMMSingleInput(torch.nn.Module):
+    test_data_generators = {
+        "rand_3d_1": lambda: (torch.rand(20, 3, 3),),
+        "rand_3d_2": lambda: (torch.rand(2, 128, 128),),
+        "rand_big_1": lambda: (10000 * torch.randn(4, 25, 25),),
+        "rand_big_2": lambda: (5 + 5 * torch.randn(3, 64, 64),),
+    }
+
+    def forward(self, x):
+        return torch.bmm(x, x)
+
+
+@common.parametrize("test_data", BMM.test_data_generators)
+def test_bmm_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](BMM(), test_data(), aten_op_bmm, exir_op_bmm)
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLETORCH-534)
+@common.parametrize("test_data", BMMSingleInput.test_data_generators)
+def test_bmm_tosa_MI_single_input(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](
+        BMMSingleInput(), test_data(), aten_op_bmm, exir_op_bmm
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", MatMul.test_data_generators)
+def test_mm_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](MatMul(), test_data(), aten_op_mm, exir_op_mm)
+    pipeline.run()
+
+
+@common.parametrize("test_data", MatMul.test_data_generators)
+def test_mm_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](MatMul(), test_data(), aten_op_mm, exir_op_mm)
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLETORCH-534)
+@common.parametrize("test_data", BMM.test_data_generators)
+def test_bmm_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](BMM(), test_data(), aten_op_bmm, exir_op_bmm)
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLETORCH-534)
+@common.parametrize("test_data", BMMSingleInput.test_data_generators)
+def test_bmm_tosa_BI_single_input(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        BMMSingleInput(), test_data(), aten_op_bmm, exir_op_bmm
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", BMM.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_bmm_u55_BI(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        BMM(),
+        test_data(),
+        aten_op_bmm,
+        exir_op_bmm,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", BMM.test_data_generators)
+@common.XfailIfNoCorstone320
+def test_bmm_u85_BI(test_data: input_t1):
+    pipeline = EthosU85PipelineBI[input_t1](
+        BMM(),
+        test_data(),
+        aten_op_bmm,
+        exir_op_bmm,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", BMMSingleInput.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_bmm_u55_BI_single_input(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        BMMSingleInput(),
+        test_data(),
+        aten_op_bmm,
+        exir_op_bmm,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", BMMSingleInput.test_data_generators)
+@common.XfailIfNoCorstone320
+def test_bmm_u85_BI_single_input(test_data: input_t1):
+    pipeline = EthosU85PipelineBI[input_t1](
+        BMMSingleInput(),
+        test_data(),
+        aten_op_bmm,
+        exir_op_bmm,
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index 63423b9e993..d5ebd6fe569 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -1,172 +1,138 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
+input_t1 = Tuple[torch.Tensor]  # Input x
 
-class TestCat(unittest.TestCase):
+aten_op = "torch.ops.aten.cat.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_cat_default"
 
-    class Cat(torch.nn.Module):
-        test_parameters = [
-            ((torch.ones(1), torch.ones(1)), 0),
-            ((torch.ones(1, 2), torch.randn(1, 5), torch.randn(1, 1)), 1),
+
+class Cat(torch.nn.Module):
+    test_parameters = {
+        "cat_ones_two_tensors": lambda: ((torch.ones(1), torch.ones(1)), 0),
+        "cat_ones_and_rand_three_tensors": lambda: (
+            (torch.ones(1, 2), torch.randn(1, 5), torch.randn(1, 1)),
+            1,
+        ),
+        "cat_ones_and_rand_four_tensors": lambda: (
             (
-                (
-                    torch.ones(1, 2, 5),
-                    torch.randn(1, 2, 4),
-                    torch.randn(1, 2, 2),
-                    torch.randn(1, 2, 1),
-                ),
-                -1,
+                torch.ones(1, 2, 5),
+                torch.randn(1, 2, 4),
+                torch.randn(1, 2, 2),
+                torch.randn(1, 2, 1),
             ),
-            ((torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 1)), 3),
-            ((torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)), 0),
-            ((torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)), 3),
+            -1,
+        ),
+        "cat_rand_two_tensors": lambda: (
+            (torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 1)),
+            3,
+        ),
+        "cat_rand_two_tensors_dim_0": lambda: (
+            (torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)),
+            0,
+        ),
+        "cat_rand_two_tensors_dim_3": lambda: (
+            (torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)),
+            3,
+        ),
+        "cat_rand_large": lambda: (
             (
-                (
-                    10000 * torch.randn(2, 3, 1, 4),
-                    torch.randn(2, 7, 1, 4),
-                    torch.randn(2, 1, 1, 4),
-                ),
-                -3,
+                10000 * torch.randn(2, 3, 1, 4),
+                torch.randn(2, 7, 1, 4),
+                torch.randn(2, 1, 1, 4),
             ),
-        ]
-
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, t: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor:
-            return torch.cat(t, dim=dim)
-
-    def _test_cat_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.cat.default": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_cat_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.cat.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_cat_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[tuple[torch.Tensor, ...], int],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.cat.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(inputs=test_data)
-
-    @parameterized.expand(Cat.test_parameters)
-    def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int):
-        test_data = (operands, dim)
-        self._test_cat_tosa_MI_pipeline(self.Cat(), test_data)
-
-    def test_cat_4d_tosa_MI(self):
-        square = torch.ones((2, 2, 2, 2))
-        for dim in range(-3, 3):
-            test_data = ((square, square.clone()), dim)
-            self._test_cat_tosa_MI_pipeline(self.Cat(), test_data)
-
-    @parameterized.expand(Cat.test_parameters)
-    def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
-        test_data = (operands, dim)
-        self._test_cat_tosa_BI_pipeline(self.Cat(), test_data)
-
-    @parameterized.expand(Cat.test_parameters[:-3])
-    @pytest.mark.corstone_fvp
-    def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
-        test_data = (operands, dim)
-        self._test_cat_ethosu_BI_pipeline(
-            self.Cat(), common.get_u55_compile_spec(), test_data
-        )
-
-    # MLETORCH-630 Cat does not work on FVP with batch>1
-    @parameterized.expand(Cat.test_parameters[-3:])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_cat_u55_BI_xfails(self, operands: tuple[torch.Tensor, ...], dim: int):
-        test_data = (operands, dim)
-        self._test_cat_ethosu_BI_pipeline(
-            self.Cat(), common.get_u55_compile_spec(), test_data
-        )
-
-    @parameterized.expand(Cat.test_parameters[:-3])
-    @pytest.mark.corstone_fvp
-    def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
-        test_data = (operands, dim)
-        self._test_cat_ethosu_BI_pipeline(
-            self.Cat(), common.get_u85_compile_spec(), test_data
-        )
-
-    # MLETORCH-630 Cat does not work on FVP with batch>1
-    @parameterized.expand(Cat.test_parameters[-3:])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_cat_u85_BI_xfails(self, operands: tuple[torch.Tensor, ...], dim: int):
-        test_data = (operands, dim)
-        self._test_cat_ethosu_BI_pipeline(
-            self.Cat(), common.get_u85_compile_spec(), test_data
+            -3,
+        ),
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, t: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor:
+        return torch.cat(t, dim=dim)
+
+
+@common.parametrize("test_data", Cat.test_parameters)
+def test_cat_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        Cat(),
+        test_data(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+def test_cat_tosa_MI_4d():
+    square = torch.ones((2, 2, 2, 2))
+    for dim in range(-3, 3):
+        test_data = ((square, square.clone()), dim)
+        pipeline = TosaPipelineMI[input_t1](
+            Cat(),
+            test_data,
+            aten_op,
+            exir_op,
         )
+        pipeline.run()
+
+
+@common.parametrize("test_data", Cat.test_parameters)
+def test_cat_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        Cat(),
+        test_data(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+x_fails = {
+    "cat_rand_two_tensors_dim_0": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
+    "cat_rand_two_tensors_dim_0": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
+    "cat_rand_two_tensors_dim_3": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
+    "cat_rand_large": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
+}
+
+
+@common.parametrize("test_data", Cat.test_parameters, x_fails)
+@common.XfailIfNoCorstone300
+def test_cat_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Cat(),
+        test_data(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Cat.test_parameters, x_fails)
+@common.XfailIfNoCorstone320
+def test_cat_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Cat(),
+        test_data(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_clamp.py b/backends/arm/test/ops/test_clamp.py
index 368f7967433..0846effa7a6 100644
--- a/backends/arm/test/ops/test_clamp.py
+++ b/backends/arm/test/ops/test_clamp.py
@@ -17,20 +17,21 @@
     TosaPipelineMI,
 )
 
-
 aten_op = "torch.ops.aten.clamp.default"
 exir_op = "executorch_exir_dialects_edge__ops_aten_clamp_default"
+
 input_t = Tuple[torch.Tensor]
 
+
 test_data_suite = {
     # test_name: (test_data, min, max)
-    "rank_1": (torch.rand(10) * 2, -1.0, 1.0),
-    "rank_2": (torch.rand(1, 35), 0.5, 0.8),
-    "rank_3": (torch.ones(1, 10, 10), -1, -1),
-    "rank_4": (torch.rand(1, 10, 10, 1) * 2, -0.1, 2.0),
-    "rank_4_mixed_min_max_dtype": (torch.rand(1, 10, 10, 5) + 10, 8.0, 10),
-    "rank_4_no_min": (torch.rand(1, 10, 10, 1) * 10, None, 5),
-    "rank_4_no_max": (torch.rand(1, 10, 10, 1) - 3, -3.3, None),
+    "rank_1": lambda: (torch.rand(10) * 2, -1.0, 1.0),
+    "rank_2": lambda: (torch.rand(1, 35), 0.5, 0.8),
+    "rank_3": lambda: (torch.ones(1, 10, 10), -1, -1),
+    "rank_4": lambda: (torch.rand(1, 10, 10, 1) * 2, -0.1, 2.0),
+    "rank_4_mixed_min_max_dtype": lambda: (torch.rand(1, 10, 10, 5) + 10, 8.0, 10),
+    "rank_4_no_min": lambda: (torch.rand(1, 10, 10, 1) * 10, None, 5),
+    "rank_4_no_max": lambda: (torch.rand(1, 10, 10, 1) - 3, -3.3, None),
 }
 
 
@@ -52,7 +53,7 @@ def forward(self, x):
 @common.parametrize("test_data", test_data_suite)
 def test_clamp_tosa_MI(test_data):
 
-    input_tensor, min_val, max_val = test_data
+    input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
     pipeline = TosaPipelineMI[input_t](
@@ -68,7 +69,7 @@ def test_clamp_tosa_MI(test_data):
 @common.parametrize("test_data", test_data_suite)
 def test_clamp_tosa_BI(test_data):
 
-    input_tensor, min_val, max_val = test_data
+    input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
     pipeline = TosaPipelineBI[input_t](
@@ -84,46 +85,10 @@ def test_clamp_tosa_BI(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_clamp_u55_BI(test_data):
 
-    input_tensor, min_val, max_val = test_data
-    model = Clamp(min_val, max_val)
-
-    pipeline = EthosU55PipelineBI[input_t](
-        model,
-        (input_tensor,),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        symmetric_io_quantization=True,
-    )
-
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite)
-def test_clamp_u85_BI(test_data):
-
-    input_tensor, min_val, max_val = test_data
-    model = Clamp(min_val, max_val)
-
-    pipeline = EthosU85PipelineBI[input_t](
-        model,
-        (input_tensor,),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        symmetric_io_quantization=True,
-    )
-
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite)
-@common.SkipIfNoCorstone300
-def test_clamp_u55_BI_on_fvp(test_data):
-
-    input_tensor, min_val, max_val = test_data
+    input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
     pipeline = EthosU55PipelineBI[input_t](
@@ -140,10 +105,10 @@ def test_clamp_u55_BI_on_fvp(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
-@common.SkipIfNoCorstone320
-def test_clamp_u85_BI_on_fvp(test_data):
+@common.XfailIfNoCorstone320
+def test_clamp_u85_BI(test_data):
 
-    input_tensor, min_val, max_val = test_data
+    input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
     pipeline = EthosU85PipelineBI[input_t](
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 2aad62ece24..125a705ccb4 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -21,7 +21,6 @@
     TosaPipelineMI,
 )
 
-
 aten_op = "torch.ops.aten.clone.default"
 exir_op = "executorch_exir_dialects_edge__ops_aten_clone_default"
 
@@ -36,13 +35,13 @@ def forward(self, x: torch.Tensor):
 
 
 test_data_suite = {
-    "ones_1D_10": (torch.ones(10),),
-    "ones_1D_50": (torch.ones(50),),
-    "rand_1D_20": (torch.rand(20),),
-    "rand_2D_10x10": (torch.rand(10, 10),),
-    "rand_3D_5x5x5": (torch.rand(5, 5, 5),),
-    "rand_4D_2x3x4x5": (torch.rand(2, 3, 4, 5),),
-    "large_tensor": (torch.rand(1000),),
+    "ones_1D_10": lambda: (torch.ones(10),),
+    "ones_1D_50": lambda: (torch.ones(50),),
+    "rand_1D_20": lambda: (torch.rand(20),),
+    "rand_2D_10x10": lambda: (torch.rand(10, 10),),
+    "rand_3D_5x5x5": lambda: (torch.rand(5, 5, 5),),
+    "rand_4D_2x3x4x5": lambda: (torch.rand(2, 3, 4, 5),),
+    "large_tensor": lambda: (torch.rand(1000),),
 }
 
 
@@ -51,7 +50,7 @@ def test_clone_tosa_MI(test_data: Tuple[torch.Tensor]):
 
     pipeline = TosaPipelineMI[input_t](
         Clone(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
     )
@@ -63,7 +62,7 @@ def test_clone_tosa_MI(test_data: Tuple[torch.Tensor]):
 def test_clone_tosa_BI(test_data):
     pipeline = TosaPipelineBI[input_t](
         Clone(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
         symmetric_io_quantization=True,
@@ -72,48 +71,14 @@ def test_clone_tosa_BI(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 @pytest.mark.xfail(
     reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
 )
 def test_clone_u55_BI(test_data):
     pipeline = EthosU55PipelineBI[input_t](
         Clone(),
-        test_data,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        symmetric_io_quantization=True,
-    )
-
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
-)
-def test_clone_u85_BI(test_data):
-    pipeline = EthosU85PipelineBI[input_t](
-        Clone(),
-        test_data,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        symmetric_io_quantization=True,
-    )
-
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
-)
-@common.SkipIfNoCorstone300
-def test_clone_u55_BI_on_fvp(test_data):
-    pipeline = EthosU55PipelineBI[input_t](
-        Clone(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
         run_on_fvp=True,
@@ -124,14 +89,14 @@ def test_clone_u55_BI_on_fvp(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 @pytest.mark.xfail(
     reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
 )
-@common.SkipIfNoCorstone320
-def test_clone_u85_BI_on_fvp(test_data):
+def test_clone_u85_BI(test_data):
     pipeline = EthosU85PipelineBI[input_t](
         Clone(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_constant_pad_nd.py b/backends/arm/test/ops/test_constant_pad_nd.py
index 9a19f6fbf5f..0a81fd0f97d 100644
--- a/backends/arm/test/ops/test_constant_pad_nd.py
+++ b/backends/arm/test/ops/test_constant_pad_nd.py
@@ -17,19 +17,20 @@
 
 aten_op = "torch.ops.aten.pad.default"
 exir_op = "executorch_exir_dialects_edge__ops_aten_pad_default"
+
 input_t1 = Tuple[torch.Tensor]  # Input x
+
 test_data_suite = {
-    "4dim_last1dim": (torch.rand(1, 1, 16, 16), (1, 1, 0, 0, 0, 0, 0, 0), 1),
-    "4dim_last2dim": (torch.rand(1, 1, 16, 16), (1, 0, 1, 0, 0, 0, 0, 0), 2),
-    "4dim_last3dim": (torch.rand(1, 1, 16, 16), (1, 1, 0, 2, 0, 2, 0, 0), 3),
-    "4dim_last4dim": (torch.rand(1, 1, 16, 16), (1, 0, 1, 1, 0, 2, 0, 2), 4),
-    "3dim_last1dim": (torch.rand(1, 1, 16), (1, 1, 0, 0, 0, 0), 1),
-    "3dim_last2dim": (torch.rand(1, 1, 16), (1, 0, 1, 1, 0, 0), 2),
-    "3dim_last3dim": (torch.rand(1, 1, 16), (1, 0, 1, 0, 1, 1), 3),
-    "2dim_last1dim": (torch.rand(1, 1, 16), (1, 1, 0, 0), 1),
-    "2dim_last2dim": (torch.rand(1, 1, 16), (1, 0, 1, 1), 2),
+    "4dim_last1dim": lambda: (torch.rand(1, 1, 16, 16), (1, 1, 0, 0, 0, 0, 0, 0), 1),
+    "4dim_last2dim": lambda: (torch.rand(1, 1, 16, 16), (1, 0, 1, 0, 0, 0, 0, 0), 2),
+    "4dim_last3dim": lambda: (torch.rand(1, 1, 16, 16), (1, 1, 0, 2, 0, 2, 0, 0), 3),
+    "4dim_last4dim": lambda: (torch.rand(1, 1, 16, 16), (1, 0, 1, 1, 0, 2, 0, 2), 4),
+    "3dim_last1dim": lambda: (torch.rand(1, 1, 16), (1, 1, 0, 0, 0, 0), 1),
+    "3dim_last2dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 1, 0, 0), 2),
+    "3dim_last3dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 0, 1, 1), 3),
+    "2dim_last1dim": lambda: (torch.rand(1, 1, 16), (1, 1, 0, 0), 1),
+    "2dim_last2dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 1), 2),
 }
-"""Tests pad."""
 
 
 class ConstantPadND(torch.nn.Module):
@@ -53,7 +54,7 @@ def forward(self, x: torch.Tensor):
     test_data_suite,
 )
 def test_constant_pad_nd_tosa_MI(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = TosaPipelineMI[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
@@ -65,7 +66,7 @@ def test_constant_pad_nd_tosa_MI(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 def test_constant_pad_nd_tosa_BI(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = TosaPipelineBI[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index a1ba23ac73a..768da4d5c89 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -250,27 +250,27 @@ def forward(self, x):
 )
 
 test_modules = {
-    "2_3x2x40_nobias": conv1d_2_3x2x40_nobias,
-    "3_1x3x256_st1": conv1d_3_1x3x256_st1,
-    "3_1x3x12_st2_pd1": conv1d_3_1x3x12_st2_pd1,
-    "1_1x2x128_st1": conv1d_1_1x2x128_st1,
-    "2_1x2x14_st2": conv1d_2_1x2x14_st2,
-    "5_3x2x128_st1": conv1d_5_3x2x128_st1,
-    "3_1x3x224_st2_pd1": conv1d_3_1x3x224_st2_pd1,
-    "7_1x3x16_st2_pd1_dl2_needs_adjust_pass": conv1d_7_1x3x16_st2_pd1_dl2,
-    "7_1x3x15_st1_pd0_dl1_needs_adjust_pass": conv1d_7_1x3x15_st1_pd0_dl1,
-    "5_1x3x14_st5_pd0_dl1_needs_adjust_pass": conv1d_5_1x3x14_st5_pd0_dl1,
-    "5_1x3x9_st5_pd0_dl1_needs_adjust_pass": conv1d_5_1x3x9_st5_pd0_dl1,
-    "two_conv1d_nobias": two_conv1d_nobias,
-    "two_conv1d": two_conv1d,
+    "2_3x2x40_nobias": lambda: conv1d_2_3x2x40_nobias,
+    "3_1x3x256_st1": lambda: conv1d_3_1x3x256_st1,
+    "3_1x3x12_st2_pd1": lambda: conv1d_3_1x3x12_st2_pd1,
+    "1_1x2x128_st1": lambda: conv1d_1_1x2x128_st1,
+    "2_1x2x14_st2": lambda: conv1d_2_1x2x14_st2,
+    "5_3x2x128_st1": lambda: conv1d_5_3x2x128_st1,
+    "3_1x3x224_st2_pd1": lambda: conv1d_3_1x3x224_st2_pd1,
+    "7_1x3x16_st2_pd1_dl2_needs_adjust_pass": lambda: conv1d_7_1x3x16_st2_pd1_dl2,
+    "7_1x3x15_st1_pd0_dl1_needs_adjust_pass": lambda: conv1d_7_1x3x15_st1_pd0_dl1,
+    "5_1x3x14_st5_pd0_dl1_needs_adjust_pass": lambda: conv1d_5_1x3x14_st5_pd0_dl1,
+    "5_1x3x9_st5_pd0_dl1_needs_adjust_pass": lambda: conv1d_5_1x3x9_st5_pd0_dl1,
+    "two_conv1d_nobias": lambda: two_conv1d_nobias,
+    "two_conv1d": lambda: two_conv1d,
 }
 
 
 @common.parametrize("test_module", test_modules)
 def test_convolution_1d_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         aten_op,
         exir_op,
     )
@@ -280,8 +280,8 @@ def test_convolution_1d_tosa_MI(test_module):
 @common.parametrize("test_module", test_modules)
 def test_convolution_1d_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         aten_op,
         exir_op,
     )
@@ -290,35 +290,11 @@ def test_convolution_1d_tosa_BI(test_module):
 
 
 @common.parametrize("test_module", test_modules)
+@common.XfailIfNoCorstone300
 def test_convolution_1d_u55_BI(test_module):
     pipeline = EthosU55PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_modules)
-def test_convolution_1d_u85_BI(test_module):
-    pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_modules)
-@common.SkipIfNoCorstone300
-def test_convolution_1d_u55_BI_on_fvp(test_module):
-    pipeline = EthosU55PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         aten_op,
         exir_op,
         run_on_fvp=True,
@@ -328,11 +304,11 @@ def test_convolution_1d_u55_BI_on_fvp(test_module):
 
 
 @common.parametrize("test_module", test_modules)
-@common.SkipIfNoCorstone320
-def test_convolution_1d_u85_BI_on_fvp(test_module):
+@common.XfailIfNoCorstone320
+def test_convolution_1d_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         aten_op,
         exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 844eed97638..158c296e4ec 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -330,24 +330,24 @@ def forward(self, x):
 # Shenanigan to get a nicer output when test fails. With unittest it looks like:
 # FAIL: test_convolution_2d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
 test_modules = {
-    "2x2_3x2x40x40_nobias": conv2d_2x2_3x2x40x40_nobias,
-    "3x3_1x3x256x256_st1": conv2d_3x3_1x3x256x256_st1,
-    "3x3_1x3x12x12_st2_pd1": conv2d_3x3_1x3x12x12_st2_pd1,
-    "1x1_1x2x128x128_st1": conv2d_1x1_1x2x128x128_st1,
-    "2x2_1x1x14x13_st2_needs_adjust_pass": conv2d_2x2_1x1x14x13_st2,
-    "5x5_1x3x14x15_st3_pd1_needs_adjust_pass": conv2d_5x5_1x3x14x15_st3_pd1,
-    "7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass": conv2d_7x7_1x3x16x16_st2_pd1_dl2,
-    "7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass": conv2d_7x7_1x3x15x15_st1_pd0_dl1,
-    "5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass": conv2d_5x5_1x3x14x14_st5_pd0_dl1,
-    "5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass": conv2d_5x5_1x3x9x9_st5_pd0_dl1,
-    "3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass": conv2d_3x3_1x3x9x8_st3_pd0_dl1,
-    "3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass": conv2d_3x3_1x3x8x9_st3_pd0_dl1,
-    "3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv2d_3x4_1x3x7x7_st3_pd0_dl1,
-    "4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv2d_4x3_1x3x7x7_st3_pd0_dl1,
-    "5x5_3x2x128x128_st1": conv2d_5x5_3x2x128x128_st1,
-    "3x3_1x3x224x224_st2_pd1": conv2d_3x3_1x3x224x224_st2_pd1,
-    "two_conv2d_nobias": two_conv2d_nobias,
-    "two_conv2d": two_conv2d,
+    "2x2_3x2x40x40_nobias": lambda: conv2d_2x2_3x2x40x40_nobias,
+    "3x3_1x3x256x256_st1": lambda: conv2d_3x3_1x3x256x256_st1,
+    "3x3_1x3x12x12_st2_pd1": lambda: conv2d_3x3_1x3x12x12_st2_pd1,
+    "1x1_1x2x128x128_st1": lambda: conv2d_1x1_1x2x128x128_st1,
+    "2x2_1x1x14x13_st2_needs_adjust_pass": lambda: conv2d_2x2_1x1x14x13_st2,
+    "5x5_1x3x14x15_st3_pd1_needs_adjust_pass": lambda: conv2d_5x5_1x3x14x15_st3_pd1,
+    "7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass": lambda: conv2d_7x7_1x3x16x16_st2_pd1_dl2,
+    "7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass": lambda: conv2d_7x7_1x3x15x15_st1_pd0_dl1,
+    "5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass": lambda: conv2d_5x5_1x3x14x14_st5_pd0_dl1,
+    "5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass": lambda: conv2d_5x5_1x3x9x9_st5_pd0_dl1,
+    "3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass": lambda: conv2d_3x3_1x3x9x8_st3_pd0_dl1,
+    "3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass": lambda: conv2d_3x3_1x3x8x9_st3_pd0_dl1,
+    "3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": lambda: conv2d_3x4_1x3x7x7_st3_pd0_dl1,
+    "4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": lambda: conv2d_4x3_1x3x7x7_st3_pd0_dl1,
+    "5x5_3x2x128x128_st1": lambda: conv2d_5x5_3x2x128x128_st1,
+    "3x3_1x3x224x224_st2_pd1": lambda: conv2d_3x3_1x3x224x224_st2_pd1,
+    "two_conv2d_nobias": lambda: two_conv2d_nobias,
+    "two_conv2d": lambda: two_conv2d,
 }
 
 fvp_xfails = {
@@ -360,7 +360,10 @@ def forward(self, x):
 @common.parametrize("test_module", test_modules)
 def test_convolution_2d_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
@@ -368,48 +371,43 @@ def test_convolution_2d_tosa_MI(test_module):
 @common.parametrize("test_module", test_modules)
 def test_convolution_2d_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
     )
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
-@common.parametrize("test_module", test_modules)
-def test_convolution_2d_u55_BI(test_module):
-    pipeline = EthosU55PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=False
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_modules)
-def test_convolution_2d_u85_BI(test_module):
-    pipeline = EthosU85PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=False
-    )
-    pipeline.run()
-
-
 @common.parametrize("test_module", test_modules, fvp_xfails)
-@common.SkipIfNoCorstone300
-def test_convolution_2d_u55_BI_on_fvp(test_module):
+@common.XfailIfNoCorstone300
+def test_convolution_2d_u55_BI(test_module):
     pipeline = EthosU55PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_modules, fvp_xfails)
-@common.SkipIfNoCorstone320
-def test_convolution_2d_u85_BI_on_fvp(test_module):
+@common.XfailIfNoCorstone320
+def test_convolution_2d_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 reject_suite = {
-    "large_stride": Conv2d(
+    "large_stride": lambda: Conv2d(
         in_channels=1,
         out_channels=1,
         kernel_size=(2, 4),
@@ -419,7 +417,7 @@ def test_convolution_2d_u85_BI_on_fvp(test_module):
         height=14,
         batches=1,
     ),
-    "large_kernel_height": Conv2d(
+    "large_kernel_height": lambda: Conv2d(
         in_channels=1,
         out_channels=1,
         kernel_size=(2, 65),
@@ -429,7 +427,7 @@ def test_convolution_2d_u85_BI_on_fvp(test_module):
         height=70,
         batches=1,
     ),
-    "large_kernel": Conv2d(
+    "large_kernel": lambda: Conv2d(
         in_channels=1,
         out_channels=1,
         kernel_size=(70, 60),
@@ -443,12 +441,11 @@ def test_convolution_2d_u85_BI_on_fvp(test_module):
 
 
 @common.parametrize("module", reject_suite)
-def test_reject_convolution_2d_u55_BI(
-    module: Conv2d,
-):
+def test_convolution_2d_u55_BI_not_delegated(module: Conv2d):
     OpNotSupportedPipeline(
-        module,
-        module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        module(),
+        module().get_inputs(),
         {"executorch_exir_dialects_edge__ops_aten_convolution_default": 1},
+        quantize=True,
+        u55_subset=True,
     ).run()
diff --git a/backends/arm/test/ops/test_conv3d.py b/backends/arm/test/ops/test_conv3d.py
index 22f7e9e7f54..c7bb7c55887 100644
--- a/backends/arm/test/ops/test_conv3d.py
+++ b/backends/arm/test/ops/test_conv3d.py
@@ -305,22 +305,22 @@ def forward(self, x):
 )
 
 test_modules = {
-    "2x2_3x2x40x40_nobias": conv3d_2x2_3x2x40x40_nobias,
-    "3x3_1x3x256x256_st1": conv3d_3x3_1x3x256x256_st1,
-    "3x3_1x3x12x12_st2_pd1": conv3d_3x3_1x3x12x12_st2_pd1,
-    "1x1_1x2x128x128_st1": conv3d_1x1_1x2x128x128_st1,
-    "2x2_1x1x14x13_st2_needs_adjust_pass": conv3d_2x2_1x1x14x13_st2,
-    "5x5_1x3x14x15_st3_pd1_needs_adjust_pass": conv3d_5x5_1x3x14x15_st3_pd1,
-    "7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass": conv3d_7x7_1x3x16x16_st2_pd1_dl2,
-    "7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass": conv3d_7x7_1x3x15x15_st1_pd0_dl1,
-    "5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass": conv3d_5x5_1x3x14x14_st5_pd0_dl1,
-    "5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass": conv3d_5x5_1x3x9x9_st5_pd0_dl1,
-    "3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass": conv3d_3x3_1x3x9x8_st3_pd0_dl1,
-    "3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass": conv3d_3x3_1x3x8x9_st3_pd0_dl1,
-    "3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv3d_3x4_1x3x7x7_st3_pd0_dl1,
-    "4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv3d_4x3_1x3x7x7_st3_pd0_dl1,
-    "5x5_3x2x128x128_st1": conv3d_5x5_3x2x128x128_st1,
-    "3x3_1x3x224x224_st2_pd1": conv3d_3x3_1x3x224x224_st2_pd1,
+    "2x2_3x2x40x40_nobias": lambda: conv3d_2x2_3x2x40x40_nobias,
+    "3x3_1x3x256x256_st1": lambda: conv3d_3x3_1x3x256x256_st1,
+    "3x3_1x3x12x12_st2_pd1": lambda: conv3d_3x3_1x3x12x12_st2_pd1,
+    "1x1_1x2x128x128_st1": lambda: conv3d_1x1_1x2x128x128_st1,
+    "2x2_1x1x14x13_st2_needs_adjust_pass": lambda: conv3d_2x2_1x1x14x13_st2,
+    "5x5_1x3x14x15_st3_pd1_needs_adjust_pass": lambda: conv3d_5x5_1x3x14x15_st3_pd1,
+    "7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass": lambda: conv3d_7x7_1x3x16x16_st2_pd1_dl2,
+    "7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass": lambda: conv3d_7x7_1x3x15x15_st1_pd0_dl1,
+    "5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass": lambda: conv3d_5x5_1x3x14x14_st5_pd0_dl1,
+    "5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass": lambda: conv3d_5x5_1x3x9x9_st5_pd0_dl1,
+    "3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass": lambda: conv3d_3x3_1x3x9x8_st3_pd0_dl1,
+    "3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass": lambda: conv3d_3x3_1x3x8x9_st3_pd0_dl1,
+    "3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": lambda: conv3d_3x4_1x3x7x7_st3_pd0_dl1,
+    "4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": lambda: conv3d_4x3_1x3x7x7_st3_pd0_dl1,
+    "5x5_3x2x128x128_st1": lambda: conv3d_5x5_3x2x128x128_st1,
+    "3x3_1x3x224x224_st2_pd1": lambda: conv3d_3x3_1x3x224x224_st2_pd1,
 }
 
 input_t = Tuple[torch.Tensor]
@@ -328,18 +328,18 @@ def forward(self, x):
 
 @common.parametrize("test_module", test_modules)
 @pytest.mark.skip  # Not implemented, skip until it is.
-def test_convolution_3d_tosa_MI(test_module):
+def test_convolution_tosa_MI_3d(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module(), test_module().get_inputs(), aten_op, exir_op
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_modules)
 @pytest.mark.skip  # Not implemented, skip until it is.
-def test_convolution_3d_tosa_BI(test_module):
+def test_convolution_tosa_BI_3d(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module(), test_module().get_inputs(), aten_op, exir_op
     )
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
@@ -347,24 +347,32 @@ def test_convolution_3d_tosa_BI(test_module):
 
 @common.parametrize("test_module", test_modules)
 @pytest.mark.skip  # Not implemented, skip until it is.
-def test_convolution_3d_u55_BI(test_module):
+def test_convolution_u55_BI_3d(test_module):
     pipeline = EthosU55PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_modules)
 @pytest.mark.skip  # Not implemented, skip until it is.
-def test_convolution_3d_u85_BI(test_module):
+def test_convolution_u85_BI_3d(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 reject_suite = {
-    "large_stride": Conv3d(
+    "large_stride": lambda: Conv3d(
         in_channels=1,
         out_channels=1,
         kernel_size=(2, 2, 1),
@@ -374,7 +382,7 @@ def test_convolution_3d_u85_BI(test_module):
         height=14,
         batches=1,
     ),
-    "large_kernel_z": Conv3d(
+    "large_kernel_z": lambda: Conv3d(
         in_channels=1,
         out_channels=1,
         kernel_size=(2, 2, 2),
@@ -388,12 +396,11 @@ def test_convolution_3d_u85_BI(test_module):
 
 
 @common.parametrize("module", reject_suite)
-def test_reject_convolution_3d_u55_BI(
-    module: Conv3d,
-):
+def test_convolution_u55_BI_not_delegated_3d(module: Conv3d):
     OpNotSupportedPipeline(
-        module,
-        module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        module(),
+        module().get_inputs(),
         {"executorch_exir_dialects_edge__ops_aten_convolution_default": 1},
+        quantize=True,
+        u55_subset=True,
     ).run()
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 0fb3c2675e9..7f54fa226aa 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -1,20 +1,24 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
 from typing import Tuple
 
 import pytest
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]
+
 from torch.nn.parameter import Parameter
 
 
@@ -138,13 +142,13 @@ class ComboConvRelu6(torch.nn.Module):
         "executorch_exir_dialects_edge__ops_aten_hardtanh_default",
     ]
 
-    test_data = [
-        (2 * torch.randn(1, 3, 256, 256),),
-        (0.5 * torch.randn(1, 3, 256, 256),),
-        (torch.randn(1, 3, 256, 256),),
-        (-0.5 * torch.randn(1, 3, 256, 256),),
-        (-2 * torch.randn(1, 3, 256, 256),),
-    ]
+    test_data = {
+        "combo_conv_relu_2_x_4d": lambda: (2 * torch.randn(1, 3, 256, 256),),
+        "combo_conv_relu_0_5_x_4d": lambda: (0.5 * torch.randn(1, 3, 256, 256),),
+        "combo_conv_relu_4d": lambda: (torch.randn(1, 3, 256, 256),),
+        "combo_conv_relu_neg_0_5_x_4d": lambda: (-0.5 * torch.randn(1, 3, 256, 256),),
+        "combo_conv_relu_neg_2_x_4d": lambda: (-2 * torch.randn(1, 3, 256, 256),),
+    }
 
     def __init__(self):
         super().__init__()
@@ -165,12 +169,12 @@ class ComboConvAvgPool2d(torch.nn.Module):
         "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default",
     ]
 
-    test_data = [
-        (20 * torch.randn(1, 3, 64, 32),),
-        (torch.randn(1, 3, 100, 200),),
-        (5 * torch.randn(1, 3, 256, 256),),
-        (torch.rand(1, 3, 512, 128),),
-    ]
+    test_data = {
+        "combo_conv_avgpool_20_x_4d": lambda: (20 * torch.randn(1, 3, 64, 32),),
+        "combo_conv_avgpool_4d": lambda: (torch.randn(1, 3, 100, 200),),
+        "combo_conv_avgpool_5_x_4d_randn": lambda: (5 * torch.randn(1, 3, 256, 256),),
+        "combo_conv_avgpool_2_x_4d": lambda: (torch.rand(1, 3, 512, 128),),
+    }
 
     def __init__(self):
         super().__init__()
@@ -185,238 +189,291 @@ def forward(self, x):
         return x
 
 
-class TestConvCombos(unittest.TestCase):
-    """Tests conv combined with other ops."""
-
-    def _test_conv_combo_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(list(module.edge_op_list))
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_conv_combo_tosa_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-        atol: float = 1e-3,
-        rtol: float = 1e-3,
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(list(module.edge_op_list))
-            .to_executorch()
-            .run_method_and_compare_outputs(
-                inputs=test_data, atol=atol, rtol=rtol, qtol=1
-            )
-        )
-
-    def _test_conv_combo_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(list(module.edge_op_list))
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    ####################
-    ## Conv + meandim ##
-    ####################
-    def test_conv_meandim_tosa_MI(self):
-        model = ComboConv2dMeandim()
-        self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
-
-    def test_conv_meandim_tosa_BI(self):
-        model = ComboConv2dMeandim()
-        self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
-
-    @pytest.mark.corstone_fvp
-    def test_conv_meandim_u55_BI(self):
-        model = ComboConv2dMeandim()
-        self._test_conv_combo_ethos_BI_pipeline(
-            model,
-            common.get_u55_compile_spec(),
-            model.get_inputs(),
-        )
-
-    @pytest.mark.corstone_fvp
-    def test_conv_meandim_u85_BI(self):
-        model = ComboConv2dMeandim()
-        self._test_conv_combo_ethos_BI_pipeline(
-            model,
-            common.get_u85_compile_spec(),
-            model.get_inputs(),
-        )
-
-    ##############################
-    ## Conv + batch norm + relu ##
-    ##############################
-    affine_params = [("affine", True), ("_no_affine", False)]
-
-    @parameterized.expand(affine_params)
-    def test_conv_batchnorm_relu6_tosa_MI(self, test_suffix, affine):
-        model = ComboConvBatchnormRelu6(affine)
-        self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
-
-    @parameterized.expand(affine_params)
-    def test_conv_batchnorm_relu6_tosa_BI(self, test_suffix, affine):
-        model = ComboConvBatchnormRelu6(affine)
-        self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
-
-    @parameterized.expand(affine_params)
-    @pytest.mark.corstone_fvp
-    def test_conv_batchnorm_relu6_u55_BI(self, test_suffix, affine):
-        model = ComboConvBatchnormRelu6(affine)
-        self._test_conv_combo_ethos_BI_pipeline(
-            model, common.get_u55_compile_spec(), model.get_inputs()
-        )
-
-    @parameterized.expand(affine_params)
-    @pytest.mark.corstone_fvp
-    def test_conv_batchnorm_relu_u85_BI(self, test_suffix, affine):
-        model = ComboConvBatchnormRelu6(affine)
-        self._test_conv_combo_ethos_BI_pipeline(
-            model,
-            common.get_u85_compile_spec(),
-            model.get_inputs(),
-        )
-
-    ##################
-    ## Conv + ReLU6 ##
-    ##################
-    @parameterized.expand(ComboConvRelu6.test_data)
-    def test_conv_relu6_tosa_MI(self, test_data: torch.Tensor):
-        model = ComboConvRelu6()
-        test_data = (test_data,)
-        self._test_conv_combo_tosa_MI_pipeline(model, test_data)
-
-    @parameterized.expand(ComboConvRelu6.test_data)
-    def test_conv_relu6_tosa_BI(self, test_data: torch.Tensor):
-        model = ComboConvRelu6()
-        test_data = (test_data,)
-        self._test_conv_combo_tosa_BI_pipeline(model, test_data)
-
-    @parameterized.expand(ComboConvRelu6.test_data)
-    @pytest.mark.corstone_fvp
-    def test_conv_relu6_u55_BI(self, test_data: torch.Tensor):
-        model = ComboConvRelu6()
-        test_data = (test_data,)
-        self._test_conv_combo_ethos_BI_pipeline(
-            model, common.get_u55_compile_spec(), test_data
-        )
-
-    @parameterized.expand(ComboConvRelu6.test_data)
-    @pytest.mark.corstone_fvp
-    def test_conv_relu6_u85_BI(self, test_data: torch.Tensor):
-        model = ComboConvRelu6()
-        test_data = (test_data,)
-        self._test_conv_combo_ethos_BI_pipeline(
-            model, common.get_u85_compile_spec(), test_data
-        )
-
-    ###############################
-    ## Block bottleneck residual ##
-    ###############################
-    def test_block_bottleneck_residual_tosa_MI(self):
-        model = ComboBlockBottleneckResidual()
-        self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
-
-    @pytest.mark.flaky  # TODO: Investigate flakyness (MLTORCH-307)
-    def test_block_bottleneck_residual_tosa_BI(self):
-        model = ComboBlockBottleneckResidual()
-        self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
-
-    @pytest.mark.corstone_fvp
-    def test_block_bottleneck_residual_u55_BI(self):
-        model = ComboBlockBottleneckResidual()
-        self._test_conv_combo_ethos_BI_pipeline(
-            model,
-            common.get_u55_compile_spec(),
-            model.get_inputs(),
-        )
-
-    @pytest.mark.corstone_fvp
-    def test_block_bottleneck_residual_u85_BI(self):
-        model = ComboBlockBottleneckResidual()
-        self._test_conv_combo_ethos_BI_pipeline(
-            model,
-            common.get_u85_compile_spec(),
-            model.get_inputs(),
-        )
-
-    ######################
-    ## Conv + AvgPool2d ##
-    ######################
-    @parameterized.expand(ComboConvAvgPool2d.test_data)
-    def test_conv_avgpool2d_tosa_MI(self, test_data: torch.Tensor):
-        model = ComboConvAvgPool2d()
-        test_data = (test_data,)
-        self._test_conv_combo_tosa_MI_pipeline(model, test_data)
-
-    @parameterized.expand(ComboConvAvgPool2d.test_data)
-    def test_conv_avgpool2d_tosa_BI(self, test_data: torch.Tensor):
-        model = ComboConvAvgPool2d()
-        test_data = (test_data,)
-        self._test_conv_combo_tosa_BI_pipeline(model, test_data)
-
-    @parameterized.expand(ComboConvAvgPool2d.test_data)
-    @pytest.mark.corstone_fvp
-    def test_conv_avgpool2d_u55_BI(self, test_data: torch.Tensor):
-        model = ComboConvAvgPool2d()
-        test_data = (test_data,)
-        self._test_conv_combo_ethos_BI_pipeline(
-            model,
-            common.get_u55_compile_spec(),
-            test_data,
-        )
-
-    @parameterized.expand(ComboConvAvgPool2d.test_data)
-    @pytest.mark.corstone_fvp
-    def test_conv_avgpool2d_u85_BI(self, test_data: torch.Tensor):
-        model = ComboConvAvgPool2d()
-        test_data = (test_data,)
-        self._test_conv_combo_ethos_BI_pipeline(
-            model,
-            common.get_u85_compile_spec(),
-            test_data,
-        )
+####################
+## Conv + meandim ##
+####################
+
+
+def test_convolution_2d_tosa_MI_meandim():
+    model = ComboConv2dMeandim()
+
+    pipeline = TosaPipelineMI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_op=[],
+        exir_op=ComboConv2dMeandim.edge_op_list,
+    )
+    pipeline.run()
+
+
+def test_convolution_2d_tosa_BI_meandim():
+    model = ComboConv2dMeandim()
+    pipeline = TosaPipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_op=[],
+        exir_op=ComboConv2dMeandim.edge_op_list,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+def test_convolution_2d_u55_BI_meandim():
+    model = ComboConv2dMeandim()
+    pipeline = EthosU55PipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_ops=[],
+        exir_ops=ComboConv2dMeandim.edge_op_list,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+def test_convolution_2d_u85_BI_meandim():
+    model = ComboConv2dMeandim()
+    pipeline = EthosU85PipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_ops=[],
+        exir_ops=ComboConv2dMeandim.edge_op_list,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+##############################
+## Conv + batch norm + relu ##
+##############################
+affine_params = {"affine": True, "_no_affine": False}
+
+
+@common.parametrize("affine", affine_params)
+def test_convolution_2d_tosa_MI_batchnorm_relu6(affine):
+    model = ComboConvBatchnormRelu6(affine)
+    pipeline = TosaPipelineMI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_op=[],
+        exir_op=ComboConvBatchnormRelu6.edge_op_list,
+    )
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLTORCH-307)
+@common.parametrize("affine", affine_params)
+def test_convolution_2d_tosa_BI_batchnorm_relu6(affine):
+    model = ComboConvBatchnormRelu6(affine)
+    pipeline = TosaPipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_op=[],
+        exir_op=ComboConvBatchnormRelu6.edge_op_list,
+    )
+    pipeline.run()
+
+
+@common.parametrize("affine", affine_params)
+@common.XfailIfNoCorstone300
+def test_convolution_2d_u55_BI_batchnorm_relu6(affine):
+    model = ComboConvBatchnormRelu6(affine)
+    pipeline = EthosU55PipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("affine", affine_params)
+@common.XfailIfNoCorstone320
+def test_convolution_2d_u85_BI_batchnorm_relu6(affine):
+    model = ComboConvBatchnormRelu6(affine)
+    pipeline = EthosU85PipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+##################
+## Conv + ReLU6 ##
+##################
+
+
+@common.parametrize("test_data", ComboConvRelu6.test_data)
+def test_convolution_2d_tosa_MI_relu6(test_data: torch.Tensor):
+    model = ComboConvRelu6()
+    pipeline = TosaPipelineMI[input_t1](
+        model,
+        test_data(),
+        aten_op=[],
+        exir_op=ComboConvRelu6.edge_op_list,
+    )
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLTORCH-307)
+@common.parametrize("test_data", ComboConvRelu6.test_data)
+def test_convolution_2d_tosa_BI_relu6(test_data: torch.Tensor):
+    model = ComboConvRelu6()
+    pipeline = TosaPipelineBI[input_t1](
+        model,
+        test_data(),
+        aten_op=[],
+        exir_op=ComboConvRelu6.edge_op_list,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", ComboConvRelu6.test_data)
+@common.XfailIfNoCorstone300
+def test_convolution_2d_u55_BI_relu6(test_data: torch.Tensor):
+    model = ComboConvRelu6()
+    pipeline = EthosU55PipelineBI[input_t1](
+        model,
+        test_data(),
+        aten_ops=[],
+        exir_ops=ComboConvRelu6.edge_op_list,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", ComboConvRelu6.test_data)
+@common.XfailIfNoCorstone320
+def test_convolution_2d_u85_BI_relu6(test_data: torch.Tensor):
+    model = ComboConvRelu6()
+    pipeline = EthosU85PipelineBI[input_t1](
+        model,
+        test_data(),
+        aten_ops=[],
+        exir_ops=ComboConvRelu6.edge_op_list,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+###############################
+## Block bottleneck residual ##
+###############################
+def test_convolution_2d_tosa_MI_block_bottleneck():
+    model = ComboBlockBottleneckResidual()
+    pipeline = TosaPipelineMI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_op=[],
+        exir_op=ComboBlockBottleneckResidual.edge_op_list,
+    )
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLTORCH-307)
+def test_convolution_2d_tosa_BI_block_bottleneck():
+    model = ComboBlockBottleneckResidual()
+    pipeline = TosaPipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_op=[],
+        exir_op=ComboBlockBottleneckResidual.edge_op_list,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", model.get_inputs(), qtol=1)
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+def test_convolution_2d_u55_BI_block_bottleneck():
+    model = ComboBlockBottleneckResidual()
+    pipeline = EthosU55PipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+def test_convolution_2d_u85_BI_block_bottleneck():
+    model = ComboBlockBottleneckResidual()
+    pipeline = EthosU85PipelineBI[input_t1](
+        model,
+        model.get_inputs(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+######################
+## Conv + AvgPool2d ##
+######################
+
+
+@common.parametrize("test_data", ComboConvAvgPool2d.test_data)
+def test_convolution_2d_tosa_MI_avgpool2d(test_data: torch.Tensor):
+    model = ComboConvAvgPool2d()
+    pipeline = TosaPipelineMI[input_t1](
+        model,
+        test_data(),
+        aten_op=[],
+        exir_op=ComboConvAvgPool2d.edge_op_list,
+    )
+    pipeline.run()
+
+
+x_fails = {
+    "combo_conv_avgpool_20_x_4d": "AssertionError: Output 0 does not match reference output.",
+    "combo_conv_avgpool_4d": "AssertionError: Output 0 does not match reference output.",
+    "combo_conv_avgpool_5_x_4d_randn": "AssertionError: Output 0 does not match reference output.",
+    "combo_conv_avgpool_2_x_4d": "AssertionError: Output 0 does not match reference output.",
+}
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLTORCH-307)
+@common.parametrize("test_data", ComboConvAvgPool2d.test_data, x_fails)
+def test_convolution_2d_tosa_BI_avgpool2d(test_data: torch.Tensor):
+    model = ComboConvAvgPool2d()
+    pipeline = TosaPipelineBI[input_t1](
+        model,
+        test_data(),
+        aten_op=[],
+        exir_op=ComboConvAvgPool2d.edge_op_list,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", ComboConvAvgPool2d.test_data)
+@common.XfailIfNoCorstone300
+def test_convolution_2d_u55_BI_avgpool2d(test_data: torch.Tensor):
+    model = ComboConvAvgPool2d()
+    pipeline = EthosU55PipelineBI[input_t1](
+        model,
+        test_data(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", ComboConvAvgPool2d.test_data)
+@common.XfailIfNoCorstone320
+def test_convolution_2d_u85_BI_avgpool2d(test_data: torch.Tensor):
+    model = ComboConvAvgPool2d()
+    pipeline = EthosU85PipelineBI[input_t1](
+        model,
+        test_data(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 59ce628693c..91b3dde1bb2 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -1,24 +1,29 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
 from typing import Tuple
 
 import pytest
 
 import torch
-from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+exir_op = "executorch_exir_dialects_edge__ops_aten_convolution_default"
+
 from executorch.backends.arm.test.ops.test_conv1d import Conv1d
 from executorch.backends.arm.test.ops.test_conv2d import Conv2d
 
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
-
 
 """
 The configuration when
@@ -149,158 +154,93 @@
 )
 
 # Shenanigan to get a nicer output when test fails.
-testsuite_conv2d = [
-    ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1),
-    ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),
-    ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),
-    ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),
-    ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),
-    ("two_dw_conv2d", two_dw_conv2d),
-]
-
-testsuite_conv2d_u85 = [
-    ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1),
-    ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),
-    ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),
-    ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),
-]
-
-testsuite_conv2d_u85_xfails = [
-    ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),
-    ("two_dw_conv2d", two_dw_conv2d),
-]
-
-
-testsuite_conv1d = [
-    ("2_1x6x4_gp6_st1", dw_conv1d_2_1x6x4_gp6_st1),
-    ("two_dw_conv1d", two_dw_conv1d),
-    ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1),
-    ("3_1x3x14_gp3_st1", dw_conv1d_3_1x3x14_gp3_st1),
-]
-
-
-class TestDepthwiseConv(unittest.TestCase):
-    """Tests Conv1D and Conv2D where groups == in_channels and out_channels = K * in_channels. This
-    is a special case enables depthwise convolution."""
-
-    def _test_dw_conv_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_dw_conv_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_dw_conv_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
-    def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv_tosa_MI_pipeline(model, model.get_inputs())
-
-    @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
-    @pytest.mark.flaky  # TODO: Investigate flakyness (MLTORCH-307)
-    def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs())
-
-    @parameterized.expand(testsuite_conv2d[:4], skip_on_empty=True)
-    @pytest.mark.corstone_fvp
-    def test_dw_conv2d_u55_BI(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv_ethos_BI_pipeline(
-            model,
-            common.get_u55_compile_spec(),
-            model.get_inputs(),
-        )
-
-    @parameterized.expand(testsuite_conv2d[4:], skip_on_empty=True)
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-516
-    def test_dw_conv2d_u55_BI_xfails(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv_ethos_BI_pipeline(
-            model,
-            common.get_u55_compile_spec(),
-            model.get_inputs(),
-        )
-
-    @parameterized.expand(testsuite_conv1d, skip_on_empty=True)
-    @pytest.mark.corstone_fvp
-    def test_dw_conv1d_u55_BI(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv_ethos_BI_pipeline(
-            model,
-            common.get_u55_compile_spec(),
-            model.get_inputs(),
-        )
-
-    @parameterized.expand(testsuite_conv1d + testsuite_conv2d_u85)
-    @pytest.mark.corstone_fvp
-    def test_dw_conv_u85_BI(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv_ethos_BI_pipeline(
-            model,
-            common.get_u85_compile_spec(),
-            model.get_inputs(),
-        )
-
-    # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520
-    @parameterized.expand(testsuite_conv2d_u85_xfails)
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_dw_conv_u85_BI_xfails(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv_ethos_BI_pipeline(
-            model,
-            common.get_u85_compile_spec(),
-            model.get_inputs(),
-        )
+testsuite_conv2d = {
+    "2x2_1x6x4x4_gp6_st1": lambda: dw_conv2d_2x2_1x6x4x4_gp6_st1,
+    "3x3_1x3x256x256_gp3_st1": lambda: dw_conv2d_3x3_1x3x256x256_gp3_st1,
+    "3x3_1x4x256x256_gp4_nobias": lambda: dw_conv2d_3x3_1x4x256x256_gp4_nobias,
+    "3x3_1x4x256x256_gp4_st1": lambda: dw_conv2d_3x3_1x4x256x256_gp4_st1,
+    "3x3_2x8x198x198_gp8_st3": lambda: dw_conv2d_3x3_2x8x198x198_gp8_st3,
+    "two_dw_conv2d": lambda: two_dw_conv2d,
+}
+
+testsuite_conv2d_u85 = {
+    "2x2_1x6x4x4_gp6_st1": lambda: dw_conv2d_2x2_1x6x4x4_gp6_st1,
+    "3x3_1x3x256x256_gp3_st1": lambda: dw_conv2d_3x3_1x3x256x256_gp3_st1,
+    "3x3_1x4x256x256_gp4_st1": lambda: dw_conv2d_3x3_1x4x256x256_gp4_st1,
+    "3x3_1x4x256x256_gp4_nobias": lambda: dw_conv2d_3x3_1x4x256x256_gp4_nobias,
+}
+
+testsuite_conv1d = {
+    "2_1x6x4_gp6_st1": lambda: dw_conv1d_2_1x6x4_gp6_st1,
+    "two_dw_conv1d": lambda: two_dw_conv1d,
+    "3_1x3x256_gp3_st1": lambda: dw_conv1d_3_1x3x256_gp3_st1,
+    "3_1x3x14_gp3_st1": lambda: dw_conv1d_3_1x3x14_gp3_st1,
+}
+
+
+@common.parametrize("test_module", testsuite_conv1d | testsuite_conv2d)
+def test_convolution_2d_tosa_MI_depth_wise(test_module: torch.nn.Module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module(),
+        test_module().get_inputs(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLTORCH-307)
+@common.parametrize("test_module", testsuite_conv1d | testsuite_conv2d)
+def test_convolution_2d_tosa_BI_depth_wise(test_module: torch.nn.Module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module(),
+        test_module().get_inputs(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+x_fails = {
+    "3x3_2x8x198x198_gp8_st3": "MLETORCH-516: AssertionError: Output 0 does not match reference output.",
+    "two_dw_conv2d": "MLETORCH-516: AssertionError: Output 0 does not match reference output.",
+}
+
+
+@common.parametrize("test_module", testsuite_conv2d, x_fails)
+@common.XfailIfNoCorstone300  # TODO: MLETORCH-516
+def test_convolution_2d_u55_BI_depth_wise(test_module: torch.nn.Module):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_module(),
+        test_module().get_inputs(),
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300  # TODO: MLETORCH-516
+@common.parametrize("test_module", testsuite_conv1d)
+def test_convolution_1d_u55_BI_depth_wise(test_module: torch.nn.Module):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_module(),
+        test_module().get_inputs(),
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", testsuite_conv1d | testsuite_conv2d, x_fails)
+@common.XfailIfNoCorstone320  # TODO: MLETORCH-516
+def test_convolution_2d_u85_BI_depth_wise(test_module: torch.nn.Module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module(),
+        test_module().get_inputs(),
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index d200a753ce5..087bdb84a63 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -1,243 +1,131 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Optional, Tuple, Union
 
-import pytest
-
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.div.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_div_Tensor"
 
+input_t1 = Tuple[torch.Tensor]  # Input x
 
-test_data_suite = [
+test_data_suite = {
     # (test_name, input, other, rounding_mode) See torch.div() for info
-    (
-        "op_div_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
-        None,
-    ),
-    (
-        "op_div_rank1_negative_ones",
+    "op_div_rank1_ones": lambda: (torch.ones(5), torch.ones(5), None),
+    "op_div_rank1_negative_ones": lambda: (
         torch.ones(5) * (-1),
         torch.ones(5) * (-1),
         None,
     ),
-    (
-        "op_div_rank1_rand",
+    "op_div_rank1_rand": lambda: (
         torch.rand(5) * 5,
         torch.rand(5) * 5,
         None,
     ),
-    (
-        "op_div_rank4_ones",
+    "op_div_rank4_ones": lambda: (
         torch.ones(5, 10, 25, 20),
         torch.ones(5, 10, 25, 20),
         None,
     ),
-    (
-        "op_div_rank4_negative_ones",
+    "op_div_rank4_negative_ones": lambda: (
         (-1) * torch.ones(5, 10, 25, 20),
         torch.ones(5, 10, 25, 20),
         None,
     ),
-    (
-        "op_div_rank4_ones_div_negative",
+    "op_div_rank4_ones_div_negative": lambda: (
         torch.ones(5, 10, 25, 20),
         (-1) * torch.ones(5, 10, 25, 20),
         None,
     ),
-    (
-        "op_div_rank4_large_rand",
+    "op_div_rank4_large_rand": lambda: (
         200 * torch.rand(5, 10, 25, 20),
         torch.rand(5, 10, 25, 20),
         None,
     ),
-    (
-        "op_div_rank4_negative_large_rand",
+    "op_div_rank4_negative_large_rand": lambda: (
         (-200) * torch.rand(5, 10, 25, 20),
         torch.rand(5, 10, 25, 20),
         None,
     ),
-    (
-        "op_div_rank4_large_randn",
+    "op_div_rank4_large_randn": lambda: (
         200 * torch.randn(5, 10, 25, 20) + 1,
         torch.rand(5, 10, 25, 20) + 1,
         None,
     ),
-]
-
-
-class TestDiv(unittest.TestCase):
-    """Tests division"""
-
-    class Div(torch.nn.Module):
-
-        def forward(
-            self,
-            input_: Union[torch.Tensor, torch.types.Number],
-            other_: Union[torch.Tensor, torch.types.Number],
-            rounding_mode: Optional[str] = None,
-        ):
-            if rounding_mode is None:
-                return torch.div(input=input_, other=other_)
-            else:
-                return torch.div(
-                    input=input_, other=other_, rounding_mode=rounding_mode
-                )
-
-    def _test_div_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.div.Tensor": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_div_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count(
-                {"torch.ops.aten.reciprocal.default": 1, "torch.ops.aten.mul.Tensor": 1}
-            )
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, atol=1, rtol=0.1)
-        )
-
-    def _test_div_ethos_BI_pipeline(
-        self, module: torch.nn.Module, compile_spec, test_data: Tuple[torch.Tensor]
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .check_count(
-                {"torch.ops.aten.reciprocal.default": 1, "torch.ops.aten.mul.Tensor": 1}
-            )
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_div_tosa_MI(
-        self,
-        test_name: str,
-        input_: Union[torch.Tensor, torch.types.Number],
-        other_: Union[torch.Tensor, torch.types.Number],
-        rounding_mode: Optional[str] = None,
-    ):
-        test_data = (input_, other_)
-        self._test_div_tosa_MI_pipeline(self.Div(), test_data)
+}
 
-    @parameterized.expand(test_data_suite)
-    def test_div_tosa_BI(
-        self,
-        test_name: str,
-        input_: Union[torch.Tensor, torch.types.Number],
-        other_: Union[torch.Tensor, torch.types.Number],
-        rounding_mode: Optional[str] = None,
-    ):
 
-        test_data = (input_, other_)
-        self._test_div_tosa_BI_pipeline(self.Div(), test_data)
+class Div(torch.nn.Module):
 
-    @parameterized.expand(test_data_suite[:3])
-    @pytest.mark.corstone_fvp
-    def test_div_u55_BI(
-        self,
-        test_name: str,
-        input_: Union[torch.Tensor, torch.types.Number],
-        other_: Union[torch.Tensor, torch.types.Number],
-        rounding_mode: Optional[str] = None,
-    ):
-        test_data = (input_, other_)
-        self._test_div_ethos_BI_pipeline(
-            self.Div(), common.get_u55_compile_spec(), test_data
-        )
-
-    # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[3:])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_div_u55_BI_xfails(
-        self,
-        test_name: str,
-        input_: Union[torch.Tensor, torch.types.Number],
-        other_: Union[torch.Tensor, torch.types.Number],
-        rounding_mode: Optional[str] = None,
-    ):
-        test_data = (input_, other_)
-        self._test_div_ethos_BI_pipeline(
-            self.Div(), common.get_u55_compile_spec(), test_data
-        )
-
-    @parameterized.expand(test_data_suite[:3])
-    @pytest.mark.corstone_fvp
-    def test_div_u85_BI(
-        self,
-        test_name: str,
-        input_: Union[torch.Tensor, torch.types.Number],
-        other_: Union[torch.Tensor, torch.types.Number],
-        rounding_mode: Optional[str] = None,
-    ):
-        test_data = (input_, other_)
-        self._test_div_ethos_BI_pipeline(
-            self.Div(), common.get_u85_compile_spec(), test_data
-        )
-
-    # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[3:])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_div_u85_BI_xfails(
+    def forward(
         self,
-        test_name: str,
         input_: Union[torch.Tensor, torch.types.Number],
         other_: Union[torch.Tensor, torch.types.Number],
         rounding_mode: Optional[str] = None,
     ):
-        test_data = (input_, other_)
-        self._test_div_ethos_BI_pipeline(
-            self.Div(), common.get_u85_compile_spec(), test_data
-        )
+        if rounding_mode is None:
+            return torch.div(input=input_, other=other_)
+        else:
+            return torch.div(input=input_, other=other_, rounding_mode=rounding_mode)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_div_tensor_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](Div(), test_data(), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_div_tensor_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](Div(), test_data(), aten_op=[], exir_op=[])
+    pipeline.run()
+
+
+x_fails = {
+    "op_div_rank4_ones": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
+    "op_div_rank4_negative_ones": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
+    "op_div_rank4_ones_div_negative": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
+    "op_div_rank4_large_rand": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
+    "op_div_rank4_negative_large_rand": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
+    "op_div_rank4_large_randn": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
+}
+
+
+@common.parametrize("test_data", test_data_suite, xfails=x_fails)
+@common.XfailIfNoCorstone300
+def test_div_tensor_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Div(),
+        test_data(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite, xfails=x_fails)
+@common.XfailIfNoCorstone320
+def test_div_tensor_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Div(),
+        test_data(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_eq.py b/backends/arm/test/ops/test_eq.py
index e3bcf877ffe..bd6cace00a5 100644
--- a/backends/arm/test/ops/test_eq.py
+++ b/backends/arm/test/ops/test_eq.py
@@ -15,7 +15,6 @@
     TosaPipelineMI,
 )
 
-
 input_t = Tuple[torch.Tensor]
 
 
@@ -63,24 +62,27 @@ def get_inputs(self):
 op_eq_scalar_rank4_randn = Equal(torch.randn(3, 2, 2, 2), 0.3)
 
 test_data_tensor = {
-    "eq_tensor_rank1_ones": op_eq_tensor_rank1_ones,
-    "eq_tensor_rank2_rand": op_eq_tensor_rank2_rand,
-    "eq_tensor_rank3_randn": op_eq_tensor_rank3_randn,
-    "eq_tensor_rank4_randn": op_eq_tensor_rank4_randn,
+    "eq_tensor_rank1_ones": lambda: op_eq_tensor_rank1_ones,
+    "eq_tensor_rank2_rand": lambda: op_eq_tensor_rank2_rand,
+    "eq_tensor_rank3_randn": lambda: op_eq_tensor_rank3_randn,
+    "eq_tensor_rank4_randn": lambda: op_eq_tensor_rank4_randn,
 }
 
 test_data_scalar = {
-    "eq_scalar_rank1_ones": op_eq_scalar_rank1_ones,
-    "eq_scalar_rank2_rand": op_eq_scalar_rank2_rand,
-    "eq_scalar_rank3_randn": op_eq_scalar_rank3_randn,
-    "eq_scalar_rank4_randn": op_eq_scalar_rank4_randn,
+    "eq_scalar_rank1_ones": lambda: op_eq_scalar_rank1_ones,
+    "eq_scalar_rank2_rand": lambda: op_eq_scalar_rank2_rand,
+    "eq_scalar_rank3_randn": lambda: op_eq_scalar_rank3_randn,
+    "eq_scalar_rank4_randn": lambda: op_eq_scalar_rank4_randn,
 }
 
 
 @common.parametrize("test_module", test_data_tensor)
-def test_eq_tensor_tosa_MI(test_module):
+def test_eq_scalar_tosa_MI_tensor(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), Equal.aten_op_Tensor, Equal.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        Equal.aten_op_Tensor,
+        Equal.exir_op,
     )
     pipeline.run()
 
@@ -88,8 +90,8 @@ def test_eq_tensor_tosa_MI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_eq_scalar_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         Equal.aten_op_Scalar,
         Equal.exir_op,
     )
@@ -97,9 +99,12 @@ def test_eq_scalar_tosa_MI(test_module):
 
 
 @common.parametrize("test_module", test_data_tensor)
-def test_eq_tensor_tosa_BI(test_module):
+def test_eq_scalar_tosa_BI_tensor(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), Equal.aten_op_Tensor, Equal.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        Equal.aten_op_Tensor,
+        Equal.exir_op,
     )
     pipeline.run()
 
@@ -107,20 +112,24 @@ def test_eq_tensor_tosa_BI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_eq_scalar_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), Equal.aten_op_Tensor, Equal.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        Equal.aten_op_Tensor,
+        Equal.exir_op,
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_data_tensor)
 @common.XfailIfNoCorstone300
-def test_eq_tensor_u55_BI(test_module):
+def test_eq_scalar_u55_BI_tensor(test_module):
     # EQUAL is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {Equal.exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -130,11 +139,12 @@ def test_eq_tensor_u55_BI(test_module):
 def test_eq_scalar_u55_BI(test_module):
     # EQUAL is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {Equal.exir_op: 1},
         n_expected_delegates=1,
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -148,10 +158,10 @@ def test_eq_scalar_u55_BI(test_module):
     strict=False,
 )
 @common.XfailIfNoCorstone320
-def test_eq_tensor_u85_BI(test_module):
+def test_eq_scalar_u85_BI_tensor(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         Equal.aten_op_Tensor,
         Equal.exir_op,
         run_on_fvp=True,
@@ -170,8 +180,8 @@ def test_eq_tensor_u85_BI(test_module):
 @common.XfailIfNoCorstone320
 def test_eq_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         Equal.aten_op_Tensor,
         Equal.exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_erf.py b/backends/arm/test/ops/test_erf.py
index d452be7cae1..e7136036c65 100644
--- a/backends/arm/test/ops/test_erf.py
+++ b/backends/arm/test/ops/test_erf.py
@@ -24,24 +24,24 @@ def forward(self, x: torch.Tensor):
         return torch.erf(x)
 
     test_data: dict[str, input_t1] = {
-        "zeros": (torch.zeros(1, 10, 10, 10),),
-        "ones": (torch.ones(10, 10, 10),),
-        "rand": ((torch.rand(10, 10) - 0.5),),
-        "randn_pos": ((torch.randn(1, 4, 4, 4) + 10),),
-        "randn_neg": ((torch.randn(1, 4, 4, 4) - 10),),
-        "ramp": (torch.arange(-16, 16, 0.2),),
+        "zeros": lambda: (torch.zeros(1, 10, 10, 10),),
+        "ones": lambda: (torch.ones(10, 10, 10),),
+        "rand": lambda: ((torch.rand(10, 10) - 0.5),),
+        "randn_pos": lambda: ((torch.randn(1, 4, 4, 4) + 10),),
+        "randn_neg": lambda: ((torch.randn(1, 4, 4, 4) - 10),),
+        "ramp": lambda: (torch.arange(-16, 16, 0.2),),
     }
 
 
 @common.parametrize("test_data", Erf.test_data)
 def test_erf_tosa_MI(test_data: input_t1):
-    pipeline = TosaPipelineMI[input_t1](Erf(), test_data, aten_op, exir_op)
+    pipeline = TosaPipelineMI[input_t1](Erf(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Erf.test_data)
 def test_erf_tosa_BI(test_data: input_t1):
-    pipeline = TosaPipelineBI[input_t1](Erf(), test_data, aten_op, exir_op)
+    pipeline = TosaPipelineBI[input_t1](Erf(), test_data(), aten_op, exir_op)
     pipeline.run()
 
 
@@ -49,7 +49,7 @@ def test_erf_tosa_BI(test_data: input_t1):
 @common.XfailIfNoCorstone300
 def test_erf_u55_BI(test_data: input_t1):
     pipeline = EthosU55PipelineBI[input_t1](
-        Erf(), test_data, aten_op, exir_op, run_on_fvp=True
+        Erf(), test_data(), aten_op, exir_op, run_on_fvp=True
     )
     pipeline.run()
 
@@ -58,6 +58,6 @@ def test_erf_u55_BI(test_data: input_t1):
 @common.XfailIfNoCorstone320
 def test_erf_u85_BI(test_data: input_t1):
     pipeline = EthosU85PipelineBI[input_t1](
-        Erf(), test_data, aten_op, exir_op, run_on_fvp=True
+        Erf(), test_data(), aten_op, exir_op, run_on_fvp=True
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
index 3fa9f8c99fa..9218455916a 100644
--- a/backends/arm/test/ops/test_exp.py
+++ b/backends/arm/test/ops/test_exp.py
@@ -1,127 +1,85 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
 
-test_data_suite = [
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+test_data_suite = {
     # (test_name, test_data)
-    ("zeros", torch.zeros(1, 10, 10, 10)),
-    ("ones", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(1, 4, 4, 4) + 10),
-    ("randn_neg", torch.randn(10) - 10),
-    ("ramp", torch.arange(-16, 16, 0.2)),
-]
-
-
-class TestExp(unittest.TestCase):
-    """Tests lowering of aten.exp"""
-
-    class Exp(torch.nn.Module):
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return torch.exp(x)
-
-    def _test_exp_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.exp.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_exp_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.exp.default"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_exp_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.exp.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_exp_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_exp_tosa_MI_pipeline(self.Exp(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_exp_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_exp_tosa_BI_pipeline(self.Exp(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_exp_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_exp_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Exp(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_exp_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_exp_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Exp(), (test_data,)
-        )
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(1, 4, 4, 4) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+}
+
+aten_op = "torch.ops.aten.exp.default"
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Exp(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.exp(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_exp_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        Exp(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_exp_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        Exp(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_exp_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Exp(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_exp_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Exp(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index cd073bddcc8..8f84c39dd27 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -7,7 +7,6 @@
 # Tests the expand op which copies the data of the input tensor (possibly with new data format)
 #
 
-import unittest
 
 from typing import Sequence, Tuple
 
@@ -15,153 +14,121 @@
 
 import torch
 
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
 )
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
-
-
-class TestSimpleExpand(unittest.TestCase):
-    """Tests the Tensor.expand which should be converted to a repeat op by a pass."""
-
-    class Expand(torch.nn.Module):
-        # (input tensor, multiples)
-        test_parameters = [
-            (torch.rand(1), (2,)),
-            (torch.randn(1), (2, 2, 4)),
-            (torch.randn(1, 1, 1, 5), (1, 4, -1, -1)),
-            (torch.randn(1, 1), (1, 2, 2, 4)),
-            (torch.randn(1, 1), (2, 2, 2, 4)),
-            (torch.randn(10, 1, 1, 97), (-1, 4, -1, -1)),
-            (torch.rand(1, 1, 2, 2), (4, 3, -1, 2)),
-            (torch.randn(1, 4), (1, -1)),
-            (torch.randn(1, 1, 192), (1, -1, -1)),
-        ]
-
-        def forward(self, x: torch.Tensor, m: Sequence):
-            return x.expand(m)
-
-    def _test_expand_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.expand.default": 1})
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.expand.default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.expand.default": 1})
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.expand.default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_expand_ethosu_BI_pipeline(
-        self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.expand.default": 1})
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.expand.default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(Expand.test_parameters)
-    def test_expand_tosa_MI(self, test_input, multiples):
-        self._test_expand_tosa_MI_pipeline(self.Expand(), (test_input, multiples))
-
-    @parameterized.expand(Expand.test_parameters)
-    def test_expand_tosa_BI(self, test_input, multiples):
-        self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples))
-
-    @parameterized.expand(Expand.test_parameters[:-5])
-    @pytest.mark.corstone_fvp
-    def test_expand_u55_BI(self, test_input, multiples):
-        self._test_expand_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
-        )
-
-    # MLETORCH-629: Expand does not work on FVP with batch>1
-    @parameterized.expand(Expand.test_parameters[-5:-2])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_expand_u55_BI_xfails_on_fvp(self, test_input, multiples):
-        self._test_expand_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
-        )
-
-    @parameterized.expand(Expand.test_parameters[-2:])
-    @pytest.mark.xfail(
-        reason="MLETORCH-716: Node will be optimized away and Vela can't handle empty graphs"
+
+aten_op = "torch.ops.aten.expand.default"
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x, Input y
+
+
+class Expand(torch.nn.Module):
+    # (input tensor, multiples)
+    test_parameters = {
+        "rand_1d_both": lambda: (torch.rand(1), (2,)),
+        "rand_1d": lambda: (torch.randn(1), (2, 2, 4)),
+        "rand_4d": lambda: (torch.randn(1, 1, 1, 5), (1, 4, -1, -1)),
+        "rand_batch_1": lambda: (torch.randn(1, 1), (1, 2, 2, 4)),
+        "rand_batch_2": lambda: (torch.randn(1, 1), (2, 2, 2, 4)),
+        "rand_mix_neg": lambda: (torch.randn(10, 1, 1, 97), (-1, 4, -1, -1)),
+        "rand_small_neg": lambda: (torch.rand(1, 1, 2, 2), (4, 3, -1, 2)),
+    }
+
+    test_reject_set = {
+        "rand_2d": lambda: (torch.randn(1, 4), (1, -1)),
+        "rand_neg_mul": lambda: (torch.randn(1, 1, 192), (1, -1, -1)),
+    }
+
+    def forward(self, x: torch.Tensor, m: Sequence):
+        return x.expand(m)
+
+
+@common.parametrize("test_data", Expand.test_parameters | Expand.test_reject_set)
+def test_expand_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        Expand(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Expand.test_parameters | Expand.test_reject_set)
+def test_expand_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        Expand(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+x_fails = {
+    "rand_batch_2": "AssertionError: Output 0 does not match reference output.",
+    "rand_mix_neg": "AssertionError: Output 0 does not match reference output.",
+    "rand_small_neg": "AssertionError: Output 0 does not match reference output.",
+}
+
+
+@common.parametrize("test_data", Expand.test_parameters, x_fails)
+@common.XfailIfNoCorstone300
+def test_expand_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Expand(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
     )
-    def test_expand_u55_BI_xfails(self, test_input, multiples):
-        self._test_expand_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
-        )
-
-    @parameterized.expand(Expand.test_parameters[:-5])
-    @pytest.mark.corstone_fvp
-    def test_expand_u85_BI(self, test_input, multiples):
-        self._test_expand_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
-        )
-
-    # MLETORCH-629: Expand does not work on FVP with batch>1
-    @parameterized.expand(Expand.test_parameters[-5:-2])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_expand_u85_BI_xfails_on_fvp(self, test_input, multiples):
-        self._test_expand_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
-        )
-
-    @parameterized.expand(Expand.test_parameters[-2:])
-    @pytest.mark.xfail(
-        reason="MLETORCH-716: Node will be optimized away and Vela can't handle empty graphs"
+    pipeline.run()
+
+
+@common.parametrize("test_data", Expand.test_parameters, x_fails)
+@common.XfailIfNoCorstone320
+def test_expand_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Expand(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Expand.test_reject_set)
+@common.XfailIfNoCorstone300
+@pytest.mark.xfail(
+    reason="MLETORCH-716: Node will be optimized away and Vela can't handle empty graphs"
+)
+def test_expand_u55_BI_failure_set(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Expand(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Expand.test_reject_set)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(
+    reason="MLETORCH-716: Node will be optimized away and Vela can't handle empty graphs"
+)
+def test_expand_u85_BI_failure_set(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Expand(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
     )
-    def test_expand_u85_xfails(self, test_input, multiples):
-        self._test_expand_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
-        )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index 193ed632ed0..13a3146f2fe 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -8,186 +8,199 @@
 # The shape and value are set at compile time, i.e. can't be set by a tensor input.
 #
 
-import unittest
-
 from typing import Tuple
 
 import pytest
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestFull(unittest.TestCase):
-    """Tests the full op which creates a tensor of a given shape filled with a given value."""
-
-    class Full(torch.nn.Module):
-        # A single full op
-        def forward(self):
-            return torch.full((3, 3), 4.5)
-
-    class AddConstFull(torch.nn.Module):
-        # Input + a full with constant value.
-        def forward(self, x: torch.Tensor):
-            return torch.full((2, 2, 3, 3), 4.5, dtype=torch.float32) + x
-
-    class AddVariableFull(torch.nn.Module):
-        sizes: list[tuple[int, ...]] = [
-            (5,),
-            (5, 5),
-            (5, 5, 5),
-            (1, 5, 5, 5),
-        ]
-        test_parameters = [((torch.randn(n) * 10 - 5, 3.2),) for n in sizes]
-
-        def forward(self, x: torch.Tensor, y):
-            # Input + a full with the shape from the input and a given value 'y'.
-            return x + torch.full(x.shape, y)
-
-    class FullLike(torch.nn.Module):
-        """Since full_like is replaced with full, we only need to test on reference model, not FVP."""
-
-        test_parameters = [
-            ((torch.randn(2, 2, 2, 2) * 50, 3.2),),
-            ((torch.randn(2, 2, 2, 2) * 50, 3),),
-            (((torch.randn(2, 2, 2, 2) * 50).to(torch.int32), 3.2),),
-            (((torch.randn(2, 2, 2, 2) * 50).to(torch.int32), 3),),
-        ]
-
-        def forward(self, input_tensor: torch.Tensor, value):
-            # Our backend can't handle tensors without users, which input_tensor doesn't have
-            # when the full_like is converted to a full. Therefore involve it in the output.
-            return input_tensor + torch.full_like(input_tensor, value)
-
-    def _test_full_tosa_MI_pipeline(
-        self,
-        module: torch.nn.Module,
-        example_data: Tuple,
-        test_data: Tuple | None = None,
-    ):
-        if test_data is None:
-            test_data = example_data
-        (
-            ArmTester(
-                module,
-                example_inputs=example_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_full_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_full_tosa_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple,
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_full_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_full_tosa_ethos_pipeline(
-        self, compile_spec: list[CompileSpec], module: torch.nn.Module, test_data: Tuple
-    ):
-        tester = (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_full_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        self._test_full_tosa_ethos_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_full_tosa_u85_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        self._test_full_tosa_ethos_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
-
-    def test_only_full_tosa_MI(self):
-        self._test_full_tosa_MI_pipeline(self.Full(), ())
-
-    def test_const_full_tosa_MI(self):
-        _input = torch.rand((2, 2, 3, 3)) * 10
-        self._test_full_tosa_MI_pipeline(self.AddConstFull(), (_input,))
-
-    @parameterized.expand(FullLike.test_parameters)
-    def test_full_like_tosa_MI(self, test_tensor: Tuple):
-        self._test_full_tosa_MI_pipeline(self.FullLike(), test_tensor)
-
-    @parameterized.expand(AddVariableFull.test_parameters)
-    def test_full_tosa_MI(self, test_tensor: Tuple):
-        self._test_full_tosa_MI_pipeline(
-            self.AddVariableFull(), example_data=test_tensor
-        )
-
-    @parameterized.expand(AddVariableFull.test_parameters)
-    def test_full_tosa_BI(self, test_tensor: Tuple):
-        self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor)
-
-    @parameterized.expand(FullLike.test_parameters)
-    def test_full_like_tosa_BI(self, test_tensor: Tuple):
-        self._test_full_tosa_BI_pipeline(self.FullLike(), test_tensor)
-
-    @parameterized.expand(AddVariableFull.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_full_u55_BI(self, test_tensor: Tuple):
-        self._test_full_tosa_u55_pipeline(
-            self.AddVariableFull(),
-            test_tensor,
-        )
-
-    @parameterized.expand(AddVariableFull.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_full_u85_BI(self, test_tensor: Tuple):
-        self._test_full_tosa_u85_pipeline(
-            self.AddVariableFull(),
-            test_tensor,
-        )
-
-    def test_integer_value(self):
-        _input = torch.ones((2, 2))
-        integer_fill_value = 1
-        self._test_full_tosa_MI_pipeline(
-            self.AddVariableFull(), example_data=(_input, integer_fill_value)
-        )
-
-    # This fails since the fill value in the full tensor is set at compile time by the example data (1.).
-    # Test data tries to set it again at runtime (to 2.) but it doesn't do anything.
-    # In eager mode, the fill value can be set at runtime, causing the outputs to not match.
-    @unittest.expectedFailure
-    def test_set_value_at_runtime(self):
-        _input = torch.ones((2, 2))
-        example_fill_value = 1.0
-        test_fill_value = 2.0
-        self._test_full_tosa_MI_pipeline(
-            self.AddVariableFull(),
-            example_data=(_input, example_fill_value),
-            test_data=(_input, test_fill_value),
-        )
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor, int]
+
+exir_op = "executorch_exir_dialects_edge__ops_aten_full_default"
+
+
+class Full(torch.nn.Module):
+    # A single full op
+    def forward(self):
+        return torch.full((3, 3), 4.5)
+
+
+class AddConstFull(torch.nn.Module):
+    # Input + a full with constant value.
+    def forward(self, x: torch.Tensor):
+        return torch.full((2, 2, 3, 3), 4.5, dtype=torch.float32) + x
+
+
+class AddVariableFull(torch.nn.Module):
+    sizes: list[tuple[int, ...]] = [
+        (5,),
+        (5, 5),
+        (5, 5, 5),
+        (1, 5, 5, 5),
+    ]
+    test_parameters = {}
+    for i, n in enumerate(sizes):
+        test_parameters[f"slice_randn_{i}"] = (torch.randn(n) * 10 - 5, 3.2)
+
+    def forward(self, x: torch.Tensor, y):
+        # Input + a full with the shape from the input and a given value 'y'.
+        return x + torch.full(x.shape, y)
+
+
+class FullLike(torch.nn.Module):
+    """Since full_like is replaced with full, we only need to test on reference model, not FVP."""
+
+    test_parameters = {
+        "full_like_value_3_2": lambda: (torch.randn(2, 2, 2, 2) * 50, 3.2),
+        "full_like_value_3": lambda: (torch.randn(2, 2, 2, 2) * 50, 3),
+        "full_like_value_3_2_int32": lambda: (
+            (torch.randn(2, 2, 2, 2) * 50).to(torch.int32),
+            3.2,
+        ),
+        "full_like_value_3_int32": lambda: (
+            (torch.randn(2, 2, 2, 2) * 50).to(torch.int32),
+            3,
+        ),
+    }
+
+    def forward(self, input_tensor: torch.Tensor, value):
+        # Our backend can't handle tensors without users, which input_tensor doesn't have
+        # when the full_like is converted to a full. Therefore involve it in the output.
+        return input_tensor + torch.full_like(input_tensor, value)
+
+
+def test_full_tosa_MI_only():
+    pipeline = TosaPipelineMI[input_t1](
+        Full(),
+        (),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+def test_full_tosa_MI_const():
+    test_data = (torch.rand((2, 2, 3, 3)) * 10,)
+    pipeline = TosaPipelineMI[input_t1](
+        AddConstFull(),
+        test_data,
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", FullLike.test_parameters)
+def test_full_like_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        FullLike(),
+        test_data(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", AddVariableFull.test_parameters)
+def test_full_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        AddVariableFull(),
+        test_data,
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", AddVariableFull.test_parameters)
+def test_full_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        AddVariableFull(),
+        test_data,
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", FullLike.test_parameters)
+def test_full_like_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        FullLike(),
+        test_data(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.run()
+
+
+@common.parametrize("test_data", AddVariableFull.test_parameters)
+@common.XfailIfNoCorstone320
+def test_full_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        AddVariableFull(),
+        test_data,
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", AddVariableFull.test_parameters)
+@common.XfailIfNoCorstone300
+def test_full_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        AddVariableFull(),
+        test_data,
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+# This fails since full outputs int64 by default if 'fill_value' is integer, which our backend doesn't support.
+@pytest.mark.skip(
+    "This fails since full outputs int64 by default if 'fill_value' is integer, which our backend doesn't support."
+)
+def test_full_tosa_MI_integer_value():
+    test_data = (torch.ones((2, 2)), 1.0)
+    pipeline = TosaPipelineMI[input_t1](
+        AddVariableFull(),
+        test_data,
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+# This fails since the fill value in the full tensor is set at compile time by the example data (1.).
+# Test data tries to set it again at runtime (to 2.) but it doesn't do anything.
+# In eager mode, the fill value can be set at runtime, causing the outputs to not match.
+@pytest.mark.skip(
+    "This fails since the fill value in the full tensor is set at compile time by the example data (1.)."
+)
+def test_full_tosa_MI_set_value_at_runtime(tosa_version: str):
+    test_data = (torch.ones((2, 2)), 1.0)
+    pipeline = TosaPipelineMI[input_t1](
+        AddVariableFull(),
+        test_data,
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.add_stage(
+        pipeline.tester.run_method_and_compare_outputs, inputs=(torch.ones((2, 2)), 2.0)
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py
index 7bcd2c923a4..19c036be526 100644
--- a/backends/arm/test/ops/test_ge.py
+++ b/backends/arm/test/ops/test_ge.py
@@ -62,25 +62,25 @@ def get_inputs(self):
 op_ge_scalar_rank4_randn = GreaterEqual(torch.randn(3, 2, 2, 2), 0.3)
 
 test_data_tensor = {
-    "ge_tensor_rank1_ones": op_ge_tensor_rank1_ones,
-    "ge_tensor_rank2_rand": op_ge_tensor_rank2_rand,
-    "ge_tensor_rank3_randn": op_ge_tensor_rank3_randn,
-    "ge_tensor_rank4_randn": op_ge_tensor_rank4_randn,
+    "ge_tensor_rank1_ones": lambda: op_ge_tensor_rank1_ones,
+    "ge_tensor_rank2_rand": lambda: op_ge_tensor_rank2_rand,
+    "ge_tensor_rank3_randn": lambda: op_ge_tensor_rank3_randn,
+    "ge_tensor_rank4_randn": lambda: op_ge_tensor_rank4_randn,
 }
 
 test_data_scalar = {
-    "ge_scalar_rank1_ones": op_ge_scalar_rank1_ones,
-    "ge_scalar_rank2_rand": op_ge_scalar_rank2_rand,
-    "ge_scalar_rank3_randn": op_ge_scalar_rank3_randn,
-    "ge_scalar_rank4_randn": op_ge_scalar_rank4_randn,
+    "ge_scalar_rank1_ones": lambda: op_ge_scalar_rank1_ones,
+    "ge_scalar_rank2_rand": lambda: op_ge_scalar_rank2_rand,
+    "ge_scalar_rank3_randn": lambda: op_ge_scalar_rank3_randn,
+    "ge_scalar_rank4_randn": lambda: op_ge_scalar_rank4_randn,
 }
 
 
 @common.parametrize("test_module", test_data_tensor)
 def test_ge_tensor_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
     )
@@ -90,8 +90,8 @@ def test_ge_tensor_tosa_MI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_ge_scalar_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         GreaterEqual.aten_op_scalar,
         GreaterEqual.exir_op,
     )
@@ -101,8 +101,8 @@ def test_ge_scalar_tosa_MI(test_module):
 @common.parametrize("test_module", test_data_tensor)
 def test_ge_tensor_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
     )
@@ -112,8 +112,8 @@ def test_ge_tensor_tosa_BI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_ge_scalar_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
     )
@@ -125,10 +125,11 @@ def test_ge_scalar_tosa_BI(test_module):
 def test_ge_tensor_u55_BI(test_module):
     # GREATER_EQUAL is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {GreaterEqual.exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -138,11 +139,12 @@ def test_ge_tensor_u55_BI(test_module):
 def test_ge_scalar_u55_BI(test_module):
     # GREATER_EQUAL is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {GreaterEqual.exir_op: 1},
         n_expected_delegates=1,
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -155,8 +157,8 @@ def test_ge_scalar_u55_BI(test_module):
 @common.XfailIfNoCorstone320
 def test_ge_tensor_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
         run_on_fvp=True,
@@ -172,8 +174,8 @@ def test_ge_tensor_u85_BI(test_module):
 @common.XfailIfNoCorstone320
 def test_ge_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_gelu.py b/backends/arm/test/ops/test_gelu.py
index fb1253fdb0c..6ac9b5dabf5 100644
--- a/backends/arm/test/ops/test_gelu.py
+++ b/backends/arm/test/ops/test_gelu.py
@@ -22,51 +22,51 @@ class Gelu(torch.nn.Module):
     exir_op = "executorch_exir_dialects_edge__ops_aten_gelu_default"
 
     test_data: dict[str, Tuple[str, input_t1]] = {
-        "zeros_none": (
+        "zeros_none": lambda: (
             "none",
             torch.zeros(1, 10, 10, 10),
         ),
-        "ones_none": (
+        "ones_none": lambda: (
             "none",
             torch.ones(10, 10, 10),
         ),
-        "rand_none": (
+        "rand_none": lambda: (
             "none",
             (torch.rand(10, 10) - 0.5),
         ),
-        "randn_pos_none": (
+        "randn_pos_none": lambda: (
             "none",
             (torch.randn(1, 4, 4, 4) + 10),
         ),
-        "randn_neg_none": (
+        "randn_neg_none": lambda: (
             "none",
             (torch.randn(1, 4, 4, 4) - 10),
         ),
-        "ramp_none": (
+        "ramp_none": lambda: (
             "none",
             torch.arange(-16, 16, 0.2),
         ),
-        "zeros_tanh": (
+        "zeros_tanh": lambda: (
             "tanh",
             torch.zeros(1, 10, 10, 10),
         ),
-        "ones_tanh": (
+        "ones_tanh": lambda: (
             "tanh",
             torch.ones(10, 10, 10),
         ),
-        "rand_tanh": (
+        "rand_tanh": lambda: (
             "tanh",
             (torch.rand(10, 10) - 0.5),
         ),
-        "randn_pos_tanh": (
+        "randn_pos_tanh": lambda: (
             "tanh",
             (torch.randn(1, 4, 4, 4) + 10),
         ),
-        "randn_neg_tanh": (
+        "randn_neg_tanh": lambda: (
             "tanh",
             (torch.randn(1, 4, 4, 4) - 10),
         ),
-        "ramp_tanh": (
+        "ramp_tanh": lambda: (
             "tanh",
             torch.arange(-16, 16, 0.2),
         ),
@@ -82,10 +82,10 @@ def forward(self, x: torch.Tensor):
 
 @common.parametrize("test_data", Gelu.test_data)
 def test_gelu_tosa_MI(test_data: input_t1):
-    approximate = test_data[0]
+    approximate, test_data = test_data()
     TosaPipelineMI[input_t1](
         Gelu(approximate),
-        (test_data[1],),
+        (test_data,),
         Gelu.aten_op,
         Gelu.exir_op,
         use_to_edge_transform_and_lower=False,
@@ -94,32 +94,34 @@ def test_gelu_tosa_MI(test_data: input_t1):
 
 @common.parametrize("test_data", Gelu.test_data)
 def test_gelu_tosa_BI(test_data: input_t1):
-    approximate = test_data[0]
+    approximate, test_data = test_data()
     TosaPipelineBI[input_t1](
         Gelu(approximate),
-        (test_data[1],),
+        (test_data,),
         Gelu.aten_op,
         Gelu.exir_op,
     ).run()
 
 
 @common.parametrize("test_data", Gelu.test_data)
+@common.XfailIfNoCorstone300
 def test_gelu_u55_BI(test_data: input_t1):
-    approximate = test_data[0]
+    approximate, test_data = test_data()
     EthosU55PipelineBI[input_t1](
         Gelu(approximate),
-        (test_data[1],),
+        (test_data,),
         Gelu.aten_op,
         Gelu.exir_op,
     ).run()
 
 
 @common.parametrize("test_data", Gelu.test_data)
+@common.XfailIfNoCorstone320
 def test_gelu_u85_BI(test_data: input_t1):
-    approximate = test_data[0]
+    approximate, test_data = test_data()
     EthosU85PipelineBI[input_t1](
         Gelu(approximate),
-        (test_data[1],),
+        (test_data,),
         Gelu.aten_op,
         Gelu.exir_op,
     ).run()
diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py
index 15515958c85..0a1b97928fd 100644
--- a/backends/arm/test/ops/test_gt.py
+++ b/backends/arm/test/ops/test_gt.py
@@ -63,24 +63,27 @@ def get_inputs(self):
 op_gt_scalar_rank4_randn = Greater(torch.randn(3, 2, 2, 2), 0.3)
 
 test_data_tensor = {
-    "gt_tensor_rank1_ones": op_gt_tensor_rank1_ones,
-    "gt_tensor_rank2_rand": op_gt_tensor_rank2_rand,
-    "gt_tensor_rank3_randn": op_gt_tensor_rank3_randn,
-    "gt_tensor_rank4_randn": op_gt_tensor_rank4_randn,
+    "gt_tensor_rank1_ones": lambda: op_gt_tensor_rank1_ones,
+    "gt_tensor_rank2_rand": lambda: op_gt_tensor_rank2_rand,
+    "gt_tensor_rank3_randn": lambda: op_gt_tensor_rank3_randn,
+    "gt_tensor_rank4_randn": lambda: op_gt_tensor_rank4_randn,
 }
 
 test_data_scalar = {
-    "gt_scalar_rank1_ones": op_gt_scalar_rank1_ones,
-    "gt_scalar_rank2_rand": op_gt_scalar_rank2_rand,
-    "gt_scalar_rank3_randn": op_gt_scalar_rank3_randn,
-    "gt_scalar_rank4_randn": op_gt_scalar_rank4_randn,
+    "gt_scalar_rank1_ones": lambda: op_gt_scalar_rank1_ones,
+    "gt_scalar_rank2_rand": lambda: op_gt_scalar_rank2_rand,
+    "gt_scalar_rank3_randn": lambda: op_gt_scalar_rank3_randn,
+    "gt_scalar_rank4_randn": lambda: op_gt_scalar_rank4_randn,
 }
 
 
 @common.parametrize("test_module", test_data_tensor)
 def test_gt_tensor_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        Greater.aten_op_tensor,
+        Greater.exir_op,
     )
     pipeline.run()
 
@@ -88,7 +91,10 @@ def test_gt_tensor_tosa_MI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_gt_scalar_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), Greater.aten_op_scalar, Greater.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        Greater.aten_op_scalar,
+        Greater.exir_op,
     )
     pipeline.run()
 
@@ -96,7 +102,10 @@ def test_gt_scalar_tosa_MI(test_module):
 @common.parametrize("test_module", test_data_tensor)
 def test_gt_tensor_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        Greater.aten_op_tensor,
+        Greater.exir_op,
     )
     pipeline.run()
 
@@ -104,7 +113,10 @@ def test_gt_tensor_tosa_BI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_gt_scalar_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        Greater.aten_op_tensor,
+        Greater.exir_op,
     )
     pipeline.run()
 
@@ -114,10 +126,11 @@ def test_gt_scalar_tosa_BI(test_module):
 def test_gt_tensor_u55_BI(test_module):
     # Greater is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {Greater.exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -127,11 +140,12 @@ def test_gt_tensor_u55_BI(test_module):
 def test_gt_scalar_u55_BI(test_module):
     # Greater is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {Greater.exir_op: 1},
         n_expected_delegates=1,
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -146,8 +160,8 @@ def test_gt_scalar_u55_BI(test_module):
 @common.XfailIfNoCorstone320
 def test_gt_tensor_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         Greater.aten_op_tensor,
         Greater.exir_op,
         run_on_fvp=True,
@@ -165,8 +179,8 @@ def test_gt_tensor_u85_BI(test_module):
 @common.XfailIfNoCorstone320
 def test_gt_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         Greater.aten_op_tensor,
         Greater.exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_hardsigmoid.py b/backends/arm/test/ops/test_hardsigmoid.py
index f73a995b120..399c6088e89 100644
--- a/backends/arm/test/ops/test_hardsigmoid.py
+++ b/backends/arm/test/ops/test_hardsigmoid.py
@@ -1,128 +1,89 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
 import torch
 
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
+aten_op = "torch.ops.aten.hardsigmoid.default"
+input_t1 = Tuple[torch.Tensor]  # Input x
 
-test_data_suite = [
+test_data_suite = {
     # (test_name, test_data)
-    ("zeros", torch.zeros(1, 10, 10, 10)),
-    ("ones", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
-    ("randn_neg", torch.randn(10) - 10),
-    ("ramp", torch.arange(-16, 16, 0.2)),
-]
-
-
-class TestHardsigmoid(unittest.TestCase):
-    class Hardsigmoid(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.hardsigmoid = torch.nn.Hardsigmoid()
-
-        def forward(self, x):
-            return self.hardsigmoid(x)
-
-    def _test_hardsigmoid_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.hardsigmoid.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_hardsigmoid_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.hardsigmoid.default"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_hardsigmoid_tosa_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.hardsigmoid.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_hardsigmoid_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_hardsigmoid_tosa_MI_pipeline(self.Hardsigmoid(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_hardsigmoid_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardsigmoid_tosa_BI_pipeline(self.Hardsigmoid(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_hardsigmoid_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardsigmoid_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), self.Hardsigmoid(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_hardsigmoid_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardsigmoid_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), self.Hardsigmoid(), (test_data,)
-        )
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+}
+
+
+class Hardsigmoid(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.hardsigmoid = torch.nn.Hardsigmoid()
+
+    def forward(self, x):
+        return self.hardsigmoid(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_hardsigmoid_tosa_MI(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](
+        Hardsigmoid(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_hardsigmoid_tosa_BI(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Hardsigmoid(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_hardsigmoid_u55_BI(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Hardsigmoid(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_hardsigmoid_u85_BI(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Hardsigmoid(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_hardswish.py b/backends/arm/test/ops/test_hardswish.py
index 81aba540e3f..bd61346e3db 100644
--- a/backends/arm/test/ops/test_hardswish.py
+++ b/backends/arm/test/ops/test_hardswish.py
@@ -1,128 +1,79 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
 import torch
 
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
+aten_op = "torch.ops.aten.hardswish.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_clamp_default"
 
-test_data_suite = [
+input_t1 = Tuple[torch.Tensor]
+
+test_data_suite = {
     # (test_name, test_data)
-    ("zeros", torch.zeros(1, 10, 10, 10)),
-    ("ones", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
-    ("randn_neg", torch.randn(10) - 10),
-    ("ramp", torch.arange(-16, 16, 0.2)),
-]
-
-
-class TestHardswish(unittest.TestCase):
-    class Hardswish(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.hardswish = torch.nn.Hardswish()
-
-        def forward(self, x):
-            return self.hardswish(x)
-
-    def _test_hardswish_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.hardswish.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_hardswish_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.hardswish.default"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_hardswish_tosa_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.hardswish.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_hardswish_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_hardswish_tosa_MI_pipeline(self.Hardswish(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_hardswish_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardswish_tosa_BI_pipeline(self.Hardswish(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_hardswish_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardswish_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), self.Hardswish(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_hardswish_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardswish_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), self.Hardswish(), (test_data,)
-        )
+    "zeros": lambda: (torch.zeros(1, 10, 10, 10)),
+    "ones": lambda: (torch.ones(10, 10, 10)),
+    "rand": lambda: (torch.rand(10, 10) - 0.5),
+    "randn_pos": lambda: (torch.randn(10) + 10),
+    "randn_neg": lambda: (torch.randn(10) - 10),
+    "ramp": lambda: (torch.arange(-16, 16, 0.2)),
+}
+
+
+class Hardswish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.hardswish = torch.nn.Hardswish()
+
+    def forward(self, x):
+        return self.hardswish(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_hardswish_tosa_MI(test_data):
+    pipeline = TosaPipelineMI[input_t1](Hardswish(), (test_data(),), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_hardswish_tosa_BI(test_data):
+    pipeline = TosaPipelineBI[input_t1](Hardswish(), (test_data(),), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_hardswish_u55_BI(test_data):
+    EthosU55PipelineBI[input_t1](
+        Hardswish(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    ).run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_hardswish_u85_BI(test_data):
+    EthosU85PipelineBI[input_t1](
+        Hardswish(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    ).run()
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
index 46b44078785..f1a50467df7 100644
--- a/backends/arm/test/ops/test_hardtanh.py
+++ b/backends/arm/test/ops/test_hardtanh.py
@@ -1,143 +1,91 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
 
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
 )
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from parameterized import parameterized
-
 
-test_data_suite = [
+test_data_suite = {
     # (test_name, test_data)
-    ("zeros", torch.zeros(1, 10, 10, 10)),
-    ("ones", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
-    ("randn_neg", torch.randn(10) - 10),
-    ("ramp", torch.arange(-16, 16, 0.2)),
-]
-
-
-class TestHardTanh(unittest.TestCase):
-    """Tests HardTanh Operator."""
-
-    class HardTanh(torch.nn.Module):
-
-        def __init__(self):
-            super().__init__()
-
-            self.hardTanh = torch.nn.Hardtanh()
-
-        def forward(self, x):
-            return self.hardTanh(x)
-
-    def _test_hardtanh_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.hardtanh.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_hardtanh_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.hardtanh.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_hardtanh_tosa_ethosu_BI_pipeline(
-        self, compile_spec, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.hardtanh.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_hardtanh_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_hardtanh_tosa_MI_pipeline(self.HardTanh(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_hardtanh_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardtanh_tosa_BI_pipeline(self.HardTanh(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_hardtanh_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardtanh_tosa_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.HardTanh(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_hardtanh_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardtanh_tosa_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.HardTanh(), (test_data,)
-        )
+    "zeros": lambda: (torch.zeros(1, 10, 10, 10)),
+    "ones": lambda: (torch.ones(10, 10, 10)),
+    "rand": lambda: (torch.rand(10, 10) - 0.5),
+    "randn_pos": lambda: (torch.randn(10) + 10),
+    "randn_neg": lambda: (torch.randn(10) - 10),
+    "ramp": lambda: (torch.arange(-16, 16, 0.2)),
+}
+
+aten_op = "torch.ops.aten.hardtanh.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_hardtanh_default"
+
+input_t = Tuple[torch.Tensor]
+
+
+class HardTanh(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+        self.hardTanh = torch.nn.Hardtanh()
+
+    def forward(self, x):
+        return self.hardTanh(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_hardtanh_tosa_MI(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t](HardTanh(), (test_data(),), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_hardtanh_tosa_BI(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t](
+        HardTanh(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_hardtanh_u55_BI(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t](
+        HardTanh(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_hardtanh_u85_BI(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t](
+        HardTanh(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 7ed181711a1..d2d9aa0bc14 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -42,18 +42,21 @@ def forward(self, x):
 
 input_t = tuple[torch.Tensor]
 test_data_suite = {
-    "randn_last_dim": ((torch.randn(1, 5, 5, 5),), LayerNorm([5])),
-    "rand_last_two_dims": ((torch.rand(1, 5, 5, 5),), LayerNorm([5, 5])),
-    "rand_last_two_dims_not_elementwise_affine": (
+    "randn_last_dim": lambda: ((torch.randn(1, 5, 5, 5),), LayerNorm([5])),
+    "rand_last_two_dims": lambda: ((torch.rand(1, 5, 5, 5),), LayerNorm([5, 5])),
+    "rand_last_two_dims_not_elementwise_affine": lambda: (
         (torch.rand(1, 5, 5, 5),),
         LayerNorm([5, 5], 1e-5, False),
     ),
-    "rand_last_two_dims_not_elementwise_affine_no_bias": (
+    "rand_last_two_dims_not_elementwise_affine_no_bias": lambda: (
         (torch.rand(1, 5, 5, 5),),
         LayerNorm([5, 5], 1e-5, False, False),
     ),
-    "randn_last_three_dims": ((torch.randn(1, 15, 10, 5),), LayerNorm([15, 10, 5])),
-    "randn_last_three_dims_no_bias": (
+    "randn_last_three_dims": lambda: (
+        (torch.randn(1, 15, 10, 5),),
+        LayerNorm([15, 10, 5]),
+    ),
+    "randn_last_three_dims_no_bias": lambda: (
         (torch.randn(1, 15, 10, 5),),
         LayerNorm([15, 10, 5], 1e-2, False, False),
     ),
@@ -62,9 +65,10 @@ def forward(self, x):
 
 @common.parametrize("test_data", test_data_suite)
 def test_native_layer_norm_tosa_MI(test_data):
+    test_data, model = test_data()
     pipeline = TosaPipelineMI[input_t](
-        test_data[1],
-        test_data[0],
+        model,
+        test_data,
         "torch.ops.aten.layer_norm.default",
     )
     pipeline.run()
@@ -72,9 +76,10 @@ def test_native_layer_norm_tosa_MI(test_data):
 
 @common.parametrize("test_data", test_data_suite)
 def test_native_layer_norm_tosa_BI(test_data):
+    test_data, model = test_data()
     pipeline = TosaPipelineBI[input_t](
-        test_data[1],
-        test_data[0],
+        model,
+        test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
     )
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -84,9 +89,10 @@ def test_native_layer_norm_tosa_BI(test_data):
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
 def test_native_layer_norm_u55_BI(test_data):
+    test_data, model = test_data()
     pipeline = EthosU55PipelineBI[input_t](
-        test_data[1],
-        test_data[0],
+        model,
+        test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
         run_on_fvp=True,
     )
@@ -97,9 +103,10 @@ def test_native_layer_norm_u55_BI(test_data):
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
 def test_native_layer_norm_u85_BI(test_data):
+    test_data, model = test_data()
     pipeline = EthosU85PipelineBI[input_t](
-        test_data[1],
-        test_data[0],
+        model,
+        test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
         run_on_fvp=True,
     )
diff --git a/backends/arm/test/ops/test_le.py b/backends/arm/test/ops/test_le.py
index 7e243ead620..217e409c6f5 100644
--- a/backends/arm/test/ops/test_le.py
+++ b/backends/arm/test/ops/test_le.py
@@ -5,7 +5,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 
@@ -57,63 +56,38 @@ def get_inputs(self):
 )
 
 test_data_common = {
-    "le_rank1_ones": op_le_rank1_ones,
-    "le_rank2_rand": op_le_rank2_rand,
-    "le_rank3_randn": op_le_rank3_randn,
-    "le_rank4_randn": op_le_rank4_randn,
+    "le_rank1_ones": lambda: op_le_rank1_ones,
+    "le_rank2_rand": lambda: op_le_rank2_rand,
+    "le_rank3_randn": lambda: op_le_rank3_randn,
+    "le_rank4_randn": lambda: op_le_rank4_randn,
 }
 
 
 @common.parametrize("test_module", test_data_common)
-def test_le_tosa_MI(test_module):
+def test_le_tensor_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module(), test_module().get_inputs(), aten_op, exir_op
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_data_common)
-def test_le_tosa_BI(test_module):
+def test_le_tensor_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module(), test_module().get_inputs(), aten_op, exir_op
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_data_common)
-def test_le_u55_BI(test_module):
+def test_le_tensor_u55_BI_not_delegated(test_module):
     # GREATER_EQUAL is not supported on U55. LE uses the GREATER_EQUAL Tosa operator.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
-        {exir_op: 1},
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_data_common)
-def test_le_u85_BI(test_module):
-    pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        use_to_edge_transform_and_lower=True,
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_data_common)
-@pytest.mark.skip(reason="The same as test_le_u55_BI")
-def test_le_u55_BI_on_fvp(test_module):
-    # GREATER_EQUAL is not supported on U55. LE uses the GREATER_EQUAL Tosa operator.
-    pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -123,11 +97,11 @@ def test_le_u55_BI_on_fvp(test_module):
     test_data_common,
     xfails={"le_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
 )
-@common.SkipIfNoCorstone320
-def test_le_u85_BI_on_fvp(test_module):
+@common.XfailIfNoCorstone320
+def test_le_tensor_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         aten_op,
         exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_leaky_relu.py b/backends/arm/test/ops/test_leaky_relu.py
index b9f0c3a8d1a..a83c2812bf0 100644
--- a/backends/arm/test/ops/test_leaky_relu.py
+++ b/backends/arm/test/ops/test_leaky_relu.py
@@ -28,19 +28,22 @@ def forward(self, x: torch.Tensor):
         return self.activation(x)
 
     test_data: dict[str, input_t1] = {
-        "zeros": ((torch.zeros(1, 1, 5, 5),), 0.01),
-        "ones": ((torch.ones(1, 32, 112, 112),), 0.01),
-        "rand": ((torch.rand(1, 96, 56, 56),), 0.2),
-        "3Dtensor": ((torch.rand(5, 5, 5),), 0.001),
-        "negative_slope": ((torch.rand(1, 16, 128, 128),), -0.002),
+        "zeros": lambda: ((torch.zeros(1, 1, 5, 5),), 0.01),
+        "ones": lambda: ((torch.ones(1, 32, 112, 112),), 0.01),
+        "rand": lambda: ((torch.rand(1, 96, 56, 56),), 0.2),
+        "3Dtensor": lambda: ((torch.rand(5, 5, 5),), 0.001),
+        "negative_slope": lambda: ((torch.rand(1, 16, 128, 128),), -0.002),
     }
 
 
 @common.parametrize("test_data", LeakyReLU.test_data)
 def test_leaky_relu_tosa_MI(test_data):
-    data, slope = test_data
+    data, slope = test_data()
     pipeline = TosaPipelineMI[input_t1](
-        LeakyReLU(slope), data, [], use_to_edge_transform_and_lower=True
+        LeakyReLU(slope),
+        data,
+        [],
+        use_to_edge_transform_and_lower=True,
     )
     pipeline.add_stage_after(
         "to_edge_transform_and_lower", pipeline.tester.check_not, [exir_op]
@@ -50,9 +53,12 @@ def test_leaky_relu_tosa_MI(test_data):
 
 @common.parametrize("test_data", LeakyReLU.test_data)
 def test_leaky_relu_tosa_BI(test_data):
-    data, slope = test_data
+    data, slope = test_data()
     pipeline = TosaPipelineBI[input_t1](
-        LeakyReLU(slope), data, [], use_to_edge_transform_and_lower=True
+        LeakyReLU(slope),
+        data,
+        [],
+        use_to_edge_transform_and_lower=True,
     )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.run()
@@ -61,7 +67,7 @@ def test_leaky_relu_tosa_BI(test_data):
 @common.parametrize("test_data", LeakyReLU.test_data)
 @common.XfailIfNoCorstone300
 def test_leaky_relu_u55_BI(test_data):
-    data, slope = test_data
+    data, slope = test_data()
     pipeline = EthosU55PipelineBI[input_t1](
         LeakyReLU(slope),
         data,
@@ -76,7 +82,7 @@ def test_leaky_relu_u55_BI(test_data):
 @common.parametrize("test_data", LeakyReLU.test_data)
 @common.XfailIfNoCorstone320
 def test_leaky_relu_u85_BI(test_data):
-    data, slope = test_data
+    data, slope = test_data()
     pipeline = EthosU85PipelineBI[input_t1](
         LeakyReLU(slope),
         data,
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 9a289909bae..56d33097999 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -1,271 +1,199 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
 import pytest
 
 import torch
-from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test import common
 
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
+aten_op = "torch.ops.aten.linear.default"
 
-test_data_suite_rank1 = [
+input_t1 = Tuple[torch.Tensor]
+
+test_data_suite_rank1 = {
     # (test_name, test_data, out_features, has_bias)
-    (
-        "model_linear_rank1_zeros",
+    "model_linear_rank1_zeros": lambda: (
         torch.zeros(10),
         15,
         True,
     ),
-    (
-        "model_linear_rank1_ones",
+    "model_linear_rank1_ones": lambda: (
         torch.ones(10),
         15,
         False,
     ),
-    (
-        "model_linear_rank1_negative_ones",
+    "model_linear_rank1_negative_ones": lambda: (
         torch.ones(10) * (-1),
         20,
         True,
     ),
-    (
-        "model_linear_rank1_rand",
+    "model_linear_rank1_rand": lambda: (
         torch.rand(10),
         10,
         True,
     ),
-    (
-        "model_linear_rank1_negative_large_rand",
+    "model_linear_rank1_negative_large_rand": lambda: (
         torch.rand(10) * (-100),
         30,
         False,
     ),
-    (
-        "model_linear_rank1_large_randn",
+    "model_linear_rank1_large_randn": lambda: (
         torch.randn(15) * 100,
         20,
         True,
     ),
-]
+}
 
-test_data_suite_rank4 = [
+test_data_suite_rank4 = {
     # (test_name, test_data, out_features, has_bias)
-    (
-        "model_linear_rank4_zeros",
+    "model_linear_rank4_zeros": lambda: (
         torch.zeros(5, 10, 25, 20),
         30,
         True,
     ),
-    (
-        "model_linear_rank4_ones",
+    "model_linear_rank4_ones": lambda: (
         torch.ones(5, 10, 25, 20),
         30,
         False,
     ),
-    (
-        "model_linear_rank4_negative_ones",
+    "model_linear_rank4_negative_ones": lambda: (
         torch.ones(5, 10, 25, 20) * (-1),
         30,
         True,
     ),
-    (
-        "model_linear_rank4_rand",
+    "model_linear_rank4_rand": lambda: (
         torch.rand(5, 10, 25, 20),
         30,
         False,
     ),
-    (
-        "model_linear_rank4_negative_large_rand",
+    "model_linear_rank4_negative_large_rand": lambda: (
         torch.rand(5, 10, 25, 20) * (-100),
         30,
         True,
     ),
-    (
-        "model_linear_rank4_large_randn",
+    "model_linear_rank4_large_randn": lambda: (
         torch.randn(5, 10, 25, 20) * 100,
         30,
         False,
     ),
-]
-
-
-class TestLinear(unittest.TestCase):
-    """tests the linear operation y = Ax + b"""
-
-    class Linear(torch.nn.Module):
-        def __init__(
-            self,
-            in_features: int,
-            out_features: int = 3,
-            bias: bool = True,
-        ):
-            super().__init__()
-            self.fc = torch.nn.Linear(
-                in_features=in_features,
-                out_features=out_features,
-                bias=bias,
-            )
-
-        def forward(self, x):
-            return self.fc(x)
+}
 
-    def _test_linear_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .check_count({"torch.ops.aten.linear.default": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data)
 
-    def _test_linear_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.linear.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
-
-    def _test_linear_tosa_ethosu_BI_pipeline(
+class Linear(torch.nn.Module):
+    def __init__(
         self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ) -> ArmTester:
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.linear.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        # TODO: Add FVP testing support.
-        return tester
-
-    @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
-    @pytest.mark.tosa_ref_model
-    def test_linear_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        out_features: int,
-        has_bias: bool,
+        in_features: int,
+        out_features: int = 3,
+        bias: bool = True,
     ):
-        in_features = test_data.shape[-1]
-        test_data = (test_data,)
-        self._test_linear_tosa_MI_pipeline(
-            self.Linear(
-                in_features=in_features,
-                out_features=out_features,
-                bias=has_bias,
-            ),
-            test_data,
+        super().__init__()
+        self.fc = torch.nn.Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
         )
 
-    @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
-    @pytest.mark.tosa_ref_model
-    def test_linear_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        out_features: int,
-        has_bias: bool,
-    ):
-        in_features = test_data.shape[-1]
-        test_data = (test_data,)
-        self._test_linear_tosa_BI_pipeline(
-            self.Linear(
-                in_features=in_features, out_features=out_features, bias=has_bias
-            ),
-            test_data,
-        )
-
-    @parameterized.expand(test_data_suite_rank1)
-    @pytest.mark.corstone_fvp
-    def test_linear_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        out_features: int,
-        has_bias: bool,
-    ):
-        in_features = test_data.shape[-1]
-        test_data = (test_data,)
-        tester = self._test_linear_tosa_ethosu_BI_pipeline(
-            self.Linear(
-                in_features=in_features,
-                out_features=out_features,
-                bias=has_bias,
-            ),
-            common.get_u55_compile_spec(),
-            test_data,
-        )
-
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
-    @pytest.mark.corstone_fvp
-    def test_linear_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        out_features: int,
-        has_bias: bool,
-    ):
-        in_features = test_data.shape[-1]
-        test_data = (test_data,)
-        self._test_linear_tosa_ethosu_BI_pipeline(
-            self.Linear(
-                in_features=in_features,
-                out_features=out_features,
-                bias=has_bias,
-            ),
-            common.get_u85_compile_spec(),
-            test_data,
-        )
+    def forward(self, x):
+        return self.fc(x)
+
+
+@common.parametrize("test_data", test_data_suite_rank1 | test_data_suite_rank4)
+def test_linear_tosa_MI(test_data: torch.Tensor):
+    test_data, out_features, has_bias = test_data()
+    in_features = test_data.shape[-1]
+    pipeline = TosaPipelineMI[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness.
+@common.parametrize("test_data", test_data_suite_rank1 | test_data_suite_rank4)
+def test_linear_tosa_BI(test_data: torch.Tensor):
+    test_data, out_features, has_bias = test_data()
+    in_features = test_data.shape[-1]
+    pipeline = TosaPipelineBI[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite_rank1)
+@common.XfailIfNoCorstone300
+def test_linear_u55_BI(test_data: torch.Tensor):
+    test_data, out_features, has_bias = test_data()
+    in_features = test_data.shape[-1]
+    EthosU55PipelineBI[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    ).run()
+
+
+x_fail = {
+    "model_linear_rank4_zeros": "AssertionError: Output 0 does not match reference output.",
+    "model_linear_rank4_ones": "AssertionError: Output 0 does not match reference output.",
+    "model_linear_rank4_negative_ones": "AssertionError: Output 0 does not match reference output.",
+    "model_linear_rank4_rand": "AssertionError: Output 0 does not match reference output.",
+    "model_linear_rank4_negative_large_rand": "AssertionError: Output 0 does not match reference output.",
+    "model_linear_rank4_large_randn": "AssertionError: Output 0 does not match reference output.",
+}
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite_rank1 | test_data_suite_rank4,
+    x_fail,
+)
+@common.XfailIfNoCorstone320
+def test_linear_u85_BI(test_data: torch.Tensor):
+    test_data, out_features, has_bias = test_data()
+    in_features = test_data.shape[-1]
+    EthosU85PipelineBI[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    ).run()
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
index 0226a62328b..0ca4510681d 100644
--- a/backends/arm/test/ops/test_log.py
+++ b/backends/arm/test/ops/test_log.py
@@ -1,127 +1,75 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.log.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_log_default"
+
+input_t1 = Tuple[torch.Tensor]
 
-test_data_suite = [
+test_data_suite = {
     # (test_name, test_data)
-    ("ones_rank4", torch.ones(1, 10, 10, 10)),
-    ("ones_rank3", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) + 0.001),
-    ("randn_pos", torch.randn(10) + 10),
-    ("randn_spread", torch.max(torch.Tensor([0.0]), torch.randn(10) * 100)),
-    ("ramp", torch.arange(0.01, 20, 0.2)),
-]
-
-
-class TestLog(unittest.TestCase):
-    """Tests lowering of aten.log"""
-
-    class Log(torch.nn.Module):
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return torch.log(x)
-
-    def _test_log_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.log.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_log_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.log.default"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_log_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.log.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_log_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_log_tosa_MI_pipeline(self.Log(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_log_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_log_tosa_BI_pipeline(self.Log(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_log_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_log_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Log(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_log_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_log_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Log(), (test_data,)
-        )
+    "ones_rank4": lambda: (torch.ones(1, 10, 10, 10)),
+    "ones_rank3": lambda: (torch.ones(10, 10, 10)),
+    "rand": lambda: (torch.rand(10, 10) + 0.001),
+    "randn_pos": lambda: (torch.randn(10) + 10),
+    "randn_spread": lambda: (torch.max(torch.Tensor([0.0]), torch.randn(10) * 100)),
+    "ramp": lambda: (torch.arange(0.01, 20, 0.2)),
+}
+
+
+class Log(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.log(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_log_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](Log(), (test_data(),), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_log_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Log(), (test_data(),), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_log_u55_BI(test_data: input_t1):
+    EthosU55PipelineBI[input_t1](
+        Log(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    ).run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_log_u85_BI(test_data: input_t1):
+    EthosU85PipelineBI[input_t1](
+        Log(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    ).run()
diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py
index a4b66339b0c..139653eea97 100644
--- a/backends/arm/test/ops/test_logical.py
+++ b/backends/arm/test/ops/test_logical.py
@@ -23,19 +23,19 @@
 
 class LogicalBinary(torch.nn.Module):
     test_data: dict[input_t2] = {
-        "rank1": (
+        "rank1": lambda: (
             torch.tensor([True, True, False, False], dtype=torch.bool),
             torch.tensor([True, False, True, False], dtype=torch.bool),
         ),
-        "rand_rank2": (
+        "rand_rank2": lambda: (
             torch.randint(0, 2, (10, 10), dtype=torch.bool),
             torch.randint(0, 2, (10, 10), dtype=torch.bool),
         ),
-        "rand_rank3": (
+        "rand_rank3": lambda: (
             torch.randint(0, 2, (10, 10, 10), dtype=torch.bool),
             torch.randint(0, 2, (10, 10, 10), dtype=torch.bool),
         ),
-        "rand_rank4": (
+        "rand_rank4": lambda: (
             torch.randint(0, 2, (1, 10, 10, 10), dtype=torch.bool),
             torch.randint(0, 2, (1, 10, 10, 10), dtype=torch.bool),
         ),
@@ -68,10 +68,10 @@ def forward(self, tensor1: torch.Tensor, tensor2: torch.Tensor):
 
 class Not(torch.nn.Module):
     test_data: dict[input_t1] = {
-        "rank1": (torch.tensor([True, True, False, False], dtype=torch.bool),),
-        "rand_rank2": (torch.randint(0, 2, (10, 10), dtype=torch.bool),),
-        "rand_rank3": (torch.randint(0, 2, (10, 10, 10), dtype=torch.bool),),
-        "rand_rank4": (torch.randint(0, 2, (1, 10, 10, 10), dtype=torch.bool),),
+        "rank1": lambda: (torch.tensor([True, True, False, False], dtype=torch.bool),),
+        "rand_rank2": lambda: (torch.randint(0, 2, (10, 10), dtype=torch.bool),),
+        "rand_rank3": lambda: (torch.randint(0, 2, (10, 10, 10), dtype=torch.bool),),
+        "rand_rank4": lambda: (torch.randint(0, 2, (1, 10, 10, 10), dtype=torch.bool),),
     }
 
     aten_op = "torch.ops.aten.logical_not.default"
@@ -83,23 +83,31 @@ def forward(self, tensor: torch.Tensor):
 
 @common.parametrize("test_data", And().test_data)
 def test_logical_and_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](And(), test_data, And().aten_op, And().exir_op)
+    pipeline = TosaPipelineMI[input_t2](
+        And(), test_data(), And().aten_op, And().exir_op
+    )
     pipeline.run()
 
 
 @common.parametrize("test_data", And().test_data)
 def test_logical_and_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](And(), test_data, And().aten_op, And().exir_op)
+    pipeline = TosaPipelineBI[input_t2](
+        And(), test_data(), And().aten_op, And().exir_op
+    )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
 @common.parametrize("test_data", And().test_data)
-def test_logical_and_u55_BI(test_data: input_t2):
+def test_logical_and_u55_BI_not_delegated(test_data: input_t2):
     # Tests that we don't delegate these ops since they are not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t2](
-        And(), test_data, "TOSA-0.80+BI+u55", {And().exir_op: 1}
+        And(),
+        test_data(),
+        {And().exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -109,7 +117,7 @@ def test_logical_and_u55_BI(test_data: input_t2):
 @common.XfailIfNoCorstone320
 def test_logical_and_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        And(), test_data, And().aten_op, And().exir_op, run_on_fvp=True
+        And(), test_data(), And().aten_op, And().exir_op, run_on_fvp=True
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
@@ -118,23 +126,31 @@ def test_logical_and_u85_BI(test_data: input_t2):
 
 @common.parametrize("test_data", Xor().test_data)
 def test_logical_xor_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](Xor(), test_data, Xor().aten_op, Xor().exir_op)
+    pipeline = TosaPipelineMI[input_t2](
+        Xor(), test_data(), Xor().aten_op, Xor().exir_op
+    )
     pipeline.run()
 
 
 @common.parametrize("test_data", Xor().test_data)
 def test_logical_xor_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Xor(), test_data, Xor().aten_op, Xor().exir_op)
+    pipeline = TosaPipelineBI[input_t2](
+        Xor(), test_data(), Xor().aten_op, Xor().exir_op
+    )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
 @common.parametrize("test_data", Xor().test_data)
-def test_logical_xor_u55_BI(test_data: input_t2):
+def test_logical_xor_u55_BI_not_delegated(test_data: input_t2):
     # Tests that we don't delegate these ops since they are not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t2](
-        Xor(), test_data, "TOSA-0.80+BI+u55", {Xor().exir_op: 1}
+        Xor(),
+        test_data(),
+        {Xor().exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -144,7 +160,7 @@ def test_logical_xor_u55_BI(test_data: input_t2):
 @common.XfailIfNoCorstone320
 def test_logical_xor_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        Xor(), test_data, Xor().aten_op, Xor().exir_op, run_on_fvp=True
+        Xor(), test_data(), Xor().aten_op, Xor().exir_op, run_on_fvp=True
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
@@ -153,33 +169,37 @@ def test_logical_xor_u85_BI(test_data: input_t2):
 
 @common.parametrize("test_data", Or().test_data)
 def test_logical_or_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](Or(), test_data, Or().aten_op, Or().exir_op)
+    pipeline = TosaPipelineMI[input_t2](Or(), test_data(), Or().aten_op, Or().exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Or().test_data)
 def test_logical_or_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Or(), test_data, Or().aten_op, Or().exir_op)
+    pipeline = TosaPipelineBI[input_t2](Or(), test_data(), Or().aten_op, Or().exir_op)
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
 @common.parametrize("test_data", Or().test_data)
-def test_logical_or_u55_BI(test_data: input_t2):
+def test_logical_or_u55_BI_not_delegated(test_data: input_t2):
     # Tests that we don't delegate these ops since they are not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t2](
-        Or(), test_data, "TOSA-0.80+BI+u55", {Or().exir_op: 1}
+        Or(),
+        test_data(),
+        {Or().exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", Or().test_data)
 @pytest.mark.xfail(reason="MLETORCH-706: Support ScalarType::Bool in EthosUBackend.")
-@common.XfailIfNoCorstone320  # TODO: Refactor to use XfailIfNoCorstone320 once MLETORCH-706 is done
+@common.XfailIfNoCorstone320
 def test_logical_or_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        Or(), test_data, Or().aten_op, Or().exir_op, run_on_fvp=True
+        Or(), test_data(), Or().aten_op, Or().exir_op, run_on_fvp=True
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
@@ -188,23 +208,31 @@ def test_logical_or_u85_BI(test_data: input_t2):
 
 @common.parametrize("test_data", Not().test_data)
 def test_logical_not_tosa_MI(test_data: input_t2):
-    pipeline = TosaPipelineMI[input_t2](Not(), test_data, Not().aten_op, Not().exir_op)
+    pipeline = TosaPipelineMI[input_t2](
+        Not(), test_data(), Not().aten_op, Not().exir_op
+    )
     pipeline.run()
 
 
 @common.parametrize("test_data", Not().test_data)
 def test_logical_not_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Not(), test_data, Not().aten_op, Not().exir_op)
+    pipeline = TosaPipelineBI[input_t2](
+        Not(), test_data(), Not().aten_op, Not().exir_op
+    )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
 @common.parametrize("test_data", Not().test_data)
-def test_logical_not_u55_BI(test_data: input_t2):
+def test_logical_not_u55_BI_not_delegated(test_data: input_t2):
     # Tests that we don't delegate these ops since they are not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t2](
-        Not(), test_data, "TOSA-0.80+BI+u55", {Not().exir_op: 1}
+        Not(),
+        test_data(),
+        {Not().exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -214,7 +242,7 @@ def test_logical_not_u55_BI(test_data: input_t2):
 @common.XfailIfNoCorstone320
 def test_logical_not_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
-        Not(), test_data, Not().aten_op, Not().exir_op, run_on_fvp=True
+        Not(), test_data(), Not().aten_op, Not().exir_op, run_on_fvp=True
     )
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index 7068ee77e01..50132ba8211 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -5,6 +5,8 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -29,20 +31,20 @@ def forward(self, x):
         return self.log_softmax(x)
 
     test_data = {
-        "ones": ((torch.ones(10, 10),), 1),
-        "ones_neg_dim": ((torch.ones(1, 3, 4),), -1),
-        "randn_neg_dim": ((torch.randn(1, 5, 8, 7),), -3),
-        "zeros": ((torch.zeros(1, 8, 5, 2),), 0),
-        "zeros_neg_dim": ((torch.zeros(1, 7, 8, 9),), -4),
-        "rand": ((torch.rand(1, 2, 5, 8),), 2),
-        "rand_neg_dim": ((torch.rand(1, 10, 8, 10),), -2),
-        "randn_mult_batches": ((torch.randn(2, 10, 10, 10),), 3),
+        "ones": lambda: ((torch.ones(10, 10),), 1),
+        "ones_neg_dim": lambda: ((torch.ones(1, 3, 4),), -1),
+        "randn_neg_dim": lambda: ((torch.randn(1, 5, 8, 7),), -3),
+        "zeros": lambda: ((torch.zeros(1, 8, 5, 2),), 0),
+        "zeros_neg_dim": lambda: ((torch.zeros(1, 7, 8, 9),), -4),
+        "rand": lambda: ((torch.rand(1, 2, 5, 8),), 2),
+        "rand_neg_dim": lambda: ((torch.rand(1, 10, 8, 10),), -2),
+        "randn_mult_batches": lambda: ((torch.randn(2, 10, 10, 10),), 3),
     }
 
 
 @common.parametrize("test_data", LogSoftmax.test_data)
 def test_log_softmax_tosa_MI(test_data):
-    data, dim = test_data
+    data, dim = test_data()
     pipeline = TosaPipelineMI[input_t1](LogSoftmax(dim), data, [])
     pipeline.add_stage_after(
         "to_edge_transform_and_lower", pipeline.tester.check_not, [exir_op]
@@ -51,9 +53,10 @@ def test_log_softmax_tosa_MI(test_data):
     pipeline.run()
 
 
+@pytest.mark.flaky(reruns=5)
 @common.parametrize("test_data", LogSoftmax.test_data)
 def test_log_softmax_tosa_BI(test_data):
-    data, dim = test_data
+    data, dim = test_data()
     pipeline = TosaPipelineBI[input_t1](LogSoftmax(dim), data, [])
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -69,8 +72,13 @@ def test_log_softmax_tosa_BI(test_data):
 )
 @common.XfailIfNoCorstone300()
 def test_log_softmax_u55_BI(test_data):
-    data, dim = test_data
-    pipeline = EthosU55PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=True)
+    data, dim = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        LogSoftmax(dim),
+        data,
+        [],
+        run_on_fvp=True,
+    )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
@@ -85,8 +93,13 @@ def test_log_softmax_u55_BI(test_data):
 )
 @common.XfailIfNoCorstone320
 def test_log_softmax_u85_BI(test_data):
-    data, dim = test_data
-    pipeline = EthosU85PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=True)
+    data, dim = test_data()
+    pipeline = EthosU85PipelineBI[input_t1](
+        LogSoftmax(dim),
+        data,
+        [],
+        run_on_fvp=True,
+    )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_lshift.py b/backends/arm/test/ops/test_lshift.py
index f6ddabf6612..e74e80deeed 100644
--- a/backends/arm/test/ops/test_lshift.py
+++ b/backends/arm/test/ops/test_lshift.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+from executorch.backends.arm.test import common
 from executorch.backends.arm.test.common import (
     XfailIfNoCorstone300,
     XfailIfNoCorstone320,
@@ -14,7 +15,6 @@
     TosaPipelineBI,
     TosaPipelineMI,
 )
-from parameterized import parameterized
 
 scalar_input_t = tuple[torch.Tensor, int]
 
@@ -23,11 +23,20 @@ class LshiftScalar(torch.nn.Module):
     torch_op_MI = "torch.ops.aten.__lshift__.Scalar"
     torch_op_BI = "torch.ops.aten.bitwise_left_shift.Tensor"
     exir_op = "executorch_exir_dialects_edge__ops_aten_bitwise_left_shift_Tensor"
-    test_data = [
-        ((torch.randint(-8, 8, (1, 12, 3, 4), dtype=torch.int8), 1),),
-        ((torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int16), 5),),
-        ((torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int32), 2),),
-    ]
+    test_data = {
+        "randint_neg_8_int8": (
+            torch.randint(-8, 8, (1, 12, 3, 4), dtype=torch.int8),
+            1,
+        ),
+        "randint_neg_100_int16": (
+            torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int16),
+            5,
+        ),
+        "randint_neg_100_int32": (
+            torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int32),
+            2,
+        ),
+    }
 
     def forward(self, x: torch.Tensor, shift: int):
         return x << shift
@@ -39,33 +48,27 @@ def forward(self, x: torch.Tensor, shift: int):
 class LshiftTensor(torch.nn.Module):
     torch_op = "torch.ops.aten.bitwise_left_shift.Tensor"
     exir_op = "executorch_exir_dialects_edge__ops_aten_bitwise_left_shift_Tensor"
-    test_data = [
-        (
-            (
-                torch.randint(-8, 8, (3, 3), dtype=torch.int8),
-                torch.randint(0, 4, (3, 3), dtype=torch.int8),
-            ),
+    test_data = {
+        "randint_neg_8_tensor_int8": (
+            torch.randint(-8, 8, (3, 3), dtype=torch.int8),
+            torch.randint(0, 4, (3, 3), dtype=torch.int8),
         ),
-        (
-            (
-                torch.randint(-1024, 1024, (3, 3, 3), dtype=torch.int16),
-                torch.randint(0, 5, (3, 3, 3), dtype=torch.int16),
-            ),
+        "randint_neg_1024_tensor_int16": (
+            torch.randint(-1024, 1024, (3, 3, 3), dtype=torch.int16),
+            torch.randint(0, 5, (3, 3, 3), dtype=torch.int16),
         ),
-        (
-            (
-                torch.randint(0, 127, (1, 2, 3, 3), dtype=torch.int32),
-                torch.randint(0, 5, (1, 2, 3, 3), dtype=torch.int32),
-            ),
+        "randint_0_tensor_int16": (
+            torch.randint(0, 127, (1, 2, 3, 3), dtype=torch.int32),
+            torch.randint(0, 5, (1, 2, 3, 3), dtype=torch.int32),
         ),
-    ]
+    }
 
     def forward(self, x: torch.Tensor, shift: torch.Tensor):
         return x.bitwise_left_shift(shift)
 
 
-@parameterized.expand(LshiftScalar.test_data)
-def test_lshift_scalar_tosa_MI(test_data):
+@common.parametrize("test_data", LshiftScalar.test_data)
+def test_lshift_scalar_tosa_MI_scalar(test_data):
     TosaPipelineMI[scalar_input_t](
         LshiftScalar(),
         test_data,
@@ -74,18 +77,21 @@ def test_lshift_scalar_tosa_MI(test_data):
     ).run()
 
 
-@parameterized.expand(LshiftScalar.test_data)
-def test_lshift_scalar_tosa_BI(test_data):
+@common.parametrize("test_data", LshiftScalar.test_data)
+def test_bitwise_left_shift_tensor_tosa_BI_scalar(test_data):
     pipeline = TosaPipelineBI[scalar_input_t](
-        LshiftScalar(), test_data, LshiftScalar.torch_op_BI, LshiftScalar.exir_op
+        LshiftScalar(),
+        test_data,
+        LshiftScalar.torch_op_BI,
+        LshiftScalar.exir_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
-@parameterized.expand(LshiftScalar.test_data)
+@common.parametrize("test_data", LshiftScalar.test_data)
 @XfailIfNoCorstone300
-def test_lshift_scalar_tosa_u55(test_data):
+def test_bitwise_left_shift_tensor_u55_BI_scalar(test_data):
     pipeline = EthosU55PipelineBI[scalar_input_t](
         LshiftScalar(),
         test_data,
@@ -97,9 +103,9 @@ def test_lshift_scalar_tosa_u55(test_data):
     pipeline.run()
 
 
-@parameterized.expand(LshiftScalar.test_data)
+@common.parametrize("test_data", LshiftScalar.test_data)
 @XfailIfNoCorstone320
-def test_lshift_scalar_tosa_u85(test_data):
+def test_bitwise_left_shift_tensor_u85_BI_scalar(test_data):
     pipeline = EthosU85PipelineBI[scalar_input_t](
         LshiftScalar(),
         test_data,
@@ -111,8 +117,8 @@ def test_lshift_scalar_tosa_u85(test_data):
     pipeline.run()
 
 
-@parameterized.expand(LshiftTensor.test_data)
-def test_lshift_tensor_tosa_MI(test_data):
+@common.parametrize("test_data", LshiftTensor.test_data)
+def test_lshift_scalar_tosa_MI(test_data):
     TosaPipelineMI[scalar_input_t](
         LshiftTensor(),
         test_data,
@@ -121,18 +127,21 @@ def test_lshift_tensor_tosa_MI(test_data):
     ).run()
 
 
-@parameterized.expand(LshiftTensor.test_data)
-def test_lshift_tensor_tosa_BI(test_data):
+@common.parametrize("test_data", LshiftTensor.test_data)
+def test_bitwise_left_shift_tensor_tosa_BI(test_data):
     pipeline = TosaPipelineBI[scalar_input_t](
-        LshiftTensor(), test_data, LshiftTensor.torch_op, LshiftTensor.exir_op
+        LshiftTensor(),
+        test_data,
+        LshiftTensor.torch_op,
+        LshiftTensor.exir_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
-@parameterized.expand(LshiftTensor.test_data)
+@common.parametrize("test_data", LshiftTensor.test_data)
 @XfailIfNoCorstone300
-def test_lshift_tensor_tosa_u55(test_data):
+def test_bitwise_left_shift_tensor_u55_BI(test_data):
     pipeline = EthosU55PipelineBI[scalar_input_t](
         LshiftTensor(),
         test_data,
@@ -144,9 +153,9 @@ def test_lshift_tensor_tosa_u55(test_data):
     pipeline.run()
 
 
-@parameterized.expand(LshiftTensor.test_data)
+@common.parametrize("test_data", LshiftTensor.test_data)
 @XfailIfNoCorstone320
-def test_lshift_tensor_tosa_u85(test_data):
+def test_bitwise_left_shift_tensor_u85_BI(test_data):
     pipeline = EthosU85PipelineBI[scalar_input_t](
         LshiftTensor(),
         test_data,
diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py
index f5664b7895d..92298ca70fa 100644
--- a/backends/arm/test/ops/test_lt.py
+++ b/backends/arm/test/ops/test_lt.py
@@ -63,24 +63,27 @@ def get_inputs(self):
 op_lt_scalar_rank4_randn = LessThan(torch.randn(3, 2, 2, 2), 0.3)
 
 test_data_tensor = {
-    "lt_tensor_rank1_ones": op_lt_tensor_rank1_ones,
-    "lt_tensor_rank2_rand": op_lt_tensor_rank2_rand,
-    "lt_tensor_rank3_randn": op_lt_tensor_rank3_randn,
-    "lt_tensor_rank4_randn": op_lt_tensor_rank4_randn,
+    "lt_tensor_rank1_ones": lambda: op_lt_tensor_rank1_ones,
+    "lt_tensor_rank2_rand": lambda: op_lt_tensor_rank2_rand,
+    "lt_tensor_rank3_randn": lambda: op_lt_tensor_rank3_randn,
+    "lt_tensor_rank4_randn": lambda: op_lt_tensor_rank4_randn,
 }
 
 test_data_scalar = {
-    "lt_scalar_rank1_ones": op_lt_scalar_rank1_ones,
-    "lt_scalar_rank2_rand": op_lt_scalar_rank2_rand,
-    "lt_scalar_rank3_randn": op_lt_scalar_rank3_randn,
-    "lt_scalar_rank4_randn": op_lt_scalar_rank4_randn,
+    "lt_scalar_rank1_ones": lambda: op_lt_scalar_rank1_ones,
+    "lt_scalar_rank2_rand": lambda: op_lt_scalar_rank2_rand,
+    "lt_scalar_rank3_randn": lambda: op_lt_scalar_rank3_randn,
+    "lt_scalar_rank4_randn": lambda: op_lt_scalar_rank4_randn,
 }
 
 
 @common.parametrize("test_module", test_data_tensor)
 def test_lt_tensor_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        LessThan.aten_op_tensor,
+        LessThan.exir_op,
     )
     pipeline.run()
 
@@ -88,7 +91,10 @@ def test_lt_tensor_tosa_MI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_lt_scalar_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), LessThan.aten_op_scalar, LessThan.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        LessThan.aten_op_scalar,
+        LessThan.exir_op,
     )
     pipeline.run()
 
@@ -96,7 +102,10 @@ def test_lt_scalar_tosa_MI(test_module):
 @common.parametrize("test_module", test_data_tensor)
 def test_lt_tensor_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        LessThan.aten_op_tensor,
+        LessThan.exir_op,
     )
     pipeline.run()
 
@@ -104,34 +113,39 @@ def test_lt_tensor_tosa_BI(test_module):
 @common.parametrize("test_module", test_data_scalar)
 def test_lt_scalar_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op
+        test_module(),
+        test_module().get_inputs(),
+        LessThan.aten_op_tensor,
+        LessThan.exir_op,
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_data_tensor)
 @common.XfailIfNoCorstone300
-def test_lt_tensor_u55_BI(test_module):
+def test_lt_tensor_u55_BI_not_delegated(test_module):
     # LessThan is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {LessThan.exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_data_scalar)
 @common.XfailIfNoCorstone300
-def test_lt_scalar_u55_BI(test_module):
+def test_lt_scalar_u55_BI_not_delegated(test_module):
     # LessThan is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {LessThan.exir_op: 1},
         n_expected_delegates=1,
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -146,8 +160,8 @@ def test_lt_scalar_u55_BI(test_module):
 @common.XfailIfNoCorstone320
 def test_lt_tensor_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         LessThan.aten_op_tensor,
         LessThan.exir_op,
         run_on_fvp=True,
@@ -165,8 +179,8 @@ def test_lt_tensor_u85_BI(test_module):
 @common.XfailIfNoCorstone320
 def test_lt_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
-        test_module,
-        test_module.get_inputs(),
+        test_module(),
+        test_module().get_inputs(),
         LessThan.aten_op_tensor,
         LessThan.exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index 4db8c62bd88..a1fd3ea30ec 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -5,280 +5,183 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-)
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
-
-
-test_data_suite = [
-    # (test_name, test_data, [kernel_size, stride, padding])
-    ("zeros", torch.zeros(1, 1, 4, 8), [2, 2, 1]),
-    ("ones", torch.ones(1, 16, 50, 32), [4, 2, 0]),
-    ("rand", torch.rand(1, 16, 52, 16), [4, 3, 0]),
-    ("non_divisible", torch.rand(1, 16, 112, 112), [3, 2, 1]),
-]
-
-test_data_suite_mult_batches = [
-    ("randn", torch.randn(5, 16, 50, 32), [4, 2, 0]),
-]
-
-
-class TestMaxPool2d(unittest.TestCase):
-    """Tests MaxPool2d."""
-
-    class MaxPool2d(torch.nn.Module):
-        def __init__(
-            self,
-            kernel_size: int | Tuple[int, int],
-            stride: int | Tuple[int, int],
-            padding: int | Tuple[int, int],
-        ):
-            super().__init__()
-            self.max_pool_2d = torch.nn.MaxPool2d(
-                kernel_size=kernel_size, stride=stride, padding=padding
-            )
-
-        def forward(self, x):
-            return self.max_pool_2d(x)
-
-    def _test_maxpool2d_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .check(["torch.ops.aten.max_pool2d.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_max_pool2d_default"])
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default"
-                ]
-            )
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_maxpool2d_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.max_pool2d.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_max_pool2d_default"])
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default"
-                ]
-            )
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
 
-    def _test_maxpool2d_tosa_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.tensor],
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.max_pool2d.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_max_pool2d_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-
-        return tester
+from executorch.backends.arm.test import common
 
-    @parameterized.expand(test_data_suite)
-    def test_maxpool2d_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_maxpool2d_tosa_MI_pipeline(
-            self.MaxPool2d(*model_params), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_maxpool2d_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_maxpool2d_tosa_BI_pipeline(
-            self.MaxPool2d(*model_params), (test_data,)
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_maxpool2d_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
-            self.MaxPool2d(*model_params),
-            common.get_u55_compile_spec(),
-            (test_data,),
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=(test_data,))
 
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_maxpool2d_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
-            self.MaxPool2d(*model_params),
-            common.get_u85_compile_spec(),
-            (test_data,),
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=(test_data,))
+test_data_suite = {
+    # (test_name, test_data, [kernel_size, stride, padding])
+    "zeros": lambda: (torch.zeros(1, 1, 4, 8), [2, 2, 1]),
+    "ones": lambda: (torch.ones(1, 16, 50, 32), [4, 2, 0]),
+    "rand": lambda: (torch.rand(1, 16, 52, 16), [4, 3, 0]),
+    "non_divisible": lambda: (torch.rand(1, 16, 112, 112), [3, 2, 1]),
+}
 
-    @parameterized.expand(test_data_suite_mult_batches)
-    def test_maxpool2d_tosa_MI_mult_batches(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_maxpool2d_tosa_MI_pipeline(
-            self.MaxPool2d(*model_params), (test_data,)
-        )
+test_data_suite_mult_batches = {
+    "randn": lambda: (torch.randn(5, 16, 50, 32), [4, 2, 0]),
+}
 
-    @parameterized.expand(test_data_suite_mult_batches)
-    def test_maxpool2d_tosa_BI_mult_batches(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_maxpool2d_tosa_BI_pipeline(
-            self.MaxPool2d(*model_params), (test_data,)
-        )
 
-    @parameterized.expand(test_data_suite_mult_batches)
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-433
-    def test_maxpool2d_tosa_u85_BI_mult_batches(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
-            self.MaxPool2d(*model_params),
-            common.get_u85_compile_spec(),
-            (test_data,),
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=(test_data,))
+aten_op = "torch.ops.aten.max_pool2d.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_max_pool2d_default"
 
-    @parameterized.expand(test_data_suite_mult_batches)
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-433
-    def test_maxpool2d_tosa_u55_BI_mult_batches(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
-            self.MaxPool2d(*model_params),
-            common.get_u55_compile_spec(),
-            (test_data,),
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=(test_data,))
+input_t1 = Tuple[torch.Tensor]
 
-    reject_data_suite = [
-        (MaxPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)),
-        (MaxPool2d((1, 257), 1, 0), torch.rand(1, 16, 5, 300)),
-        (MaxPool2d((800, 90), 1, 0), torch.rand(1, 16, 850, 100)),
-    ]
 
-    @parameterized.expand(reject_data_suite)
-    def test_reject_maxpool2d_u55_BI(
+class MaxPool2d(torch.nn.Module):
+    def __init__(
         self,
-        module: torch.nn.Module,
-        test_data: torch.tensor,
+        kernel_size: int | Tuple[int, int],
+        stride: int | Tuple[int, int],
+        padding: int | Tuple[int, int],
     ):
-        compile_spec = common.get_u55_compile_spec()
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
+        super().__init__()
+        self.max_pool_2d = torch.nn.MaxPool2d(
+            kernel_size=kernel_size, stride=stride, padding=padding
         )
 
-        (
-            ArmTester(
-                module,
-                example_inputs=(test_data,),
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.max_pool2d.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check(
-                [
-                    "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default"
-                ]
-            )
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 0})
-        )
+    def forward(self, x):
+        return self.max_pool_2d(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_max_pool2d_tosa_MI(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        MaxPool2d(*model_params), (test_data,), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_max_pool2d_tosa_BI(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        MaxPool2d(*model_params),
+        (test_data,),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_max_pool2d_u55_BI(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    EthosU55PipelineBI[input_t1](
+        MaxPool2d(*model_params),
+        (test_data,),
+        aten_op,
+        exir_ops=[],
+        symmetric_io_quantization=True,
+        run_on_fvp=True,
+    ).run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_max_pool2d_u85_BI(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    EthosU85PipelineBI[input_t1](
+        MaxPool2d(*model_params),
+        (test_data,),
+        aten_op,
+        exir_ops=[],
+        symmetric_io_quantization=True,
+        run_on_fvp=True,
+    ).run()
+
+
+@common.parametrize("test_data", test_data_suite_mult_batches)
+def test_max_pool2d_tosa_MI_mult_batches(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        MaxPool2d(*model_params),
+        (test_data,),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite_mult_batches)
+def test_max_pool2d_tosa_BI_mult_batches(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        MaxPool2d(*model_params),
+        (test_data,),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+x_fail = {"randn": "MLETORCH-986: Numerical issues with mutli batches."}
+
+
+@common.parametrize("test_data", test_data_suite_mult_batches, x_fail)
+@common.XfailIfNoCorstone300
+def test_max_pool2d_u55_BI_mult_batches(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    EthosU55PipelineBI[input_t1](
+        MaxPool2d(*model_params),
+        (test_data,),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+        use_to_edge_transform_and_lower=True,
+    ).run()
+
+
+@common.parametrize("test_data", test_data_suite_mult_batches, x_fail)
+@common.XfailIfNoCorstone320
+def test_max_pool2d_u85_BI_mult_batches(test_data: torch.Tensor):
+    test_data, model_params = test_data()
+    EthosU85PipelineBI[input_t1](
+        MaxPool2d(*model_params),
+        (test_data,),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+        use_to_edge_transform_and_lower=True,
+    ).run()
+
+
+reject_data_suite = {
+    "reject_1": lambda: (MaxPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)),
+    "reject_2": lambda: (MaxPool2d((1, 257), 1, 0), torch.rand(1, 16, 5, 300)),
+    "reject_3": lambda: (MaxPool2d((800, 90), 1, 0), torch.rand(1, 16, 850, 100)),
+}
+
+
+@common.parametrize("test_data", reject_data_suite)
+@common.XfailIfNoCorstone300
+def test_max_pool2d_u55_BI_failure_set(test_data: Tuple):
+    module, test_data = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        module,
+        (test_data,),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.pop_stage("check_count.exir")
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_maximum.py b/backends/arm/test/ops/test_maximum.py
index a255496d517..adcc7dc9cab 100644
--- a/backends/arm/test/ops/test_maximum.py
+++ b/backends/arm/test/ops/test_maximum.py
@@ -1,127 +1,75 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestMaximum(unittest.TestCase):
-    """Tests a single maximum op"""
-
-    class Maximum(torch.nn.Module):
-        test_parameters = [
-            (
-                torch.FloatTensor([1, 2, 3, 5, 7]),
-                (torch.FloatTensor([2, 1, 2, 1, 10])),
-            ),
-            (torch.ones(1, 10, 4, 6), 2 * torch.ones(1, 10, 4, 6)),
-            (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-            (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
-            (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
-        ]
-
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x, y):
-            return torch.maximum(x, y)
-
-    def _test_maximum_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.maximum.default": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_maximum_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.maximum.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_maximum_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .to_executorch()
-            .serialize()
-        )
-
-        return tester
-
-    @parameterized.expand(Maximum.test_parameters)
-    def test_maximum_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_maximum_tosa_MI_pipeline(self.Maximum(), test_data)
-
-    @parameterized.expand(Maximum.test_parameters)
-    def test_maximum_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_maximum_tosa_BI_pipeline(self.Maximum(), test_data)
-
-    @parameterized.expand(Maximum.test_parameters)
-    def test_maximum_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        tester = self._test_maximum_ethos_BI_pipeline(
-            self.Maximum(), common.get_u55_compile_spec(), test_data
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(Maximum.test_parameters)
-    def test_maximum_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        tester = self._test_maximum_ethos_BI_pipeline(
-            self.Maximum(), common.get_u85_compile_spec(), test_data
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+test_t = tuple[torch.Tensor, torch.Tensor]
+aten_op = "torch.ops.aten.maximum.default"
+
+
+class Maximum(torch.nn.Module):
+    test_parameters = {
+        "float_tensor": lambda: (
+            torch.FloatTensor([1, 2, 3, 5, 7]),
+            (torch.FloatTensor([2, 1, 2, 1, 10])),
+        ),
+        "ones": lambda: (torch.ones(1, 10, 4, 6), 2 * torch.ones(1, 10, 4, 6)),
+        "rand_diff": lambda: (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
+        "rand_same": lambda: (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
+        "rand_large": lambda: (
+            10000 * torch.randn(1, 1, 4, 4),
+            torch.randn(1, 1, 4, 1),
+        ),
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return torch.maximum(x, y)
+
+
+@common.parametrize("test_data", Maximum.test_parameters)
+def test_maximum_tosa_MI(test_data: Tuple):
+    TosaPipelineMI[test_t](Maximum(), test_data(), aten_op).run()
+
+
+@common.parametrize("test_data", Maximum.test_parameters)
+def test_maximum_tosa_BI(test_data: Tuple):
+    TosaPipelineBI[test_t](Maximum(), test_data(), aten_op).run()
+
+
+@common.parametrize("test_data", Maximum.test_parameters)
+@common.XfailIfNoCorstone300
+def test_maximum_u55_BI(test_data: Tuple):
+    EthosU55PipelineBI[test_t](
+        Maximum(),
+        test_data(),
+        aten_op,
+        run_on_fvp=True,
+    ).run()
+
+
+@common.parametrize("test_data", Maximum.test_parameters)
+@common.XfailIfNoCorstone320
+def test_maximum_u85_BI(test_data: Tuple):
+    EthosU85PipelineBI[test_t](
+        Maximum(),
+        test_data(),
+        aten_op,
+        run_on_fvp=True,
+    ).run()
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 2351b0f9e9c..43063058805 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -20,10 +20,10 @@
 class AdaptiveAveragePool2d(torch.nn.Module):
     test_data_suite = {
         # (test_name, test_data)
-        "zeros": (torch.zeros(1, 1280, 7, 7),),
-        "ones": (torch.ones(1, 1280, 7, 7),),
-        "rand": (torch.rand(1, 1280, 7, 7),),
-        "randn": (torch.randn(1, 1280, 7, 7),),
+        "zeros": lambda: (torch.zeros(1, 1280, 7, 7),),
+        "ones": lambda: (torch.ones(1, 1280, 7, 7),),
+        "rand": lambda: (torch.rand(1, 1280, 7, 7),),
+        "randn": lambda: (torch.randn(1, 1280, 7, 7),),
     }
     aten_op = "torch.ops.aten.adaptive_avg_pool2d.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_mean_dim"
@@ -40,7 +40,7 @@ def forward(self, x):
 def test_adaptive_avg_pool2d_tosa_MI(test_data):
     TosaPipelineMI[input_t](
         AdaptiveAveragePool2d(),
-        test_data,
+        test_data(),
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
     ).run()
@@ -50,38 +50,18 @@ def test_adaptive_avg_pool2d_tosa_MI(test_data):
 def test_adaptive_avg_pool2d_tosa_BI(test_data):
     TosaPipelineBI[input_t](
         AdaptiveAveragePool2d(),
-        test_data,
+        test_data(),
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
     ).run()
 
 
 @common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
-def test_adaptive_avg_pool2d_u55(test_data):
-    EthosU55PipelineBI[input_t](
-        AdaptiveAveragePool2d(),
-        test_data,
-        AdaptiveAveragePool2d.aten_op,
-        AdaptiveAveragePool2d.exir_op,
-    ).run()
-
-
-@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
-def test_adaptive_avg_pool2d_u85(test_data):
-    EthosU85PipelineBI[input_t](
-        AdaptiveAveragePool2d(),
-        test_data,
-        AdaptiveAveragePool2d.aten_op,
-        AdaptiveAveragePool2d.exir_op,
-    ).run()
-
-
-@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
-@common.SkipIfNoCorstone300
-def test_adaptive_avg_pool2d_u55_on_fvp(test_data):
+@common.XfailIfNoCorstone300
+def test_adaptive_avg_pool2d_u55_BI(test_data):
     EthosU55PipelineBI[input_t](
         AdaptiveAveragePool2d(),
-        test_data,
+        test_data(),
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
         run_on_fvp=True,
@@ -89,11 +69,11 @@ def test_adaptive_avg_pool2d_u55_on_fvp(test_data):
 
 
 @common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
-@common.SkipIfNoCorstone320
-def test_adaptive_avg_pool2d_u85_on_fvp(test_data):
+@common.XfailIfNoCorstone320
+def test_adaptive_avg_pool2d_u85_BI(test_data):
     EthosU85PipelineBI[input_t](
         AdaptiveAveragePool2d(),
-        test_data,
+        test_data(),
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
         run_on_fvp=True,
@@ -102,14 +82,14 @@ def test_adaptive_avg_pool2d_u85_on_fvp(test_data):
 
 class MeanDim(torch.nn.Module):
     test_data_suite: dict[str, tuple] = {
-        "zeros": (torch.zeros(1, 1280, 7, 7), -1, True),
-        "ones": (torch.ones(1, 1280, 7, 7), (-1, 2), False),
-        "rand": (
+        "zeros": lambda: (torch.zeros(1, 1280, 7, 7), -1, True),
+        "ones": lambda: (torch.ones(1, 1280, 7, 7), (-1, 2), False),
+        "rand": lambda: (
             torch.rand(1, 1280, 7, 7),
             (-1),
             True,
         ),
-        "randn": (
+        "randn": lambda: (
             torch.randn(1, 1280, 7, 7),
             (-1, -2, -3),
             False,
@@ -128,20 +108,22 @@ def forward(self, x: torch.Tensor):
 
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
-def test_mean_tosa_MI(test_data):
+def test_mean_dim_tosa_MI(test_data):
+    test_data, dim, keep_dim = test_data()
     TosaPipelineMI[input_t](
-        MeanDim(test_data[1], test_data[2]),
-        (test_data[0],),
+        MeanDim(dim, keep_dim),
+        (test_data,),
         MeanDim.torch_op,
         MeanDim.exir_op,
     ).run()
 
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
-def test_mean_tosa_BI(test_data):
+def test_mean_dim_tosa_BI(test_data):
+    test_data, dim, keep_dim = test_data()
     pipeline = TosaPipelineBI[input_t](
-        MeanDim(test_data[1], test_data[2]),
-        (test_data[0],),
+        MeanDim(dim, keep_dim),
+        (test_data,),
         "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
     )
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -150,10 +132,11 @@ def test_mean_tosa_BI(test_data):
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
 @common.XfailIfNoCorstone300
-def test_mean_u55_BI(test_data):
+def test_mean_dim_u55_BI(test_data):
+    test_data, dim, keep_dim = test_data()
     pipeline = EthosU55PipelineBI[input_t](
-        MeanDim(test_data[1], test_data[2]),
-        (test_data[0],),
+        MeanDim(dim, keep_dim),
+        (test_data,),
         "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
         run_on_fvp=True,
     )
@@ -163,10 +146,11 @@ def test_mean_u55_BI(test_data):
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
 @common.XfailIfNoCorstone320
-def test_mean_u85_BI(test_data):
+def test_mean_dim_u85_BI(test_data):
+    test_data, dim, keep_dim = test_data()
     pipeline = EthosU85PipelineBI[input_t](
-        MeanDim(test_data[1], test_data[2]),
-        (test_data[0],),
+        MeanDim(dim, keep_dim),
+        (test_data,),
         "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
         run_on_fvp=True,
     )
diff --git a/backends/arm/test/ops/test_minimum.py b/backends/arm/test/ops/test_minimum.py
index 04693a46435..27922cda5e0 100644
--- a/backends/arm/test/ops/test_minimum.py
+++ b/backends/arm/test/ops/test_minimum.py
@@ -1,130 +1,75 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestMinimum(unittest.TestCase):
-    """Tests a single minimum op"""
-
-    class Minimum(torch.nn.Module):
-        test_parameters = [
-            (
-                torch.FloatTensor([1, 2, 3, 5, 7]),
-                (torch.FloatTensor([2, 1, 2, 1, 10])),
-            ),
-            (torch.ones(1, 10, 4, 6), 2 * torch.ones(1, 10, 4, 6)),
-            (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-            (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
-            (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
-        ]
-
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x, y):
-            return torch.minimum(x, y)
-
-    def _test_minimum_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.minimum.default": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_minimum_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.minimum.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_minimum_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .to_executorch()
-            .serialize()
-        )
-
-        return tester
-
-    @parameterized.expand(Minimum.test_parameters)
-    def test_minimum_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_minimum_tosa_MI_pipeline(self.Minimum(), test_data)
-
-    @parameterized.expand(Minimum.test_parameters)
-    def test_minimum_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_minimum_tosa_BI_pipeline(self.Minimum(), test_data)
-
-    @parameterized.expand(Minimum.test_parameters)
-    def test_minimum_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        tester = self._test_minimum_ethos_BI_pipeline(
-            self.Minimum(), common.get_u55_compile_spec(), test_data
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(Minimum.test_parameters)
-    def test_minimum_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        tester = self._test_minimum_ethos_BI_pipeline(
-            self.Minimum(), common.get_u85_compile_spec(), test_data
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                qtol=1,
-                inputs=test_data,
-            )
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+test_t = tuple[torch.Tensor, torch.Tensor]
+aten_op = "torch.ops.aten.minimum.default"
+
+
+class Minimum(torch.nn.Module):
+    test_parameters = {
+        "float_tensor": lambda: (
+            torch.FloatTensor([1, 2, 3, 5, 7]),
+            (torch.FloatTensor([2, 1, 2, 1, 10])),
+        ),
+        "ones": lambda: (torch.ones(1, 10, 4, 6), 2 * torch.ones(1, 10, 4, 6)),
+        "rand_diff": lambda: (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
+        "rand_same": lambda: (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
+        "rand_large": lambda: (
+            10000 * torch.randn(1, 1, 4, 4),
+            torch.randn(1, 1, 4, 1),
+        ),
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return torch.minimum(x, y)
+
+
+@common.parametrize("test_data", Minimum.test_parameters)
+def test_minimum_tosa_MI(test_data: Tuple):
+    TosaPipelineMI[test_t](Minimum(), test_data(), aten_op).run()
+
+
+@common.parametrize("test_data", Minimum.test_parameters)
+def test_minimum_tosa_BI(test_data: Tuple):
+    TosaPipelineBI[test_t](Minimum(), test_data(), aten_op).run()
+
+
+@common.parametrize("test_data", Minimum.test_parameters)
+@common.XfailIfNoCorstone300
+def test_minimum_u55_BI(test_data: Tuple):
+    EthosU55PipelineBI[test_t](
+        Minimum(),
+        test_data(),
+        aten_op,
+        run_on_fvp=True,
+    ).run()
+
+
+@common.parametrize("test_data", Minimum.test_parameters)
+@common.XfailIfNoCorstone320
+def test_minimum_u85_BI(test_data: Tuple):
+    EthosU85PipelineBI[test_t](
+        Minimum(),
+        test_data(),
+        aten_op,
+        run_on_fvp=True,
+    ).run()
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index a4503280db9..a5a3b4b98b9 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable
+from typing import Tuple
 
 import pytest
 import torch
@@ -15,19 +15,18 @@
     TosaPipelineBI,
     TosaPipelineMI,
 )
-from parameterized import parameterized
 
 test_t = tuple[torch.Tensor, torch.Tensor]
 
 
 class MM(torch.nn.Module):
-    test_data_generators = [
-        lambda: (torch.rand(3, 5), torch.rand(5, 2)),
-        lambda: (torch.rand(1, 1), torch.rand(1, 1)),
-        lambda: (torch.ones(55, 3), torch.ones(3, 44)),
-        lambda: (10000 * torch.randn(1, 10), torch.randn(10, 5)),
-        lambda: (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
-    ]
+    test_data_generators = {
+        "rand_2d": lambda: (torch.rand(3, 5), torch.rand(5, 2)),
+        "rand_same": lambda: (torch.rand(1, 1), torch.rand(1, 1)),
+        "ones": lambda: (torch.ones(55, 3), torch.ones(3, 44)),
+        "randn_large": lambda: (10000 * torch.randn(1, 10), torch.randn(10, 5)),
+        "rand_neg": lambda: (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
+    }
     aten_op = "torch.ops.aten.mm.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_mm_default"
 
@@ -35,43 +34,35 @@ def forward(self, x, y):
         return torch.mm(x, y)
 
 
-@parameterized.expand(MM.test_data_generators)
-def test_mm_tosa_MI(test_data_generator: Callable[[], tuple]):
-    test_data = test_data_generator()
-    TosaPipelineMI[test_t](MM(), test_data, MM.aten_op).run()
+@common.parametrize("test_data", MM.test_data_generators)
+def test_mm_tosa_MI(test_data: Tuple):
+    TosaPipelineMI[test_t](MM(), test_data(), MM.aten_op).run()
 
 
-@parameterized.expand(MM.test_data_generators)
-def test_mm_tosa_BI(test_data_generator: Callable[[], tuple]):
-    test_data = test_data_generator()
-    TosaPipelineBI[test_t](MM(), test_data, MM.aten_op, MM.exir_op).run()
+@common.parametrize("test_data", MM.test_data_generators)
+def test_mm_tosa_BI(test_data: Tuple):
+    TosaPipelineBI[test_t](MM(), test_data(), MM.aten_op, MM.exir_op).run()
 
 
-@parameterized.expand(MM.test_data_generators)
-def test_mm_tosa_u55(test_data_generator: Callable[[], tuple]):
-    test_data = test_data_generator()
-    EthosU55PipelineBI[test_t](MM(), test_data, MM.aten_op).run()
-
-
-@parameterized.expand(MM.test_data_generators)
+@common.parametrize("test_data", MM.test_data_generators)
+@common.XfailIfNoCorstone300
 @pytest.mark.flaky  # Investigate flakiness (MLETORCH-870)
-def test_mm_tosa_u85(test_data_generator: Callable[[], tuple]):
-    test_data = test_data_generator()
-    EthosU85PipelineBI[test_t](MM(), test_data, MM.aten_op, MM.exir_op).run()
-
-
-@parameterized.expand(MM.test_data_generators)
-@common.SkipIfNoCorstone300
-def test_mm_tosa_u55_on_fvp(test_data_generator: Callable[[], tuple]):
-    test_data = test_data_generator()
-    EthosU55PipelineBI[test_t](MM(), test_data, MM.aten_op, run_on_fvp=True).run()
+def test_mm_u55_BI(test_data: Tuple):
+    EthosU55PipelineBI[test_t](
+        MM(),
+        test_data(),
+        MM.aten_op,
+        run_on_fvp=True,
+    ).run()
 
 
-@parameterized.expand(MM.test_data_generators)
-@common.SkipIfNoCorstone320
-@pytest.mark.flaky  # Investigate flakiness (MLETORCH-870)
-def test_mm_tosa_u85_on_fvp(test_data_generator: Callable[[], tuple]):
-    test_data = test_data_generator()
+@common.parametrize("test_data", MM.test_data_generators)
+@common.XfailIfNoCorstone320
+def test_mm_u85_BI(test_data: Tuple):
     EthosU85PipelineBI[test_t](
-        MM(), test_data, MM.aten_op, MM.exir_op, run_on_fvp=True
+        MM(),
+        test_data(),
+        MM.aten_op,
+        MM.exir_op,
+        run_on_fvp=True,
     ).run()
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 739864a4982..f960f348a87 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -1,226 +1,155 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
-import pytest
+from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
 
-test_data_suite = [
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x
+aten_op = "torch.ops.aten.mul.Tensor"
+
+test_data_suite = {
     # (test_name, input, other,) See torch.mul() for info
-    (
-        "op_mul_rank1_rand",
+    "op_mul_rank1_rand": lambda: (
         torch.rand(5) * 3.7,
         torch.rand(5) * 1.5,
     ),
-    (
-        "op_mul_rank2_rand",
+    "op_mul_rank2_rand": lambda: (
         torch.rand(4, 5),
         torch.rand(1, 5),
     ),
-    (
-        "op_mul_rank3_randn",
+    "op_mul_rank3_randn": lambda: (
         torch.randn(10, 5, 2),
         torch.randn(10, 5, 2),
     ),
-    (
-        "op_mul_rank4_randn",
+    "op_mul_rank4_randn": lambda: (
         torch.randn(1, 10, 25, 20),
         torch.randn(1, 10, 25, 20),
     ),
-    (
-        "op_mul_rank4_ones_mul_negative",
+    "op_mul_rank4_ones_mul_negative": lambda: (
         torch.ones(1, 10, 25, 20),
         (-1) * torch.ones(1, 10, 25, 20),
     ),
-    (
-        "op_mul_rank4_negative_large_rand",
+    "op_mul_rank4_negative_large_rand": lambda: (
         (-200) * torch.rand(1, 10, 25, 20),
         torch.rand(1, 1, 1, 20),
     ),
-    (
-        "op_mul_rank4_large_randn",
+    "op_mul_rank4_large_randn": lambda: (
         200 * torch.randn(1, 10, 25, 20),
         torch.rand(1, 10, 25, 1),
     ),
-]
+}
 
 
-test_data_suite_2 = [
+test_data_suite_2 = {
     # (test_name, input, other,) See torch.mul() for info
-    (
-        "op_mul_rank2_rand",
+    "op_mul_rank2_rand": lambda: (
         torch.rand(4, 5),
         torch.rand(5),
     ),
-    (
-        "op_mul_rank3_randn",
+    "op_mul_rank3_randn": lambda: (
         torch.randn(10, 5, 2),
         torch.randn(5, 2),
     ),
-    (
-        "op_mul_rank4_randn",
+    "op_mul_rank4_randn": lambda: (
         torch.randn(1, 10, 25, 20),
         torch.randn(1, 25, 20),
     ),
-    (
-        "op_mul_rank4_randn_2",
+    "op_mul_rank4_randn_2": lambda: (
         torch.randn(1, 25, 1),
         torch.randn(1, 3, 25, 10),
     ),
-]
-
-
-class TestMul(unittest.TestCase):
-    class Mul(torch.nn.Module):
-
-        def forward(
-            self,
-            input_: torch.Tensor,
-            other_: torch.Tensor,
-        ):
-            return input_ * other_
-
-    def _test_mul_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .check_count({"torch.ops.aten.mul.Tensor": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_mul_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.mul.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1.0)
-        )
-
-    def _test_mul_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor, torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.mul.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_mul_tosa_MI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_mul_tosa_MI_pipeline(self.Mul(), test_data)
-
-    @parameterized.expand(test_data_suite_2)
-    def test_mul_diff_input_ranks_tosa_MI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_mul_tosa_MI_pipeline(self.Mul(), test_data)
+}
 
-    @parameterized.expand(test_data_suite_2)
-    def test_mul_diff_input_ranks_tosa_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
-    @parameterized.expand(test_data_suite)
-    def test_mul_tosa_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-
-        test_data = (input_, other_)
-        self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
+class Mul(torch.nn.Module):
 
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_mul_u55_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_mul_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Mul(), test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_mul_u85_BI(
+    def forward(
         self,
-        test_name: str,
         input_: torch.Tensor,
         other_: torch.Tensor,
     ):
-        test_data = (input_, other_)
-        self._test_mul_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Mul(), test_data
-        )
+        return input_ * other_
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_mul_tensor_tosa_MI(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](
+        Mul(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite_2)
+def test_mul_tensor_tosa_MI_diff_input_ranks(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](
+        Mul(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite_2)
+def test_mul_tensor_tosa_BI_diff_input_ranks(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Mul(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_mul_tensor_tosa_BI(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Mul(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_mul_tensor_u55_BI(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Mul(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_mul_tensor_u85_BI(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Mul(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_ne.py b/backends/arm/test/ops/test_ne.py
index 7ab75827cad..2ceacdb31b9 100644
--- a/backends/arm/test/ops/test_ne.py
+++ b/backends/arm/test/ops/test_ne.py
@@ -126,11 +126,12 @@ def test_ne_tensor_u55_BI(test_module):
     pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
         {
             NotEqual.decomposed_exir_ops[0]: 1,
             NotEqual.decomposed_exir_ops[1]: 1,
         },
+        quantize=True,
+        u55_subset=True,
     )
     pipeline.run()
 
@@ -143,11 +144,12 @@ def test_ne_scalar_u55_BI(test_module):
     pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
         {
             NotEqual.decomposed_exir_ops[0]: 1,
             NotEqual.decomposed_exir_ops[1]: 1,
         },
+        quantize=True,
+        u55_subset=True,
         n_expected_delegates=1,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 50db1231b41..3bbfdb69903 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -5,185 +5,105 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
 
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
 )
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.test.tester.test_pipeline import OpNotSupportedPipeline
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
 from torchvision.ops import Permute
 
-test_data_suite = [
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+aten_op = "torch.ops.aten.permute.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_permute_default"
+
+test_data_suite = {
     # (test_name,test_data,dims)
-    ("rank_2", torch.rand(10, 10), [1, 0]),
-    ("rank_3", torch.rand(10, 10, 10), [2, 0, 1]),
-    ("rank_3", torch.rand(10, 10, 10), [1, 2, 0]),
-    ("rank_4", torch.rand(1, 5, 1, 10), [0, 2, 3, 1]),
-    ("rank_4", torch.rand(1, 2, 5, 10), [1, 0, 2, 3]),
-    ("rank_4", torch.rand(1, 10, 10, 5), [2, 0, 1, 3]),
-]
-
-
-class TestPermute(unittest.TestCase):
-    """Tests Permute Operator."""
-
-    class Permute(torch.nn.Module):
-
-        def __init__(self, dims: list[int]):
-            super().__init__()
-
-            self.permute = Permute(dims=dims)
-
-        def forward(self, x):
-            return self.permute(x)
-
-    def _test_permute_tosa_MI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.permute.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_permute_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.permute.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_permute_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.permute.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_permute_tosa_MI(
-        self, test_name: str, test_data: torch.Tensor, dims: list[int]
-    ):
-        self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,))
-        self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_permute_tosa_BI(
-        self, test_name: str, test_data: torch.Tensor, dims: list[int]
-    ):
-        self._test_permute_tosa_BI_pipeline(self.Permute(dims=dims), (test_data,))
-
-    # Expected to fail as TOSA.Transpose is not supported by Ethos-U55.
-    @parameterized.expand(test_data_suite[0:1])
-    @pytest.mark.corstone_fvp
-    def test_permute_u55_BI(
-        self, test_name: str, test_data: torch.Tensor, dims: list[int]
-    ):
-        self._test_permute_ethos_BI_pipeline(
-            self.Permute(dims=dims), common.get_u55_compile_spec(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite[:-2])
-    @pytest.mark.corstone_fvp
-    def test_permute_u85_BI(
-        self, test_name: str, test_data: torch.Tensor, dims: list[int]
-    ):
-        self._test_permute_ethos_BI_pipeline(
-            self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
-        )
-
-    # Fails since on FVP since N > 1 is not supported. MLETORCH-517
-    @parameterized.expand(test_data_suite[-2:])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_permute_u85_BI_xfails(
-        self, test_name: str, test_data: torch.Tensor, dims: list[int]
-    ):
-        self._test_permute_ethos_BI_pipeline(
-            self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
-        )
-
-
-reject_data_suite = {
-    "int8_r3_axes_product": ([1, 700, 1000], [2, 1, 0], torch.int8),
-    "int8_r5_axes_product": ([1, 1, 1, 700, 1000], [0, 1, 2, 3, 4], torch.int8),
-    "int8_r4_NH_too_large": ([700, 100, 1, 1], [0, 1, 3, 2], torch.int8),
-    "int32_r5_no_support": ([2, 2, 2, 2, 2], [3, 4, 2, 1, 0], torch.int32),
+    "rank_2": lambda: (torch.rand(10, 10), [1, 0]),
+    "rank_3": lambda: (torch.rand(10, 10, 10), [2, 0, 1]),
+    "rank_3_2": lambda: (torch.rand(10, 10, 10), [1, 2, 0]),
+    "rank_4": lambda: (torch.rand(1, 5, 1, 10), [0, 2, 3, 1]),
+    "rank_4_2": lambda: (torch.rand(1, 2, 5, 10), [1, 0, 2, 3]),
+    "rank_4_3": lambda: (torch.rand(1, 10, 10, 5), [2, 0, 1, 3]),
+}
+
+
+class SimplePermute(torch.nn.Module):
+
+    def __init__(self, dims: list[int]):
+        super().__init__()
+
+        self.permute = Permute(dims=dims)
+
+    def forward(self, x):
+        return self.permute(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_permute_tosa_MI(test_data: torch.Tensor):
+    test_data, dims = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        SimplePermute(dims=dims),
+        (test_data,),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_permute_tosa_BI(test_data: torch.Tensor):
+    test_data, dims = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        SimplePermute(dims=dims),
+        (test_data,),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+x_fails = {
+    "rank_4_2": "AssertionError: Output 0 does not match reference output.",
+    "rank_4_3": "AssertionError: Output 0 does not match reference output.",
 }
-input_t = tuple[torch.Tensor]
-
-
-@common.parametrize("test_data", reject_data_suite)
-def test_permute_u55_BI_not_delegated(test_data):
-    # Tests that we don't delegate these ops since they are not supported on U55.
-    shape, permutation, dtype = test_data
-    data = ((torch.rand(shape) * 10).to(dtype),)
-    pipeline = OpNotSupportedPipeline[input_t](
-        TestPermute.Permute(dims=permutation),
-        data,
-        "TOSA-0.80+BI+u55",
-        {"executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1},
+
+
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone300
+def test_permute_u55_BI(test_data):
+    test_data, dims = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        SimplePermute(dims=dims),
+        (test_data,),
+        aten_op,
+        exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+# Fails since on FVP since N > 1 is not supported. MLETORCH-517
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone320
+def test_permute_u85_BI(test_data: torch.Tensor):
+    test_data, dims = test_data()
+    pipeline = EthosU85PipelineBI[input_t1](
+        SimplePermute(dims=dims),
+        (test_data,),
+        aten_op,
+        exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_pow.py b/backends/arm/test/ops/test_pow.py
index 618acf50fc2..98b23870f21 100644
--- a/backends/arm/test/ops/test_pow.py
+++ b/backends/arm/test/ops/test_pow.py
@@ -81,8 +81,14 @@ def forward(self, x: torch.Tensor):
         return torch.pow(x, self.exp)
 
 
-@common.parametrize("test_data", Pow_TensorTensor.test_data)
-def test_pow_tensor_tensor_MI(test_data: Pow_TensorTensor.input_t):
+x_fail = {
+    "zero_base_zero_exp": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+    "neg_base_zero_exp": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+}
+
+
+@common.parametrize("test_data", Pow_TensorTensor.test_data, x_fail, strict=False)
+def test_pow_tensor_tensor_tosa_MI(test_data: Pow_TensorTensor.input_t):
     pipeline = TosaPipelineMI[Pow_TensorTensor.input_t](
         Pow_TensorTensor(),
         test_data(),
@@ -92,8 +98,18 @@ def test_pow_tensor_tensor_MI(test_data: Pow_TensorTensor.input_t):
     pipeline.run()
 
 
-@common.parametrize("test_data", Pow_TensorScalar.test_data)
-def test_pow_tensor_scalar_MI(test_data: Pow_TensorScalar.input_t):
+x_fail = {
+    "exp_minus_three": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+    "exp_minus_one": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+    "exp_zero": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+    "exp_one": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+    "exp_two": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+    "non_neg_base_exp_pos_decimal": "TOSA constraints: If x == 0 and y ⇐ 0, the result is undefined.",
+}
+
+
+@common.parametrize("test_data", Pow_TensorScalar.test_data, x_fail, strict=False)
+def test_pow_tensor_scalar_tosa_MI(test_data: Pow_TensorScalar.input_t):
     base, exp = test_data()
     pipeline = TosaPipelineMI[Pow_TensorScalar.input_t](
         Pow_TensorScalar(exp),
@@ -104,8 +120,8 @@ def test_pow_tensor_scalar_MI(test_data: Pow_TensorScalar.input_t):
     pipeline.run()
 
 
-@common.parametrize("test_data", Pow_TensorScalar.test_data)
-def test_pow_tensor_scalar_BI(test_data: Pow_TensorScalar.input_t):
+@common.parametrize("test_data", Pow_TensorScalar.test_data, x_fail, strict=False)
+def test_pow_tensor_scalar_tosa_BI(test_data: Pow_TensorScalar.input_t):
     base, exp = test_data()
     pipeline = TosaPipelineBI[Pow_TensorScalar.input_t](
         Pow_TensorScalar(exp),
diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py
index b3233d02a92..92a33346015 100644
--- a/backends/arm/test/ops/test_reciprocal.py
+++ b/backends/arm/test/ops/test_reciprocal.py
@@ -1,120 +1,91 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
-import pytest
+from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
-
-test_data_t = tuple[str, torch.Tensor]
-test_data_suite: list[test_data_t] = [
-    (
-        "op_reciprocal_rank1_ones",
-        torch.ones(5),
-    ),
-    (
-        "op_reciprocal_rank1_rand",
-        torch.rand(5) * 5,
-    ),
-    ("op_reciprocal_rank1_negative_ones", torch.ones(5) * (-1)),
-    ("op_reciprocal_rank4_ones", torch.ones(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_rand", 200 * torch.rand(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_randn", 200 * torch.randn(1, 10, 25, 20) + 1),
-]
-
-
-class TestReciprocal(unittest.TestCase):
-    """Tests reciprocal"""
-
-    class Reciprocal(torch.nn.Module):
-
-        def forward(self, input_: torch.Tensor):
-            return input_.reciprocal()
-
-    def _test_reciprocal_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.reciprocal.default": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_reciprocal_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.reciprocal.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_reciprocal_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.reciprocal.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_reciprocal_tosa_MI(self, test_name: str, input_: torch.Tensor):
-        test_data = (input_,)
-        self._test_reciprocal_tosa_MI_pipeline(self.Reciprocal(), test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_reciprocal_tosa_BI(self, test_name: str, input_: torch.Tensor):
-
-        test_data = (input_,)
-        self._test_reciprocal_tosa_BI_pipeline(self.Reciprocal(), test_data)
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_reciprocal_u55_BI(self, test_name: str, input_: torch.Tensor):
-        test_data = (input_,)
-        self._test_reciprocal_u55_BI_pipeline(self.Reciprocal(), test_data)
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]  # Input x, Input y
+aten_op = "torch.ops.aten.reciprocal.default"
+
+test_data_suite = {
+    "op_reciprocal_rank1_ones": lambda: torch.ones(5),
+    "op_reciprocal_rank1_rand": lambda: torch.rand(5) * 5,
+    "op_reciprocal_rank1_negative_ones": lambda: torch.ones(5) * (-1),
+    "op_reciprocal_rank4_ones": lambda: torch.ones(1, 10, 25, 20),
+    "op_reciprocal_rank4_negative_ones": lambda: (-1) * torch.ones(1, 10, 25, 20),
+    "op_reciprocal_rank4_ones_reciprocal_negative": lambda: torch.ones(1, 10, 25, 20),
+    "op_reciprocal_rank4_large_rand": lambda: 200 * torch.rand(1, 10, 25, 20),
+    "op_reciprocal_rank4_negative_large_rand": lambda: (-200)
+    * torch.rand(1, 10, 25, 20),
+    "op_reciprocal_rank4_large_randn": lambda: 200 * torch.randn(1, 10, 25, 20) + 1,
+}
+
+
+class Reciprocal(torch.nn.Module):
+
+    def forward(self, input_: torch.Tensor):
+        return input_.reciprocal()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_reciprocal_tosa_MI(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](
+        Reciprocal(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_reciprocal_tosa_BI(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Reciprocal(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_reciprocal_u55_BI(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Reciprocal(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_reciprocal_u85_BI(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Reciprocal(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py
index 3fc64c89be1..e27a65e76da 100644
--- a/backends/arm/test/ops/test_relu.py
+++ b/backends/arm/test/ops/test_relu.py
@@ -1,134 +1,91 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-)
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
+input_t1 = Tuple[torch.Tensor]  # Input x
 
-test_data_suite = [
+aten_op = "torch.ops.aten.relu.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_relu_default"
+
+test_data_suite = {
     # (test_name, test_data)
-    ("zeros", torch.zeros(1, 10, 10, 10)),
-    ("ones", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
-    ("randn_neg", torch.randn(10) - 10),
-    ("ramp", torch.arange(-16, 16, 0.2)),
-]
-
-
-class TestRelu(unittest.TestCase):
-    class Relu(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.relu = torch.nn.ReLU()
-
-        def forward(self, x):
-            return self.relu(x)
-
-    def _test_relu_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.relu.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_relu_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.relu.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_relu_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.relu.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_relu_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_relu_tosa_MI_pipeline(self.Relu(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_relu_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_relu_tosa_BI_pipeline(self.Relu(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_relu_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_relu_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Relu(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_relu_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_relu_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Relu(), (test_data,)
-        )
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+}
+
+
+class Relu(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        return self.relu(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_relu_tosa_MI(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](
+        Relu(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_relu_tosa_BI(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Relu(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_relu_u55_BI(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Relu(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_relu_u85_BI(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Relu(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py
index da2770cfafe..3a7a37196ec 100644
--- a/backends/arm/test/ops/test_repeat.py
+++ b/backends/arm/test/ops/test_repeat.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,115 +7,83 @@
 # Tests the repeat op which copies the data of the input tensor (possibly with new data format)
 #
 
-import unittest
+
 from typing import Sequence, Tuple
 
 import torch
 
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-)
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
-
-
-class TestSimpleRepeat(unittest.TestCase):
-    """Tests Tensor.repeat for different ranks and dimensions."""
-
-    class Repeat(torch.nn.Module):
-        # (input tensor, multiples)
-        test_parameters = [
-            (torch.randn(3), (2,)),
-            (torch.randn(3, 4), (2, 1)),
-            (torch.randn(1, 1, 2, 2), (1, 2, 3, 4)),
-            (torch.randn(3), (2, 2)),
-            (torch.randn(3), (1, 2, 3)),
-            (torch.randn((3, 3)), (2, 2, 2)),
-            (torch.randn((3, 3, 3)), (2, 1, 2, 4)),
-        ]
-
-        def forward(self, x: torch.Tensor, multiples: Sequence):
-            return x.repeat(multiples)
-
-    def _test_repeat_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.repeat.default": 1})
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.repeat.default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.repeat.default": 1})
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.repeat.default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_repeat_ethosu_pipeline(
-        self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.repeat.default": 1})
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.repeat.default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    @parameterized.expand(Repeat.test_parameters)
-    def test_repeat_tosa_MI(self, test_input, multiples):
-        self._test_repeat_tosa_MI_pipeline(self.Repeat(), (test_input, multiples))
-
-    @parameterized.expand(Repeat.test_parameters)
-    def test_repeat_tosa_BI(self, test_input, multiples):
-        self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples))
-
-    @parameterized.expand(Repeat.test_parameters)
-    def test_repeat_u55_BI(self, test_input, multiples):
-        self._test_repeat_ethosu_pipeline(
-            common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples)
-        )
-
-    @parameterized.expand(Repeat.test_parameters)
-    def test_repeat_u85_BI(self, test_input, multiples):
-        self._test_repeat_ethosu_pipeline(
-            common.get_u85_compile_spec(), self.Repeat(), (test_input, multiples)
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x, Input y
+aten_op = "torch.ops.aten.repeat.default"
+
+
+"""Tests Tensor.repeat for different ranks and dimensions."""
+
+
+class Repeat(torch.nn.Module):
+    # (input tensor, multiples)
+    test_parameters = {
+        "1_x_1": lambda: (torch.randn(3), (2,)),
+        "2_x_2": lambda: (torch.randn(3, 4), (2, 1)),
+        "4_x_4": lambda: (torch.randn(1, 1, 2, 2), (1, 2, 3, 4)),
+        "1_x_2": lambda: (torch.randn(3), (2, 2)),
+        "1_x_3": lambda: (torch.randn(3), (1, 2, 3)),
+        "2_x_3": lambda: (torch.randn((3, 3)), (2, 2, 2)),
+        "1_x_4": lambda: (torch.randn((3, 3, 3)), (2, 1, 2, 4)),
+    }
+
+    def forward(self, x: torch.Tensor, multiples: Sequence):
+        return x.repeat(multiples)
+
+
+@common.parametrize("test_data", Repeat.test_parameters)
+def test_repeat_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        Repeat(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Repeat.test_parameters)
+def test_repeat_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        Repeat(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Repeat.test_parameters)
+def test_repeat_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Repeat(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Repeat.test_parameters)
+def test_repeat_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Repeat(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_rshift.py b/backends/arm/test/ops/test_rshift.py
index 52c05d48038..2e11cee5183 100644
--- a/backends/arm/test/ops/test_rshift.py
+++ b/backends/arm/test/ops/test_rshift.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+from executorch.backends.arm.test import common
 from executorch.backends.arm.test.common import (
     XfailIfNoCorstone300,
     XfailIfNoCorstone320,
@@ -14,7 +15,6 @@
     TosaPipelineBI,
     TosaPipelineMI,
 )
-from parameterized import parameterized
 
 scalar_input_t = tuple[torch.Tensor, int]
 
@@ -23,11 +23,20 @@ class RshiftScalar(torch.nn.Module):
     torch_op_MI = "torch.ops.aten.__rshift__.Scalar"
     torch_op_BI = "torch.ops.aten.bitwise_right_shift.Tensor"
     exir_op = "executorch_exir_dialects_edge__ops_aten_bitwise_right_shift_Tensor"
-    test_data = [
-        ((torch.randint(-100, 100, (1, 12, 3, 4), dtype=torch.int8), 1),),
-        ((torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int16), 5),),
-        ((torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int32), 2),),
-    ]
+    test_data = {
+        "randint_neg_100_int8": lambda: (
+            torch.randint(-100, 100, (1, 12, 3, 4), dtype=torch.int8),
+            1,
+        ),
+        "randint_neg_100_int16": lambda: (
+            torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int16),
+            5,
+        ),
+        "randint_neg_100_int32": lambda: (
+            torch.randint(-100, 100, (1, 5, 3, 4), dtype=torch.int32),
+            2,
+        ),
+    }
 
     def forward(self, x: torch.Tensor, shift: int):
         return x >> shift
@@ -39,53 +48,53 @@ def forward(self, x: torch.Tensor, shift: int):
 class RshiftTensor(torch.nn.Module):
     torch_op = "torch.ops.aten.bitwise_right_shift.Tensor"
     exir_op = "executorch_exir_dialects_edge__ops_aten_bitwise_right_shift_Tensor"
-    test_data = [
-        (
-            (
-                torch.randint(-128, 127, (3, 3), dtype=torch.int8),
-                torch.randint(0, 5, (3, 3), dtype=torch.int8),
-            ),
+    test_data = {
+        "randint_neg_128_int8": lambda: (
+            torch.randint(-128, 127, (3, 3), dtype=torch.int8),
+            torch.randint(0, 5, (3, 3), dtype=torch.int8),
         ),
-        (
-            (
-                torch.randint(-1024, 1024, (3, 3, 3), dtype=torch.int16),
-                torch.randint(0, 5, (3, 3, 3), dtype=torch.int16),
-            ),
+        "randint_neg_1024_int16": lambda: (
+            torch.randint(-1024, 1024, (3, 3, 3), dtype=torch.int16),
+            torch.randint(0, 5, (3, 3, 3), dtype=torch.int16),
         ),
-        (
-            (
-                torch.randint(0, 127, (1, 2, 3, 3), dtype=torch.int32),
-                torch.randint(0, 5, (1, 2, 3, 3), dtype=torch.int32),
-            ),
+        "randint_0_127_int32": lambda: (
+            torch.randint(0, 127, (1, 2, 3, 3), dtype=torch.int32),
+            torch.randint(0, 5, (1, 2, 3, 3), dtype=torch.int32),
         ),
-    ]
+    }
 
     def forward(self, x: torch.Tensor, shift: torch.Tensor):
         return x.bitwise_right_shift(shift)
 
 
-@parameterized.expand(RshiftScalar.test_data)
-def test_rshift_scalar_tosa_MI(test_data):
+@common.parametrize("test_data", RshiftScalar.test_data)
+def test_rshift_scalar_tosa_MI_scalar(test_data):
     TosaPipelineMI[scalar_input_t](
-        RshiftScalar(), test_data, RshiftScalar.torch_op_MI, RshiftScalar.exir_op
+        RshiftScalar(),
+        test_data(),
+        RshiftScalar.torch_op_MI,
+        RshiftScalar.exir_op,
     ).run()
 
 
-@parameterized.expand(RshiftScalar.test_data)
-def test_rshift_scalar_tosa_BI(test_data):
+@common.parametrize("test_data", RshiftScalar.test_data)
+def test_bitwise_right_shift_tensor_tosa_BI_scalar(test_data):
     pipeline = TosaPipelineBI[scalar_input_t](
-        RshiftScalar(), test_data, RshiftScalar.torch_op_BI, RshiftScalar.exir_op
+        RshiftScalar(),
+        test_data(),
+        RshiftScalar.torch_op_BI,
+        RshiftScalar.exir_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
-@parameterized.expand(RshiftScalar.test_data)
+@common.parametrize("test_data", RshiftScalar.test_data)
 @XfailIfNoCorstone300
-def test_rshift_scalar_tosa_u55(test_data):
+def test_bitwise_right_shift_tensor_u55_BI_scalar(test_data):
     pipeline = EthosU55PipelineBI[scalar_input_t](
         RshiftScalar(),
-        test_data,
+        test_data(),
         RshiftScalar.torch_op_BI,
         RshiftScalar.exir_op,
         run_on_fvp=True,
@@ -93,16 +102,16 @@ def test_rshift_scalar_tosa_u55(test_data):
     pipeline.pop_stage("check.quant_nodes")
 
     # Forced rounding in U55 HW causes off-by-one errors.
-    pipeline.change_args("run_method_and_compare_outputs", inputs=test_data, atol=1)
+    pipeline.change_args("run_method_and_compare_outputs", inputs=test_data(), atol=1)
     pipeline.run()
 
 
-@parameterized.expand(RshiftScalar.test_data)
+@common.parametrize("test_data", RshiftScalar.test_data)
 @XfailIfNoCorstone320
-def test_rshift_scalar_tosa_u85(test_data):
+def test_bitwise_right_shift_tensor_u85_BI_scalar(test_data):
     pipeline = EthosU85PipelineBI[scalar_input_t](
         RshiftScalar(),
-        test_data,
+        test_data(),
         RshiftScalar.torch_op_BI,
         RshiftScalar.exir_op,
         run_on_fvp=True,
@@ -111,28 +120,34 @@ def test_rshift_scalar_tosa_u85(test_data):
     pipeline.run()
 
 
-@parameterized.expand(RshiftTensor.test_data)
-def test_rshift_tensor_tosa_MI(test_data):
+@common.parametrize("test_data", RshiftTensor.test_data)
+def test_rshift_scalar_tosa_MI(test_data):
     TosaPipelineMI[scalar_input_t](
-        RshiftTensor(), test_data, RshiftTensor.torch_op, RshiftTensor.exir_op
+        RshiftTensor(),
+        test_data(),
+        RshiftTensor.torch_op,
+        RshiftTensor.exir_op,
     ).run()
 
 
-@parameterized.expand(RshiftTensor.test_data)
-def test_rshift_tensor_tosa_BI(test_data):
+@common.parametrize("test_data", RshiftTensor.test_data)
+def test_bitwise_right_shift_tensor_tosa_BI(test_data):
     pipeline = TosaPipelineBI[scalar_input_t](
-        RshiftTensor(), test_data, RshiftTensor.torch_op, RshiftTensor.exir_op
+        RshiftTensor(),
+        test_data(),
+        RshiftTensor.torch_op,
+        RshiftTensor.exir_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
-@parameterized.expand(RshiftTensor.test_data)
+@common.parametrize("test_data", RshiftTensor.test_data)
 @XfailIfNoCorstone300
-def test_rshift_tensor_tosa_u55(test_data):
+def test_bitwise_right_shift_tensor_u55_BI(test_data):
     pipeline = EthosU55PipelineBI[scalar_input_t](
         RshiftTensor(),
-        test_data,
+        test_data(),
         RshiftTensor.torch_op,
         RshiftTensor.exir_op,
         run_on_fvp=True,
@@ -140,16 +155,16 @@ def test_rshift_tensor_tosa_u55(test_data):
     pipeline.pop_stage("check.quant_nodes")
 
     # Forced rounding in U55 HW causes off-by-one errors.
-    pipeline.change_args("run_method_and_compare_outputs", inputs=test_data, atol=1)
+    pipeline.change_args("run_method_and_compare_outputs", inputs=test_data(), atol=1)
     pipeline.run()
 
 
-@parameterized.expand(RshiftTensor.test_data)
+@common.parametrize("test_data", RshiftTensor.test_data)
 @XfailIfNoCorstone320
-def test_rshift_tensor_tosa_u85(test_data):
+def test_bitwise_right_shift_tensor_u85_BI(test_data):
     pipeline = EthosU85PipelineBI[scalar_input_t](
         RshiftTensor(),
-        test_data,
+        test_data(),
         RshiftTensor.torch_op,
         RshiftTensor.exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_rsqrt.py b/backends/arm/test/ops/test_rsqrt.py
index 2bf5fc371c8..0a9e95d890e 100644
--- a/backends/arm/test/ops/test_rsqrt.py
+++ b/backends/arm/test/ops/test_rsqrt.py
@@ -1,5 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,101 +6,78 @@
 # Tests the rsqrt op.
 #
 
-import unittest
+from typing import Tuple
 
 import torch
+
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestRsqrt(unittest.TestCase):
-    class Rsqrt(torch.nn.Module):
-        test_parameters = [
-            (torch.ones(1, 10, 10, 10),),
-            (torch.rand(1, 10, 10, 10),),
-            (torch.rand(1, 5, 10, 20),),
-            (torch.rand(5, 10, 20),),
-        ]
-
-        def forward(self, x: torch.Tensor):
-            return x.rsqrt()
-
-    def _test_rsqrt_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.rsqrt.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_rsqrt_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.rsqrt.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_rsqrt_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.rsqrt.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    @parameterized.expand(Rsqrt.test_parameters)
-    def test_rsqrt_tosa_MI(self, test_tensor: torch.Tensor):
-        self._test_rsqrt_tosa_MI_pipeline(self.Rsqrt(), (test_tensor,))
-
-    @parameterized.expand(Rsqrt.test_parameters)
-    def test_rsqrt_tosa_BI(self, test_tensor: torch.Tensor):
-        self._test_rsqrt_tosa_BI_pipeline(self.Rsqrt(), (test_tensor,))
-
-    @parameterized.expand(Rsqrt.test_parameters)
-    def test_rsqrt_u55_BI(self, test_tensor: torch.Tensor):
-        self._test_rsqrt_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Rsqrt(), (test_tensor,)
-        )
-
-    @parameterized.expand(Rsqrt.test_parameters)
-    def test_rsqrt_u85_BI(self, test_tensor: torch.Tensor):
-        self._test_rsqrt_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Rsqrt(), (test_tensor,)
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+aten_op = "torch.ops.aten.rsqrt.default"
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Rsqrt(torch.nn.Module):
+    test_parameters = {
+        "ones_4d": lambda: (torch.ones(1, 10, 10, 10),),
+        "rand_4d_1": lambda: (torch.rand(1, 10, 10, 10),),
+        "rand_4d_2": lambda: (torch.rand(1, 5, 10, 20),),
+        "rand_3d": lambda: (torch.rand(5, 10, 20),),
+    }
+
+    def forward(self, x: torch.Tensor):
+        return x.rsqrt()
+
+
+@common.parametrize("test_tensor", Rsqrt.test_parameters)
+def test_rsqrt_tosa_MI(test_tensor: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](
+        Rsqrt(),
+        test_tensor(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_tensor", Rsqrt.test_parameters)
+def test_rsqrt_tosa_BI(test_tensor: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Rsqrt(),
+        test_tensor(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_tensor", Rsqrt.test_parameters)
+@common.XfailIfNoCorstone300
+def test_rsqrt_u55_BI(test_tensor: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Rsqrt(),
+        test_tensor(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_tensor", Rsqrt.test_parameters)
+@common.XfailIfNoCorstone320
+def test_rsqrt_u85_BI(test_tensor: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Rsqrt(),
+        test_tensor(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_scalar_tensor.py b/backends/arm/test/ops/test_scalar_tensor.py
index ad9d385c1d1..7b050f7787e 100644
--- a/backends/arm/test/ops/test_scalar_tensor.py
+++ b/backends/arm/test/ops/test_scalar_tensor.py
@@ -14,22 +14,22 @@
 )
 
 float_test_data_suite = {
-    "scalar_tensor_float_1": (3.7, torch.float32, torch.rand((1, 2, 3, 4))),
-    "scalar_tensor_float_2": (66, torch.float32, torch.rand((1, 2, 3))),
+    "scalar_tensor_float_1": lambda: (3.7, torch.float32, torch.rand((1, 2, 3, 4))),
+    "scalar_tensor_float_2": lambda: (66, torch.float32, torch.rand((1, 2, 3))),
 }
 
 int_test_data_suite = {
-    "scalar_tensor_int32": (
+    "scalar_tensor_int32": lambda: (
         33,
         torch.int32,
         torch.randint(0, 10, (1, 2), dtype=torch.int32),
     ),
-    "scalar_tensor_int8": (
+    "scalar_tensor_int8": lambda: (
         8,
         torch.int8,
         torch.rand(1, 2, 3),
     ),
-    "scalar_tensor_int16": (
+    "scalar_tensor_int16": lambda: (
         16 * 16 * 16,
         torch.int16,
         torch.rand((1,)).unsqueeze(0),  # Rank 0 inputs not supported
@@ -49,17 +49,29 @@ def forward(self, x: torch.Tensor):
         return torch.scalar_tensor(self.scalar, dtype=self.dtype) + x
 
 
-@common.parametrize("test_data", int_test_data_suite | float_test_data_suite)
+@common.parametrize(
+    "test_data",
+    int_test_data_suite | float_test_data_suite,
+)
 def test_scalar_tensor_tosa_MI(test_data):  # Note TOSA MI supports all types
-    scalar, dtype, data = test_data
-    TosaPipelineMI(ScalarTensor(scalar, dtype), tuple(data), ScalarTensor.aten_op).run()
+    scalar, dtype, data = test_data()
+    TosaPipelineMI(
+        ScalarTensor(scalar, dtype),
+        tuple(data),
+        ScalarTensor.aten_op,
+    ).run()
 
 
-@common.parametrize("test_data", int_test_data_suite | float_test_data_suite)
+@common.parametrize(
+    "test_data",
+    int_test_data_suite | float_test_data_suite,
+)
 def test_scalar_tensor_tosa_BI(test_data):
-    scalar, dtype, data = test_data
+    scalar, dtype, data = test_data()
     pipeline: TosaPipelineBI = TosaPipelineBI(
-        ScalarTensor(scalar, dtype), tuple(data), ScalarTensor.aten_op
+        ScalarTensor(scalar, dtype),
+        tuple(data),
+        ScalarTensor.aten_op,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -67,8 +79,8 @@ def test_scalar_tensor_tosa_BI(test_data):
 
 @common.parametrize("test_data", float_test_data_suite)
 @common.XfailIfNoCorstone300
-def test_scalar_tensor_tosa_u55(test_data):
-    scalar, dtype, data = test_data
+def test_scalar_tensor_u55_BI(test_data):
+    scalar, dtype, data = test_data()
     EthosU55PipelineBI(
         ScalarTensor(scalar, dtype),
         tuple(data),
@@ -80,8 +92,8 @@ def test_scalar_tensor_tosa_u55(test_data):
 
 @common.parametrize("test_data", float_test_data_suite)
 @common.XfailIfNoCorstone320
-def test_scalar_tensor_tosa_u85(test_data):
-    scalar, dtype, data = test_data
+def test_scalar_tensor_u85_BI(test_data):
+    scalar, dtype, data = test_data()
     EthosU85PipelineBI(
         ScalarTensor(scalar, dtype),
         tuple(data),
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
index 97af070120b..a4748e93fdb 100644
--- a/backends/arm/test/ops/test_scalars.py
+++ b/backends/arm/test/ops/test_scalars.py
@@ -3,13 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import common
 import torch
 
+from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
     TosaPipelineBI,
     TosaPipelineMI,
@@ -32,90 +31,103 @@
 input_t1 = Tuple[torch.Tensor, torch.scalar_tensor]  # Input x, Input y
 
 
-class TestScalars(unittest.TestCase):
-    """Tests various scalar cases"""
+"""Tests various scalar cases"""
 
-    class Add(torch.nn.Module):
-        def forward(self, x, y):
-            return x + y
 
-    class Sub(torch.nn.Module):
-        def forward(self, x, y):
-            return x - y
+class Add(torch.nn.Module):
+    def forward(self, x, y):
+        return x + y
 
-    class Div(torch.nn.Module):
-        def forward(self, x, y):
-            return x / y
 
-    class Mul(torch.nn.Module):
-        def forward(self, x, y):
-            return x * y
+class Sub(torch.nn.Module):
+    def forward(self, x, y):
+        return x - y
 
-    class MulScalar(torch.nn.Module):
-        def forward(self, x, y):
-            return torch.ops.aten.mul.Scalar(x, y)
 
-    class DivScalar(torch.nn.Module):
-        def forward(self, x, y):
-            return torch.ops.aten.div.Scalar(x, y)
+class Div(torch.nn.Module):
+    def forward(self, x, y):
+        return x / y
 
-    class AddScalar(torch.nn.Module):
-        def forward(self, x, y):
-            return torch.ops.aten.add.Scalar(x, y)
 
-    class SubScalar(torch.nn.Module):
-        def forward(self, x, y):
-            return torch.ops.aten.sub.Scalar(x, y)
+class Mul(torch.nn.Module):
+    def forward(self, x, y):
+        return x * y
 
-    class AddInplace(torch.nn.Module):
-        def forward(self, x, y):
-            x += y
-            return x
 
-    class SubInplace(torch.nn.Module):
-        def forward(self, x, y):
-            x -= y
-            return x
+class MulScalar(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.ops.aten.mul.Scalar(x, y)
 
-    class DivInplace(torch.nn.Module):
-        def forward(self, x, y):
-            x /= y
-            return x
 
-    class MulInplace(torch.nn.Module):
-        def forward(self, x, y):
-            x *= y
-            return x
+class DivScalar(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.ops.aten.div.Scalar(x, y)
 
-    class AddConst(torch.nn.Module):
-        def forward(self, x):
-            x = 1.0 + x
-            return x
 
-    class ShiftInplaceSub(torch.nn.Module):
-        def forward(self, x):
-            x = x >> 4
-            x -= 10
-            return x
+class AddScalar(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.ops.aten.add.Scalar(x, y)
+
+
+class SubScalar(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.ops.aten.sub.Scalar(x, y)
+
+
+class AddInplace(torch.nn.Module):
+    def forward(self, x, y):
+        x += y
+        return x
+
+
+class SubInplace(torch.nn.Module):
+    def forward(self, x, y):
+        x -= y
+        return x
+
+
+class DivInplace(torch.nn.Module):
+    def forward(self, x, y):
+        x /= y
+        return x
+
+
+class MulInplace(torch.nn.Module):
+    def forward(self, x, y):
+        x *= y
+        return x
+
+
+class AddConst(torch.nn.Module):
+    def forward(self, x):
+        x = 1.0 + x
+        return x
+
+
+class ShiftInplaceSub(torch.nn.Module):
+    def forward(self, x):
+        x = x >> 4
+        x -= 10
+        return x
 
 
 # Inplace ops end with '_' (from aten naming)
 ops = [
-    ("Add", TestScalars.Add()),
-    ("Sub", TestScalars.Sub()),
-    ("Mul", TestScalars.Mul()),
-    ("Div", TestScalars.Div()),
-    ("Add_", TestScalars.AddInplace()),
-    ("Sub_", TestScalars.SubInplace()),
-    ("Mul_", TestScalars.MulInplace()),
-    ("Div_", TestScalars.DivInplace()),
-    ("MulScalar", TestScalars.MulScalar()),
-    ("DivScalar", TestScalars.DivScalar()),
-    ("AddScalar", TestScalars.AddScalar()),
-    ("SubScalar", TestScalars.SubScalar()),
+    ("Add", Add()),
+    ("Sub", Sub()),
+    ("Mul", Mul()),
+    ("Div", Div()),
+    ("Add_", AddInplace()),
+    ("Sub_", SubInplace()),
+    ("Mul_", MulInplace()),
+    ("Div_", DivInplace()),
+    ("MulScalar", MulScalar()),
+    ("DivScalar", DivScalar()),
+    ("AddScalar", AddScalar()),
+    ("SubScalar", SubScalar()),
 ]
 
-const_ops = [("Add", TestScalars.AddConst())]
+const_ops = [("Add", AddConst())]
 
 dtypes = [("int", 3), ("float", 3.0)]
 sizes = [("r1", (1)), ("r4", (2, 4, 5, 3))]
@@ -198,16 +210,18 @@ def _test_add_tosa_BI_pipeline(
 }
 
 
-@common.parametrize("tensor_scalar_tests", tensor_scalar_tests, MI_xfails)
-def test_MI(tensor_scalar_tests: list):
+@common.parametrize(
+    "tensor_scalar_tests",
+    tensor_scalar_tests,
+    MI_xfails,
+)
+def test_tosa_MI(tensor_scalar_tests: list):
     op, x, y = tensor_scalar_tests
     _test_add_tosa_MI_pipeline(op, (x, y))
 
 
 def _test_passes_tosa_BI_pipeline(module: torch.nn.Module, test_data: tuple):
-    pipeline = TransformAnnotationPassPipeline[input_t1](
-        module, test_data, tosa_version="TOSA-0.80+BI"
-    )
+    pipeline = TransformAnnotationPassPipeline[input_t1](module, test_data)
     pipeline.run()
 
 
@@ -221,39 +235,47 @@ def _test_passes_tosa_BI_pipeline(module: torch.nn.Module, test_data: tuple):
 
 
 @common.parametrize(
-    "tensor_scalar_tests", tensor_scalar_tests, passes_xfails, strict=False
+    "tensor_scalar_tests",
+    tensor_scalar_tests,
+    passes_xfails,
+    strict=False,
 )
-def test_passes_BI(tensor_scalar_tests: list):
+def test_scalars_tosa_BI_passes(tensor_scalar_tests: list):
     op, x, y = tensor_scalar_tests
     _test_passes_tosa_BI_pipeline(op, (x, y))
 
 
 # op(Scalar float, tensor) works if the scalar is constant.
 @common.parametrize("tensor_const_tests", tensor_const_tests)
-def test_MI_const(tensor_const_tests: list):
+def test_scalars_tosa_MI(tensor_const_tests: list):
     op, x = tensor_const_tests
     _test_add_tosa_MI_pipeline(op, (x,))
 
 
 @common.parametrize("tensor_scalar_tests", tensor_scalar_tests)
-def test_BI(tensor_scalar_tests: list):
+def test_scalars_tosa_BI(tensor_scalar_tests: list):
     op, x, y = tensor_scalar_tests
     _test_add_tosa_BI_pipeline(op, (x, y))
 
 
 # op(Scalar float, tensor) works if the scalar is constant.
 @common.parametrize("tensor_const_tests", tensor_const_tests)
-def test_BI_const(tensor_const_tests: list):
+def test_scalars_tosa_BI_const(tensor_const_tests: list):
     op, x = tensor_const_tests
     _test_add_tosa_BI_pipeline(op, (x,))
 
 
 def test_shift_sub_inplace_tosa_MI():
-    _test_add_tosa_MI_pipeline(TestScalars.ShiftInplaceSub(), (torch.IntTensor(5),))
+    _test_add_tosa_MI_pipeline(
+        ShiftInplaceSub(),
+        (torch.IntTensor(5),),
+    )
 
 
 # Do not check for quant nodes in the graph for rshift.
 def test_shift_sub_inplace_tosa_BI():
     _test_add_tosa_BI_pipeline(
-        TestScalars.ShiftInplaceSub(), (torch.IntTensor(5),), check_quant_nodes=False
+        ShiftInplaceSub(),
+        (torch.IntTensor(5),),
+        check_quant_nodes=False,
     )
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index fbeb4ebf9e7..a0b72942d44 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -1,182 +1,157 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+
+from typing import Tuple
 
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
-test_data_t = tuple[torch.Tensor, int, int]
+input_t1 = Tuple[torch.Tensor, int, int]
 
-test_data_suite: list[tuple[test_data_t]] = [
+test_data_suite = {
     # (test_data, dim, index)
-    ((torch.zeros(5, 3, 20), -1, 0),),
-    ((torch.rand(5, 3, 20), 0, -1),),
-    ((torch.zeros(5, 3, 20), 0, 4),),
-    ((torch.ones(10, 10, 10), 0, 2),),
-    ((torch.rand(5, 3, 20, 2), 0, 2),),
-    ((torch.rand(10, 10) - 0.5, 0, 0),),
-    ((torch.randn(10) + 10, 0, 1),),
-    ((torch.randn(10) - 10, 0, 2),),
-    ((torch.arange(-16, 16, 0.2), 0, 1),),
-]
-
-
-class TestSelect(unittest.TestCase):
-    class SelectCopy(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x, dim: int, index: int):
-            return torch.select_copy(x, dim=dim, index=index)
-
-    class SelectInt(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x, dim: int, index: int):
-            return torch.select(x, dim=dim, index=index)
-
-    def _test_select_tosa_MI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: test_data_t,
-        export_target: str,
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check([export_target])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_select_tosa_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: test_data_t,
-        export_target: str,
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check([export_target])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_select_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: test_data_t,
-        export_target: str,
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check([export_target])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_select_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: test_data_t, export_target: str
-    ):
-        self._test_select_ethos_BI_pipeline(
-            common.get_u55_compile_spec(),
-            module,
-            test_data,
-            export_target,
-        )
-
-    def _test_select_tosa_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: test_data_t, export_target: str
-    ):
-        self._test_select_ethos_BI_pipeline(
-            common.get_u85_compile_spec(),
-            module,
-            test_data,
-            export_target,
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_copy_tosa_MI(self, test_data: test_data_t):
-        self._test_select_tosa_MI_pipeline(
-            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_int_tosa_MI(self, test_data: test_data_t):
-        self._test_select_tosa_MI_pipeline(
-            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_copy_tosa_BI(self, test_data: test_data_t):
-        self._test_select_tosa_BI_pipeline(
-            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_int_tosa_BI(self, test_data: test_data_t):
-        self._test_select_tosa_BI_pipeline(
-            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_copy_tosa_u55_BI(self, test_data: test_data_t):
-        self._test_select_tosa_u55_BI_pipeline(
-            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_int_tosa_u55_BI(self, test_data: test_data_t):
-        self._test_select_tosa_u55_BI_pipeline(
-            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_copy_tosa_u85_BI(self, test_data: test_data_t):
-        self._test_select_tosa_u85_BI_pipeline(
-            self.SelectCopy(), test_data, export_target="torch.ops.aten.select_copy.int"
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_select_int_tosa_u85_BI(self, test_data: test_data_t):
-        self._test_select_tosa_u85_BI_pipeline(
-            self.SelectInt(), test_data, export_target="torch.ops.aten.select.int"
-        )
+    "select3d_neg_1_dim_0_index": lambda: (torch.zeros(5, 3, 20), -1, 0),
+    "select3d_0_dim_neg_1_index": lambda: (torch.rand(5, 3, 20), 0, -1),
+    "select3d_0_dim_4_index": lambda: (torch.zeros(5, 3, 20), 0, 4),
+    "select3d_0_dim_2_index": lambda: (torch.ones(10, 10, 10), 0, 2),
+    "select4d_0_dim_2_index": lambda: (torch.rand(5, 3, 20, 2), 0, 2),
+    "select2d_0_dim_0_index": lambda: (torch.rand(10, 10) - 0.5, 0, 0),
+    "select1d_0_dim_1_index": lambda: (torch.randn(10) + 10, 0, 1),
+    "select1d_0_dim_0_index": lambda: (torch.randn(10) - 10, 0, 2),
+    "select3d_0_dim_1_index": lambda: (torch.arange(-16, 16, 0.2), 0, 1),
+}
+
+aten_op_copy = "torch.ops.aten.select_copy.int"
+aten_op_int = "torch.ops.aten.select.int"
+
+
+class SelectCopy(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, dim: int, index: int):
+        return torch.select_copy(x, dim=dim, index=index)
+
+
+class SelectInt(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, dim: int, index: int):
+        return torch.select(x, dim=dim, index=index)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_select_int_tosa_MI_copy(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        SelectCopy(),
+        test_data(),
+        aten_op=aten_op_copy,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_select_int_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        SelectInt(),
+        test_data(),
+        aten_op=aten_op_int,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_select_int_tosa_BI_copy(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        SelectCopy(),
+        test_data(),
+        aten_op=aten_op_copy,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_select_int_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        SelectInt(),
+        test_data(),
+        aten_op=aten_op_int,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+x_fails = {
+    "select4d_0_dim_2_index": "AssertionError: Output 0 does not match reference output."
+}
+
+
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone300
+def test_select_int_u55_BI_copy(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        SelectCopy(),
+        test_data(),
+        aten_op_copy,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone300
+def test_select_int_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        SelectInt(),
+        test_data(),
+        aten_op_int,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone320
+def test_select_int_u85_BI_copy(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        SelectCopy(),
+        test_data(),
+        aten_op_copy,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone320
+def test_select_int_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        SelectInt(),
+        test_data(),
+        aten_op_int,
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py
index 43b4abd2039..b5ee68b987b 100644
--- a/backends/arm/test/ops/test_sigmoid.py
+++ b/backends/arm/test/ops/test_sigmoid.py
@@ -5,189 +5,158 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
 from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.sigmoid.default"  # Used for checking that we do not have softmax in the graph after decompose
+exir_op = "executorch_exir_dialects_edge__ops_aten_sigmoid_default"
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+test_data_suite = {
+    # (test_name, test_data)
+    "zeros": lambda: torch.zeros(10, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+}
 
 
-test_data_suite = [
-    # (test_name, test_data)
-    ("zeros", torch.zeros(10, 10, 10, 10)),
-    ("ones", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
-    ("randn_neg", torch.randn(10) - 10),
-    ("ramp", torch.arange(-16, 16, 0.2)),
-]
-
-
-class TestSigmoid(unittest.TestCase):
-    class Sigmoid(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.sigmoid = torch.nn.Sigmoid()
-
-        def forward(self, x):
-            return self.sigmoid(x)
-
-    class AddSigmoid(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.sigmoid = torch.nn.Sigmoid()
-
-        def forward(self, x):
-            return self.sigmoid(x + x)
-
-    class SigmoidAdd(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.sigmoid = torch.nn.Sigmoid()
-
-        def forward(self, x):
-            return x + self.sigmoid(x)
-
-    class SigmoidAddSigmoid(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.sigmoid = torch.nn.Sigmoid()
-
-        def forward(self, x, y):
-            return self.sigmoid((self.sigmoid(y) + self.sigmoid(x)))
-
-    def _test_sigmoid_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.sigmoid.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data)
-
-    def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.sigmoid.default"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data)
-
-    def _test_sigmoid_tosa_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.sigmoid.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_sigmoid_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_sigmoid_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_sigmoid_tosa_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_sigmoid_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.tosa_ref_model
-    def test_sigmoid_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_sigmoid_tosa_MI_pipeline(self.Sigmoid(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.tosa_ref_model
-    def test_sigmoid_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_sigmoid_tosa_BI_pipeline(self.Sigmoid(), (test_data,))
-
-    @pytest.mark.tosa_ref_model
-    def test_add_sigmoid_tosa_MI(self):
-        self._test_sigmoid_tosa_MI_pipeline(self.AddSigmoid(), (test_data_suite[0][1],))
-
-    @pytest.mark.tosa_ref_model
-    def test_add_sigmoid_tosa_BI(self):
-        self._test_sigmoid_tosa_BI_pipeline(self.AddSigmoid(), (test_data_suite[5][1],))
-
-    @pytest.mark.tosa_ref_model
-    def test_sigmoid_add_tosa_MI(self):
-        self._test_sigmoid_tosa_MI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
-
-    @pytest.mark.tosa_ref_model
-    def test_sigmoid_add_tosa_BI(self):
-        self._test_sigmoid_tosa_BI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
-
-    @pytest.mark.tosa_ref_model
-    def test_sigmoid_add_sigmoid_tosa_MI(self):
-        self._test_sigmoid_tosa_MI_pipeline(
-            self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
-        )
-
-    @pytest.mark.tosa_ref_model
-    def test_sigmoid_add_sigmoid_tosa_BI(self):
-        self._test_sigmoid_tosa_BI_pipeline(
-            self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_sigmoid_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_sigmoid_tosa_u55_BI_pipeline(self.Sigmoid(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_sigmoid_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_sigmoid_tosa_u85_BI_pipeline(self.Sigmoid(), (test_data,))
+class Sigmoid(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid(x)
+
+
+class AddSigmoid(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid(x + x)
+
+
+class SigmoidAdd(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        return x + self.sigmoid(x)
+
+
+class SigmoidAddSigmoid(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x, y):
+        return self.sigmoid((self.sigmoid(y) + self.sigmoid(x)))
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_sigmoid_tosa_MI(test_data: torch.Tensor):
+    TosaPipelineMI[input_t1](Sigmoid(), (test_data(),), aten_op, exir_op).run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_sigmoid_tosa_BI(test_data: torch.Tensor):
+    TosaPipelineBI[input_t1](Sigmoid(), (test_data(),), aten_op, exir_op).run()
+
+
+def test_sigmoid_tosa_MI_add():
+    TosaPipelineMI[input_t1](
+        AddSigmoid(),
+        (test_data_suite["zeros"](),),
+        aten_op,
+        exir_op,
+        tosa_version=conftest.get_option("tosa_version"),
+    ).run()
+
+
+def test_sigmoid_tosa_BI_add():
+    TosaPipelineBI[input_t1](
+        AddSigmoid(),
+        (test_data_suite["ramp"](),),
+        aten_op,
+        exir_op,
+        tosa_version=conftest.get_option("tosa_version"),
+    ).run()
+
+
+def test_sigmoid_tosa_MI_add_2():
+    TosaPipelineMI[input_t1](
+        SigmoidAdd(),
+        (test_data_suite["zeros"](),),
+        aten_op,
+        exir_op,
+        tosa_version=conftest.get_option("tosa_version"),
+    ).run()
+
+
+def test_sigmoid_tosa_BI_add_2():
+    TosaPipelineBI[input_t1](
+        SigmoidAdd(),
+        (test_data_suite["zeros"](),),
+        aten_op,
+        exir_op,
+        tosa_version=conftest.get_option("tosa_version"),
+    ).run()
+
+
+def test_sigmoid_tosa_MI_add_3():
+    TosaPipelineMI[input_t1](
+        SigmoidAddSigmoid(),
+        (test_data_suite["randn_neg"](), test_data_suite["randn_pos"]()),
+        aten_op,
+        exir_op,
+        tosa_version=conftest.get_option("tosa_version"),
+    ).run()
+
+
+def test_sigmoid_tosa_BI_3():
+    TosaPipelineBI[input_t1](
+        SigmoidAddSigmoid(),
+        (test_data_suite["randn_neg"](), test_data_suite["randn_pos"]()),
+        aten_op,
+        exir_op,
+        tosa_version=conftest.get_option("tosa_version"),
+    ).run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_sigmoid_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Sigmoid(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_sigmoid_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Sigmoid(),
+        (test_data(),),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py
index 3cd11699a0a..ddec8c61eb9 100644
--- a/backends/arm/test/ops/test_sigmoid_16bit.py
+++ b/backends/arm/test/ops/test_sigmoid_16bit.py
@@ -11,12 +11,13 @@
     TOSAQuantizer,
 )
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU85PipelineBI,
     OpNotSupportedPipeline,
     TosaPipelineBI,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester import Quantize
 from torch.ao.quantization.observer import HistogramObserver
 from torch.ao.quantization.quantizer import QuantizationSpec
@@ -37,9 +38,18 @@ def _get_16_bit_quant_config():
     return qconfig
 
 
-def get_16bit_sigmoid_quantizer(tosa_str: str):
-    tosa_spec = common.TosaSpecification.create_from_string(tosa_str)
-    quantizer = TOSAQuantizer(tosa_spec)
+def get_16bit_sigmoid_quantizer(u55_config=False):
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "0.80": TosaSpecification.create_from_string(
+            "TOSA-0.80+BI" + ("+u55" if u55_config else "")
+        ),
+        "1.0": TosaSpecification.create_from_string(
+            "TOSA-1.0+INT" + ("+u55" if u55_config else "")
+        ),
+    }
+
+    quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
     quantizer.set_global(get_symmetric_quantization_config())
     quantizer.set_module_type(
         torch.nn.modules.activation.Sigmoid, _get_16_bit_quant_config()
@@ -86,7 +96,7 @@ def test_sigmoid_tosa_BI(test_data):
     pipeline = TosaPipelineBI(
         Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op
     )
-    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer())
     pipeline.run()
 
 
@@ -96,26 +106,41 @@ def test_sigmoid_tosa_BI(test_data):
     xfails={
         "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787"
     },
+    strict=False,
 )
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
-def test_sigmoid_add_sigmoid_tosa_BI(test_data):
+def test_sigmoid_tosa_BI_add_sigmoid(test_data):
     pipeline = TosaPipelineBI(
-        SigmoidAddSigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op
+        SigmoidAddSigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
     )
-    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
     pipeline.run()
 
 
+xfails = {
+    "ones": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
+    "rand": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
+    "rand_4d": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
+    "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
+}
+
+
 @common.parametrize(
     "test_data",
     test_data_suite,
 )
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
-def test_sigmoid_tosa_u55(test_data):
+def test_sigmoid_u55_BI(test_data):
     pipeline = OpNotSupportedPipeline(
-        Sigmoid(), (test_data(),), "TOSA-0.80+BI+u55", {Sigmoid.exir_op: 1}
+        Sigmoid(),
+        (test_data(),),
+        {Sigmoid.exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
-    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer(True))
     pipeline.run()
 
 
@@ -124,26 +149,31 @@ def test_sigmoid_tosa_u55(test_data):
     test_data_suite,
 )
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
-def test_sigmoid_add_sigmoid_tosa_u55(test_data):
+def test_sigmoid_u55_BI_add_sigmoid(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
         (test_data(),),
-        "TOSA-0.80+BI+u55",
         {Sigmoid.exir_op: 3},
         n_expected_delegates=1,
+        quantize=True,
+        u55_subset=True,
     )
-    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer(True))
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
-def test_sigmoid_tosa_u85(test_data):
+def test_sigmoid_u85_BI(test_data):
     pipeline = EthosU85PipelineBI(
-        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, run_on_fvp=True
+        Sigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+        run_on_fvp=True,
     )
-    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer())
     pipeline.run()
 
 
@@ -156,7 +186,7 @@ def test_sigmoid_tosa_u85(test_data):
 )
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
-def test_sigmoid_add_sigmoid_tosa_u85(test_data):
+def test_sigmoid_u85_BI_add_sigmoid(test_data):
     pipeline = EthosU85PipelineBI(
         SigmoidAddSigmoid(),
         (test_data(),),
@@ -164,5 +194,5 @@ def test_sigmoid_add_sigmoid_tosa_u85(test_data):
         Sigmoid.exir_op,
         run_on_fvp=True,
     )
-    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer())
     pipeline.run()
diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py
index fbfc263a6d0..a0fe077da5f 100644
--- a/backends/arm/test/ops/test_sigmoid_32bit.py
+++ b/backends/arm/test/ops/test_sigmoid_32bit.py
@@ -7,12 +7,13 @@
 import torch
 from executorch.backends.arm.quantizer import TOSAQuantizer
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU85PipelineBI,
     OpNotSupportedPipeline,
     TosaPipelineBI,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester import Quantize
 from torch.ao.quantization.observer import HistogramObserver
 from torch.ao.quantization.quantizer import QuantizationSpec
@@ -53,9 +54,18 @@ def _get_32_bit_quant_config():
     return qconfig
 
 
-def get_32bit_sigmoid_quantizer(tosa_str: str):
-    tosa_spec = common.TosaSpecification.create_from_string(tosa_str)
-    quantizer = TOSAQuantizer(tosa_spec)
+def get_32bit_sigmoid_quantizer(u55_config=False):
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "0.80": TosaSpecification.create_from_string(
+            "TOSA-0.80+BI" + ("+u55" if u55_config else "")
+        ),
+        "1.0": TosaSpecification.create_from_string(
+            "TOSA-1.0+INT" + ("+u55" if u55_config else "")
+        ),
+    }
+
+    quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
     quantizer.set_global(_get_32_bit_quant_config())
     quantizer.set_module_type(
         torch.nn.modules.activation.Sigmoid, _get_16_bit_quant_config()
@@ -105,55 +115,65 @@ def test_sigmoid_tosa_BI(test_data):
         Sigmoid.aten_op,
         Sigmoid.exir_op,
     )
-    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
-def test_sigmoid_add_sigmoid_tosa_BI(test_data):
+def test_sigmoid_tosa_BI_add_sigmoid(test_data):
     pipeline = TosaPipelineBI(
         SigmoidAddSigmoid(),
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
     )
-    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
-def test_sigmoid_tosa_u55(test_data):
+def test_sigmoid_u55_BI(test_data):
     pipeline = OpNotSupportedPipeline(
-        Sigmoid(), (test_data(),), "TOSA-0.80+BI+u55", {Sigmoid.exir_op: 1}
+        Sigmoid(),
+        (test_data(),),
+        {Sigmoid.exir_op: 1},
+        quantize=True,
+        u55_subset=True,
     )
-    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer(True))
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
-def test_sigmoid_add_sigmoid_tosa_u55(test_data):
+def test_sigmoid_u55_BI_add_sigmoid(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
         (test_data(),),
-        "TOSA-0.80+BI+u55",
         {Sigmoid.exir_op: 3},
         n_expected_delegates=1,
+        quantize=True,
+        u55_subset=True,
     )
-    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer(True))
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
-def test_sigmoid_tosa_u85(test_data):
+@pytest.mark.flaky(reruns=5)
+def test_sigmoid_u85_BI(test_data):
     pipeline = EthosU85PipelineBI(
-        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, run_on_fvp=True
+        Sigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+        run_on_fvp=True,
     )
-    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
 
 
@@ -162,11 +182,14 @@ def test_sigmoid_tosa_u85(test_data):
     test_data_suite,
     xfails={
         "ramp": "AssertionError: Output 0 does not match reference output.",
+        "rand": "AssertionError: Output 0 does not match reference output.",
+        "rand_4d": "AssertionError: Output 0 does not match reference output.",
     },
 )
 @pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
-def test_sigmoid_add_sigmoid_tosa_u85(test_data):
+@pytest.mark.flaky(reruns=5)
+def test_sigmoid_u85_BI_add_sigmoid(test_data):
     pipeline = EthosU85PipelineBI(
         SigmoidAddSigmoid(),
         (test_data(),),
@@ -174,5 +197,5 @@ def test_sigmoid_add_sigmoid_tosa_u85(test_data):
         Sigmoid.exir_op,
         run_on_fvp=True,
     )
-    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py
index 51748b02450..e1736bf10e6 100644
--- a/backends/arm/test/ops/test_silu.py
+++ b/backends/arm/test/ops/test_silu.py
@@ -30,14 +30,14 @@ def forward(
         return torch.nn.SiLU(inplace=_inplace)(_input)
 
     test_data: list[input_t] = {
-        "op_silu_rank1_ones": (torch.ones(5),),
-        "op_silu_rank1_negative_ones": (torch.ones(5) * (-1),),
-        "op_silu_rank1_rand": (torch.rand(5) * 5,),
-        "op_silu_rank4_ones": (torch.ones(1, 10, 25, 20),),
-        "op_silu_rank4_negative_ones": ((-1) * torch.ones(1, 10, 25, 20),),
-        "op_silu_rank4_large_rand": (200 * torch.rand(1, 10, 25, 20),),
-        "op_silu_rank4_negative_large_rand": ((-200) * torch.rand(1, 10, 25, 20),),
-        "op_silu_rank4_large_randn": (200 * torch.randn(1, 10, 25, 20) + 1,),
+        "op_silu_rank1_ones": lambda: torch.ones(5),
+        "op_silu_rank1_negative_ones": lambda: torch.ones(5) * (-1),
+        "op_silu_rank1_rand": lambda: torch.rand(5) * 5,
+        "op_silu_rank4_ones": lambda: torch.ones(1, 10, 25, 20),
+        "op_silu_rank4_negative_ones": lambda: (-1) * torch.ones(1, 10, 25, 20),
+        "op_silu_rank4_large_rand": lambda: 200 * torch.rand(1, 10, 25, 20),
+        "op_silu_rank4_negative_large_rand": lambda: (-200) * torch.rand(1, 10, 25, 20),
+        "op_silu_rank4_large_randn": lambda: 200 * torch.randn(1, 10, 25, 20) + 1,
     }
 
     aten_op_MI = "torch.ops.aten.silu.default"
@@ -47,28 +47,28 @@ def forward(
 
 @common.parametrize("test_data", Silu.test_data)
 def test_silu_tosa_MI(test_data: input_t):
-    silu_data = (test_data[0], False)
+    silu_data = (test_data(), False)
     pipeline = TosaPipelineMI[input_t](Silu(), silu_data, Silu.aten_op_MI)
     pipeline.run()
 
 
 @common.parametrize("test_data", Silu.test_data)
 def test_silu_tosa_MI_inplace(test_data: input_t):
-    silu_data = (test_data[0], True)
+    silu_data = (test_data(), True)
     pipeline = TosaPipelineMI[input_t](Silu(), silu_data, Silu.aten_op_inplace_MI)
     pipeline.run()
 
 
 @common.parametrize("test_data", Silu.test_data)
 def test_silu_tosa_BI(test_data: input_t):
-    silu_data = (test_data[0], False)
+    silu_data = (test_data(), False)
     pipeline = TosaPipelineBI[input_t](Silu(), silu_data, Silu.aten_op_BI)
     pipeline.run()
 
 
 @common.parametrize("test_data", Silu.test_data)
 def test_silu_tosa_BI_inplace(test_data: input_t):
-    silu_data = (test_data[0], True)
+    silu_data = (test_data(), True)
     pipeline = TosaPipelineBI[input_t](Silu(), silu_data, Silu.aten_op_BI)
     pipeline.run()
 
@@ -76,7 +76,7 @@ def test_silu_tosa_BI_inplace(test_data: input_t):
 @common.parametrize("test_data", Silu.test_data)
 @common.XfailIfNoCorstone300
 def test_silu_u55_BI(test_data: input_t):
-    silu_data = (test_data[0], False)
+    silu_data = (test_data(), False)
     pipeline = EthosU55PipelineBI[input_t](
         Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
     )
@@ -86,7 +86,7 @@ def test_silu_u55_BI(test_data: input_t):
 @common.parametrize("test_data", Silu.test_data)
 @common.XfailIfNoCorstone300
 def test_silu_u55_BI_inplace(test_data: input_t):
-    silu_data = (test_data[0], True)
+    silu_data = (test_data(), True)
     pipeline = EthosU55PipelineBI[input_t](
         Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
     )
@@ -96,7 +96,7 @@ def test_silu_u55_BI_inplace(test_data: input_t):
 @common.parametrize("test_data", Silu.test_data)
 @common.XfailIfNoCorstone320
 def test_silu_u85_BI(test_data: input_t):
-    silu_data = (test_data[0], False)
+    silu_data = (test_data(), False)
     pipeline = EthosU85PipelineBI[input_t](
         Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
     )
@@ -106,7 +106,7 @@ def test_silu_u85_BI(test_data: input_t):
 @common.parametrize("test_data", Silu.test_data)
 @common.XfailIfNoCorstone320
 def test_silu_u85_BI_inplace(test_data: input_t):
-    silu_data = (test_data[0], True)
+    silu_data = (test_data(), True)
     pipeline = EthosU85PipelineBI[input_t](
         Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
     )
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index 91ef51cc2a2..6ae12c41657 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -4,135 +4,91 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-from typing import Tuple
 
-import pytest
+from typing import Tuple
 
 import torch
 
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-test_data_suite = [
-    (torch.ones(10), [(3, -3)]),
-    (torch.ones(10), [(-8, 3)]),
-    (torch.ones(10, 10), [(1, 3), (3, None)]),
-    (torch.ones(10, 10, 10), [(0, 7), (0, None), (0, 8)]),
-    (torch.ones((1, 12, 10, 10)), [(None, None), (None, 5), (3, 5), (4, 10)]),
-]
-
-
-class TestSimpleSlice(unittest.TestCase):
-
-    class Slice(torch.nn.Module):
-        def forward(self, x: torch.Tensor, s: list[tuple[int, int]]):
-            slices = [slice(*i) for i in s]
-            return x[slices]
-
-    def _test_slice_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: torch.Tensor
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.slice.Tensor"])
-            .to_edge()
-            .check(["executorch_exir_dialects_edge__ops_aten_slice_copy"])
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data)
-
-    def _test_slice_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.slice.Tensor"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
-
-    def _test_slice_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.slice.Tensor"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_slice_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        self._test_slice_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_slice_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        self._test_slice_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.tosa_ref_model
-    def test_slice_tosa_MI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]):
-        self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor, slices))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.tosa_ref_model
-    def test_slice_nchw_tosa_BI(
-        self, tensor: torch.Tensor, slices: list[tuple[int, int]]
-    ):
-        self._test_slice_tosa_BI_pipeline(self.Slice(), (tensor, slices))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.tosa_ref_model
-    def test_slice_nhwc_tosa_BI(
-        self, tensor: torch.Tensor, slices: list[tuple[int, int]]
-    ):
-        self._test_slice_tosa_BI_pipeline(self.Slice(), (tensor, slices))
-
-    @parameterized.expand(test_data_suite)
-    def test_slice_u55_BI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]):
-        self._test_slice_u55_BI_pipeline(self.Slice(), (tensor, slices))
-
-    @parameterized.expand(test_data_suite)
-    def test_slice_u85_BI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]):
-        self._test_slice_u85_BI_pipeline(self.Slice(), (tensor, slices))
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.slice.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_slice_copy"
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+test_data_suite = {
+    "ones_neg_3": lambda: (torch.ones(10), [(3, -3)]),
+    "ones_neg_8": lambda: (torch.ones(10), [(-8, 3)]),
+    "ones_slice_2": lambda: (torch.ones(10, 10), [(1, 3), (3, None)]),
+    "ones_slice_3": lambda: (torch.ones(10, 10, 10), [(0, 7), (0, None), (0, 8)]),
+    "ones_slice_4": lambda: (
+        torch.ones((1, 12, 10, 10)),
+        [(None, None), (None, 5), (3, 5), (4, 10)],
+    ),
+}
+
+
+class Slice(torch.nn.Module):
+
+    def forward(self, x: torch.Tensor, s: list[tuple[int, int]]):
+        slices = [slice(*i) for i in s]
+        return x[slices]
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_slice_tensor_tosa_MI(test_data: torch.Tensor):
+    pipeline = TosaPipelineMI[input_t1](Slice(), test_data(), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_slice_tensor_tosa_BI_nchw(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Slice(),
+        test_data(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_slice_tensor_tosa_BI_nhwc(test_data: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Slice(),
+        test_data(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_slice_tensor_u55_BI(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Slice(),
+        test_data(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_slice_tensor_u85_BI(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Slice(),
+        test_data(),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index dcee5d038f2..5ab616c0eea 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -18,7 +18,6 @@
 
 aten_op = "torch.ops.aten.softmax.default"  # Used for checking that we do not have softmax in the graph after decompose
 exir_op = "executorch_exir_dialects_edge__ops_aten__softmax_tensor"
-
 input_t1 = Tuple[torch.Tensor]  # Input x
 
 
@@ -31,20 +30,20 @@ def forward(self, x):
         return self.softmax(x)
 
     test_data = {
-        "ones": ((torch.ones(10, 10),), 1),
-        "ones_neg_dim": ((torch.ones(1, 3, 4),), -1),
-        "randn_neg_dim": ((torch.randn(1, 5, 8, 7),), -3),
-        "zeros": ((torch.zeros(1, 8, 5, 2),), 0),
-        "zeros_neg_dim": ((torch.zeros(1, 7, 8, 9),), -4),
-        "rand": ((torch.rand(1, 2, 5, 8),), 2),
-        "rand_neg_dim": ((torch.rand(1, 10, 8, 10),), -2),
-        "randn_mult_batches": ((torch.randn(2, 10, 10, 10),), 3),
+        "ones": lambda: ((torch.ones(10, 10),), 1),
+        "ones_neg_dim": lambda: ((torch.ones(1, 3, 4),), -1),
+        "randn_neg_dim": lambda: ((torch.randn(1, 5, 8, 7),), -3),
+        "zeros": lambda: ((torch.zeros(1, 8, 5, 2),), 0),
+        "zeros_neg_dim": lambda: ((torch.zeros(1, 7, 8, 9),), -4),
+        "rand": lambda: ((torch.rand(1, 2, 5, 8),), 2),
+        "rand_neg_dim": lambda: ((torch.rand(1, 10, 8, 10),), -2),
+        "randn_mult_batches": lambda: ((torch.randn(2, 10, 10, 10),), 3),
     }
 
 
 @common.parametrize("test_data", Softmax.test_data)
 def test_softmax_tosa_MI(test_data):
-    data, dim = test_data
+    data, dim = test_data()
     pipeline = TosaPipelineMI[input_t1](Softmax(dim), data, [])
     pipeline.add_stage_after(
         "to_edge_transform_and_lower", pipeline.tester.check_not, [exir_op]
@@ -54,7 +53,7 @@ def test_softmax_tosa_MI(test_data):
 
 @common.parametrize("test_data", Softmax.test_data)
 def test_softmax_tosa_BI(test_data):
-    data, dim = test_data
+    data, dim = test_data()
     pipeline = TosaPipelineBI[input_t1](Softmax(dim), data, [])
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -70,7 +69,7 @@ def test_softmax_tosa_BI(test_data):
 )
 @common.XfailIfNoCorstone300
 def test_softmax_u55_BI(test_data):
-    data, dim = test_data
+    data, dim = test_data()
     pipeline = EthosU55PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=True)
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -86,7 +85,7 @@ def test_softmax_u55_BI(test_data):
 )
 @common.XfailIfNoCorstone320
 def test_softmax_u85_BI(test_data):
-    data, dim = test_data
+    data, dim = test_data()
     pipeline = EthosU85PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=True)
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py
index b86e27f1a4c..90458584995 100644
--- a/backends/arm/test/ops/test_split.py
+++ b/backends/arm/test/ops/test_split.py
@@ -1,141 +1,147 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+
+from typing import Tuple
 
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-test_data_t = tuple[torch.Tensor, int | list[int], int]
-
-
-class TestSimpleSplit(unittest.TestCase):
-    class Split(torch.nn.Module):
-
-        test_data: list[tuple[test_data_t]] = [
-            ((torch.rand(10), 2, 0),),
-            ((torch.rand(10, 10), 3, 1),),
-            ((torch.rand(10, 10), 4, -1),),
-            ((torch.rand(10, 15, 10), [2, 2, 11], 1),),
-            ((torch.rand(4, 4, 4, 4), 2, 0),),
-            ((torch.rand(4, 4, 4, 4), [1, 1, 1, 1], -2),),
-        ]
-
-        def forward(
-            self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int
-        ):
-            return x.split(split_size=split_size_or_sections, dim=dim)
-
-    class SplitWithSizes(torch.nn.Module):
-        def forward(self, x: torch.Tensor, split_sizes: list[int], dim: int):
-            return x.split_with_sizes(split_sizes=split_sizes, dim=dim)
-
-    class SplitSingleOut(torch.nn.Module):
-        def forward(
-            self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int
-        ):
-            return x.split(split_size=split_size_or_sections, dim=dim)[1]
-
-    class SplitTwoOut(torch.nn.Module):
-        def forward(
-            self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int
-        ):
-            return x.split(split_size=split_size_or_sections, dim=dim)[1:3]
-
-    def _test_split_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: test_data_t
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+exir_op = "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default"
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Split(torch.nn.Module):
+
+    test_data = {
+        "split_1d_2_size_0_dim": lambda: (torch.rand(10), 2, 0),
+        "split_2d_3_size_1_dim": lambda: (torch.rand(10, 10), 3, 1),
+        "split_2d_2_size_4_dim": lambda: (torch.rand(10, 10), 4, -1),
+        "split_4d_2_size_2_dim": lambda: (torch.rand(4, 4, 4, 4), 2, 0),
+    }
+
+    test_data_list = {
+        "split_3d_2_sizes_dim": lambda: (torch.rand(10, 15, 10), [2, 2, 11], 1),
+        "split_4d_2_sizes_dim_neg": lambda: (torch.rand(4, 4, 4, 4), [1, 1, 1, 1], -2),
+    }
+
+    def forward(
+        self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int
     ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-            .check(
-                [
-                    "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default"
-                ]
-            )
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_split_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: test_data_t
+        return x.split(split_size=split_size_or_sections, dim=dim)
+
+
+class SplitWithSizes(torch.nn.Module):
+    def forward(self, x: torch.Tensor, split_sizes: list[int], dim: int):
+        return x.split_with_sizes(split_sizes=split_sizes, dim=dim)
+
+
+class SplitSingleOut(torch.nn.Module):
+    def forward(
+        self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int
     ):
+        return x.split(split_size=split_size_or_sections, dim=dim)[1]
+
 
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_split_ethosu_BI_pipeline(
-        self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: test_data_t
+class SplitTwoOut(torch.nn.Module):
+    def forward(
+        self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int
     ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    @parameterized.expand(Split.test_data)
-    def test_split_tosa_MI(self, test_data: test_data_t):
-        self._test_split_tosa_MI_pipeline(self.Split(), test_data)
-
-    @parameterized.expand([Split.test_data[3], Split.test_data[5]])
-    def test_split_with_sizes_tosa_MI(self, test_data: test_data_t):
-        assert isinstance(test_data[1], list)
-        self._test_split_tosa_MI_pipeline(self.SplitWithSizes(), test_data)
-
-    @parameterized.expand(Split.test_data)
-    def test_split_one_out_tosa_MI(self, test_data: test_data_t):
-        self._test_split_tosa_MI_pipeline(self.SplitSingleOut(), test_data)
-
-    @parameterized.expand(Split.test_data)
-    def test_split_two_out_tosa_MI(self, test_data: test_data_t):
-        self._test_split_tosa_MI_pipeline(self.SplitTwoOut(), test_data)
-
-    @parameterized.expand(Split.test_data)
-    def test_split_tosa_BI(self, test_data: test_data_t):
-        self._test_split_tosa_BI_pipeline(self.Split(), test_data)
-
-    @parameterized.expand(Split.test_data)
-    def test_split_u55_BI(self, test_data: test_data_t):
-        self._test_split_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Split(), test_data
-        )
-
-    @parameterized.expand(Split.test_data)
-    def test_split_u85_BI(self, test_data: test_data_t):
-        self._test_split_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Split(), test_data
-        )
+        return x.split(split_size=split_size_or_sections, dim=dim)[1:3]
+
+
+@common.parametrize(
+    "test_data",
+    (Split.test_data | Split.test_data_list),
+)
+def test_split_with_sizes_tosa_MI(test_data: input_t1):
+
+    pipeline = TosaPipelineMI[input_t1](
+        Split(),
+        test_data(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Split.test_data_list)
+def test_split_with_sizes_tosa_MI_2(test_data: input_t1):
+
+    pipeline = TosaPipelineMI[input_t1](
+        SplitWithSizes(),
+        test_data(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    (Split.test_data | Split.test_data_list),
+)
+def test_split_with_sizes_tosa_MI_one_out(test_data: input_t1):
+
+    pipeline = TosaPipelineMI[input_t1](
+        SplitSingleOut(),
+        test_data(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    (Split.test_data | Split.test_data_list),
+)
+def test_split_with_sizes_tosa_BI(test_data: input_t1):
+
+    pipeline = TosaPipelineBI[input_t1](
+        Split(),
+        test_data(),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    (Split.test_data | Split.test_data_list),
+)
+def test_split_with_sizes_u55_BI(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Split(),
+        test_data(),
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    (Split.test_data | Split.test_data_list),
+)
+def test_split_with_sizes_u85_BI(test_data: input_t1):
+
+    pipeline = EthosU85PipelineBI[input_t1](
+        Split(),
+        test_data(),
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sqrt.py b/backends/arm/test/ops/test_sqrt.py
index 53a1e79c0a8..0c79f534656 100644
--- a/backends/arm/test/ops/test_sqrt.py
+++ b/backends/arm/test/ops/test_sqrt.py
@@ -31,11 +31,11 @@ def forward(self, x):
         return torch.sqrt(x)
 
     test_data: Dict[str, input_t] = {
-        "sqrt_tensor_rank1_ones": (torch.ones(10),),
-        "sqrt_tensor_rank2_random": (torch.rand(5, 10),),
-        "sqrt_tensor_rank3_ones": (torch.ones(2, 3, 4),),
-        "sqrt_tensor_rank4_random": (torch.rand(1, 3, 8, 8),),
-        "sqrt_tensor_rank4_multibatch": (torch.rand(2, 3, 4, 4),),
+        "sqrt_tensor_rank1_ones": lambda: (torch.ones(10),),
+        "sqrt_tensor_rank2_random": lambda: (torch.rand(5, 10),),
+        "sqrt_tensor_rank3_ones": lambda: (torch.ones(2, 3, 4),),
+        "sqrt_tensor_rank4_random": lambda: (torch.rand(1, 3, 8, 8),),
+        "sqrt_tensor_rank4_multibatch": lambda: (torch.rand(2, 3, 4, 4),),
     }
 
 
@@ -47,7 +47,10 @@ def forward(self, x):
 @common.parametrize("test_data", Sqrt.test_data)
 def test_sqrt_tosa_MI(test_data: Sqrt.input_t):
     pipeline = TosaPipelineMI[Sqrt.input_t](
-        Sqrt(), test_data, Sqrt.aten_op_MI, Sqrt.exir_op_MI
+        Sqrt(),
+        test_data(),
+        Sqrt.aten_op_MI,
+        Sqrt.exir_op_MI,
     )
     pipeline.run()
 
@@ -55,7 +58,10 @@ def test_sqrt_tosa_MI(test_data: Sqrt.input_t):
 @common.parametrize("test_data", Sqrt.test_data)
 def test_sqrt_tosa_BI(test_data: Sqrt.input_t):
     pipeline = TosaPipelineBI[Sqrt.input_t](
-        Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI
+        Sqrt(),
+        test_data(),
+        Sqrt.aten_op_BI,
+        Sqrt.exir_op_BI,
     )
     pipeline.run()
 
@@ -64,7 +70,11 @@ def test_sqrt_tosa_BI(test_data: Sqrt.input_t):
 @common.XfailIfNoCorstone300
 def test_sqrt_u55_BI(test_data: Sqrt.input_t):
     pipeline = EthosU55PipelineBI[Sqrt.input_t](
-        Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI, run_on_fvp=True
+        Sqrt(),
+        test_data(),
+        Sqrt.aten_op_BI,
+        Sqrt.exir_op_BI,
+        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -73,6 +83,10 @@ def test_sqrt_u55_BI(test_data: Sqrt.input_t):
 @common.XfailIfNoCorstone320
 def test_sqrt_u85_BI(test_data: Sqrt.input_t):
     pipeline = EthosU85PipelineBI[Sqrt.input_t](
-        Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI, run_on_fvp=True
+        Sqrt(),
+        test_data(),
+        Sqrt.aten_op_BI,
+        Sqrt.exir_op_BI,
+        run_on_fvp=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py
index 9f02392e1e2..e5f606c887e 100644
--- a/backends/arm/test/ops/test_squeeze.py
+++ b/backends/arm/test/ops/test_squeeze.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,208 +7,194 @@
 # Tests the squeeze op which squeezes a given dimension with size 1 into a lower ranked tensor.
 #
 
-import unittest
-from typing import Optional, Tuple
+
+from typing import Tuple
 
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestSqueeze(unittest.TestCase):
-    class SqueezeDim(torch.nn.Module):
-        test_parameters: list[tuple[torch.Tensor, int]] = [
-            (torch.randn(1, 1, 5), -2),
-            (torch.randn(1, 2, 3, 1), 3),
-            (torch.randn(1, 5, 1, 5), -2),
-        ]
-
-        def forward(self, x: torch.Tensor, dim: int):
-            return x.squeeze(dim)
-
-    class SqueezeDims(torch.nn.Module):
-        test_parameters: list[tuple[torch.Tensor, tuple[int]]] = [
-            (torch.randn(1, 1, 5), (0, 1)),
-            (torch.randn(1, 5, 5, 1), (0, -1)),
-            (torch.randn(1, 5, 1, 5), (0, -2)),
-        ]
-
-        def forward(self, x: torch.Tensor, dims: tuple[int]):
-            return x.squeeze(dims)
-
-    class Squeeze(torch.nn.Module):
-        test_parameters: list[tuple[torch.Tensor]] = [
-            (torch.randn(1, 1, 5),),
-            (torch.randn(1, 5, 5, 1),),
-            (torch.randn(1, 5, 1, 5),),
-        ]
-
-        def forward(self, x: torch.Tensor):
-            return x.squeeze()
-
-    def _test_squeeze_tosa_MI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor, Optional[tuple[int]]],
-        export_target: str,
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({export_target: 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_squeeze_tosa_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor, Optional[tuple[int]]],
-        export_target: str,
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({export_target: 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_squeeze_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor, Optional[tuple[int]]],
-        export_target: str,
-    ):
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize()
-            .export()
-            .check_count({export_target: 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    @parameterized.expand(Squeeze.test_parameters)
-    def test_squeeze_tosa_MI(
-        self,
-        test_tensor: torch.Tensor,
-    ):
-        self._test_squeeze_tosa_MI_pipeline(
-            self.Squeeze(), (test_tensor,), "torch.ops.aten.squeeze.default"
-        )
-
-    @parameterized.expand(Squeeze.test_parameters)
-    def test_squeeze_tosa_BI(
-        self,
-        test_tensor: torch.Tensor,
-    ):
-        self._test_squeeze_tosa_BI_pipeline(
-            self.Squeeze(), (test_tensor,), "torch.ops.aten.squeeze.default"
-        )
-
-    @parameterized.expand(Squeeze.test_parameters)
-    def test_squeeze_u55_BI(
-        self,
-        test_tensor: torch.Tensor,
-    ):
-        self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(),
-            self.Squeeze(),
-            (test_tensor,),
-            "torch.ops.aten.squeeze.default",
-        )
-
-    @parameterized.expand(Squeeze.test_parameters)
-    def test_squeeze_u85_BI(
-        self,
-        test_tensor: torch.Tensor,
-    ):
-        self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(),
-            self.Squeeze(),
-            (test_tensor,),
-            "torch.ops.aten.squeeze.default",
-        )
-
-    @parameterized.expand(SqueezeDim.test_parameters)
-    def test_squeeze_dim_tosa_MI(self, test_tensor: torch.Tensor, dim: int):
-        self._test_squeeze_tosa_MI_pipeline(
-            self.SqueezeDim(), (test_tensor, dim), "torch.ops.aten.squeeze.dim"
-        )
-
-    @parameterized.expand(SqueezeDim.test_parameters)
-    def test_squeeze_dim_tosa_BI(self, test_tensor: torch.Tensor, dim: int):
-        self._test_squeeze_tosa_BI_pipeline(
-            self.SqueezeDim(), (test_tensor, dim), "torch.ops.aten.squeeze.dim"
-        )
-
-    @parameterized.expand(SqueezeDim.test_parameters)
-    def test_squeeze_dim_u55_BI(self, test_tensor: torch.Tensor, dim: int):
-        self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(),
-            self.SqueezeDim(),
-            (test_tensor, dim),
-            "torch.ops.aten.squeeze.dim",
-        )
-
-    @parameterized.expand(SqueezeDim.test_parameters)
-    def test_squeeze_dim_u85_BI(self, test_tensor: torch.Tensor, dim: int):
-        self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(),
-            self.SqueezeDim(),
-            (test_tensor, dim),
-            "torch.ops.aten.squeeze.dim",
-        )
-
-    @parameterized.expand(SqueezeDims.test_parameters)
-    def test_squeeze_dims_tosa_MI(self, test_tensor: torch.Tensor, dims: tuple[int]):
-        self._test_squeeze_tosa_MI_pipeline(
-            self.SqueezeDims(), (test_tensor, dims), "torch.ops.aten.squeeze.dims"
-        )
-
-    @parameterized.expand(SqueezeDims.test_parameters)
-    def test_squeeze_dims_tosa_BI(self, test_tensor: torch.Tensor, dims: tuple[int]):
-        self._test_squeeze_tosa_BI_pipeline(
-            self.SqueezeDims(), (test_tensor, dims), "torch.ops.aten.squeeze.dims"
-        )
-
-    @parameterized.expand(SqueezeDims.test_parameters)
-    def test_squeeze_dims_u55_BI(self, test_tensor: torch.Tensor, dims: tuple[int]):
-        self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(),
-            self.SqueezeDims(),
-            (test_tensor, dims),
-            "torch.ops.aten.squeeze.dims",
-        )
-
-    @parameterized.expand(SqueezeDims.test_parameters)
-    def test_squeeze_dims_u85_BI(self, test_tensor: torch.Tensor, dims: tuple[int]):
-        self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(),
-            self.SqueezeDims(),
-            (test_tensor, dims),
-            "torch.ops.aten.squeeze.dims",
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class SqueezeDim(torch.nn.Module):
+    test_parameters = {
+        "squeeze3d_dim_neg_2": lambda: (torch.randn(1, 1, 5), -2),
+        "squeeze4d_dim_pos_3": lambda: (torch.randn(1, 2, 3, 1), 3),
+        "squeeze4d_dim_neg_2": lambda: (torch.randn(1, 5, 1, 5), -2),
+    }
+
+    def forward(self, x: torch.Tensor, dim: int):
+        return x.squeeze(dim)
+
+
+class SqueezeDims(torch.nn.Module):
+    test_parameters = {
+        "squeeze3d_dims_0_1": lambda: (torch.randn(1, 1, 5), (0, 1)),
+        "squeeze4d_dims_0_neg_1": lambda: (torch.randn(1, 5, 5, 1), (0, -1)),
+        "squeeze4d_dims_0_neg_2": lambda: (torch.randn(1, 5, 1, 5), (0, -2)),
+    }
+
+    def forward(self, x: torch.Tensor, dims: tuple[int]):
+        return x.squeeze(dims)
+
+
+class Squeeze(torch.nn.Module):
+    test_parameters = {
+        "squeeze3d": lambda: (torch.randn(1, 1, 5),),
+        "squeeze4d_dims": lambda: (torch.randn(1, 5, 5, 1),),
+        "squeeze3d_dims_mix": lambda: (torch.randn(1, 5, 1, 5),),
+    }
+
+    def forward(self, x: torch.Tensor):
+        return x.squeeze()
+
+
+@common.parametrize("test_data", Squeeze.test_parameters)
+def test_squeeze_dim_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        Squeeze(),
+        test_data(),
+        aten_op="torch.ops.aten.squeeze.default",
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Squeeze.test_parameters)
+def test_squeeze_dim_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        Squeeze(),
+        test_data(),
+        aten_op="torch.ops.aten.squeeze.default",
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Squeeze.test_parameters)
+@common.XfailIfNoCorstone300
+def test_squeeze_dim_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Squeeze(),
+        test_data(),
+        aten_ops="torch.ops.aten.squeeze.default",
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Squeeze.test_parameters)
+@common.XfailIfNoCorstone320
+def test_squeeze_dim_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Squeeze(),
+        test_data(),
+        aten_ops="torch.ops.aten.squeeze.default",
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDim.test_parameters)
+def test_squeeze_dim_tosa_MI_2(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        SqueezeDim(),
+        test_data(),
+        aten_op="torch.ops.aten.squeeze.dim",
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDim.test_parameters)
+def test_squeeze_dim_tosa_BI_2(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        SqueezeDim(),
+        test_data(),
+        aten_op="torch.ops.aten.squeeze.dim",
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDim.test_parameters)
+@common.XfailIfNoCorstone300
+def test_squeeze_dim_u55_BI_2(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        SqueezeDim(),
+        test_data(),
+        aten_ops="torch.ops.aten.squeeze.dim",
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDim.test_parameters)
+@common.XfailIfNoCorstone320
+def test_squeeze_dim_u85_BI_2(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        SqueezeDim(),
+        test_data(),
+        aten_ops="torch.ops.aten.squeeze.dim",
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDims.test_parameters)
+def test_squeeze_dims_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        SqueezeDims(),
+        test_data(),
+        aten_op="torch.ops.aten.squeeze.dims",
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDims.test_parameters)
+def test_squeeze_dims_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        SqueezeDims(),
+        test_data(),
+        aten_op="torch.ops.aten.squeeze.dims",
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDims.test_parameters)
+@common.XfailIfNoCorstone300
+def test_squeeze_dims_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        SqueezeDims(),
+        test_data(),
+        aten_ops="torch.ops.aten.squeeze.dims",
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", SqueezeDims.test_parameters)
+@common.XfailIfNoCorstone320
+def test_squeeze_dims_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        SqueezeDims(),
+        test_data(),
+        aten_ops="torch.ops.aten.squeeze.dims",
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index d1849e830c9..f61f3b0583d 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
 from typing import Tuple
 
 import torch
@@ -22,23 +21,23 @@
 
 # Single-input subtraction (x - x)
 sub_test_data = {
-    "ones_1D_5": (torch.ones(5),),
-    "ones_1D_50": (torch.ones(50),),
-    "rand_1D_10": (torch.rand(10),),
-    "rand_2D_5x5": (torch.rand(5, 5),),
-    "rand_3D_5x5x5": (torch.rand(5, 5, 5),),
-    "rand_4D_2x3x4x5": (torch.rand(2, 3, 4, 5),),
-    "zeros": (torch.zeros(10),),
+    "ones_1D_5": lambda: (torch.ones(5),),
+    "ones_1D_50": lambda: (torch.ones(50),),
+    "rand_1D_10": lambda: (torch.rand(10),),
+    "rand_2D_5x5": lambda: (torch.rand(5, 5),),
+    "rand_3D_5x5x5": lambda: (torch.rand(5, 5, 5),),
+    "rand_4D_2x3x4x5": lambda: (torch.rand(2, 3, 4, 5),),
+    "zeros": lambda: (torch.zeros(10),),
 }
 
 fvp_sub_xfails = {"rand_4D_2x3x4x5": "MLETORCH-517 : Multiple batches not supported"}
 
 # Two-input subtraction (x - y)
 sub2_test_data = {
-    "rand_2D_4x4": (torch.rand(4, 4), torch.rand(4, 4)),
-    "rand_3D_4x4x4": (torch.rand(4, 2, 2), torch.rand(4, 2, 2)),
-    "rand_4D_2x2x4x4": (torch.rand(2, 2, 4, 4), torch.rand(2, 2, 4, 4)),
-    "zeros": (torch.rand(4, 4), torch.zeros(4, 4)),
+    "rand_2D_4x4": lambda: (torch.rand(4, 4), torch.rand(4, 4)),
+    "rand_3D_4x4x4": lambda: (torch.rand(4, 2, 2), torch.rand(4, 2, 2)),
+    "rand_4D_2x2x4x4": lambda: (torch.rand(2, 2, 4, 4), torch.rand(2, 2, 4, 4)),
+    "zeros": lambda: (torch.rand(4, 4), torch.zeros(4, 4)),
 }
 fvp_sub2_xfails = {"rand_4D_2x2x4x4": "MLETORCH-517 : Multiple batches not supported"}
 
@@ -58,11 +57,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 
 @common.parametrize("test_data", sub_test_data)
-def test_sub_tosa_MI(test_data):
+def test_sub_tensor_tosa_MI(test_data):
     """Test Subtraction (TOSA MI)"""
     pipeline = TosaPipelineMI[input_t1](
         Sub(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
     )
@@ -70,11 +69,11 @@ def test_sub_tosa_MI(test_data):
 
 
 @common.parametrize("test_data", sub2_test_data)
-def test_sub_2_tosa_MI(test_data: Tuple[torch.Tensor, torch.Tensor]):
+def test_sub_tensor_tosa_MI_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     """Test Two-Operand Subtraction (TOSA MI)"""
     pipeline = TosaPipelineMI[input_t2](
         Sub2(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
     )
@@ -82,11 +81,11 @@ def test_sub_2_tosa_MI(test_data: Tuple[torch.Tensor, torch.Tensor]):
 
 
 @common.parametrize("test_data", sub_test_data)
-def test_sub_tosa_BI(test_data):
+def test_sub_tensor_tosa_BI(test_data):
     """Test Subtraction (TOSA BI)"""
     pipeline = TosaPipelineBI[input_t1](
         Sub(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
     )
@@ -95,11 +94,11 @@ def test_sub_tosa_BI(test_data):
 
 
 @common.parametrize("test_data", sub2_test_data)
-def test_sub_2_tosa_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
+def test_sub_tensor_tosa_BI_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     """Test Two-Operand Subtraction (TOSA BI)"""
     pipeline = TosaPipelineBI[input_t2](
         Sub2(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
     )
@@ -107,65 +106,13 @@ def test_sub_2_tosa_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
-@common.parametrize("test_data", sub_test_data)
-def test_sub_u55_BI(test_data):
-    """Test Subtraction on Ethos-U55"""
-    pipeline = EthosU55PipelineBI[input_t1](
-        Sub(),
-        test_data,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", sub2_test_data)
-def test_sub_2_u55_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
-    """Test Two-Operand Subtraction on Ethos-U55"""
-    pipeline = EthosU55PipelineBI[input_t2](
-        Sub2(),
-        test_data,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", sub_test_data)
-def test_sub_u85_BI(test_data):
-    """Test Subtraction on Ethos-U85 (Quantized Mode)"""
-    pipeline = EthosU85PipelineBI[input_t1](
-        Sub(),
-        test_data,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", sub2_test_data)
-def test_sub_2_u85_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
-    """Test Two-Operand Subtraction on Ethos-U85"""
-    pipeline = EthosU85PipelineBI[input_t2](
-        Sub2(),
-        test_data,
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-    )
-    pipeline.run()
-
-
 @common.parametrize("test_data", sub_test_data, fvp_sub_xfails)
-@common.SkipIfNoCorstone300
-def test_sub_u55_BI_on_fvp(test_data):
+@common.XfailIfNoCorstone300
+def test_sub_tensor_u55_BI(test_data):
     """Test Subtraction on Ethos-U55 (FVP Mode)"""
     pipeline = EthosU55PipelineBI[input_t1](
         Sub(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
         run_on_fvp=True,
@@ -175,12 +122,12 @@ def test_sub_u55_BI_on_fvp(test_data):
 
 
 @common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails)
-@common.SkipIfNoCorstone300
-def test_sub_2_u55_BI_on_fvp(test_data: Tuple[torch.Tensor, torch.Tensor]):
+@common.XfailIfNoCorstone300
+def test_sub_tensor_u55_BI_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     """Test Two-Operand Subtraction on Ethos-U55 (FVP Mode)"""
     pipeline = EthosU55PipelineBI[input_t2](
         Sub2(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
         run_on_fvp=True,
@@ -190,12 +137,12 @@ def test_sub_2_u55_BI_on_fvp(test_data: Tuple[torch.Tensor, torch.Tensor]):
 
 
 @common.parametrize("test_data", sub_test_data, fvp_sub_xfails)
-@common.SkipIfNoCorstone320
-def test_sub_u85_BI_on_fvp(test_data):
+@common.XfailIfNoCorstone320
+def test_sub_tensor_u85_BI_2(test_data):
     """Test Subtraction on Ethos-U85 (FVP Mode)"""
     pipeline = EthosU85PipelineBI[input_t1](
         Sub(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
         run_on_fvp=True,
@@ -205,12 +152,12 @@ def test_sub_u85_BI_on_fvp(test_data):
 
 
 @common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails)
-@common.SkipIfNoCorstone320
-def test_sub_2_u85_BI_on_fvp(test_data: Tuple[torch.Tensor, torch.Tensor]):
+@common.XfailIfNoCorstone320
+def test_sub_tensor_u85_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
     """Test Two-Operand Subtraction on Ethos-U85 (FVP Mode)"""
     pipeline = EthosU85PipelineBI[input_t2](
         Sub2(),
-        test_data,
+        test_data(),
         aten_op,
         exir_op,
         run_on_fvp=True,
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index bc0c50b8ee0..8837f1b292d 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -4,155 +4,104 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-exampledata_t = Tuple[torch.Tensor, int | list[int], bool]
-"""(data, dim(s), keepdim)"""
-
-
-class TestSum(unittest.TestCase):
-    """Tests sum which sums all elements along some specified dimensions.
-    keepdim specifies whether the dimension that is summed should
-    be squeezed or not.
-    """
-
-    class Sum(torch.nn.Module):
-        test_parameters: list[Tuple[exampledata_t]] = [
-            ((torch.rand(10), 0, True),),
-            ((torch.rand(10, 10), 1, False),),
-            ((torch.rand(10, 10, 10), [-3, 1], True),),
-            ((torch.rand(1, 1, 5, 8), 1, False),),
-            ((torch.rand(1, 2, 3, 4), 3, True),),
-            ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
-        ]
-
-        test_parameters_u55: list[Tuple[exampledata_t]] = [
-            ((torch.rand(10), 0, True),),
-            ((torch.rand(10, 10), 1, False),),
-            ((torch.rand(1, 2, 3, 4), 3, True),),
-            ((torch.rand(10, 10, 10), [-3, 1], True),),
-            ((torch.rand(1, 1, 5, 8), 1, False),),
-            ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
-        ]
-
-        def forward(self, x: torch.Tensor, dim: int, keepdim: bool):
-            return x.sum(dim=dim, keepdim=keepdim)
-
-    def _test_sum_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[exampledata_t]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.sum.dim_IntList": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_sum_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[exampledata_t]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.sum.dim_IntList": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_sum_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: tuple[exampledata_t],
-        compile_spec: CompileSpec,
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.sum.dim_IntList": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
-
-    @parameterized.expand(Sum.test_parameters)
-    def test_sum_tosa_MI(self, test_data: tuple[exampledata_t]):
-        self._test_sum_tosa_MI_pipeline(self.Sum(), test_data)
-
-    @parameterized.expand(Sum.test_parameters)
-    def test_sum_tosa_BI(self, test_data: tuple[exampledata_t]):
-        self._test_sum_tosa_BI_pipeline(self.Sum(), test_data)
-
-    @parameterized.expand(Sum.test_parameters_u55)
-    def test_sum_u55_BI(self, test_data: tuple[exampledata_t]):
-        self._test_sum_ethosu_BI_pipeline(
-            self.Sum(),
-            test_data,
-            common.get_u55_compile_spec(),
-        )
-
-    @parameterized.expand(Sum.test_parameters)
-    def test_sum_u85_BI(self, test_data: tuple[exampledata_t]):
-        self._test_sum_ethosu_BI_pipeline(
-            self.Sum(),
-            test_data,
-            common.get_u85_compile_spec(),
-        )
-
-    reject_inputs = [
-        ((torch.rand((65537, 1, 1)), 0, False),),
-        ((torch.rand((800, 90, 1)), 2, False),),
-        ((torch.rand((3, 2, 800, 90)), 1, False),),
-    ]
-
-    @parameterized.expand(reject_inputs)
-    def test_reject_sum_u55_BI(self, example_inputs):
-        (
-            ArmTester(
-                TestSum.Sum(),
-                example_inputs=example_inputs,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.sum.dim_IntList": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 0})
-            .check(["executorch_exir_dialects_edge__ops_aten_sum_dim_IntList"])
-        )
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.sum.dim_IntList"
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+"""Tests sum which sums all elements along some specified dimensions.
+keepdim specifies whether the dimension that is summed should
+be squeezed or not.
+"""
+
+
+class Sum(torch.nn.Module):
+    test_parameters = {
+        "1d_dim_0_keep": lambda: (torch.rand(10), 0, True),
+        "2d_dim_1_no_keep": lambda: (torch.rand(10, 10), 1, False),
+        "3d_dims_keep": lambda: (torch.rand(10, 10, 10), [-3, 1], True),
+        "4d_dims_no_keep": lambda: (torch.rand(1, 1, 5, 8), 1, False),
+        "4d_dim_3_keep": lambda: (torch.rand(1, 2, 3, 4), 3, True),
+        "4d_dims_keep": lambda: (torch.rand(1, 2, 8, 8), [2, 3, 0], True),
+    }
+
+    def forward(self, x: torch.Tensor, dim: int, keepdim: bool):
+        return x.sum(dim=dim, keepdim=keepdim)
+
+
+@common.parametrize("test_data", Sum.test_parameters)
+def test_sum_dim_intlist_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](
+        Sum(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Sum.test_parameters)
+def test_sum_dim_intlist_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        Sum(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Sum.test_parameters)
+@common.XfailIfNoCorstone300
+def test_view_u55_BI_1_0(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Sum(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Sum.test_parameters)
+@common.XfailIfNoCorstone320
+def test_view_u85_BI_1_0(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Sum(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+reject_inputs = {
+    "reject_large_0_dim": lambda: (torch.rand((65537, 1, 1)), 0, False),
+    "reject_large_2_dim": lambda: (torch.rand((800, 90, 1)), 2, False),
+    "reject_large_1_dim": lambda: (torch.rand((3, 2, 800, 90)), 1, False),
+}
+
+
+@common.parametrize("test_data", reject_inputs)
+def test_view_u55_BI_failure_set(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Sum(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+    )
+    pipeline.pop_stage("check_count.exir")
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_tanh.py b/backends/arm/test/ops/test_tanh.py
index 8d13620dc4a..73d51cb8c3e 100644
--- a/backends/arm/test/ops/test_tanh.py
+++ b/backends/arm/test/ops/test_tanh.py
@@ -1,142 +1,85 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
 from typing import Tuple
 
-import pytest
-
 import torch
 
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
+aten_op = "torch.ops.aten.tanh.default"
+input_t1 = Tuple[torch.Tensor]  # Input x
 
-test_data_suite = [
+test_data_suite = {
     # (test_name, test_data)
-    ("zeros", torch.zeros(10, 10, 10, 10)),
-    ("ones", torch.ones(10, 10, 10)),
-    ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
-    ("randn_neg", torch.randn(10) - 10),
-    ("ramp", torch.arange(-16, 16, 0.2)),
-]
-
-
-class TestTanh(unittest.TestCase):
-    class Tanh(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.tanh = torch.nn.Tanh()
-
-        def forward(self, x):
-            return self.tanh(x)
-
-    def _test_tanh_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.tanh.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_tanh_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data)
-
-    def _test_tanh_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.tanh.default"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_tanh_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-        if conftest.is_option_enabled("tosa_ref_model"):
-            tester.run_method_and_compare_outputs(inputs=test_data)
-
-    def _test_tanh_tosa_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.tanh.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_tanh_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_tanh_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_tanh_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_tanh_tosa_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_tanh_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.tosa_ref_model
-    def test_tanh_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_tanh_tosa_MI_pipeline(self.Tanh(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.tosa_ref_model
-    def test_tanh_tosa_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_tanh_tosa_BI_pipeline(self.Tanh(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_tanh_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_tanh_tosa_u55_BI_pipeline(self.Tanh(), (test_data,))
-
-    @parameterized.expand(test_data_suite)
-    def test_tanh_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_tanh_tosa_u85_BI_pipeline(self.Tanh(), (test_data,))
+    "zeros": lambda: torch.zeros(10, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+}
+
+
+class Tanh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.tanh = torch.nn.Tanh()
+
+    def forward(self, x):
+        return self.tanh(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_tanh_tosa_MI(test_data: Tuple):
+    pipeline = TosaPipelineMI[input_t1](
+        Tanh(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_tanh_tosa_BI(test_data: Tuple):
+    pipeline = TosaPipelineBI[input_t1](
+        Tanh(),
+        (test_data(),),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_tanh_u55_BI(test_data: Tuple):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Tanh(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_tanh_u85_BI(test_data: Tuple):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Tanh(),
+        (test_data(),),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py
index db3e93fbdc9..9d873f30ce9 100644
--- a/backends/arm/test/ops/test_to_copy.py
+++ b/backends/arm/test/ops/test_to_copy.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,14 +7,14 @@
 # Tests the _to_copy op which is interpreted as a cast for our purposes.
 #
 
-import unittest
+from typing import Tuple
 
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineMI
 
-from parameterized import parameterized
+input_t1 = Tuple[torch.Tensor]  # Input x
 
 
 class Cast(torch.nn.Module):
@@ -27,41 +26,41 @@ def forward(self, x: torch.Tensor):
         return x.to(dtype=self.target_dtype)
 
 
-class TestToCopy(unittest.TestCase):
-    """
-    Tests the _to_copy operation.
+"""
+Tests the _to_copy operation.
 
-    Only test unquantized graphs as explicit casting of dtypes messes with the
-    quantization.
+Only test unquantized graphs as explicit casting of dtypes messes with the
+quantization.
 
-    Note: This is also covered by test_scalars.py.
-    """
+Note: This is also covered by test_scalars.py.
+"""
+
+_TO_COPY_TEST_DATA = {
+    "rand_fp16": lambda: (torch.rand((1, 2, 3, 4), dtype=torch.float16), torch.float32),
+    "rand_fp32": lambda: (torch.rand((1, 2, 3, 4), dtype=torch.float32), torch.float16),
+    "rand_int8": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8),
+        torch.float32,
+    ),
+    "rand_int8_int32": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8),
+        torch.int32,
+    ),
+    "rand_int32": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int32),
+        torch.int8,
+    ),
+}
 
-    _TO_COPY_TEST_DATA = (
-        (torch.rand((1, 2, 3, 4), dtype=torch.float16), torch.float32),
-        (torch.rand((1, 2, 3, 4), dtype=torch.float32), torch.float16),
-        (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8), torch.float32),
-        (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8), torch.int32),
-        (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int32), torch.int8),
-    )
 
-    def _test_to_copy_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: torch.Tensor
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    @parameterized.expand(_TO_COPY_TEST_DATA)
-    def test_view_tosa_MI(self, test_tensor: torch.Tensor, new_dtype):
-        self._test_to_copy_tosa_MI_pipeline(Cast(new_dtype), (test_tensor,))
+@common.parametrize("test_data", _TO_COPY_TEST_DATA)
+def test_copy_tosa_MI(test_data: Tuple):
+    test_tensor, new_dtype = test_data()
+
+    pipeline = TosaPipelineMI[input_t1](
+        Cast(new_dtype),
+        (test_tensor,),
+        aten_op=[],
+        exir_op=[],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_unary.py b/backends/arm/test/ops/test_unary.py
index 1f91cab56c1..f8359bb4339 100644
--- a/backends/arm/test/ops/test_unary.py
+++ b/backends/arm/test/ops/test_unary.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
 from typing import Tuple
 
 import torch
@@ -46,51 +45,51 @@ def forward(self, x: torch.Tensor):
 
 
 test_data = {
-    "ceil_zeros": (
+    "ceil_zeros": lambda: (
         Ceil(),
         zeros,
     ),
-    "floor_zeros": (
+    "floor_zeros": lambda: (
         Floor(),
         zeros,
     ),
-    "ceil_ones": (
+    "ceil_ones": lambda: (
         Ceil(),
         ones,
     ),
-    "floor_ones": (
+    "floor_ones": lambda: (
         Floor(),
         ones,
     ),
-    "ceil_rand": (
+    "ceil_rand": lambda: (
         Ceil(),
         rand,
     ),
-    "floor_rand": (
+    "floor_rand": lambda: (
         Floor(),
         rand,
     ),
-    "ceil_randn_pos": (
+    "ceil_randn_pos": lambda: (
         Ceil(),
         randn_pos,
     ),
-    "floor_randn_pos": (
+    "floor_randn_pos": lambda: (
         Floor(),
         randn_pos,
     ),
-    "ceil_randn_neg": (
+    "ceil_randn_neg": lambda: (
         Ceil(),
         randn_neg,
     ),
-    "floor_randn_neg": (
+    "floor_randn_neg": lambda: (
         Floor(),
         randn_neg,
     ),
-    "ceil_ramp": (
+    "ceil_ramp": lambda: (
         Ceil(),
         ramp,
     ),
-    "floor_ramp": (
+    "floor_ramp": lambda: (
         Floor(),
         ramp,
     ),
@@ -99,55 +98,51 @@ def forward(self, x: torch.Tensor):
 
 @common.parametrize("test_data", test_data)
 def test_unary_tosa_MI(test_data: input_t1):
-    module = test_data[0]
+    module, test_data = test_data()
     pipeline = TosaPipelineMI[input_t1](
-        module, (test_data[1],), module.aten_op, module.exir_op
+        module,
+        (test_data,),
+        module.aten_op,
+        module.exir_op,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data)
 def test_unary_tosa_BI(test_data: input_t1):
-    module = test_data[0]
+    module, test_data = test_data()
     pipeline = TosaPipelineBI[input_t1](
-        module, (test_data[1],), module.aten_op, module.exir_op
+        module,
+        (test_data,),
+        module.aten_op,
+        module.exir_op,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data)
+@common.XfailIfNoCorstone300
 def test_unary_u55_BI(test_data: input_t1):
-    module = test_data[0]
+    module, test_data = test_data()
     pipeline = EthosU55PipelineBI[input_t1](
-        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=False
+        module,
+        (test_data,),
+        module.aten_op,
+        module.exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data)
+@common.XfailIfNoCorstone320
 def test_unary_u85_BI(test_data: input_t1):
-    module = test_data[0]
-    pipeline = EthosU85PipelineBI[input_t1](
-        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=False
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data)
-@common.SkipIfNoCorstone300
-def test_unary_u55_BI_on_fvp(test_data: input_t1):
-    module = test_data[0]
-    pipeline = EthosU55PipelineBI[input_t1](
-        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=True
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data)
-@common.SkipIfNoCorstone320
-def test_unary_u85_BI_on_fvp(test_data: input_t1):
-    module = test_data[0]
+    module, test_data = test_data()
     pipeline = EthosU85PipelineBI[input_t1](
-        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=True
+        module,
+        (test_data,),
+        module.aten_op,
+        module.exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py
index 68f4fe46123..4ad238a099a 100644
--- a/backends/arm/test/ops/test_unsqueeze.py
+++ b/backends/arm/test/ops/test_unsqueeze.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,105 +7,76 @@
 # Tests the unsqueeze op which copies the data of the input tensor (possibly with new data format)
 #
 
-import unittest
 from typing import Sequence, Tuple
 
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestSimpleUnsqueeze(unittest.TestCase):
-    class Unsqueeze(torch.nn.Module):
-        shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3)]
-        test_parameters: list[tuple[torch.Tensor]] = [(torch.randn(n),) for n in shapes]
-
-        def forward(self, x: torch.Tensor, dim):
-            return x.unsqueeze(dim)
-
-    def _test_unsqueeze_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.unsqueeze.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
-    def _test_unsqueeze_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.unsqueeze.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
+aten_op = "torch.ops.aten.unsqueeze.default"
+input_t1 = Tuple[torch.Tensor, torch.scalar_tensor]  # Input x, Input y
 
-    def _test_unsqueeze_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor, int],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.unsqueeze.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
 
-    @parameterized.expand(Unsqueeze.test_parameters)
-    def test_unsqueeze_tosa_MI(self, test_tensor: torch.Tensor):
-        for i in range(-test_tensor.dim() - 1, test_tensor.dim() + 1):
-            self._test_unsqueeze_tosa_MI_pipeline(self.Unsqueeze(), (test_tensor, i))
-
-    @parameterized.expand(Unsqueeze.test_parameters)
-    def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor):
-        self._test_unsqueeze_tosa_BI_pipeline(self.Unsqueeze(), (test_tensor, 0))
-
-    @parameterized.expand(Unsqueeze.test_parameters)
-    def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor):
-        self._test_unsqueeze_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(),
-            self.Unsqueeze(),
-            (test_tensor, 0),
-        )
+class Unsqueeze(torch.nn.Module):
+    shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3)]
+    test_parameters = {}
+    for n in shapes:
+        test_parameters[f"rand_{n}"] = (torch.randn(n),)
+
+    def forward(self, x: torch.Tensor, dim):
+        return x.unsqueeze(dim)
+
 
-    @parameterized.expand(Unsqueeze.test_parameters)
-    def test_unsqueeze_u85_BI(self, test_tensor: torch.Tensor):
-        self._test_unsqueeze_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(),
-            self.Unsqueeze(),
-            (test_tensor, 0),
+@common.parametrize("test_tensor", Unsqueeze.test_parameters)
+def test_unsqueeze_tosa_MI(test_tensor: torch.Tensor):
+    for i in range(-test_tensor[0].dim() - 1, test_tensor[0].dim() + 1):
+        pipeline = TosaPipelineMI[input_t1](
+            Unsqueeze(),
+            (*test_tensor, i),
+            aten_op,
+            exir_op=[],
         )
+        pipeline.run()
+
+
+@common.parametrize("test_tensor", Unsqueeze.test_parameters)
+def test_unsqueeze_tosa_BI(test_tensor: torch.Tensor):
+    pipeline = TosaPipelineBI[input_t1](
+        Unsqueeze(),
+        (*test_tensor, 0),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_tensor", Unsqueeze.test_parameters)
+@common.XfailIfNoCorstone300
+def test_unsqueeze_u55_BI(test_tensor: torch.Tensor):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Unsqueeze(),
+        (*test_tensor, 0),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_tensor", Unsqueeze.test_parameters)
+@common.XfailIfNoCorstone320
+def test_unsqueeze_u85_BI(test_tensor: torch.Tensor):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Unsqueeze(),
+        (*test_tensor, 0),
+        aten_op,
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_upsample_nearest2d.py b/backends/arm/test/ops/test_upsample_nearest2d.py
index 8984d716a3d..7809d5fdee2 100644
--- a/backends/arm/test/ops/test_upsample_nearest2d.py
+++ b/backends/arm/test/ops/test_upsample_nearest2d.py
@@ -1,165 +1,163 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
 from typing import Optional, Tuple
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
 
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.upsample_nearest2d.vec"
+input_t1 = Tuple[torch.Tensor]  # Input x
 
-test_data_suite = [
+test_data_suite = {
     # (test_name, test_data, size, scale_factor, compare_outputs)
-    ("rand_double_scale", torch.rand(2, 4, 8, 3), None, 2.0, True),
-    ("rand_double_scale_one_dim", torch.rand(2, 4, 8, 3), None, (1.0, 2.0), True),
-    ("rand_double_size", torch.rand(2, 4, 8, 3), (16, 6), None, True),
-    ("rand_one_double_scale", torch.rand(2, 4, 1, 1), None, 2.0, True),
-    ("rand_one_double_size", torch.rand(2, 4, 1, 1), (2, 2), None, True),
-    ("rand_one_same_scale", torch.rand(2, 4, 1, 1), None, 1.0, True),
-    ("rand_one_same_size", torch.rand(2, 4, 1, 1), (1, 1), None, True),
+    "rand_double_scale": lambda: (torch.rand(2, 4, 8, 3), None, 2.0, True),
+    "rand_double_scale_one_dim": lambda: (
+        torch.rand(2, 4, 8, 3),
+        None,
+        (1.0, 2.0),
+        True,
+    ),
+    "rand_double_size": lambda: (torch.rand(2, 4, 8, 3), (16, 6), None, True),
+    "rand_one_double_scale": lambda: (torch.rand(2, 4, 1, 1), None, 2.0, True),
+    "rand_one_double_size": lambda: (torch.rand(2, 4, 1, 1), (2, 2), None, True),
+    "rand_one_same_scale": lambda: (torch.rand(2, 4, 1, 1), None, 1.0, True),
+    "rand_one_same_size": lambda: (torch.rand(2, 4, 1, 1), (1, 1), None, True),
     # Can't compare outputs as the rounding when selecting the nearest pixel is
     # different between PyTorch and TOSA. Just check the legalization went well.
     # TODO Improve the test infrastructure to support more in depth verification
     # of the TOSA legalization results.
-    ("rand_half_scale", torch.rand(2, 4, 8, 6), None, 0.5, False),
-    ("rand_half_size", torch.rand(2, 4, 8, 6), (4, 3), None, False),
-    ("rand_one_and_half_scale", torch.rand(2, 4, 8, 3), None, 1.5, False),
-    ("rand_one_and_half_size", torch.rand(2, 4, 8, 3), (12, 4), None, False),
-]
-
-
-class TestUpsampleNearest2d(unittest.TestCase):
-    class UpsamplingNearest2d(torch.nn.Module):
-        def __init__(
-            self,
-            size: Optional[Tuple[int]],
-            scale_factor: Optional[float | Tuple[float]],
-        ):
-            super().__init__()
-            self.upsample = torch.nn.UpsamplingNearest2d(  # noqa: TOR101
-                size=size, scale_factor=scale_factor
-            )
-
-        def forward(self, x):
-            return self.upsample(x)
-
-    class Upsample(torch.nn.Module):
-        def __init__(
-            self,
-            size: Optional[Tuple[int]],
-            scale_factor: Optional[float | Tuple[float]],
-        ):
-            super().__init__()
-            self.upsample = torch.nn.Upsample(
-                size=size, scale_factor=scale_factor, mode="nearest"
-            )
-
-        def forward(self, x):
-            return self.upsample(x)
-
-    class Interpolate(torch.nn.Module):
-        def __init__(
-            self,
-            size: Optional[Tuple[int]],
-            scale_factor: Optional[float | Tuple[float]],
-        ):
-            super().__init__()
-            self.upsample = lambda x: torch.nn.functional.interpolate(
-                x, size=size, scale_factor=scale_factor, mode="nearest"
-            )
-
-        def forward(self, x):
-            return self.upsample(x)
-
-    def _test_upsample_nearest_2d_tosa_MI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-        compare_outputs: bool,
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.upsample_nearest2d.vec"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["torch.ops.aten.upsample_nearest2d.vec"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
+    "rand_half_scale": lambda: (torch.rand(2, 4, 8, 6), None, 0.5, False),
+    "rand_half_size": lambda: (torch.rand(2, 4, 8, 6), (4, 3), None, False),
+    "rand_one_and_half_scale": lambda: (torch.rand(2, 4, 8, 3), None, 1.5, False),
+    "rand_one_and_half_size": lambda: (torch.rand(2, 4, 8, 3), (12, 4), None, False),
+}
 
-        if compare_outputs:
-            tester.run_method_and_compare_outputs(inputs=test_data)
 
-    def _test_upsample_nearest_2d_tosa_BI_pipeline(
+class UpsamplingNearest2d(torch.nn.Module):
+    def __init__(
         self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-        compare_outputs: bool,
+        size: Optional[Tuple[int]],
+        scale_factor: Optional[float | Tuple[float]],
     ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.upsample_nearest2d.vec"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_not(["torch.ops.aten.upsample_nearest2d.vec"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
+        super().__init__()
+        self.upsample = torch.nn.UpsamplingNearest2d(  # noqa: TOR101
+            size=size, scale_factor=scale_factor
         )
 
-        if compare_outputs:
-            tester.run_method_and_compare_outputs(inputs=test_data)
+    def forward(self, x):
+        return self.upsample(x)
+
 
-    @parameterized.expand(test_data_suite)
-    def test_upsample_nearest_2d_tosa_MI(
+class Upsample(torch.nn.Module):
+    def __init__(
         self,
-        test_name: str,
-        test_data: torch.Tensor,
         size: Optional[Tuple[int]],
         scale_factor: Optional[float | Tuple[float]],
-        compare_outputs: bool,
     ):
-        self._test_upsample_nearest_2d_tosa_MI_pipeline(
-            self.UpsamplingNearest2d(size, scale_factor), (test_data,), compare_outputs
-        )
-        self._test_upsample_nearest_2d_tosa_MI_pipeline(
-            self.Upsample(size, scale_factor), (test_data,), compare_outputs
-        )
-        self._test_upsample_nearest_2d_tosa_MI_pipeline(
-            self.Interpolate(size, scale_factor), (test_data,), compare_outputs
+        super().__init__()
+        self.upsample = torch.nn.Upsample(
+            size=size, scale_factor=scale_factor, mode="nearest"
         )
 
-    @parameterized.expand(test_data_suite)
-    def test_upsample_nearest_2d_tosa_BI(
+    def forward(self, x):
+        return self.upsample(x)
+
+
+class Interpolate(torch.nn.Module):
+    def __init__(
         self,
-        test_name: str,
-        test_data: torch.Tensor,
         size: Optional[Tuple[int]],
         scale_factor: Optional[float | Tuple[float]],
-        compare_outputs: bool,
     ):
-        self._test_upsample_nearest_2d_tosa_BI_pipeline(
-            self.UpsamplingNearest2d(size, scale_factor), (test_data,), compare_outputs
-        )
-        self._test_upsample_nearest_2d_tosa_BI_pipeline(
-            self.Upsample(size, scale_factor), (test_data,), compare_outputs
-        )
-        self._test_upsample_nearest_2d_tosa_BI_pipeline(
-            self.Interpolate(size, scale_factor), (test_data,), compare_outputs
+        super().__init__()
+        self.upsample = lambda x: torch.nn.functional.interpolate(
+            x, size=size, scale_factor=scale_factor, mode="nearest"
         )
+
+    def forward(self, x):
+        return self.upsample(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_upsample_nearest2d_vec_tosa_MI(test_data: torch.Tensor):
+    test_data, size, scale_factor, compare_outputs = test_data()
+
+    pipeline = TosaPipelineMI[input_t1](
+        UpsamplingNearest2d(size, scale_factor),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+    )
+    if not compare_outputs:
+        pipeline.pop_stage(-1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_upsample_nearest2d_vec_tosa_MI_nearest(test_data: torch.Tensor):
+    test_data, size, scale_factor, compare_outputs = test_data()
+
+    pipeline = TosaPipelineMI[input_t1](
+        Upsample(size, scale_factor),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+    )
+    if not compare_outputs:
+        pipeline.pop_stage(-1)
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_upsample_nearest2d_vec_tosa_MI_interpolate(test_data: torch.Tensor):
+    test_data, size, scale_factor, compare_outputs = test_data()
+
+    pipeline = TosaPipelineMI[input_t1](
+        Interpolate(size, scale_factor),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+    )
+    if not compare_outputs:
+        pipeline.pop_stage(-1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_upsample_nearest2d_vec_tosa_BI_interpolate(test_data: torch.Tensor):
+    test_data, size, scale_factor, compare_outputs = test_data()
+
+    pipeline = TosaPipelineBI[input_t1](
+        UpsamplingNearest2d(size, scale_factor),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+    )
+    if not compare_outputs:
+        pipeline.pop_stage(-1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_upsample_nearest2d_vec_tosa_BI_nearest(test_data: torch.Tensor):
+    test_data, size, scale_factor, compare_outputs = test_data()
+
+    pipeline = TosaPipelineBI[input_t1](
+        Upsample(size, scale_factor),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+    )
+    if not compare_outputs:
+        pipeline.pop_stage(-1)
+
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index fb23f24307e..63681263fab 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -1,255 +1,321 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-#
-# Tests the mean op which changes the size of a Tensor without changing the underlying data.
-#
 
-import unittest
+from typing import Tuple
 
 import torch
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
 )
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.backend_details import CompileSpec
-
-from parameterized import parameterized
-
-
-class TestVar(unittest.TestCase):
-
-    class Var(torch.nn.Module):
-        test_parameters = [
-            (torch.randn(1, 50, 10, 20), True, 0),
-            (torch.rand(1, 50, 10), False, 0),
-            (torch.randn(1, 30, 15, 20), True, 1),
-            (torch.rand(1, 50, 10, 20), False, 0.5),
-        ]
-
-        def __init__(self, keepdim: bool = True, correction: int = 0):
-            super().__init__()
-            self.keepdim = keepdim
-            self.correction = correction
-
-        def forward(
-            self,
-            x: torch.Tensor,
-        ):
-            return x.var(keepdim=self.keepdim, correction=self.correction)
-
-    class VarDim(torch.nn.Module):
-        test_parameters = [
-            (torch.randn(1, 50, 10, 20), 1, True, False),
-            (torch.rand(1, 50, 10), -2, False, False),
-            (torch.randn(1, 30, 15, 20), -3, True, True),
-            (torch.rand(1, 50, 10, 20), -1, False, True),
-        ]
-
-        test_parameters_u55 = [
-            (torch.randn(1, 50, 10, 20), 1, True, False),
-            (torch.randn(1, 30, 15, 20), -3, True, True),
-        ]
-
-        test_parameters_u55_xfails = [
-            (torch.rand(1, 50, 10), -2, True, False),
-            (torch.rand(1, 50, 10, 20), -1, True, True),
-        ]
-
-        def __init__(self, dim: int = -1, keepdim: bool = True, unbiased: bool = False):
-            super().__init__()
-            self.dim = dim
-            self.keepdim = keepdim
-            self.unbiased = unbiased
-
-        def forward(
-            self,
-            x: torch.Tensor,
-        ):
-            return x.var(dim=self.dim, keepdim=self.keepdim, unbiased=self.unbiased)
-
-    class VarCorrection(torch.nn.Module):
-        test_parameters = [
-            (torch.randn(1, 50, 10, 20), (-1, -2), True, 0),
-            (torch.rand(1, 50, 10), (-2), True, 0),
-            (torch.randn(1, 30, 15, 20), (-1, -2, -3), True, 1),
-            (torch.rand(1, 50, 10, 20), (-1, -2), True, 0.5),
-        ]
-
-        def __init__(
-            self, dim: int = -1, keepdim: bool = True, correction: bool = False
-        ):
-            super().__init__()
-            self.dim = dim
-            self.keepdim = keepdim
-            self.correction = correction
-
-        def forward(
-            self,
-            x: torch.Tensor,
-        ):
-            return x.var(dim=self.dim, keepdim=self.keepdim, correction=self.correction)
-
-    def _test_var_tosa_MI_pipeline(
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Var(torch.nn.Module):
+    test_parameters = {
+        "var_4d_keep_dim_0_correction": lambda: (torch.randn(1, 50, 10, 20), True, 0),
+        "var_3d_no_keep_dim_0_correction": lambda: (torch.rand(1, 50, 10), False, 0),
+        "var_4d_keep_dim_1_correction": lambda: (torch.randn(1, 30, 15, 20), True, 1),
+        "var_4d_no_keep_dim_0_5_correction": lambda: (
+            torch.rand(1, 50, 10, 20),
+            False,
+            0.5,
+        ),
+    }
+
+    def __init__(self, keepdim: bool = True, correction: int = 0):
+        super().__init__()
+        self.keepdim = keepdim
+        self.correction = correction
+
+    def forward(
         self,
-        module: torch.nn.Module,
-        test_data: torch.Tensor,
-        target_str: str = None,
+        x: torch.Tensor,
     ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_var_tosa_BI_pipeline(
+        return x.var(keepdim=self.keepdim, correction=self.correction)
+
+
+class VarDim(torch.nn.Module):
+    test_parameters = {
+        "var_4d_dim_1_keep_dim_unbiased": lambda: (
+            torch.randn(1, 50, 10, 20),
+            1,
+            True,
+            False,
+        ),
+        "var_3d_dim_neg_2_no_keep_dim_unbiased": lambda: (
+            torch.rand(1, 50, 10),
+            -2,
+            False,
+            False,
+        ),
+        "var_3d_dim_neg_3_keep_dim_biased": lambda: (
+            torch.randn(1, 30, 15, 20),
+            -3,
+            True,
+            True,
+        ),
+        "var_3d_dim_neg_1_no_keep_dim_biased": lambda: (
+            torch.rand(1, 50, 10, 20),
+            -1,
+            False,
+            True,
+        ),
+    }
+
+    test_parameters_u55 = {
+        "var_4d_dim_1_keep_dim_unbiased": lambda: (
+            torch.randn(1, 50, 10, 20),
+            1,
+            True,
+            False,
+        ),
+        "var_4d_dim_neg_3_keep_dim_biased": lambda: (
+            torch.randn(1, 30, 15, 20),
+            -3,
+            True,
+            True,
+        ),
+    }
+
+    test_parameters_u55_xfails = {
+        "var_3d_dim_neg_2_keep_dim_unbiased": lambda: (
+            torch.rand(1, 50, 10),
+            -2,
+            True,
+            False,
+        ),
+        "var_3d_dim_neg_1_keep_dim_biased": lambda: (
+            torch.rand(1, 50, 10, 20),
+            -1,
+            True,
+            True,
+        ),
+    }
+
+    def __init__(self, dim: int = -1, keepdim: bool = True, unbiased: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.unbiased = unbiased
+
+    def forward(
         self,
-        module: torch.nn.Module,
-        test_data: torch.Tensor,
-        target_str: str = None,
+        x: torch.Tensor,
     ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_var_ethosu_BI_pipeline(
+        return x.var(dim=self.dim, keepdim=self.keepdim, unbiased=self.unbiased)
+
+
+class VarCorrection(torch.nn.Module):
+    test_parameters = {
+        "var_4d_dims_keep_dim_0_correction": lambda: (
+            torch.randn(1, 50, 10, 20),
+            (-1, -2),
+            True,
+            0,
+        ),
+        "var_3d_dims_keep_dim_0_correction": lambda: (
+            torch.rand(1, 50, 10),
+            (-2),
+            True,
+            0,
+        ),
+        "var_4d_dims_keep_dim_1_correction": lambda: (
+            torch.randn(1, 30, 15, 20),
+            (-1, -2, -3),
+            True,
+            1,
+        ),
+        "var_4d_dims_keep_dim_0_5_correction": lambda: (
+            torch.rand(1, 50, 10, 20),
+            (-1, -2),
+            True,
+            0.5,
+        ),
+    }
+
+    def __init__(self, dim: int = -1, keepdim: bool = True, correction: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.correction = correction
+
+    def forward(
         self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: torch.Tensor,
-        target_str: str = None,
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
-
-    @parameterized.expand(Var.test_parameters)
-    def test_var_tosa_MI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_tosa_MI_pipeline(self.Var(keepdim, correction), (test_tensor,))
-
-    @parameterized.expand(Var.test_parameters)
-    def test_var_tosa_BI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_tosa_BI_pipeline(self.Var(keepdim, correction), (test_tensor,))
-
-    @parameterized.expand(Var.test_parameters)
-    def test_var_u55_BI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_ethosu_BI_pipeline(
-            self.Var(keepdim, correction),
-            common.get_u55_compile_spec(),
-            (test_tensor,),
-        )
-
-    @parameterized.expand(Var.test_parameters)
-    def test_var_u85_BI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_ethosu_BI_pipeline(
-            self.Var(keepdim, correction),
-            common.get_u85_compile_spec(),
-            (test_tensor,),
-        )
-
-    @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_tosa_MI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
-        self._test_var_tosa_MI_pipeline(
-            self.VarDim(dim, keepdim, unbiased), (test_tensor,)
-        )
-
-    @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_tosa_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
-        self._test_var_tosa_BI_pipeline(
-            self.VarDim(dim, keepdim, unbiased), (test_tensor,)
-        )
-
-    @parameterized.expand(VarDim.test_parameters_u55)
-    def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
-        self._test_var_ethosu_BI_pipeline(
-            self.VarDim(dim, keepdim, unbiased),
-            common.get_u55_compile_spec(),
-            (test_tensor,),
-        )
-
-    @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
-        self._test_var_ethosu_BI_pipeline(
-            self.VarDim(dim, keepdim, unbiased),
-            common.get_u85_compile_spec(),
-            (test_tensor,),
-        )
-
-    @parameterized.expand(VarCorrection.test_parameters)
-    def test_var_correction_tosa_MI(
-        self, test_tensor: torch.Tensor, dim, keepdim, correction
-    ):
-        self._test_var_tosa_MI_pipeline(
-            self.VarCorrection(dim, keepdim, correction), (test_tensor,)
-        )
-
-    @parameterized.expand(VarCorrection.test_parameters)
-    def test_var_correction_tosa_BI(
-        self, test_tensor: torch.Tensor, dim, keepdim, correction
-    ):
-        self._test_var_tosa_BI_pipeline(
-            self.VarCorrection(dim, keepdim, correction), (test_tensor,)
-        )
-
-    @parameterized.expand(VarCorrection.test_parameters)
-    def test_var_correction_u55_BI(
-        self, test_tensor: torch.Tensor, dim, keepdim, correction
-    ):
-        self._test_var_ethosu_BI_pipeline(
-            self.VarCorrection(dim, keepdim, correction),
-            common.get_u55_compile_spec(),
-            (test_tensor,),
-        )
-
-    @parameterized.expand(VarCorrection.test_parameters)
-    def test_var_correction_u85_BI(
-        self, test_tensor: torch.Tensor, dim, keepdim, correction
+        x: torch.Tensor,
     ):
-        self._test_var_ethosu_BI_pipeline(
-            self.VarCorrection(dim, keepdim, correction),
-            common.get_u85_compile_spec(),
-            (test_tensor,),
-        )
+        return x.var(dim=self.dim, keepdim=self.keepdim, correction=self.correction)
+
+
+@common.parametrize("test_data", Var.test_parameters)
+def test_var_dim_tosa_MI_no_dim(test_data: Tuple):
+    test_data, keepdim, correction = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        Var(keepdim, correction),
+        (test_data,),
+        aten_op=[],
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Var.test_parameters)
+def test_var_dim_tosa_BI_no_dim(test_data: Tuple):
+    test_data, keepdim, correction = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        Var(keepdim, correction),
+        (test_data,),
+        aten_op=[],
+        exir_op=[],
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Var.test_parameters)
+@common.XfailIfNoCorstone300
+def test_var_dim_u55_BI_no_dim(test_data: Tuple):
+    test_data, keepdim, correction = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        Var(keepdim, correction),
+        (test_data,),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Var.test_parameters)
+@common.XfailIfNoCorstone320
+def test_var_dim_u85_BI_no_dim(test_data: Tuple):
+    test_data, keepdim, correction = test_data()
+    pipeline = EthosU85PipelineBI[input_t1](
+        Var(keepdim, correction),
+        (test_data,),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarDim.test_parameters)
+def test_var_dim_tosa_MI(test_data: Tuple):
+    test_data, dim, keepdim, unbiased = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        VarDim(dim, keepdim, unbiased),
+        (test_data,),
+        aten_op=[],
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarDim.test_parameters)
+def test_var_dim_tosa_BI(test_data: Tuple):
+
+    test_data, dim, keepdim, unbiased = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        VarDim(dim, keepdim, unbiased),
+        (test_data,),
+        aten_op=[],
+        exir_op=[],
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarDim.test_parameters_u55)
+@common.XfailIfNoCorstone300
+def test_var_dim_u55_BI(test_data: Tuple):
+    test_data, dim, keepdim, unbiased = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        VarDim(dim, keepdim, unbiased),
+        (test_data,),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarDim.test_parameters)
+@common.XfailIfNoCorstone320
+def test_var_dim_u85_BI(test_data: Tuple):
+    test_data, dim, keepdim, unbiased = test_data()
+    pipeline = EthosU85PipelineBI[input_t1](
+        VarDim(dim, keepdim, unbiased),
+        (test_data,),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarCorrection.test_parameters)
+def test_var_dim_tosa_MI_correction(test_data: Tuple):
+    test_data, dim, keepdim, correction = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        VarCorrection(dim, keepdim, correction),
+        (test_data,),
+        aten_op=[],
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarCorrection.test_parameters)
+def test_var_dim_tosa_BI_correction(test_data: Tuple):
+    test_data, dim, keepdim, correction = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        VarCorrection(dim, keepdim, correction),
+        (test_data,),
+        aten_op=[],
+        exir_op=[],
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarCorrection.test_parameters)
+@common.XfailIfNoCorstone300
+def test_var_dim_u55_BI_correction(test_data: Tuple):
+    test_data, dim, keepdim, correction = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        VarCorrection(dim, keepdim, correction),
+        (test_data,),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", VarCorrection.test_parameters)
+@common.XfailIfNoCorstone320
+def test_var_dim_u85_BI_correction(test_data: Tuple):
+    test_data, dim, keepdim, correction = test_data()
+    pipeline = EthosU85PipelineBI[input_t1](
+        VarCorrection(dim, keepdim, correction),
+        (test_data,),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index f90ae402067..a899be6750d 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,131 +7,90 @@
 # Tests the view op which changes the size of a Tensor without changing the underlying data.
 #
 
-import unittest
 from typing import Tuple
 
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestView(unittest.TestCase):
-    """Tests the view operation."""
-
-    class View(torch.nn.Module):
-
-        needs_transpose_tests = [
-            (torch.rand(100), (1, -1, 5, 2)),
-            (torch.rand(10, 2, 1, 5), (1, -1, 5, 2)),
-            (torch.rand(1, 2, 1, 9), (3, 1, 3, 2)),
-            (torch.rand(2, 1, 1, 9), (3, 2, 3, 1)),
-            (torch.rand(2, 50, 2, 1), (1, 200)),
-            (torch.rand(2, 5, 2, 3), (1, 15, 4)),
-        ]
-
-        no_transpose_tests = [
-            (torch.rand(2, 1, 1, 9), (3, 1, 3, 2)),
-            (torch.rand(5, 10, 1, 1), (25, 2, 1, 1)),
-            (torch.rand(10, 2), (1, 1, 5, 4)),
-            (torch.rand(10, 10), (5, 1, 5, 4)),
-            (torch.rand(1, 1, 1, 10), (1, 1, 10, 1)),
-            (torch.rand(1, 1, 5, 10), (1, 1, 50, 1)),
-            (torch.rand(5, 10, 1, 1), (1, 25, 2)),
-            (torch.rand(2, 50, 1, 1), (1, 100)),
-            (torch.rand(2, 3, 2, 3), (2, 3, 3, 2)),
-        ]
-
-        def forward(self, x: torch.Tensor, new_shape):
-            return x.view(new_shape)
-
-    def _test_view_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: torch.Tensor
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.view.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_view_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.view.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_view_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.view.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_view_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        self._test_view_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_view_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        self._test_view_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
-
-    @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests)
-    def test_view_tosa_MI(self, test_tensor: torch.Tensor, new_shape):
-        self._test_view_tosa_MI_pipeline(self.View(), (test_tensor, new_shape))
-
-    @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests)
-    def test_view_tosa_BI(self, test_tensor: torch.Tensor, new_shape):
-        self._test_view_tosa_BI_pipeline(self.View(), (test_tensor, new_shape))
-
-    @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests)
-    def test_view_u55_BI(self, test_tensor: torch.Tensor, new_shape):
-        self._test_view_u55_BI_pipeline(self.View(), (test_tensor, new_shape))
-
-    @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests)
-    def test_view_u85_BI(self, test_tensor: torch.Tensor, new_shape):
-        self._test_view_u85_BI_pipeline(self.View(), (test_tensor, new_shape))
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.view.default"
+
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x,  Input y
+
+
+class View(torch.nn.Module):
+
+    needs_transpose_tests = {
+        "rand_1d_neg": lambda: (torch.rand(100), (1, -1, 5, 2)),
+        "rand_4d_neg": lambda: (torch.rand(10, 2, 1, 5), (1, -1, 5, 2)),
+        "rand_4d_4d_small": lambda: (torch.rand(1, 2, 1, 9), (3, 1, 3, 2)),
+        "rand_4d_4d": lambda: (torch.rand(2, 1, 1, 9), (3, 2, 3, 1)),
+        "rand_4d_2d": lambda: (torch.rand(2, 50, 2, 1), (1, 200)),
+        "rand_4d_3d": lambda: (torch.rand(2, 5, 2, 3), (1, 15, 4)),
+        "rand_4d_1": lambda: (torch.rand(2, 1, 1, 9), (3, 1, 3, 2)),
+        "rand_4d_2": lambda: (torch.rand(5, 10, 1, 1), (25, 2, 1, 1)),
+        "rand_4d_2_4": lambda: (torch.rand(10, 2), (1, 1, 5, 4)),
+        "rand_4d_2_4_big": lambda: (torch.rand(10, 10), (5, 1, 5, 4)),
+        "rand_4d_4_4": lambda: (torch.rand(1, 1, 1, 10), (1, 1, 10, 1)),
+        "rand_4d_4_4_big": lambda: (torch.rand(1, 1, 5, 10), (1, 1, 50, 1)),
+        "rand_4d_4_3": lambda: (torch.rand(5, 10, 1, 1), (1, 25, 2)),
+        "rand_4d_4_2": lambda: (torch.rand(2, 50, 1, 1), (1, 100)),
+        "rand_4d_2_4_same": lambda: (torch.rand(2, 3, 2, 3), (2, 3, 3, 2)),
+    }
+
+    def forward(self, x: torch.Tensor, new_shape):
+        return x.view(new_shape)
+
+
+@common.parametrize("test_data", View.needs_transpose_tests)
+def test_view_tosa_MI(test_data: Tuple):
+    test_tensor, new_shape = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        View(),
+        (test_tensor, new_shape),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", View.needs_transpose_tests)
+def test_view_tosa_BI(test_data: Tuple):
+    test_tensor, new_shape = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        View(),
+        (test_tensor, new_shape),
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", View.needs_transpose_tests)
+def test_view_u55_BI(test_data: Tuple):
+    test_tensor, new_shape = test_data()
+    pipeline = EthosU55PipelineBI[input_t1](
+        View(),
+        (test_tensor, new_shape),
+        aten_op,
+        exir_ops=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", View.needs_transpose_tests)
+def test_view_u85_BI(test_data: Tuple):
+    test_tensor, new_shape = test_data()
+    pipeline = EthosU85PipelineBI[input_t1](
+        View(),
+        (test_tensor, new_shape),
+        aten_op,
+        exir_ops=[],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_where.py b/backends/arm/test/ops/test_where.py
index 91d616232fa..7bfd27ac0a8 100644
--- a/backends/arm/test/ops/test_where.py
+++ b/backends/arm/test/ops/test_where.py
@@ -5,15 +5,13 @@
 
 from typing import List, Tuple
 
-import pytest
-
 import torch
 
 from executorch.backends.arm.quantizer import (
     EthosUQuantizer,
     get_symmetric_quantization_config,
-    TOSAQuantizer,
 )
+
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU85PipelineBI,
@@ -124,18 +122,18 @@ def scalar_condition(input: torch.Tensor):
 )
 
 test_modules_common = {
-    "two_dim_tensor_cond": two_dim_tensor_cond,
-    "three_dim_tensor_cond": three_dim_tensor_cond,
-    "float32_tensor_cond": float32_tensor_cond,
-    "two_dim_scalar_cond": two_dim_scalar_cond,
-    "three_dim_scalar_cond": three_dim_scalar_cond,
-    "float32_scalar_cond": float32_scalar_cond,
+    "two_dim_tensor_cond": lambda: two_dim_tensor_cond,
+    "three_dim_tensor_cond": lambda: three_dim_tensor_cond,
+    "float32_tensor_cond": lambda: float32_tensor_cond,
+    "two_dim_scalar_cond": lambda: two_dim_scalar_cond,
+    "three_dim_scalar_cond": lambda: three_dim_scalar_cond,
+    "float32_scalar_cond": lambda: float32_scalar_cond,
 }
 
 test_modules_MI = {
     **test_modules_common,
-    "float32_tensor_cond_tuple_dtype": float32_tensor_cond_tuple_dtype,
-    "float32_tensor_cond_tuple_dtype_bool": float32_tensor_cond_tuple_dtype_bool,
+    "float32_tensor_cond_tuple_dtype": lambda: float32_tensor_cond_tuple_dtype,
+    "float32_tensor_cond_tuple_dtype_bool": lambda: float32_tensor_cond_tuple_dtype_bool,
 }
 
 test_modules_BI = {
@@ -146,93 +144,51 @@ def scalar_condition(input: torch.Tensor):
 
 
 @common.parametrize("test_module", test_modules_MI)
-def test_where_tosa_MI(test_module):
+def test_where_self_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_modules_BI)
-def test_where_tosa_BI(test_module):
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+BI")
-    quantizer = TOSAQuantizer(compile_spec).set_io(get_symmetric_quantization_config())
+def test_where_self_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
-    )
-    pipeline.change_args(
-        "quantize", Quantize(quantizer, get_symmetric_quantization_config())
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_module", test_modules_BI)
-def test_where_u55_BI(test_module):
-    compile_spec = common.get_u55_compile_spec()
-    quantizer = EthosUQuantizer(compile_spec).set_io(
-        get_symmetric_quantization_config()
-    )
-
+@common.XfailIfNoCorstone300
+def test_where_self_u55_BI_not_delegated(test_module):
     # There will be one full_like op which will be delegated.
     num_delegates = 1
     num_exir = 0
 
-    pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
-        {
-            exir_op: 1,
-            "executorch_exir_dialects_edge__ops_aten_full_default": num_exir,
-        },
-        num_delegates,
-    )
-
-    pipeline.change_args(
-        "quantize", Quantize(quantizer, get_symmetric_quantization_config())
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_modules_BI)
-def test_where_u85_BI(test_module):
-    compile_spec = common.get_u85_compile_spec()
-    quantizer = EthosUQuantizer(compile_spec).set_io(
-        get_symmetric_quantization_config()
-    )
-    pipeline = EthosU85PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=False
-    )
-    pipeline.change_args(
-        "quantize", Quantize(quantizer, get_symmetric_quantization_config())
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_module", test_modules_BI)
-@pytest.mark.skip(reason="The same as test_where_u55_BI")
-@common.XfailIfNoCorstone300
-def test_where_u55_BI_on_fvp(test_module):
     compile_spec = common.get_u55_compile_spec()
     quantizer = EthosUQuantizer(compile_spec).set_io(
         get_symmetric_quantization_config()
     )
 
-    # There will be one full_like op which will be delegated.
-    num_delegates = 1
-    num_exir = 0
-
     pipeline = OpNotSupportedPipeline[input_t](
-        test_module,
-        test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
+        test_module(),
+        test_module().get_inputs(),
         {
             exir_op: 1,
             "executorch_exir_dialects_edge__ops_aten_full_default": num_exir,
         },
         num_delegates,
+        quantize=True,
+        u55_subset=True,
     )
-
     pipeline.change_args(
         "quantize", Quantize(quantizer, get_symmetric_quantization_config())
     )
@@ -241,15 +197,14 @@ def test_where_u55_BI_on_fvp(test_module):
 
 @common.parametrize("test_module", test_modules_BI)
 @common.XfailIfNoCorstone320
-def test_where_u85_BI_on_fvp(test_module):
-    compile_spec = common.get_u85_compile_spec()
-    quantizer = EthosUQuantizer(compile_spec).set_io(
-        get_symmetric_quantization_config()
-    )
+def test_where_self_u85_BI(test_module):
+
     pipeline = EthosU85PipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
-    )
-    pipeline.change_args(
-        "quantize", Quantize(quantizer, get_symmetric_quantization_config())
+        test_module(),
+        test_module().get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
index 38d82b739e1..c4c90064bce 100644
--- a/backends/arm/test/tester/test_pipeline.py
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -13,8 +13,9 @@
     get_symmetric_quantization_config,
     TOSAQuantizer,
 )
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester, RunPasses
+from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -281,8 +282,14 @@ def __init__(
         rtol: float = 1e-03,
         qtol: int = 0,
     ):
+        tosa_profiles = {
+            "0.80": TosaSpecification.create_from_string("TOSA-0.80+BI"),
+            "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        }
+        tosa_version = conftest.get_option("tosa_version")
+
         compile_spec = common.get_tosa_compile_spec(
-            tosa_version, custom_path=custom_path
+            tosa_profiles[tosa_version], custom_path=custom_path
         )
         quant_stage = (
             Quantize(
@@ -371,8 +378,14 @@ def __init__(
         rtol: float = 1e-03,
         qtol: int = 0,
     ):
+        tosa_profiles = {
+            "0.80": TosaSpecification.create_from_string("TOSA-0.80+MI"),
+            "1.0": TosaSpecification.create_from_string("TOSA-1.0+FP"),
+        }
+        tosa_version = conftest.get_option("tosa_version")
+
         compile_spec = common.get_tosa_compile_spec(
-            tosa_version, custom_path=custom_path
+            tosa_profiles[tosa_version], custom_path=custom_path
         )
         super().__init__(
             module,
@@ -663,7 +676,6 @@ class TransformAnnotationPassPipeline(BasePipelineMaker, Generic[T]):
     Attributes:
         module: The module which the pipeline is applied to.
         test_data: Data used for testing the module.
-        tosa_version: The TOSA-version which to test for.
 
         custom_path : Path to dump intermediate artifacts such as tosa and pte to.
 
@@ -673,11 +685,16 @@ def __init__(
         self,
         module: torch.nn.Module,
         test_data: T,
-        tosa_version: str,
         custom_path: str = None,
     ):
+        tosa_profiles = {
+            "0.80": TosaSpecification.create_from_string("TOSA-0.80+BI"),
+            "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        }
+        tosa_version = conftest.get_option("tosa_version")
+
         compile_spec = common.get_tosa_compile_spec(
-            tosa_version, custom_path=custom_path
+            tosa_profiles[tosa_version], custom_path=custom_path
         )
         super().__init__(
             module,
@@ -723,11 +740,21 @@ def __init__(
         self,
         module: torch.nn.Module,
         test_data: T,
-        tosa_version: str,
         non_delegated_ops: Dict[str, int],
         n_expected_delegates: int = 0,
         custom_path: str = None,
+        quantize: Optional[bool] = False,
+        u55_subset: Optional[bool] = False,
     ):
+        tosa_profiles = {
+            "0.80": "TOSA-0.80+" + ("BI" if quantize else "MI"),
+            "1.0": "TOSA-1.0+" + ("INT" if quantize else "FP"),
+        }
+        tosa_version = tosa_profiles[conftest.get_option("tosa_version")]
+
+        if u55_subset and quantize:
+            tosa_version = f"{tosa_version}+u55"
+
         compile_spec = common.get_tosa_compile_spec(
             tosa_version, custom_path=custom_path
         )
@@ -739,7 +766,7 @@ def __init__(
             [],
         )
 
-        if "BI" in tosa_version:
+        if "INT" in tosa_version or "BI" in tosa_version:
             self.add_stage(self.tester.quantize, pos=0)
 
         self.change_args("check_not.exir", [])

From 6346bfd3b6b7875e082a3886d0f301a694e3b2b1 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Thu, 8 May 2025 10:14:34 -0700
Subject: [PATCH 009/178] Automatically announce declared options (#10766)

### Summary

Instead of manually printing all the options in
`tools/cmake/Utils.cmake`, let's just "automatically" print all the
configured options.

### Test plan

```
$ ./scripts/build_apple_frameworks.sh --Debug

-- --- Configurated Options ---

-- EXECUTORCH_ENABLE_LOGGING : ON
-- ---------------------------

```

```
$ ./scripts/build_apple_frameworks.sh --Release

-- --- Configurated Options ---

-- EXECUTORCH_ENABLE_LOGGING : OFF
-- ---------------------------

```


cc @larryliu0820
---
 CMakeLists.txt                  |  3 ++
 tools/cmake/Utils.cmake         |  4 ---
 tools/cmake/common/preset.cmake | 50 +++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45993721a66..2d3f8e5f907 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,9 @@ project(executorch)
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
 
+# Print all the configs that were called with announce_configured_options.
+print_configured_options()
+
 # MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
 
 include(tools/cmake/Utils.cmake)
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 0a09f51fd28..3155c3fc16e 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -30,10 +30,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  BUCK2                         : ${BUCK2}")
   message(STATUS "  PYTHON_EXECUTABLE             : ${PYTHON_EXECUTABLE}")
   message(STATUS "  FLATC_EXECUTABLE              : ${FLATC_EXECUTABLE}")
-  message(
-    STATUS
-      "  EXECUTORCH_ENABLE_LOGGING              : ${EXECUTORCH_ENABLE_LOGGING}"
-  )
   message(STATUS "  EXECUTORCH_ENABLE_PROGRAM_VERIFICATION : "
                  "${EXECUTORCH_ENABLE_PROGRAM_VERIFICATION}"
   )
diff --git a/tools/cmake/common/preset.cmake b/tools/cmake/common/preset.cmake
index 0fde24bae6a..0affdf04bdd 100644
--- a/tools/cmake/common/preset.cmake
+++ b/tools/cmake/common/preset.cmake
@@ -4,6 +4,54 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Announce the name and value of a cmake variable in the summary of the build.
+function(announce_configured_options NAME)
+  get_property(_options GLOBAL PROPERTY _announce_configured_options)
+  if(NOT _options)
+    set_property(GLOBAL PROPERTY _announce_configured_options)
+    get_property(_options GLOBAL PROPERTY _announce_configured_options)
+  endif()
+
+  set(option_exists FALSE)
+  foreach(_option IN LISTS _options)
+    if(_option STREQUAL "${NAME}")
+      set(option_exists TRUE)
+      break()
+    endif()
+  endforeach()
+
+  if(NOT option_exists)
+    set(_options ${_options} "${NAME}")
+    set_property(GLOBAL PROPERTY _announce_configured_options "${_options}")
+  endif()
+endfunction()
+
+# Print the configured options.
+function(print_configured_options)
+  get_property(_options GLOBAL PROPERTY _announce_configured_options)
+
+  set(_longest_name_length 0)
+  foreach(_option IN LISTS _options)
+    string(LENGTH "${_option}" length)
+    if(length GREATER _longest_name_length)
+      set(_longest_name_length ${length})
+    endif()
+  endforeach()
+
+  message(STATUS "--- Configurated Options ---\n")
+  foreach(_option IN LISTS _options)
+    string(LENGTH "${_option}" _option_length)
+    math(EXPR num_spaces "${_longest_name_length} - ${_option_length}")
+    set(padding "")
+    while(num_spaces GREATER 0)
+      set(padding "${padding} ")
+      math(EXPR num_spaces "${num_spaces} - 1")
+    endwhile()
+    message(STATUS "${_option}${padding} : ${${_option}}")
+  endforeach()
+  message(STATUS "---------------------------")
+endfunction()
+
 # Enforce option names to always start with EXECUTORCH.
 function(enforce_executorch_option_name NAME)
   if(NOT "${NAME}" MATCHES "^EXECUTORCH_")
@@ -26,4 +74,6 @@ macro(define_overridable_option NAME DESCRIPTION VALUE_TYPE DEFAULT_VALUE)
   else()
     set(${NAME} ${DEFAULT_VALUE} CACHE ${VALUE_TYPE} ${DESCRIPTION})
   endif()
+
+  announce_configured_options(${NAME})
 endmacro()

From c35281394de8f757f20fa2fc30b25d099c7e2c35 Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Thu, 8 May 2025 19:54:44 +0200
Subject: [PATCH 010/178] Arm backend: Remove redundant validation check for
 op_where (#10773)

Signed-off-by: Sebastian Larsson <sebastian.larsson@arm.com>
---
 backends/arm/operators/op_where.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/backends/arm/operators/op_where.py b/backends/arm/operators/op_where.py
index d34f4134def..67392fefcd8 100644
--- a/backends/arm/operators/op_where.py
+++ b/backends/arm/operators/op_where.py
@@ -69,8 +69,6 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        validate_num_inputs(self.target, inputs, 3)
-
         bi_supported_dtypes = [
             ts.DType.INT8,
             ts.DType.INT16,
@@ -99,8 +97,6 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        validate_num_inputs(self.target, inputs, 3)
-
         mi_supported_dtypes = [
             ts.DType.FP16,
             ts.DType.FP32,
@@ -163,8 +159,6 @@ def define_node(
     ) -> None:
         import serializer.tosa_serializer as ts
 
-        validate_num_inputs(self.target, inputs, 3)
-
         bi_supported_dtypes = [
             ts.DType.INT8,
             ts.DType.INT16,
@@ -193,8 +187,6 @@ def define_node(
     ) -> None:
         import serializer.tosa_serializer as ts
 
-        validate_num_inputs(self.target, inputs, 3)
-
         mi_supported_dtypes = [
             ts.DType.FP16,
             ts.DType.FP32,

From d24eda4aa0bb3704aa0f86a708c1a0fc8166a240 Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Thu, 8 May 2025 19:57:14 +0200
Subject: [PATCH 011/178] Arm backend: Replace asserts with exceptions in
 permutation code (#10774)

Refactor assertion statements to raise ValueErrors for better error
handling in permutation matrix and vector transformations. Ensure that
conditions are checked and appropriate exceptions are raised to enhance
code robustness and readability.

Signed-off-by: Sebastian Larsson <sebastian.larsson@arm.com>
---
 backends/arm/operators/op_permute.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py
index b78ee94b774..2b345cb5118 100644
--- a/backends/arm/operators/op_permute.py
+++ b/backends/arm/operators/op_permute.py
@@ -46,24 +46,26 @@ def permutation_matrix_to_vector(permutation_matrix: torch.Tensor) -> list[int]:
     (1,0,2)
     """
     N = len(permutation_matrix)
-    assert N == len(
-        permutation_matrix[0]
-    ), f"A permutation matrix must be square, got shape {permutation_matrix.shape}"
+    if N != len(permutation_matrix[0]):
+        raise ValueError(
+            f"A permutation matrix must be square, got shape {permutation_matrix.shape}"
+        )
 
     p = [0] * N
     for row_index, row in enumerate(permutation_matrix):
         saw_one = False
         for col_index, value in enumerate(row):
             if value == 1:
-                assert (
-                    not saw_one
-                ), f"A permutation matrix can only have one 1 per row, got row {row}."
+                if saw_one:
+                    raise ValueError(
+                        f"A permutation matrix can only have one 1 per row, got {row=}"
+                    )
                 p[row_index] = col_index
                 saw_one = True
-            else:
-                assert (
-                    value == 0
-                ), f"A permutation matrix only contains 1's and 0's, got value {value}."
+            elif value != 0:
+                raise ValueError(
+                    f"A permutation matrix only contains 1's and 0's, got {value=}"
+                )
     return p
 
 
From a37b369858ed89d12a593e318a5c9849f2c9613b Mon Sep 17 00:00:00 2001
From: trivedivivek <5340687+trivedivivek@users.noreply.github.com>
Date: Thu, 8 May 2025 13:35:50 -0500
Subject: [PATCH 012/178] Minor vector sizing change. (#10753)

Summary: Minor change to reserve size for VkWriteDescriptorSet and
VkDescriptorSetLayoutBinding vectors.

Differential Revision: D74335276
---
 backends/vulkan/runtime/vk_api/Descriptor.cpp | 5 ++---
 backends/vulkan/runtime/vk_api/Shader.cpp     | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
index b42ade6ea02..938666802ef 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -154,6 +154,7 @@ DescriptorSet& DescriptorSet::bind(
 
 VkDescriptorSet DescriptorSet::get_bind_handle() const {
   std::vector<VkWriteDescriptorSet> write_descriptor_sets;
+  write_descriptor_sets.reserve(bindings_.size());
 
   for (const ResourceBinding& binding : bindings_) {
     VkWriteDescriptorSet write{
@@ -185,9 +186,7 @@ VkDescriptorSet DescriptorSet::get_bind_handle() const {
       0u,
       nullptr);
 
-  VkDescriptorSet ret = handle_;
-
-  return ret;
+  return handle_;
 }
 
 void DescriptorSet::add_binding(const ResourceBinding& binding) {
diff --git a/backends/vulkan/runtime/vk_api/Shader.cpp b/backends/vulkan/runtime/vk_api/Shader.cpp
index e560f37868e..458b1f83956 100644
--- a/backends/vulkan/runtime/vk_api/Shader.cpp
+++ b/backends/vulkan/runtime/vk_api/Shader.cpp
@@ -59,10 +59,11 @@ ShaderLayout::ShaderLayout(
     const ShaderLayout::Signature& signature)
     : device_(device), handle_{VK_NULL_HANDLE} {
   std::vector<VkDescriptorSetLayoutBinding> bindings;
+  bindings.reserve(signature.size());
 
   uint32_t binding_num = 0u;
   for (const VkDescriptorType type : signature) {
-    bindings.push_back({
+    bindings.emplace_back(VkDescriptorSetLayoutBinding{
         binding_num++, // binding
         type, // descriptorType
         1u, // descriptorCount

From 380c4f1778a70e0239067a926269dfa23473ae25 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Thu, 8 May 2025 13:32:49 -0700
Subject: [PATCH 013/178] Allow options to be set by presets (#10767)

### Summary

In this diff we create a helper that will allow presets to set options.
Again this is mostly a helper to check if the option has been defined
already, then no-oping.

To test it, I also create the first preset `macos-arm64`. I will test it
in upcoming diffs.

### Test plan

pytest for now, manual test in future diffs


cc @larryliu0820
---
 CMakeLists.txt                       |  2 -
 tools/cmake/Utils.cmake              |  4 --
 tools/cmake/common/preset.cmake      | 14 ++++++
 tools/cmake/common/preset_test.py    | 67 ++++++++++++++++++++++++++++
 tools/cmake/preset/default.cmake     |  1 +
 tools/cmake/preset/macos-arm64.cmake |  7 +++
 6 files changed, 89 insertions(+), 6 deletions(-)
 create mode 100644 tools/cmake/preset/macos-arm64.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d3f8e5f907..03e36186c94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,8 +178,6 @@ option(EXECUTORCH_BUILD_ARM_BAREMETAL
        "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF
 )
 
-option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
-
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM "Build the custom kernels" OFF)
 
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT "Build the custom ops lib for AOT"
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 3155c3fc16e..edbd682c7e3 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -45,10 +45,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  EXECUTORCH_BUILD_CADENCE               : "
                  "${EXECUTORCH_BUILD_CADENCE}"
   )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_COREML                : ${EXECUTORCH_BUILD_COREML}"
-  )
   message(
     STATUS
       "  EXECUTORCH_BUILD_CPUINFO               : ${EXECUTORCH_BUILD_CPUINFO}"
diff --git a/tools/cmake/common/preset.cmake b/tools/cmake/common/preset.cmake
index 0affdf04bdd..8f886abab36 100644
--- a/tools/cmake/common/preset.cmake
+++ b/tools/cmake/common/preset.cmake
@@ -26,6 +26,7 @@ function(announce_configured_options NAME)
   endif()
 endfunction()
 
+
 # Print the configured options.
 function(print_configured_options)
   get_property(_options GLOBAL PROPERTY _announce_configured_options)
@@ -52,6 +53,7 @@ function(print_configured_options)
   message(STATUS "---------------------------")
 endfunction()
 
+
 # Enforce option names to always start with EXECUTORCH.
 function(enforce_executorch_option_name NAME)
   if(NOT "${NAME}" MATCHES "^EXECUTORCH_")
@@ -59,6 +61,7 @@ function(enforce_executorch_option_name NAME)
   endif()
 endfunction()
 
+
 # Define an overridable option.
 #   1) If the option is already defined in the process, then store that in cache
 #   2) If the option is NOT set, then store the default value in cache
@@ -77,3 +80,14 @@ macro(define_overridable_option NAME DESCRIPTION VALUE_TYPE DEFAULT_VALUE)
 
   announce_configured_options(${NAME})
 endmacro()
+
+
+# Set an overridable option.
+macro(set_overridable_option NAME VALUE)
+  # If the user has explitily set the option, do not override it.
+  if(DEFINED ${NAME})
+    return()
+  endif()
+
+  set(${NAME} ${VALUE} CACHE STRING "")
+endmacro()
diff --git a/tools/cmake/common/preset_test.py b/tools/cmake/common/preset_test.py
index eb564eadace..1748062f166 100644
--- a/tools/cmake/common/preset_test.py
+++ b/tools/cmake/common/preset_test.py
@@ -223,3 +223,70 @@ def test_define_overridable_option_cli_override_with_set_override(self):
         self.run_cmake(cmake_args=["-DEXECUTORCH_TEST_MESSAGE='cli value'"])
         # If an option is set through cmake, it should NOT be overridable from the CLI.
         self.assert_cmake_cache("EXECUTORCH_TEST_MESSAGE", "set value", "STRING")
+
+    def test_set_overridable_option_before(self):
+        _cmake_lists_txt = """
+            cmake_minimum_required(VERSION 3.24)
+            project(test_preset)
+            include(${PROJECT_SOURCE_DIR}/preset.cmake)
+            set_overridable_option(EXECUTORCH_TEST_MESSAGE "from set_overridable_option")
+            add_subdirectory(build)
+        """
+        _build_cmake_lists_txt = """
+            define_overridable_option(EXECUTORCH_TEST_MESSAGE "test message" STRING "move fast")
+        """
+        self.create_workspace(
+            {
+                "CMakeLists.txt": _cmake_lists_txt,
+                "build": {
+                    "CMakeLists.txt": _build_cmake_lists_txt,
+                },
+            }
+        )
+        self.run_cmake()
+        self.assert_cmake_cache(
+            "EXECUTORCH_TEST_MESSAGE", "from set_overridable_option", "STRING"
+        )
+
+    def test_set_overridable_option_after(self):
+        _cmake_lists_txt = """
+            cmake_minimum_required(VERSION 3.24)
+            project(test_preset)
+            include(${PROJECT_SOURCE_DIR}/preset.cmake)
+            add_subdirectory(build)
+            set_overridable_option(EXECUTORCH_TEST_MESSAGE "from set_overridable_option")
+        """
+        _build_cmake_lists_txt = """
+            define_overridable_option(EXECUTORCH_TEST_MESSAGE "test message" STRING "move fast")
+        """
+        self.create_workspace(
+            {
+                "CMakeLists.txt": _cmake_lists_txt,
+                "build": {
+                    "CMakeLists.txt": _build_cmake_lists_txt,
+                },
+            }
+        )
+        self.run_cmake()
+        self.assert_cmake_cache("EXECUTORCH_TEST_MESSAGE", "move fast", "STRING")
+
+    def test_set_overridable_option_with_cli_override(self):
+        _cmake_lists_txt = """
+            cmake_minimum_required(VERSION 3.24)
+            project(test_preset)
+            include(${PROJECT_SOURCE_DIR}/preset.cmake)
+            add_subdirectory(build)
+        """
+        _build_cmake_lists_txt = """
+            define_overridable_option(EXECUTORCH_TEST_MESSAGE "test message" STRING "move fast")
+        """
+        self.create_workspace(
+            {
+                "CMakeLists.txt": _cmake_lists_txt,
+                "build": {
+                    "CMakeLists.txt": _build_cmake_lists_txt,
+                },
+            }
+        )
+        self.run_cmake(cmake_args=["-DEXECUTORCH_TEST_MESSAGE='from the cli'"])
+        self.assert_cmake_cache("EXECUTORCH_TEST_MESSAGE", "from the cli", "STRING")
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index eafa8a7a937..5fbb47b1396 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -15,3 +15,4 @@ endif()
 # MARK: - Definitions
 
 define_overridable_option(EXECUTORCH_ENABLE_LOGGING "Build with ET_LOG_ENABLED" BOOL ${_is_build_type_debug})
+define_overridable_option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" BOOL OFF)
diff --git a/tools/cmake/preset/macos-arm64.cmake b/tools/cmake/preset/macos-arm64.cmake
new file mode 100644
index 00000000000..84e60c50b92
--- /dev/null
+++ b/tools/cmake/preset/macos-arm64.cmake
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set_overridable_option(EXECUTORCH_BUILD_COREML ON)

From d25ce549e5eb5ce606b995d9d839fabfb0e1e990 Mon Sep 17 00:00:00 2001
From: tmsl <phaiting@gmail.com>
Date: Thu, 8 May 2025 13:42:35 -0700
Subject: [PATCH 014/178] Convert the unit test from java to kotlin (#10702)

### Summary
This change converts the unit test from java to kotlin.

### Test plan
./gradlew :executorch_android:testDebugUnitTest

---------

Co-authored-by: Haiting Pu <haiting@meta.com>
---
 .../android/executorch_android/build.gradle   |   5 +
 .../org/pytorch/executorch/EValueTest.java    | 230 -------------
 .../java/org/pytorch/executorch/EValueTest.kt | 224 +++++++++++++
 .../org/pytorch/executorch/TensorTest.java    | 305 ------------------
 .../java/org/pytorch/executorch/TensorTest.kt | 296 +++++++++++++++++
 extension/android/gradle/libs.versions.toml   |   5 +
 6 files changed, 530 insertions(+), 535 deletions(-)
 delete mode 100644 extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.java
 create mode 100644 extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt
 delete mode 100644 extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.java
 create mode 100644 extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt

diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index 15088f4097f..fac08588740 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -9,6 +9,7 @@
 plugins {
     id "com.android.library" version "8.9.0"
     id "com.vanniktech.maven.publish" version "0.31.0"
+    alias(libs.plugins.jetbrains.kotlin.android)
 }
 
 android {
@@ -34,6 +35,9 @@ android {
             resources.srcDirs += [ 'src/androidTest/resources' ]
         }
     }
+    kotlinOptions {
+        jvmTarget = "1.8"
+    }
 }
 
 task copyTestRes(type: Exec) {
@@ -43,6 +47,7 @@ task copyTestRes(type: Exec) {
 dependencies {
     implementation 'com.facebook.fbjni:fbjni:0.5.1'
     implementation 'com.facebook.soloader:nativeloader:0.10.5'
+    implementation libs.core.ktx
     testImplementation 'junit:junit:4.12'
     androidTestImplementation 'androidx.test.ext:junit:1.1.5'
     androidTestImplementation 'androidx.test:rules:1.2.0'
diff --git a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.java b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.java
deleted file mode 100644
index cbeb3a7b634..00000000000
--- a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.java
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package org.pytorch.executorch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.util.Arrays;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/** Unit tests for {@link EValue}. */
-@RunWith(JUnit4.class)
-public class EValueTest {
-  @Test
-  public void testNone() {
-    EValue evalue = EValue.optionalNone();
-    assertTrue(evalue.isNone());
-  }
-
-  @Test
-  public void testTensorValue() {
-    long[] data = {1, 2, 3};
-    long[] shape = {1, 3};
-    EValue evalue = EValue.from(Tensor.fromBlob(data, shape));
-    assertTrue(evalue.isTensor());
-    assertTrue(Arrays.equals(evalue.toTensor().shape, shape));
-    assertTrue(Arrays.equals(evalue.toTensor().getDataAsLongArray(), data));
-  }
-
-  @Test
-  public void testBoolValue() {
-    EValue evalue = EValue.from(true);
-    assertTrue(evalue.isBool());
-    assertTrue(evalue.toBool());
-  }
-
-  @Test
-  public void testIntValue() {
-    EValue evalue = EValue.from(1);
-    assertTrue(evalue.isInt());
-    assertEquals(evalue.toInt(), 1);
-  }
-
-  @Test
-  public void testDoubleValue() {
-    EValue evalue = EValue.from(0.1d);
-    assertTrue(evalue.isDouble());
-    assertEquals(evalue.toDouble(), 0.1d, 0.0001d);
-  }
-
-  @Test
-  public void testStringValue() {
-    EValue evalue = EValue.from("a");
-    assertTrue(evalue.isString());
-    assertEquals(evalue.toStr(), "a");
-  }
-
-  @Test
-  public void testAllIllegalCast() {
-    EValue evalue = EValue.optionalNone();
-    assertTrue(evalue.isNone());
-
-    // try Tensor
-    assertFalse(evalue.isTensor());
-    try {
-      evalue.toTensor();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-    }
-
-    // try bool
-    assertFalse(evalue.isBool());
-    try {
-      evalue.toBool();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-    }
-
-    // try int
-    assertFalse(evalue.isInt());
-    try {
-      evalue.toInt();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-    }
-
-    // try double
-    assertFalse(evalue.isDouble());
-    try {
-      evalue.toDouble();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-    }
-
-    // try string
-    assertFalse(evalue.isString());
-    try {
-      evalue.toStr();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-    }
-  }
-
-  @Test
-  public void testNoneSerde() {
-    EValue evalue = EValue.optionalNone();
-    byte[] bytes = evalue.toByteArray();
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isNone(), true);
-  }
-
-  @Test
-  public void testBoolSerde() {
-    EValue evalue = EValue.from(true);
-    byte[] bytes = evalue.toByteArray();
-    assertEquals(1, bytes[1]);
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isBool(), true);
-    assertEquals(deser.toBool(), true);
-  }
-
-  @Test
-  public void testBoolSerde2() {
-    EValue evalue = EValue.from(false);
-    byte[] bytes = evalue.toByteArray();
-    assertEquals(0, bytes[1]);
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isBool(), true);
-    assertEquals(deser.toBool(), false);
-  }
-
-  @Test
-  public void testIntSerde() {
-    EValue evalue = EValue.from(1);
-    byte[] bytes = evalue.toByteArray();
-    assertEquals(0, bytes[1]);
-    assertEquals(0, bytes[2]);
-    assertEquals(0, bytes[3]);
-    assertEquals(0, bytes[4]);
-    assertEquals(0, bytes[5]);
-    assertEquals(0, bytes[6]);
-    assertEquals(0, bytes[7]);
-    assertEquals(1, bytes[8]);
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isInt(), true);
-    assertEquals(deser.toInt(), 1);
-  }
-
-  @Test
-  public void testLargeIntSerde() {
-    EValue evalue = EValue.from(256000);
-    byte[] bytes = evalue.toByteArray();
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isInt(), true);
-    assertEquals(deser.toInt(), 256000);
-  }
-
-  @Test
-  public void testDoubleSerde() {
-    EValue evalue = EValue.from(1.345e-2d);
-    byte[] bytes = evalue.toByteArray();
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isDouble(), true);
-    assertEquals(1.345e-2d, deser.toDouble(), 1e-6);
-  }
-
-  @Test
-  public void testLongTensorSerde() {
-    long data[] = {1, 2, 3, 4};
-    long shape[] = {2, 2};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-
-    EValue evalue = EValue.from(tensor);
-    byte[] bytes = evalue.toByteArray();
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isTensor(), true);
-    Tensor deserTensor = deser.toTensor();
-    long[] deserShape = deserTensor.shape();
-    long[] deserData = deserTensor.getDataAsLongArray();
-
-    for (int i = 0; i < data.length; i++) {
-      assertEquals(data[i], deserData[i]);
-    }
-
-    for (int i = 0; i < shape.length; i++) {
-      assertEquals(shape[i], deserShape[i]);
-    }
-  }
-
-  @Test
-  public void testFloatTensorSerde() {
-    float data[] = {Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE};
-    long shape[] = {2, 2};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-
-    EValue evalue = EValue.from(tensor);
-    byte[] bytes = evalue.toByteArray();
-
-    EValue deser = EValue.fromByteArray(bytes);
-    assertEquals(deser.isTensor(), true);
-    Tensor deserTensor = deser.toTensor();
-    long[] deserShape = deserTensor.shape();
-    float[] deserData = deserTensor.getDataAsFloatArray();
-
-    for (int i = 0; i < data.length; i++) {
-      assertEquals(data[i], deserData[i], 1e-5);
-    }
-
-    for (int i = 0; i < shape.length; i++) {
-      assertEquals(shape[i], deserShape[i]);
-    }
-  }
-}
diff --git a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt
new file mode 100644
index 00000000000..0e56480d621
--- /dev/null
+++ b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+package org.pytorch.executorch
+
+import org.junit.Assert
+import org.junit.Test
+import org.junit.runner.RunWith
+import org.junit.runners.JUnit4
+
+/** Unit tests for [EValue].  */
+@RunWith(JUnit4::class)
+class EValueTest {
+    @Test
+    fun testNone() {
+        val evalue = EValue.optionalNone()
+        Assert.assertTrue(evalue.isNone)
+    }
+
+    @Test
+    fun testTensorValue() {
+        val data = longArrayOf(1, 2, 3)
+        val shape = longArrayOf(1, 3)
+        val evalue = EValue.from(Tensor.fromBlob(data, shape))
+        Assert.assertTrue(evalue.isTensor)
+        Assert.assertTrue(evalue.toTensor().shape.contentEquals(shape))
+        Assert.assertTrue(evalue.toTensor().dataAsLongArray.contentEquals(data))
+    }
+
+    @Test
+    fun testBoolValue() {
+        val evalue = EValue.from(true)
+        Assert.assertTrue(evalue.isBool)
+        Assert.assertTrue(evalue.toBool())
+    }
+
+    @Test
+    fun testIntValue() {
+        val evalue = EValue.from(1)
+        Assert.assertTrue(evalue.isInt)
+        Assert.assertEquals(evalue.toInt(), 1)
+    }
+
+    @Test
+    fun testDoubleValue() {
+        val evalue = EValue.from(0.1)
+        Assert.assertTrue(evalue.isDouble)
+        Assert.assertEquals(evalue.toDouble(), 0.1, 0.0001)
+    }
+
+    @Test
+    fun testStringValue() {
+        val evalue = EValue.from("a")
+        Assert.assertTrue(evalue.isString)
+        Assert.assertEquals(evalue.toStr(), "a")
+    }
+
+    @Test
+    fun testAllIllegalCast() {
+        val evalue = EValue.optionalNone()
+        Assert.assertTrue(evalue.isNone)
+
+        // try Tensor
+        Assert.assertFalse(evalue.isTensor)
+        try {
+            evalue.toTensor()
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+        }
+
+        // try bool
+        Assert.assertFalse(evalue.isBool)
+        try {
+            evalue.toBool()
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+        }
+
+        // try int
+        Assert.assertFalse(evalue.isInt)
+        try {
+            evalue.toInt()
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+        }
+
+        // try double
+        Assert.assertFalse(evalue.isDouble)
+        try {
+            evalue.toDouble()
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+        }
+
+        // try string
+        Assert.assertFalse(evalue.isString)
+        try {
+            evalue.toStr()
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+        }
+    }
+
+    @Test
+    fun testNoneSerde() {
+        val evalue = EValue.optionalNone()
+        val bytes = evalue.toByteArray()
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isNone, true)
+    }
+
+    @Test
+    fun testBoolSerde() {
+        val evalue = EValue.from(true)
+        val bytes = evalue.toByteArray()
+        Assert.assertEquals(1, bytes[1].toLong())
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isBool, true)
+        Assert.assertEquals(deser.toBool(), true)
+    }
+
+    @Test
+    fun testBoolSerde2() {
+        val evalue = EValue.from(false)
+        val bytes = evalue.toByteArray()
+        Assert.assertEquals(0, bytes[1].toLong())
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isBool, true)
+        Assert.assertEquals(deser.toBool(), false)
+    }
+
+    @Test
+    fun testIntSerde() {
+        val evalue = EValue.from(1)
+        val bytes = evalue.toByteArray()
+        Assert.assertEquals(0, bytes[1].toLong())
+        Assert.assertEquals(0, bytes[2].toLong())
+        Assert.assertEquals(0, bytes[3].toLong())
+        Assert.assertEquals(0, bytes[4].toLong())
+        Assert.assertEquals(0, bytes[5].toLong())
+        Assert.assertEquals(0, bytes[6].toLong())
+        Assert.assertEquals(0, bytes[7].toLong())
+        Assert.assertEquals(1, bytes[8].toLong())
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isInt, true)
+        Assert.assertEquals(deser.toInt(), 1)
+    }
+
+    @Test
+    fun testLargeIntSerde() {
+        val evalue = EValue.from(256000)
+        val bytes = evalue.toByteArray()
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isInt, true)
+        Assert.assertEquals(deser.toInt(), 256000)
+    }
+
+    @Test
+    fun testDoubleSerde() {
+        val evalue = EValue.from(1.345e-2)
+        val bytes = evalue.toByteArray()
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isDouble, true)
+        Assert.assertEquals(1.345e-2, deser.toDouble(), 1e-6)
+    }
+
+    @Test
+    fun testLongTensorSerde() {
+        val data = longArrayOf(1, 2, 3, 4)
+        val shape = longArrayOf(2, 2)
+        val tensor = Tensor.fromBlob(data, shape)
+
+        val evalue = EValue.from(tensor)
+        val bytes = evalue.toByteArray()
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isTensor, true)
+        val deserTensor = deser.toTensor()
+        val deserShape = deserTensor.shape()
+        val deserData = deserTensor.dataAsLongArray
+
+        for (i in data.indices) {
+            Assert.assertEquals(data[i], deserData[i])
+        }
+
+        for (i in shape.indices) {
+            Assert.assertEquals(shape[i], deserShape[i])
+        }
+    }
+
+    @Test
+    fun testFloatTensorSerde() {
+        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+        val shape = longArrayOf(2, 2)
+        val tensor = Tensor.fromBlob(data, shape)
+
+        val evalue = EValue.from(tensor)
+        val bytes = evalue.toByteArray()
+
+        val deser = EValue.fromByteArray(bytes)
+        Assert.assertEquals(deser.isTensor, true)
+        val deserTensor = deser.toTensor()
+        val deserShape = deserTensor.shape()
+        val deserData = deserTensor.dataAsFloatArray
+
+        for (i in data.indices) {
+            Assert.assertEquals(data[i].toDouble(), deserData[i].toDouble(), 1e-5)
+        }
+
+        for (i in shape.indices) {
+            Assert.assertEquals(shape[i], deserShape[i])
+        }
+    }
+}
diff --git a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.java b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.java
deleted file mode 100644
index 9811a1d0ff6..00000000000
--- a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.java
+++ /dev/null
@@ -1,305 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package org.pytorch.executorch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
-
-import java.nio.ByteBuffer;
-import java.nio.DoubleBuffer;
-import java.nio.FloatBuffer;
-import java.nio.IntBuffer;
-import java.nio.LongBuffer;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/** Unit tests for {@link Tensor}. */
-@RunWith(JUnit4.class)
-public class TensorTest {
-
-  @Test
-  public void testFloatTensor() {
-    float data[] = {Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE};
-    long shape[] = {2, 2};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    assertEquals(tensor.dtype(), DType.FLOAT);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsFloatArray()[0], 1e-5);
-    assertEquals(data[1], tensor.getDataAsFloatArray()[1], 1e-5);
-    assertEquals(data[2], tensor.getDataAsFloatArray()[2], 1e-5);
-    assertEquals(data[3], tensor.getDataAsFloatArray()[3], 1e-5);
-
-    FloatBuffer floatBuffer = Tensor.allocateFloatBuffer(4);
-    floatBuffer.put(data);
-    tensor = Tensor.fromBlob(floatBuffer, shape);
-    assertEquals(tensor.dtype(), DType.FLOAT);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsFloatArray()[0], 1e-5);
-    assertEquals(data[1], tensor.getDataAsFloatArray()[1], 1e-5);
-    assertEquals(data[2], tensor.getDataAsFloatArray()[2], 1e-5);
-    assertEquals(data[3], tensor.getDataAsFloatArray()[3], 1e-5);
-  }
-
-  @Test
-  public void testIntTensor() {
-    int data[] = {Integer.MIN_VALUE, 0, 1, Integer.MAX_VALUE};
-    long shape[] = {1, 4, 1};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    assertEquals(tensor.dtype(), DType.INT32);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(shape[2], tensor.shape()[2]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsIntArray()[0]);
-    assertEquals(data[1], tensor.getDataAsIntArray()[1]);
-    assertEquals(data[2], tensor.getDataAsIntArray()[2]);
-    assertEquals(data[3], tensor.getDataAsIntArray()[3]);
-
-    IntBuffer intBuffer = Tensor.allocateIntBuffer(4);
-    intBuffer.put(data);
-    tensor = Tensor.fromBlob(intBuffer, shape);
-    assertEquals(tensor.dtype(), DType.INT32);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(shape[2], tensor.shape()[2]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsIntArray()[0]);
-    assertEquals(data[1], tensor.getDataAsIntArray()[1]);
-    assertEquals(data[2], tensor.getDataAsIntArray()[2]);
-    assertEquals(data[3], tensor.getDataAsIntArray()[3]);
-  }
-
-  @Test
-  public void testDoubleTensor() {
-    double data[] = {Double.MIN_VALUE, 0.0d, 0.1d, Double.MAX_VALUE};
-    long shape[] = {1, 4};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    assertEquals(tensor.dtype(), DType.DOUBLE);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsDoubleArray()[0], 1e-5);
-    assertEquals(data[1], tensor.getDataAsDoubleArray()[1], 1e-5);
-    assertEquals(data[2], tensor.getDataAsDoubleArray()[2], 1e-5);
-    assertEquals(data[3], tensor.getDataAsDoubleArray()[3], 1e-5);
-
-    DoubleBuffer doubleBuffer = Tensor.allocateDoubleBuffer(4);
-    doubleBuffer.put(data);
-    tensor = Tensor.fromBlob(doubleBuffer, shape);
-    assertEquals(tensor.dtype(), DType.DOUBLE);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsDoubleArray()[0], 1e-5);
-    assertEquals(data[1], tensor.getDataAsDoubleArray()[1], 1e-5);
-    assertEquals(data[2], tensor.getDataAsDoubleArray()[2], 1e-5);
-    assertEquals(data[3], tensor.getDataAsDoubleArray()[3], 1e-5);
-  }
-
-  @Test
-  public void testLongTensor() {
-    long data[] = {Long.MIN_VALUE, 0L, 1L, Long.MAX_VALUE};
-    long shape[] = {4, 1};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    assertEquals(tensor.dtype(), DType.INT64);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsLongArray()[0]);
-    assertEquals(data[1], tensor.getDataAsLongArray()[1]);
-    assertEquals(data[2], tensor.getDataAsLongArray()[2]);
-    assertEquals(data[3], tensor.getDataAsLongArray()[3]);
-
-    LongBuffer longBuffer = Tensor.allocateLongBuffer(4);
-    longBuffer.put(data);
-    tensor = Tensor.fromBlob(longBuffer, shape);
-    assertEquals(tensor.dtype(), DType.INT64);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsLongArray()[0]);
-    assertEquals(data[1], tensor.getDataAsLongArray()[1]);
-    assertEquals(data[2], tensor.getDataAsLongArray()[2]);
-    assertEquals(data[3], tensor.getDataAsLongArray()[3]);
-  }
-
-  @Test
-  public void testSignedByteTensor() {
-    byte data[] = {Byte.MIN_VALUE, (byte) 0, (byte) 1, Byte.MAX_VALUE};
-    long shape[] = {1, 1, 4};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    assertEquals(tensor.dtype(), DType.INT8);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(shape[2], tensor.shape()[2]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsByteArray()[0]);
-    assertEquals(data[1], tensor.getDataAsByteArray()[1]);
-    assertEquals(data[2], tensor.getDataAsByteArray()[2]);
-    assertEquals(data[3], tensor.getDataAsByteArray()[3]);
-
-    ByteBuffer byteBuffer = Tensor.allocateByteBuffer(4);
-    byteBuffer.put(data);
-    tensor = Tensor.fromBlob(byteBuffer, shape);
-    assertEquals(tensor.dtype(), DType.INT8);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(shape[2], tensor.shape()[2]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsByteArray()[0]);
-    assertEquals(data[1], tensor.getDataAsByteArray()[1]);
-    assertEquals(data[2], tensor.getDataAsByteArray()[2]);
-    assertEquals(data[3], tensor.getDataAsByteArray()[3]);
-  }
-
-  @Test
-  public void testUnsignedByteTensor() {
-    byte data[] = {(byte) 0, (byte) 1, (byte) 2, (byte) 255};
-    long shape[] = {4, 1, 1};
-    Tensor tensor = Tensor.fromBlobUnsigned(data, shape);
-    assertEquals(tensor.dtype(), DType.UINT8);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(shape[2], tensor.shape()[2]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsUnsignedByteArray()[0]);
-    assertEquals(data[1], tensor.getDataAsUnsignedByteArray()[1]);
-    assertEquals(data[2], tensor.getDataAsUnsignedByteArray()[2]);
-    assertEquals(data[3], tensor.getDataAsUnsignedByteArray()[3]);
-
-    ByteBuffer byteBuffer = Tensor.allocateByteBuffer(4);
-    byteBuffer.put(data);
-    tensor = Tensor.fromBlobUnsigned(byteBuffer, shape);
-    assertEquals(tensor.dtype(), DType.UINT8);
-    assertEquals(shape[0], tensor.shape()[0]);
-    assertEquals(shape[1], tensor.shape()[1]);
-    assertEquals(shape[2], tensor.shape()[2]);
-    assertEquals(4, tensor.numel());
-    assertEquals(data[0], tensor.getDataAsUnsignedByteArray()[0]);
-    assertEquals(data[1], tensor.getDataAsUnsignedByteArray()[1]);
-    assertEquals(data[2], tensor.getDataAsUnsignedByteArray()[2]);
-    assertEquals(data[3], tensor.getDataAsUnsignedByteArray()[3]);
-  }
-
-  @Test
-  public void testIllegalDataTypeException() {
-    float data[] = {Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE};
-    long shape[] = {2, 2};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    assertEquals(tensor.dtype(), DType.FLOAT);
-
-    try {
-      tensor.getDataAsByteArray();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-      // expected
-    }
-    try {
-      tensor.getDataAsUnsignedByteArray();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-      // expected
-    }
-    try {
-      tensor.getDataAsIntArray();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-      // expected
-    }
-    try {
-      tensor.getDataAsDoubleArray();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-      // expected
-    }
-    try {
-      tensor.getDataAsLongArray();
-      fail("Should have thrown an exception");
-    } catch (IllegalStateException e) {
-      // expected
-    }
-  }
-
-  @Test
-  public void testIllegalArguments() {
-    float data[] = {Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE};
-    long shapeWithNegativeValues[] = {-1, 2};
-    long mismatchShape[] = {1, 2};
-
-    try {
-      Tensor tensor = Tensor.fromBlob((float[]) null, mismatchShape);
-      fail("Should have thrown an exception");
-    } catch (IllegalArgumentException e) {
-      // expected
-    }
-    try {
-      Tensor tensor = Tensor.fromBlob(data, null);
-      fail("Should have thrown an exception");
-    } catch (IllegalArgumentException e) {
-      // expected
-    }
-    try {
-      Tensor tensor = Tensor.fromBlob(data, shapeWithNegativeValues);
-      fail("Should have thrown an exception");
-    } catch (IllegalArgumentException e) {
-      // expected
-    }
-    try {
-      Tensor tensor = Tensor.fromBlob(data, mismatchShape);
-      fail("Should have thrown an exception");
-    } catch (IllegalArgumentException e) {
-      // expected
-    }
-  }
-
-  @Test
-  public void testLongTensorSerde() {
-    long data[] = {1, 2, 3, 4};
-    long shape[] = {2, 2};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    byte[] bytes = tensor.toByteArray();
-
-    Tensor deser = Tensor.fromByteArray(bytes);
-    long[] deserShape = deser.shape();
-    long[] deserData = deser.getDataAsLongArray();
-
-    for (int i = 0; i < data.length; i++) {
-      assertEquals(data[i], deserData[i]);
-    }
-
-    for (int i = 0; i < shape.length; i++) {
-      assertEquals(shape[i], deserShape[i]);
-    }
-  }
-
-  @Test
-  public void testFloatTensorSerde() {
-    float data[] = {Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE};
-    long shape[] = {2, 2};
-    Tensor tensor = Tensor.fromBlob(data, shape);
-    byte[] bytes = tensor.toByteArray();
-
-    Tensor deser = Tensor.fromByteArray(bytes);
-    long[] deserShape = deser.shape();
-    float[] deserData = deser.getDataAsFloatArray();
-
-    for (int i = 0; i < data.length; i++) {
-      assertEquals(data[i], deserData[i], 1e-5);
-    }
-
-    for (int i = 0; i < shape.length; i++) {
-      assertEquals(shape[i], deserShape[i]);
-    }
-  }
-}
diff --git a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt
new file mode 100644
index 00000000000..4b206c8efbd
--- /dev/null
+++ b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+package org.pytorch.executorch
+
+import org.junit.Assert
+import org.junit.Test
+import org.junit.runner.RunWith
+import org.junit.runners.JUnit4
+
+/** Unit tests for [Tensor].  */
+@RunWith(JUnit4::class)
+class TensorTest {
+    @Test
+    fun testFloatTensor() {
+        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+        val shape = longArrayOf(2, 2)
+        var tensor = Tensor.fromBlob(data, shape)
+        Assert.assertEquals(tensor.dtype(), DType.FLOAT)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toDouble(), tensor.dataAsFloatArray[0].toDouble(), 1e-5)
+        Assert.assertEquals(data[1].toDouble(), tensor.dataAsFloatArray[1].toDouble(), 1e-5)
+        Assert.assertEquals(data[2].toDouble(), tensor.dataAsFloatArray[2].toDouble(), 1e-5)
+        Assert.assertEquals(data[3].toDouble(), tensor.dataAsFloatArray[3].toDouble(), 1e-5)
+
+        val floatBuffer = Tensor.allocateFloatBuffer(4)
+        floatBuffer.put(data)
+        tensor = Tensor.fromBlob(floatBuffer, shape)
+        Assert.assertEquals(tensor.dtype(), DType.FLOAT)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toDouble(), tensor.dataAsFloatArray[0].toDouble(), 1e-5)
+        Assert.assertEquals(data[1].toDouble(), tensor.dataAsFloatArray[1].toDouble(), 1e-5)
+        Assert.assertEquals(data[2].toDouble(), tensor.dataAsFloatArray[2].toDouble(), 1e-5)
+        Assert.assertEquals(data[3].toDouble(), tensor.dataAsFloatArray[3].toDouble(), 1e-5)
+    }
+
+    @Test
+    fun testIntTensor() {
+        val data = intArrayOf(Int.MIN_VALUE, 0, 1, Int.MAX_VALUE)
+        val shape = longArrayOf(1, 4, 1)
+        var tensor = Tensor.fromBlob(data, shape)
+        Assert.assertEquals(tensor.dtype(), DType.INT32)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(shape[2], tensor.shape()[2])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toLong(), tensor.dataAsIntArray[0].toLong())
+        Assert.assertEquals(data[1].toLong(), tensor.dataAsIntArray[1].toLong())
+        Assert.assertEquals(data[2].toLong(), tensor.dataAsIntArray[2].toLong())
+        Assert.assertEquals(data[3].toLong(), tensor.dataAsIntArray[3].toLong())
+
+        val intBuffer = Tensor.allocateIntBuffer(4)
+        intBuffer.put(data)
+        tensor = Tensor.fromBlob(intBuffer, shape)
+        Assert.assertEquals(tensor.dtype(), DType.INT32)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(shape[2], tensor.shape()[2])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toLong(), tensor.dataAsIntArray[0].toLong())
+        Assert.assertEquals(data[1].toLong(), tensor.dataAsIntArray[1].toLong())
+        Assert.assertEquals(data[2].toLong(), tensor.dataAsIntArray[2].toLong())
+        Assert.assertEquals(data[3].toLong(), tensor.dataAsIntArray[3].toLong())
+    }
+
+    @Test
+    fun testDoubleTensor() {
+        val data = doubleArrayOf(Double.MIN_VALUE, 0.0, 0.1, Double.MAX_VALUE)
+        val shape = longArrayOf(1, 4)
+        var tensor = Tensor.fromBlob(data, shape)
+        Assert.assertEquals(tensor.dtype(), DType.DOUBLE)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0], tensor.dataAsDoubleArray[0], 1e-5)
+        Assert.assertEquals(data[1], tensor.dataAsDoubleArray[1], 1e-5)
+        Assert.assertEquals(data[2], tensor.dataAsDoubleArray[2], 1e-5)
+        Assert.assertEquals(data[3], tensor.dataAsDoubleArray[3], 1e-5)
+
+        val doubleBuffer = Tensor.allocateDoubleBuffer(4)
+        doubleBuffer.put(data)
+        tensor = Tensor.fromBlob(doubleBuffer, shape)
+        Assert.assertEquals(tensor.dtype(), DType.DOUBLE)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0], tensor.dataAsDoubleArray[0], 1e-5)
+        Assert.assertEquals(data[1], tensor.dataAsDoubleArray[1], 1e-5)
+        Assert.assertEquals(data[2], tensor.dataAsDoubleArray[2], 1e-5)
+        Assert.assertEquals(data[3], tensor.dataAsDoubleArray[3], 1e-5)
+    }
+
+    @Test
+    fun testLongTensor() {
+        val data = longArrayOf(Long.MIN_VALUE, 0L, 1L, Long.MAX_VALUE)
+        val shape = longArrayOf(4, 1)
+        var tensor = Tensor.fromBlob(data, shape)
+        Assert.assertEquals(tensor.dtype(), DType.INT64)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0], tensor.dataAsLongArray[0])
+        Assert.assertEquals(data[1], tensor.dataAsLongArray[1])
+        Assert.assertEquals(data[2], tensor.dataAsLongArray[2])
+        Assert.assertEquals(data[3], tensor.dataAsLongArray[3])
+
+        val longBuffer = Tensor.allocateLongBuffer(4)
+        longBuffer.put(data)
+        tensor = Tensor.fromBlob(longBuffer, shape)
+        Assert.assertEquals(tensor.dtype(), DType.INT64)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0], tensor.dataAsLongArray[0])
+        Assert.assertEquals(data[1], tensor.dataAsLongArray[1])
+        Assert.assertEquals(data[2], tensor.dataAsLongArray[2])
+        Assert.assertEquals(data[3], tensor.dataAsLongArray[3])
+    }
+
+    @Test
+    fun testSignedByteTensor() {
+        val data = byteArrayOf(Byte.MIN_VALUE, 0.toByte(), 1.toByte(), Byte.MAX_VALUE)
+        val shape = longArrayOf(1, 1, 4)
+        var tensor = Tensor.fromBlob(data, shape)
+        Assert.assertEquals(tensor.dtype(), DType.INT8)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(shape[2], tensor.shape()[2])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toLong(), tensor.dataAsByteArray[0].toLong())
+        Assert.assertEquals(data[1].toLong(), tensor.dataAsByteArray[1].toLong())
+        Assert.assertEquals(data[2].toLong(), tensor.dataAsByteArray[2].toLong())
+        Assert.assertEquals(data[3].toLong(), tensor.dataAsByteArray[3].toLong())
+
+        val byteBuffer = Tensor.allocateByteBuffer(4)
+        byteBuffer.put(data)
+        tensor = Tensor.fromBlob(byteBuffer, shape)
+        Assert.assertEquals(tensor.dtype(), DType.INT8)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(shape[2], tensor.shape()[2])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toLong(), tensor.dataAsByteArray[0].toLong())
+        Assert.assertEquals(data[1].toLong(), tensor.dataAsByteArray[1].toLong())
+        Assert.assertEquals(data[2].toLong(), tensor.dataAsByteArray[2].toLong())
+        Assert.assertEquals(data[3].toLong(), tensor.dataAsByteArray[3].toLong())
+    }
+
+    @Test
+    fun testUnsignedByteTensor() {
+        val data = byteArrayOf(0.toByte(), 1.toByte(), 2.toByte(), 255.toByte())
+        val shape = longArrayOf(4, 1, 1)
+        var tensor = Tensor.fromBlobUnsigned(data, shape)
+        Assert.assertEquals(tensor.dtype(), DType.UINT8)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(shape[2], tensor.shape()[2])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toLong(), tensor.dataAsUnsignedByteArray[0].toLong())
+        Assert.assertEquals(data[1].toLong(), tensor.dataAsUnsignedByteArray[1].toLong())
+        Assert.assertEquals(data[2].toLong(), tensor.dataAsUnsignedByteArray[2].toLong())
+        Assert.assertEquals(data[3].toLong(), tensor.dataAsUnsignedByteArray[3].toLong())
+
+        val byteBuffer = Tensor.allocateByteBuffer(4)
+        byteBuffer.put(data)
+        tensor = Tensor.fromBlobUnsigned(byteBuffer, shape)
+        Assert.assertEquals(tensor.dtype(), DType.UINT8)
+        Assert.assertEquals(shape[0], tensor.shape()[0])
+        Assert.assertEquals(shape[1], tensor.shape()[1])
+        Assert.assertEquals(shape[2], tensor.shape()[2])
+        Assert.assertEquals(4, tensor.numel())
+        Assert.assertEquals(data[0].toLong(), tensor.dataAsUnsignedByteArray[0].toLong())
+        Assert.assertEquals(data[1].toLong(), tensor.dataAsUnsignedByteArray[1].toLong())
+        Assert.assertEquals(data[2].toLong(), tensor.dataAsUnsignedByteArray[2].toLong())
+        Assert.assertEquals(data[3].toLong(), tensor.dataAsUnsignedByteArray[3].toLong())
+    }
+
+    @Test
+    fun testIllegalDataTypeException() {
+        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+        val shape = longArrayOf(2, 2)
+        val tensor = Tensor.fromBlob(data, shape)
+        Assert.assertEquals(tensor.dtype(), DType.FLOAT)
+
+        try {
+            tensor.dataAsByteArray
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+            // expected
+        }
+        try {
+            tensor.dataAsUnsignedByteArray
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+            // expected
+        }
+        try {
+            tensor.dataAsIntArray
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+            // expected
+        }
+        try {
+            tensor.dataAsDoubleArray
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+            // expected
+        }
+        try {
+            tensor.dataAsLongArray
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalStateException) {
+            // expected
+        }
+    }
+
+    @Test
+    fun testIllegalArguments() {
+        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+        val shapeWithNegativeValues = longArrayOf(-1, 2)
+        val mismatchShape = longArrayOf(1, 2)
+
+        try {
+            val tensor = Tensor.fromBlob(null as FloatArray?, mismatchShape)
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalArgumentException) {
+            // expected
+        }
+        try {
+            val tensor = Tensor.fromBlob(data, null)
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalArgumentException) {
+            // expected
+        }
+        try {
+            val tensor = Tensor.fromBlob(data, shapeWithNegativeValues)
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalArgumentException) {
+            // expected
+        }
+        try {
+            val tensor = Tensor.fromBlob(data, mismatchShape)
+            Assert.fail("Should have thrown an exception")
+        } catch (e: IllegalArgumentException) {
+            // expected
+        }
+    }
+
+    @Test
+    fun testLongTensorSerde() {
+        val data = longArrayOf(1, 2, 3, 4)
+        val shape = longArrayOf(2, 2)
+        val tensor = Tensor.fromBlob(data, shape)
+        val bytes = tensor.toByteArray()
+
+        val deser = Tensor.fromByteArray(bytes)
+        val deserShape = deser.shape()
+        val deserData = deser.dataAsLongArray
+
+        for (i in data.indices) {
+            Assert.assertEquals(data[i], deserData[i])
+        }
+
+        for (i in shape.indices) {
+            Assert.assertEquals(shape[i], deserShape[i])
+        }
+    }
+
+    @Test
+    fun testFloatTensorSerde() {
+        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+        val shape = longArrayOf(2, 2)
+        val tensor = Tensor.fromBlob(data, shape)
+        val bytes = tensor.toByteArray()
+
+        val deser = Tensor.fromByteArray(bytes)
+        val deserShape = deser.shape()
+        val deserData = deser.dataAsFloatArray
+
+        for (i in data.indices) {
+            Assert.assertEquals(data[i].toDouble(), deserData[i].toDouble(), 1e-5)
+        }
+
+        for (i in shape.indices) {
+            Assert.assertEquals(shape[i], deserShape[i])
+        }
+    }
+}
diff --git a/extension/android/gradle/libs.versions.toml b/extension/android/gradle/libs.versions.toml
index 561988cb1f6..fcd6a356536 100644
--- a/extension/android/gradle/libs.versions.toml
+++ b/extension/android/gradle/libs.versions.toml
@@ -5,8 +5,13 @@
 commons-math3 = "3.6.1"
 guava = "32.1.3-jre"
 junit = "4.13.2"
+core-ktx = "1.13.1"
+kotlin = "1.9.23"
 
 [libraries]
 commons-math3 = { module = "org.apache.commons:commons-math3", version.ref = "commons-math3" }
 guava = { module = "com.google.guava:guava", version.ref = "guava" }
 junit = { module = "junit:junit", version.ref = "junit" }
+core-ktx = { group = "androidx.core", name = "core-ktx", version.ref = "core-ktx" }
+[plugins]
+jetbrains-kotlin-android = { id = "org.jetbrains.kotlin.android", version.ref = "kotlin" }

From ac26555aa86cceaddfbf16a17fb4548c5dbc3b40 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Thu, 8 May 2025 14:41:50 -0700
Subject: [PATCH 015/178] Create a macos-arm64 preset (#10768)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary

* Create the base for a macos-arm64 preset — bigger migration in future
diffs
* Create an Apple CI job to test builds

### Test plan

CI +

```
$ cmake --preset macos-arm64

-- Loading build preset: /Users/jathu/executorch/tools/cmake/preset/macos-arm64.cmake
-- --- Configurated Options ---

-- EXECUTORCH_BUILD_PRESET_FILE : /Users/jathu/executorch/tools/cmake/preset/macos-arm64.cmake
-- EXECUTORCH_ENABLE_LOGGING    : ON
-- EXECUTORCH_BUILD_COREML      : ON
-- ---------------------------

$ cmake --build cmake-out --parallel
```

cc @larryliu0820
---
 .github/workflows/build-presets.yml | 17 +++++++++++++++
 CMakeLists.txt                      |  2 ++
 CMakePresets.json                   | 33 +++++++++++++++++++++++++++++
 tools/cmake/Utils.cmake             |  5 +++++
 tools/cmake/common/preset.cmake     | 11 ++++++++++
 5 files changed, 68 insertions(+)
 create mode 100644 CMakePresets.json

diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 39bc9dc6480..7f3c958ae55 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -11,3 +11,20 @@ on:
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
+
+jobs:
+  apple:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      matrix:
+        preset: [macos-arm64]
+    with:
+      job-name: build
+      runner: macos-latest-xlarge
+      python-version: 3.12
+      submodules: recursive
+      script: |
+        set -eux
+        ${CONDA_RUN} ./install_requirements.sh > /dev/null
+        ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
+        ${CONDA_RUN} cmake --build cmake-out --parallel
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 03e36186c94..76c75270d5f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,8 @@ project(executorch)
 # MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
+
+load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
 
 # Print all the configs that were called with announce_configured_options.
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 00000000000..5006ba9ec05
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,33 @@
+{
+  "version": 10,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 31,
+    "patch": 0
+  },
+  "$comment": "On-device AI across mobile, embedded and edge for PyTorch.",
+  "configurePresets": [
+    {
+      "name": "common",
+      "hidden": true,
+      "binaryDir": "${sourceDir}/cmake-out",
+      "generator": "Unix Makefiles"
+    },
+    {
+      "name": "macos-arm64",
+      "inherits": ["common"],
+      "generator": "Xcode",
+      "cacheVariables": {
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/macos-arm64.cmake",
+        "PLATFORM": "MAC_ARM64",
+        "DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "lhs": "${hostSystemName}",
+        "type": "equals",
+        "rhs": "Darwin"
+      }
+    }
+  ]
+}
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index edbd682c7e3..dda83f1794e 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -313,6 +313,11 @@ function(resolve_python_executable)
         python
         PARENT_SCOPE
     )
+  elseif(DEFINED ENV{VIRTUAL_ENV})
+    set(PYTHON_EXECUTABLE
+      $ENV{VIRTUAL_ENV}/bin/python3
+      PARENT_SCOPE
+    )
   else()
     set(PYTHON_EXECUTABLE
         python3
diff --git a/tools/cmake/common/preset.cmake b/tools/cmake/common/preset.cmake
index 8f886abab36..e9933c8f05e 100644
--- a/tools/cmake/common/preset.cmake
+++ b/tools/cmake/common/preset.cmake
@@ -91,3 +91,14 @@ macro(set_overridable_option NAME VALUE)
 
   set(${NAME} ${VALUE} CACHE STRING "")
 endmacro()
+
+# Detemine the build preset and load it.
+macro(load_build_preset)
+  if(DEFINED EXECUTORCH_BUILD_PRESET_FILE)
+    announce_configured_options(EXECUTORCH_BUILD_PRESET_FILE)
+    message(STATUS "Loading build preset: ${EXECUTORCH_BUILD_PRESET_FILE}")
+    include(${EXECUTORCH_BUILD_PRESET_FILE})
+  endif()
+  # For now, just continue if the preset file is not set. In the future, we will
+  # try to determine a preset file.
+endmacro()

From 5ad676d14b54c40ce40946588e8036d4125b0a2a Mon Sep 17 00:00:00 2001
From: mcremon-meta <134334895+mcremon-meta@users.noreply.github.com>
Date: Thu, 8 May 2025 15:04:17 -0700
Subject: [PATCH 016/178] Extract trace from prepare_and_convert and remove
 export_program

Differential Revision: D73440517

Pull Request resolved: https://github.com/pytorch/executorch/pull/10493
---
 backends/cadence/aot/compiler.py              | 97 +++++++++++--------
 backends/cadence/aot/export_example.py        |  6 +-
 .../aot/tests/test_remove_ops_passes.py       | 68 +++++++------
 3 files changed, 97 insertions(+), 74 deletions(-)

diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 3d43ca2956e..594c4189b3a 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -39,7 +39,6 @@
 from torch._inductor.decomposition import remove_decompositions
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
-from torch.export import export
 from torch.export.exported_program import ExportedProgram
 
 from .passes import get_cadence_passes
@@ -55,27 +54,24 @@
 # however useful for unit tests to separate the converted model from the fused
 # model, to be able to get reference numerics.
 # If this does not apply, please use quantize_and_fuse_pt2 instead.
-def prepare_and_convert_pt2(
+def trace(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
-    quantizer: CadenceQuantizer,
-    calibration_data: Optional[list[tuple[object, ...]]] = None,
     dump_graphs: bool = False,
-) -> torch.fx.GraphModule:
+) -> ExportedProgram:
     """
-    Prepare and convert a model using the given quantizer.
-    The quantizer must be supplied and be the same as the one used to
-    fuse the model later, if applicable. If you do not expect that behavior,
-    please use quantize_and_fuse_pt2 instead, which will instantiate a
-    default quantizer for you if needed.
-    If calibration data is provided, it will be used to calibrate the model. If
-    not, the inputs will be used for calibration instead, which is useful for
-    unit tests but should not be used for end-to-end use cases.
-    Returns a GraphModule with the converted model.
+    Trace the model with export_for_training and return an ExportedProgram.
     """
 
+    # Make the model inference mode by calling model.eval()
+    model.eval()
+
+    # Prevent mkldnn decompositions
+    torch._C._set_mkldnn_enabled(False)
+
     # Get default decompositions
     decomp_table = torch.export.default_decompositions()
+
     # Select ops to keep
     ops_to_keep = [
         torch.ops.aten.conv1d.default,
@@ -85,19 +81,46 @@ def prepare_and_convert_pt2(
         torch.ops.aten.matmul.default,
         torch.ops.aten.rms_norm.default,
     ]
+
     # Remove decompositions for the ops we want to keep
     # pyre-fixme[6]: For 1st argument expected `Dict[typing.Callable[..., typing.Any
     remove_decompositions(decomp_table, ops_to_keep)
+
     # Export with dynamo
-    model_gm = (
-        torch.export.export_for_training(model, inputs, strict=True)
-        .run_decompositions(decomp_table)
-        .module()
-    )
+    program = torch.export.export_for_training(
+        model, inputs, strict=True
+    ).run_decompositions(decomp_table)
 
     if dump_graphs:
         logging.info("Graph before quantization:")
-        logging.info(model_gm.graph.print_tabular())
+        logging.info(program.module().graph.print_tabular())
+
+    return program
+
+
+def prepare_and_convert_pt2(
+    program: ExportedProgram,
+    inputs: tuple[object, ...],
+    quantizer: CadenceQuantizer,
+    calibration_data: Optional[list[tuple[object, ...]]] = None,
+    dump_graphs: bool = False,
+) -> torch.fx.GraphModule:
+    """
+    Prepare and convert a model using the given quantizer.
+    The quantizer must be supplied and be the same as the one used to
+    fuse the model later, if applicable. If you do not expect that behavior,
+    please use quantize_and_fuse_pt2 instead, which will instantiate a
+    default quantizer for you if needed.
+    If calibration data is provided, it will be used to calibrate the model. If
+    not, the inputs will be used for calibration instead, which is useful for
+    unit tests but should not be used for end-to-end use cases.
+    Returns a GraphModule with the converted model.
+    """
+
+    # Get the graph module from the ExportedProgram
+    model_gm = program.module()
+
+    assert isinstance(model_gm, torch.fx.GraphModule)
 
     # Prepare
     prepared_model = prepare_pt2e(model_gm, quantizer)
@@ -121,10 +144,10 @@ def prepare_and_convert_pt2(
 
 
 # Note: this is not meant as a primary API since it can create inconsistencies
-# if the quantizer here is different from the quantizer used to convert. It is
-# however useful for unit tests to separate the converted model from the fused
-# model, to be able to get reference numerics.
-# If this does not apply, please use quantize_and_fuse_pt2 instead.
+# if the quantizer here is different from the quantizer used to prepare/convert.
+# It is however useful for unit tests to separate the converted model from the
+# fused model, to be able to get reference numerics.
+# If this does not apply, please use quantize_pt2 instead.
 def fuse_pt2(
     converted_graph_module: torch.fx.GraphModule,
     quantizer: CadenceQuantizer,
@@ -167,9 +190,15 @@ def quantize_pt2(
     if not quantizer:
         quantizer = CadenceDefaultQuantizer()
 
+    program = trace(model, inputs, dump_graphs=dump_graphs)
+
+    if dump_graphs:
+        logging.info("Graph after trace:")
+        logging.info(program.graph.print_tabular())
+
     # Get converted graph module
     converted_gm = prepare_and_convert_pt2(
-        model, inputs, quantizer, calibration_data, dump_graphs=dump_graphs
+        program, inputs, quantizer, calibration_data, dump_graphs=dump_graphs
     )
 
     # Get fused model
@@ -184,22 +213,6 @@ def quantize_pt2(
     return program
 
 
-# Export the model and lower it to an ExportedProgram (in aten IR)
-def export_program(
-    model: torch.nn.Module,
-    inputs: tuple[object, ...],
-) -> ExportedProgram:
-    assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
-
-    # Prevent mkldnn decompositions
-    torch._C._set_mkldnn_enabled(False)
-
-    # Export the model and return it.
-    expo_program = export(model, inputs, strict=True)
-
-    return expo_program
-
-
 def _lower_ep_to_edge(
     expo_program: ExportedProgram,
     dump_graphs: bool = False,
@@ -248,7 +261,7 @@ def export_to_edge(
     assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
 
     # Export the model into an ExportedProgram.
-    expo_program = export_program(model, inputs)
+    expo_program = trace(model, inputs)
 
     # Lower the model to edge IR.
     edge_prog_manager = _lower_ep_to_edge(expo_program, dump_graphs, constant_methods)
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
index d2148870e53..6eaead7105e 100644
--- a/backends/cadence/aot/export_example.py
+++ b/backends/cadence/aot/export_example.py
@@ -18,6 +18,7 @@
     export_to_executorch_gen_etrecord,
     fuse_pt2,
     prepare_and_convert_pt2,
+    trace,
 )
 
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
@@ -48,8 +49,11 @@ def export_model(
     # Instantiate the quantizer
     quantizer = CadenceDefaultQuantizer()
 
+    # Trace the model
+    ep = trace(model, example_inputs)
+
     # Convert the model
-    converted_model = prepare_and_convert_pt2(model, example_inputs, quantizer)
+    converted_model = prepare_and_convert_pt2(ep, example_inputs, quantizer)
 
     # Get reference outputs from converted model
     ref_outputs = converted_model(*example_inputs)
diff --git a/backends/cadence/aot/tests/test_remove_ops_passes.py b/backends/cadence/aot/tests/test_remove_ops_passes.py
index 8caba7799b5..74c39ae3ee3 100644
--- a/backends/cadence/aot/tests/test_remove_ops_passes.py
+++ b/backends/cadence/aot/tests/test_remove_ops_passes.py
@@ -16,10 +16,10 @@
 import torch.nn.functional as F
 from executorch.backends.cadence.aot import compiler
 from executorch.backends.cadence.aot.compiler import export_to_edge
+from executorch.backends.cadence.aot.fuse_ops import FuseQuantDequantToRequantizePass
 from executorch.backends.cadence.aot.graph_builder import GraphBuilder
 
 from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
-from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
 from executorch.backends.cadence.aot.remove_ops import (
     RemoveAliasCopyOpPass,
     RemoveBranchedQuantDequant,
@@ -42,9 +42,6 @@
 from parameterized.parameterized import parameterized
 from pyre_extensions import none_throws
 
-from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
-
-from torch.export import export_for_training
 from torch.fx.passes.infra.pass_base import PassResult
 
 
@@ -459,44 +456,53 @@ def forward(self, x, y):
         )
 
     def test_remove_nop_quant_dequant(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-                self.linear = torch.nn.Linear(6, 12, bias=False)
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(8, 8))
+        q0 = builder.call_operator(
+            op=exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.01662161760032177, -4, -128, 127, torch.int8),
+        )
+        dq0 = builder.call_operator(
+            op=exir_ops.edge.cadence.dequantize_per_tensor.default,
+            args=(q0, 0.01662161760032177, -4, -128, 127, torch.int8),
+        )
+        q1 = builder.call_operator(
+            op=exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.012577153742313385, -9, -128, 127, torch.int8),
+        )
+        builder.output([dq0, q1])
+        graph_module = builder.get_graph_module()
 
-            def forward(self, x):
-                x = self.linear(x)
-                return x
+        # Expect the dq op to be removed by the pass
+        self.assertEqual(
+            count_node(
+                graph_module, exir_ops.edge.cadence.dequantize_per_tensor.default
+            ),
+            1,
+        )
 
-        inp = torch.randn(2, 8, 1, 6)
+        # Expect 1 quantize op left since it has no matching dequant
+        self.assertEqual(
+            count_node(graph_module, exir_ops.edge.cadence.quantize_per_tensor.default),
+            2,
+        )
 
-        # Run the standard quant/convert steps, but without fusing
-        # this leaves two redundant quant/dequant pairs to test with
-        quantizer = CadenceDefaultQuantizer()
-        model_exp = export_for_training(M(), (inp,), strict=True).module()
-        prepared_model = prepare_pt2e(model_exp, quantizer)
-        prepared_model(inp)
-        converted_model = convert_pt2e(prepared_model)
+        p = FuseQuantDequantToRequantizePass()
 
-        graph_module = (
-            compiler.export_to_cadence(
-                converted_model,
-                (inp,),
-            )
-            .exported_program()
-            .graph_module
-        )
+        graph_after_passes = cast(PassResult, p(graph_module)).graph_module
 
-        # Expect all quantize ops to be removed by the pass
+        # Expect the dq op to be removed by the pass
         self.assertEqual(
-            count_node(graph_module, exir_ops.edge.cadence.quantize_per_tensor.default),
+            count_node(
+                graph_after_passes, exir_ops.edge.cadence.dequantize_per_tensor.default
+            ),
             0,
         )
 
-        # Expect 1 dequantize op for the weights
+        # Expect 1 quantize op left since it has no matching dequant
         self.assertEqual(
             count_node(
-                graph_module, exir_ops.edge.cadence.dequantize_per_tensor.default
+                graph_after_passes, exir_ops.edge.cadence.quantize_per_tensor.default
             ),
             1,
         )

From 277c39d869e18c70eb85d33025fa554d368caa8a Mon Sep 17 00:00:00 2001
From: Thomas Jannaud <jannaud@meta.com>
Date: Thu, 8 May 2025 16:15:03 -0700
Subject: [PATCH 017/178] Make constant_folding's _DEFAULT_SKIP_TARGETS public

Differential Revision: D74349918

Pull Request resolved: https://github.com/pytorch/executorch/pull/10760
---
 exir/passes/constant_prop_pass.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 0049e597f8d..6921bd632f4 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -29,7 +29,8 @@
 
 # Avoid propagating constants for `exir.ops.edge.aten.full.default`.
 # Propagating aten.full can significantly increase compiled model size.
-_DEFAULT_SKIP_TARGETS = {exir_ops.edge.aten.full.default}
+_DEFAULT_SKIP_TARGETS_NO_QUANT = {exir_ops.edge.aten.full.default}
+_DEFAULT_SKIP_TARGETS = set(_DEFAULT_SKIP_TARGETS_NO_QUANT)
 
 # Do not const prop quantization primitives
 _QUANT_PRIMITIVES_EDGE = [aten_to_edge(op) for op in _QUANT_PRIMITIVES]
@@ -48,6 +49,10 @@
 )
 
 
+def get_default_skip_targets_no_quant() -> set[EdgeOpOverload]:
+    return _DEFAULT_SKIP_TARGETS_NO_QUANT
+
+
 def is_const(
     arg,
     exported_program: ExportedProgram,

From b1b46ee4c5615f5f2008cc05ee1e32367b3e43bd Mon Sep 17 00:00:00 2001
From: Thomas Jannaud <jannaud@meta.com>
Date: Thu, 8 May 2025 17:04:14 -0700
Subject: [PATCH 018/178] : constant fold None

Differential Revision: D74350331

Pull Request resolved: https://github.com/pytorch/executorch/pull/10762
---
 exir/passes/constant_prop_pass.py |  2 ++
 exir/tests/test_passes.py         | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 6921bd632f4..a103568b9a9 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -66,6 +66,8 @@ def is_const(
         )
     elif isinstance(arg, _PRIMITIVE_TYPES):
         return True
+    elif arg is None:
+        return True
     elif not isinstance(arg, torch.fx.Node):
         return False
     elif arg in const_node_to_tensor:
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index d3c2d0a0936..6618c729987 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1823,3 +1823,34 @@ def _do_checks(
         self.assertTrue(
             torch.allclose(output_no_dim_order[0], output_no_dim_order_revert[0])
         )
+
+    def test_constant_prop_pass_none(self) -> None:
+        """
+        This checks that None arguments are treated as constants in constant_prop_pass.
+        """
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.cst = torch.ones(3, 3, 3, dtype=torch.int8)
+                self.w = torch.ones(3, 3, 3, dtype=torch.int8)
+
+            def forward(self, x):
+                # Note: using e.g aten.linear would not work as None is not in the graph
+                a = torch.ops.aten.convolution.default(
+                    self.cst, self.w, None, [1], [0], [1], False, [0], 1
+                )
+                return a + x
+
+        mod = M()
+        x = torch.randn([3, 3, 3])
+        mod(x)
+        edge = to_edge(
+            export(mod, (x,), strict=True),
+            compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+        )
+        # 2 constants: self.w and self.cst
+        self.assertEqual(2, len(edge.exported_program().constants))
+        pass_result = constant_prop_pass(edge.exported_program())
+        # 1 constant: a (= self.w @ self.cst)
+        self.assertEqual(1, len(pass_result.constants))

From 6e3cb79b0120a5d44aef2379b59d35842634a3bc Mon Sep 17 00:00:00 2001
From: wl1026sun <weipingliu@meta.com>
Date: Thu, 8 May 2025 23:23:06 -0700
Subject: [PATCH 019/178] to make TIE quantized conv operator to fall back to
 hifi quantized conv op instead of cpu op for shapes not supported by the TIE
 kernel.

Differential Revision: D74337713

Pull Request resolved: https://github.com/pytorch/executorch/pull/10770
---
 backends/cadence/hifi/operators/operators.h | 39 +++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
index e9cfca6fb70..105510e3421 100644
--- a/backends/cadence/hifi/operators/operators.h
+++ b/backends/cadence/hifi/operators/operators.h
@@ -12,6 +12,7 @@
   _(uint8_t, Byte)                           \
   _(int8_t, Char)
 
+using ::executorch::aten::IntArrayRef;
 using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
@@ -67,6 +68,44 @@ void quantized_linear_per_tensor_out(
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out);
 
+void quantized_conv_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED const Tensor& out_multiplier,
+    __ET_UNUSED const Tensor& out_shift,
+    bool channel_last,
+    Tensor& out);
+
+void quantized_conv_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    bool channel_last,
+    Tensor& out);
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl

From 01a5d818ccee61e734c7c1e42bbc8abba24a8395 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 9 May 2025 01:34:47 -0500
Subject: [PATCH 020/178] Arm Backend: Use tosa_ref_model only if it is
 avaiable

Differential Revision: D74420616

Pull Request resolved: https://github.com/pytorch/executorch/pull/10778
---
 backends/arm/test/conftest.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py
index 2d247f7bd42..936a4f64a10 100644
--- a/backends/arm/test/conftest.py
+++ b/backends/arm/test/conftest.py
@@ -44,10 +44,20 @@ def pytest_configure(config):
     if getattr(config.option, "fast_fvp", False):
         pytest._test_options["fast_fvp"] = config.option.fast_fvp  # type: ignore[attr-defined]
 
+    pytest._test_options["tosa_version"] = "0.80"  # type: ignore[attr-defined]
     if config.option.arm_run_tosa_version:
         pytest._test_options["tosa_version"] = config.option.arm_run_tosa_version
 
-    pytest._test_options["tosa_ref_model"] = True  # type: ignore[attr-defined]
+    # Not all deployments of ET have the TOSA reference model available.
+    # Make sure we don't try to use it if it's not available.
+    try:
+        if pytest._test_options["tosa_version"] == "0.80":
+            import tosa_tools.v0_80.tosa_reference_model as tosa_reference_model
+        else:
+            import tosa_tools.tosa_ref_model as tosa_reference_model
+    except ImportError:
+        pytest._test_options["tosa_ref_model"] = False  # type: ignore[attr-defined]
+        tosa_reference_model = None  # noqa
 
     logging.basicConfig(level=logging.INFO, stream=sys.stdout)
 

From 7e1f3e3680f7e33dd9d9a949e3940ee7875745d4 Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Fri, 9 May 2025 00:10:57 -0700
Subject: [PATCH 021/178] Use std::align_alloc in file_data_loader

Differential Revision: D74041198

Pull Request resolved: https://github.com/pytorch/executorch/pull/10660
---
 .github/workflows/pull.yml                    |  4 +-
 extension/data_loader/file_data_loader.cpp    | 69 +++-----------
 .../test/backend_integration_test.cpp         |  4 +-
 runtime/platform/compiler.h                   | 95 +++++++++++++++++++
 4 files changed, 110 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 795272688bd..2dc1fcde36e 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -434,9 +434,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        # threshold=48120 on devserver with gcc11.4
-        # todo(lfq): update once binary size is below 50kb.
-        threshold="47552"
+        threshold="47560"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index 503539774a5..e9922eb8323 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -49,20 +49,6 @@ namespace {
 static bool is_power_of_2(size_t value) {
   return value > 0 && (value & ~(value - 1)) == value;
 }
-
-/**
- * Returns the next alignment for a given pointer.
- */
-static uint8_t* align_pointer(void* ptr, size_t alignment) {
-  intptr_t addr = reinterpret_cast<intptr_t>(ptr);
-  if ((addr & (alignment - 1)) == 0) {
-    // Already aligned.
-    return reinterpret_cast<uint8_t*>(ptr);
-  }
-  // Bump forward.
-  addr = (addr | (alignment - 1)) + 1;
-  return reinterpret_cast<uint8_t*>(addr);
-}
 } // namespace
 
 FileDataLoader::~FileDataLoader() {
@@ -129,13 +115,13 @@ namespace {
 /**
  * FreeableBuffer::FreeFn-compatible callback.
  *
- * `context` is actually a ptrdiff_t value (not a pointer) that contains the
- * offset in bytes between `data` and the actual pointer to free.
+ * `context` is the original buffer pointer. It is allocated with
+ * ET_ALIGNED_ALLOC, and must be freed with ET_ALIGNED_FREE.
+ *
+ * `data` and `size` are unused.
  */
 void FreeSegment(void* context, void* data, ET_UNUSED size_t size) {
-  ptrdiff_t offset = reinterpret_cast<ptrdiff_t>(context);
-  ET_DCHECK_MSG(offset >= 0, "Unexpected offset %ld", (long int)offset);
-  std::free(static_cast<uint8_t*>(data) - offset);
+  ET_ALIGNED_FREE(context);
 }
 } // namespace
 
@@ -163,57 +149,26 @@ Result<FreeableBuffer> FileDataLoader::load(
   }
 
   // Allocate memory for the FreeableBuffer.
-  size_t alloc_size = size;
-  if (alignment_ > alignof(std::max_align_t)) {
-    // malloc() will align to smaller values, but we must manually align to
-    // larger values.
-    alloc_size += alignment_;
-  }
-  void* buffer = std::malloc(alloc_size);
-  if (buffer == nullptr) {
+  void* aligned_buffer = ET_ALIGNED_ALLOC(alignment_, size);
+  if (aligned_buffer == nullptr) {
     ET_LOG(
         Error,
-        "Reading from %s at offset %zu: malloc(%zd) failed",
+        "Reading from %s at offset %zu: ET_ALIGNED_ALLOC(%zd, %zd) failed",
         file_name_,
         offset,
+        alignment_,
         size);
     return Error::MemoryAllocationFailed;
   }
 
-  // Align.
-  void* aligned_buffer = align_pointer(buffer, alignment_);
-
-  // Assert that the alignment didn't overflow the buffer.
-  ET_DCHECK_MSG(
-      reinterpret_cast<uintptr_t>(aligned_buffer) + size <=
-          reinterpret_cast<uintptr_t>(buffer) + alloc_size,
-      "aligned_buffer %p + size %zu > buffer %p + alloc_size %zu",
-      aligned_buffer,
-      size,
-      buffer,
-      alloc_size);
-
   auto err = load_into(offset, size, segment_info, aligned_buffer);
   if (err != Error::Ok) {
-    // Free `buffer`, which is what malloc() gave us, not `aligned_buffer`.
-    std::free(buffer);
+    ET_ALIGNED_FREE(aligned_buffer);
     return err;
   }
 
-  // We can't naively free this pointer, since it may not be what malloc() gave
-  // us. Pass the offset to the real buffer as context. This is the number of
-  // bytes that need to be subtracted from the FreeableBuffer::data() pointer to
-  // find the actual pointer to free.
-  return FreeableBuffer(
-      aligned_buffer,
-      size,
-      FreeSegment,
-      /*free_fn_context=*/
-      reinterpret_cast<void*>(
-          // Using signed types here because it will produce a signed ptrdiff_t
-          // value, though for us it will always be non-negative.
-          reinterpret_cast<intptr_t>(aligned_buffer) -
-          reinterpret_cast<intptr_t>(buffer)));
+  // Pass the aligned_buffer pointer as context to FreeSegment.
+  return FreeableBuffer(aligned_buffer, size, FreeSegment, aligned_buffer);
 }
 
 Result<size_t> FileDataLoader::size() const {
diff --git a/runtime/executor/test/backend_integration_test.cpp b/runtime/executor/test/backend_integration_test.cpp
index ea9467907c7..e2e61f171eb 100644
--- a/runtime/executor/test/backend_integration_test.cpp
+++ b/runtime/executor/test/backend_integration_test.cpp
@@ -656,8 +656,8 @@ class DelegateDataAlignmentTest : public ::testing::TestWithParam<bool> {
       // The delegate data inline alignment used by the -da1024 file.
       return 1024;
     } else {
-      // A small alignment that's compatible with any realistic alignment.
-      return 4;
+      // Minimum alignment expected by program.cpp.
+      return alignof(std::max_align_t);
     }
   }
 
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
index 7467d5c1e04..da7e0988a62 100644
--- a/runtime/platform/compiler.h
+++ b/runtime/platform/compiler.h
@@ -171,6 +171,101 @@
 using ssize_t = ptrdiff_t;
 #endif
 
+/**
+ * Platform-specific aligned memory allocation and deallocation.
+ *
+ * Usage:
+ *   void* ptr = ET_ALIGNED_ALLOC(alignment, size);
+ *   // use ptr...
+ *   ET_ALIGNED_FREE(ptr);
+ *
+ * Note: alignment must be a power of 2 and size must be an integral multiple of
+ * alignment.
+ */
+#if defined(_MSC_VER)
+#include <malloc.h>
+#define ET_ALIGNED_ALLOC(alignment, size) \
+  _aligned_malloc(((size + alignment - 1) & ~(alignment - 1)), (alignment))
+#define ET_ALIGNED_FREE(ptr) _aligned_free(ptr)
+#elif defined(__APPLE__)
+#include <stdlib.h> // For posix_memalign and free
+inline void* et_apple_aligned_alloc(size_t alignment, size_t size) {
+  void* ptr = nullptr;
+  // The address of the allocated memory must be a multiple of sizeof(void*).
+  if (alignment < sizeof(void*)) {
+    alignment = sizeof(void*);
+  }
+  if (posix_memalign(
+          &ptr, alignment, (size + alignment - 1) & ~(alignment - 1)) != 0) {
+    return nullptr;
+  }
+  return ptr;
+}
+#define ET_ALIGNED_ALLOC(alignment, size) \
+  et_apple_aligned_alloc((alignment), (size))
+#define ET_ALIGNED_FREE(ptr) free(ptr)
+#elif __has_builtin(__builtin_aligned_alloc) || defined(_ISOC11_SOURCE)
+// Linux and posix systems that support aligned_alloc and are >= C++17.
+#include <cstdlib>
+#define ET_ALIGNED_ALLOC(alignment, size) \
+  ::aligned_alloc(alignment, (size + alignment - 1) & ~(alignment - 1))
+#define ET_ALIGNED_FREE(ptr) free(ptr)
+#else
+// If the platform doesn't support aligned_alloc, fallback to malloc.
+#include <stdint.h>
+#include <cstdlib>
+inline void* et_aligned_malloc(size_t alignment, size_t size) {
+  // Place to store the offset to the original pointer.
+  size_t offset_size = sizeof(uint16_t);
+
+  // Malloc extra space for offset + alignment.
+  size_t alloc_size = size + offset_size + alignment - 1;
+  void* ptr = std::malloc(alloc_size);
+
+  if (ptr == nullptr) {
+    // Malloc failed.
+    return nullptr;
+  }
+
+  uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+  // Align the address past addr + offset_size bytes.
+  // This provides space to store the offset before the aligned pointer.
+  addr = addr + offset_size;
+  uintptr_t aligned_ptr = (addr + alignment - 1) & ~(alignment - 1);
+
+  // Check that alignment didn't overflow the buffer.
+  if (reinterpret_cast<uintptr_t>(aligned_ptr) + size >
+      reinterpret_cast<uintptr_t>(ptr) + alloc_size) {
+    std::free(ptr);
+    return nullptr;
+  }
+
+  // Store the offset to the original pointer.
+  // Used to free the original allocated buffer.
+  *(reinterpret_cast<uint16_t*>(aligned_ptr) - 1) =
+      (uint16_t)(reinterpret_cast<uintptr_t>(aligned_ptr) -
+                 reinterpret_cast<uintptr_t>(ptr));
+
+  return reinterpret_cast<uint16_t*>(aligned_ptr);
+}
+
+inline void et_aligned_free(void* ptr) {
+  if (ptr == nullptr) {
+    return;
+  }
+
+  // Get the original pointer using the offset.
+  uint16_t* original_ptr = reinterpret_cast<uint16_t*>(
+      reinterpret_cast<uintptr_t>(ptr) -
+      *(reinterpret_cast<uint16_t*>(ptr) - 1));
+  std::free(original_ptr);
+}
+
+#define ET_ALIGNED_ALLOC(alignment, size) et_aligned_malloc((alignment), (size))
+#define ET_ALIGNED_FREE(ptr) et_aligned_free(ptr)
+
+#endif
+
 // DEPRECATED: Use the non-underscore-prefixed versions instead.
 // TODO(T199005537): Remove these once all users have stopped using them.
 #define __ET_DEPRECATED ET_DEPRECATED

From 6759d35d49d988430dbab4cb22f320a0b0f21d1a Mon Sep 17 00:00:00 2001
From: Thomas Jannaud <jannaud@meta.com>
Date: Fri, 9 May 2025 00:45:45 -0700
Subject: [PATCH 022/178] fix transpose / permutations fusion pass

Differential Revision: D74447383

Pull Request resolved: https://github.com/pytorch/executorch/pull/10780
---
 backends/cadence/aot/fuse_ops.py              | 13 +++++-
 .../aot/tests/test_fusion_ops_passes.py       | 44 +++++++++++++++++++
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
index 77184c7af77..7a20a3f64b4 100644
--- a/backends/cadence/aot/fuse_ops.py
+++ b/backends/cadence/aot/fuse_ops.py
@@ -885,6 +885,9 @@ class FuseTransposeOrPermuteOpPairsPass(FuseOpPairsAcrossBranchesPass):
     """
     Fuse transpose or permute op pairs to a single view op.
     (transpose or permutation) -> (quant or dequant) -> (transpose or permutation)
+    This happens when op2(op1) == identity, modulo unitary dimensions.
+    'unitary dimensions' example: a tensor of shape [1, 5, 30] is equivalent (in memory) to [5, 1, 30]
+    so transpose(1, 2) then transpose(0, 2) is a pseudo identity and should be fused.
     """
 
     # A list of ops that can be bypassed when looking for a
@@ -908,7 +911,7 @@ def can_fuse_for_chain(
         if not super().can_fuse_for_chain(producer, consumer, consumer_op_packets):
             return False
 
-        # checking that permut2(permut1(identify)) == identity
+        # checking that permut2(permut1(identity)) == identity, modulo unitary dimensions
         input_shape = cast(torch.fx.Node, producer.args[0]).meta["val"].shape
         ident_dims = list(range(len(input_shape)))
         # this mapping helps to handle both transpose and permutations
@@ -918,7 +921,10 @@ def can_fuse_for_chain(
         }
         in_dims = f[producer.target](producer, ident_dims)
         out_dims = f[consumer.target](consumer, in_dims)
-        return out_dims == ident_dims
+        # Filtering out unitary dimensions
+        non_unit_ident_dims = [dim for dim in ident_dims if input_shape[dim] != 1]
+        non_unit_out_dims = [dim for dim in out_dims if input_shape[dim] != 1]
+        return non_unit_out_dims == non_unit_ident_dims
 
     def get_fused_node(
         self,
@@ -926,6 +932,9 @@ def get_fused_node(
         consumer: torch.fx.Node,
         graph_module: torch.fx.GraphModule,
     ) -> torch.fx.Node:
+        # This step is important because of how we can fuse transpositions that are not perfectly
+        # reverse one of another but will be fused if there are unitary dimensions.
+        # The fused operation must have the same output shape as the consumer.
         output_shape = consumer.meta["val"].shape
         with graph_module.graph.inserting_after(consumer):
             view = graph_module.graph.call_function(
diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
index 1bb44b872d2..4e267254488 100644
--- a/backends/cadence/aot/tests/test_fusion_ops_passes.py
+++ b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -584,6 +584,28 @@ def _create_operator(
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
                 False,
             ),
+            # transpose -> quant -> transpose is not the reverse BUT there is a UNITARY dimension
+            # so it ends up being the same on memory => fuse
+            (
+                True,
+                [0, 1],
+                True,
+                [0, 2],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                True,
+                [5, 40, 1],
+            ),
+            # transpose -> quant -> transpose is not the reverse, and unitary dimensions
+            # don't help => don't fuse
+            (
+                True,
+                [0, 1],
+                True,
+                [1, 3],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                False,
+                [5, 40, 1, 4],
+            ),
             # permutation -> quant -> opposite permutation => fuse
             (
                 False,
@@ -622,6 +644,28 @@ def _create_operator(
                 False,
                 [4, 4, 4],
             ),
+            # permutation -> quant -> a non reverse permutation BUT there is a UNITARY dimension
+            # so it ends up being the same on memory => fuse
+            (
+                False,
+                [1, 3, 2, 0],
+                False,
+                [3, 2, 1, 0],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                True,
+                [3, 1, 8, 10],
+            ),
+            # permutation -> quant -> a non reverse permutation, and unitary dimensions
+            # don't help => don't fuse
+            (
+                False,
+                [1, 3, 2, 0],
+                False,
+                [3, 1, 2, 0],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                False,
+                [3, 1, 8, 10],
+            ),
             # transpose -> quant -> transpose as a permutation => fuse
             (
                 True,

From 54a14d9bbdf1c229e44db23b4eb44bf1b64513cc Mon Sep 17 00:00:00 2001
From: per held <per.held@arm.com>
Date: Fri, 9 May 2025 13:10:42 +0200
Subject: [PATCH 023/178] Arm backend: Suppress colors in pre-push if
 non-interactive (#10783)

Dont try to print with colors in the pre-push script if the script is
non-interactive. This is to avoid getting broken output in the CI which
doesnt support colors.

Signed-off-by: per.held@arm.com
---
 backends/arm/scripts/pre-push | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push
index b755f2bcc48..804abbe26a4 100755
--- a/backends/arm/scripts/pre-push
+++ b/backends/arm/scripts/pre-push
@@ -8,11 +8,13 @@
 # non-interactive mode. "$#" gives the number of positional arguments.
 [ "$#" -eq 0 ] && is_script_interactive=1 || is_script_interactive=0
 
-RESET='\e[0m'
-RED='\e[31m'
-GREEN='\e[32m'
-YELLOW='\e[33m'
-BLUE='\e[34m'
+if [ $is_script_interactive -eq 1 ]; then
+    RESET='\e[0m'
+    RED='\e[31m'
+    GREEN='\e[32m'
+    YELLOW='\e[33m'
+    BLUE='\e[34m'
+fi
 
 INFO="${BLUE}[INFO]${RESET}"
 WARNING="${YELLOW}[WARNING]${RESET}"

From f7c906f6158d546c84495ca308806e6944cb9ea5 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 9 May 2025 11:18:03 -0500
Subject: [PATCH 024/178] Cortex-M: Use q/dq ops in Arm Ethos Runner (#10782)

---
 CMakeLists.txt                                |   6 +
 backends/arm/scripts/build_executorch.sh      |   1 +
 backends/arm/test/test_arm_baremetal.sh       |   7 ++
 backends/cortex_m/CMakeLists.txt              |  61 ++++++++++
 .../cortex_m/ops/op_dequantize_per_tensor.cpp |  93 +++++++++++----
 .../cortex_m/ops/op_quantize_per_tensor.cpp   | 111 +++++++++++++++---
 examples/arm/aot_arm_compiler.py              |  85 +++++++++++++-
 examples/arm/executor_runner/CMakeLists.txt   |  22 +++-
 examples/arm/run.sh                           |  20 +++-
 9 files changed, 360 insertions(+), 46 deletions(-)
 create mode 100644 backends/cortex_m/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76c75270d5f..4d292c209a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,6 +242,8 @@ option(EXECUTORCH_USE_DL "Use libdl library" ON)
 
 option(EXECUTORCH_BUILD_CADENCE "Build the Cadence DSP backend" OFF)
 
+option(EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" OFF)
+
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
@@ -715,6 +717,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
+if(EXECUTORCH_BUILD_CORTEX_M)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+endif()
+
 if(EXECUTORCH_BUILD_DEVTOOLS)
   if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
     set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh
index 87d9fd23070..573f93221d4 100755
--- a/backends/arm/scripts/build_executorch.sh
+++ b/backends/arm/scripts/build_executorch.sh
@@ -129,6 +129,7 @@ cmake                                                 \
     -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+    -DEXECUTORCH_BUILD_CORTEX_M=ON                    \
     -DEXECUTORCH_ENABLE_LOGGING=ON                    \
     ${build_devtools_flags}                           \
     ${build_with_etdump_flags}                        \
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 48cee9acd95..476d417a69a 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -154,6 +154,13 @@ test_run_ethosu_fvp() { # End to End model tests using run.sh
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=mul
+
+    # Cortex-M op tests
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qadd --bundleio
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qops --bundleio
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qops --bundleio --no_delegate --portable_kernels="aten::sub.out,aten::add.out,aten::mul.out"
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=qops --bundleio
+
     echo "${TEST_SUITE_NAME}: PASS"
     }
 
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
new file mode 100644
index 00000000000..39638bf0ee4
--- /dev/null
+++ b/backends/cortex_m/CMakeLists.txt
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Kernel library for Cortex-M operators. Please keep this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+
+# Cortex-M ops kernel sources
+set(_cortex_m_kernels__srcs
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
+)
+
+# Generate C++ bindings to register kernels into Executorch (for runtime).
+# Here select all ops in operators.yaml
+set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
+gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
+
+# Generate bindings for the kernels
+generate_bindings_for_kernels(
+  LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
+)
+message("Generated files ${gen_command_sources}")
+
+# Build a library for _cortex_m_kernels_srcs
+add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
+target_link_libraries(cortex_m_kernels PRIVATE executorch)
+target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
+
+# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
+gen_operators_lib(
+  LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
+)
+
+install(
+  TARGETS cortex_m_kernels cortex_m_ops_lib
+  DESTINATION lib
+  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
+)
diff --git a/backends/cortex_m/ops/op_dequantize_per_tensor.cpp b/backends/cortex_m/ops/op_dequantize_per_tensor.cpp
index 1011de73be7..6d3f3698c67 100644
--- a/backends/cortex_m/ops/op_dequantize_per_tensor.cpp
+++ b/backends/cortex_m/ops/op_dequantize_per_tensor.cpp
@@ -29,6 +29,7 @@ namespace {
  */
 void check_dequantize_args(
     const Tensor& input,
+    int64_t zero_point,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
@@ -39,6 +40,18 @@ void check_dequantize_args(
       "input.scalar_type() %" PRId8 " is not char type",
       static_cast<int8_t>(input.scalar_type()));
 
+  // Check zp range
+  ET_CHECK_MSG(
+      zero_point >= quant_min,
+      "zero_point must be %" PRId64 " <= quant_min %" PRId64,
+      zero_point,
+      quant_min);
+  ET_CHECK_MSG(
+      zero_point <= quant_max,
+      "zero_point must be %" PRId64 " >= quant_max %" PRId64,
+      zero_point,
+      quant_max);
+
   // Check output dtype is float
   ET_CHECK_MSG(
       out.scalar_type() == ScalarType::Float,
@@ -73,18 +86,10 @@ void check_dequantize_args(
 /**
  * Scalar implementation of quantization for a single value.
  */
-template <typename K, typename T>
-T dequantize_val(
-    float scale,
-    int32_t zero_point,
-    K value,
-    int64_t quant_min,
-    int64_t quant_max) {
-  (void)quant_min;
-  (void)quant_max;
-  return static_cast<T>((static_cast<int32_t>(value) - zero_point) * scale);
+template <typename Q, typename F>
+F dequantize_val(float scale, int32_t zero_point, Q qvalue) {
+  return static_cast<F>((static_cast<int32_t>(qvalue) - zero_point) * scale);
 }
-
 } // namespace
 
 Tensor& dequantize_per_tensor_out(
@@ -106,29 +111,71 @@ Tensor& dequantize_per_tensor_out(
       "Failed to resize out Tensor in dequantize_per_tensor_out");
 
   // Validate input parameters
-  check_dequantize_args(input, quant_min, quant_max, dtype, out);
+  check_dequantize_args(input, zero_point, quant_min, quant_max, dtype, out);
 
-  // Pre-compute inverse scale for better performance
   int32_t zp = static_cast<int32_t>(zero_point);
-  int32_t qmin = static_cast<int32_t>(quant_min);
-  int32_t qmax = static_cast<int32_t>(quant_max);
 
   // Get pointers to input and output data
   const int8_t* input_data = input.const_data_ptr<int8_t>();
   float* out_data = out.mutable_data_ptr<float>();
   const size_t numel = input.numel();
 
+  size_t i = 0;
 #if defined(HAS_HELIUM_SIMD)
-// Helium MVE implementation for float32 to int8 quantization
-#Error "Implement MVE version!"
-#else
-  // Scalar implementation for float32 to int8 quantization
-  for (size_t i = 0; i < numel; i++) {
-    out_data[i] =
-        dequantize_val<int8_t, float>(scale, zp, input_data[i], qmin, qmax);
+  // Helium MVE implementation for int8 to float quantization
+  static uint8x16_t voffset{
+      0x0,
+      0x8,
+      0x4,
+      0xC,
+      0x1,
+      0x9,
+      0x5,
+      0xD,
+      0x2,
+      0xA,
+      0x6,
+      0xE,
+      0x3,
+      0xB,
+      0x7,
+      0xF};
+
+  int16x8_t vzp = vdupq_n_s16(static_cast<int16_t>(zp));
+  float32x4_t vscale = vdupq_n_f32(static_cast<float>(scale));
+
+  for (; i + 15 < numel; i += 16) {
+    int8x16_t in_084C195D2A6E3B7F =
+        vldrbq_gather_offset_s8(input_data, voffset);
+
+    int16x8_t in_04152637 = vsubq_s16(vmovlbq_s8(in_084C195D2A6E3B7F), vzp);
+    int16x8_t in_8C9DAEBF = vsubq_s16(vmovltq_s8(in_084C195D2A6E3B7F), vzp);
+
+    float32x4_t inf_0123 = vcvtq_f32_s32(vmovlbq_s16(in_04152637));
+    float32x4_t inf_4567 = vcvtq_f32_s32(vmovltq_s16(in_04152637));
+    float32x4_t inf_89AB = vcvtq_f32_s32(vmovlbq_s16(in_8C9DAEBF));
+    float32x4_t inf_CDEF = vcvtq_f32_s32(vmovltq_s16(in_8C9DAEBF));
+
+    float32x4_t out_0123 = vmulq_f32(inf_0123, vscale);
+    float32x4_t out_4567 = vmulq_f32(inf_4567, vscale);
+    float32x4_t out_89AB = vmulq_f32(inf_89AB, vscale);
+    float32x4_t out_CDEF = vmulq_f32(inf_CDEF, vscale);
+
+    vstrwq_f32(out_data + 0, out_0123);
+    vstrwq_f32(out_data + 4, out_4567);
+    vstrwq_f32(out_data + 8, out_89AB);
+    vstrwq_f32(out_data + 12, out_CDEF);
+
+    input_data += 16;
+    out_data += 16;
   }
-#endif
+#endif // defined(HAS_HELIUM_SIMD)
 
+  for (; i < numel; i++) {
+    *out_data = dequantize_val<int8_t, float>(scale, zp, *input_data);
+    *input_data++;
+    *out_data++;
+  }
   return out;
 }
 
diff --git a/backends/cortex_m/ops/op_quantize_per_tensor.cpp b/backends/cortex_m/ops/op_quantize_per_tensor.cpp
index 25385602e58..d92d2666a8f 100644
--- a/backends/cortex_m/ops/op_quantize_per_tensor.cpp
+++ b/backends/cortex_m/ops/op_quantize_per_tensor.cpp
@@ -41,13 +41,13 @@ void check_quantize_args(
       "input.scalar_type() %" PRId8 " is not float type",
       static_cast<int8_t>(input.scalar_type()));
 
-  // Check output dtype is int8 (Char)
+  // Check output dtype is int8
   ET_CHECK_MSG(
       out.scalar_type() == ScalarType::Char,
       "out.scalar_type() %" PRId8 " is not int8 (Char)",
       static_cast<int8_t>(out.scalar_type()));
 
-  // Check dtype is int8 (Char)
+  // Check dtype is int8
   ET_CHECK_MSG(
       dtype == ScalarType::Char,
       "dtype %" PRId8 " is not int8 (Char)",
@@ -75,18 +75,18 @@ void check_quantize_args(
 /**
  * Scalar implementation of quantization for a single value.
  */
-template <typename T, typename K>
-T quantize_val(
-    float inv_scale,
+template <typename Q, typename F>
+Q quantize_val(
+    F inv_scale,
     int32_t zero_point,
-    K value,
+    F value,
     int64_t quant_min,
     int64_t quant_max) {
   int32_t qvalue =
       zero_point + static_cast<int32_t>(std::nearbyint(inv_scale * value));
   qvalue = std::max<int32_t>(qvalue, static_cast<int32_t>(quant_min));
   qvalue = std::min<int32_t>(qvalue, static_cast<int32_t>(quant_max));
-  return static_cast<T>(qvalue);
+  return static_cast<Q>(qvalue);
 }
 
 } // namespace
@@ -123,16 +123,97 @@ Tensor& quantize_per_tensor_out(
   int8_t* out_data = out.mutable_data_ptr<int8_t>();
   const size_t numel = input.numel();
 
+  size_t i = 0;
+
 #if defined(HAS_HELIUM_SIMD)
-// Helium MVE implementation for float32 to int8 quantization
-#Error "Implement MVE version!"
-#else
-  // Scalar implementation for float32 to int8 quantization
-  for (size_t i = 0; i < numel; i++) {
-    out_data[i] =
-        quantize_val<int8_t, float>(inv_scale, zp, input_data[i], qmin, qmax);
+  // Helium MVE implementation for float32 to int8 quantization
+  static uint8x16_t voffset{
+      0x0,
+      0x8,
+      0x4,
+      0xC,
+      0x1,
+      0x9,
+      0x5,
+      0xD,
+      0x2,
+      0xA,
+      0x6,
+      0xE,
+      0x3,
+      0xB,
+      0x7,
+      0xF};
+
+  float32x4_t inv_scale_vec = vdupq_n_f32(inv_scale);
+
+  // Magic number for float to int conversion, round to nearest even integer
+  // int magic_round(float f): interpret_as_int32(f + magic_float) - magic_int
+  // where,
+  //    magic_float = 12582912.0f = (2 ** 23 + 2 ** 22) = (1.5 * 2 ** 23)
+  //    magic_int = 1262485504 = 0x4B400000 = bit_pattern_as_int32(magic_float)
+
+  float magic_float = 12582912.0f;
+  int32_t magic_int = 1262485504;
+
+  float32x4_t vmagic_float = vdupq_n_f32(magic_float);
+  int32x4_t vmagic_int_less_zp =
+      vdupq_n_s32(magic_int - static_cast<int32_t>(zp));
+
+  int16x8_t vqmin = vdupq_n_s16(qmin);
+  int16x8_t vqmax = vdupq_n_s16(qmax);
+
+  // TODO: Measure performnce, we are spilling
+  for (; i + 15 < numel; i += 16) {
+    float32x4_t in_0123 = vldrwq_f32(input_data + 0);
+    float32x4_t in_4567 = vldrwq_f32(input_data + 4);
+    float32x4_t in_89AB = vldrwq_f32(input_data + 8);
+    float32x4_t in_CDEF = vldrwq_f32(input_data + 12);
+
+    float32x4_t outf_0123 = vfmaq_f32(vmagic_float, in_0123, inv_scale_vec);
+    float32x4_t outf_4567 = vfmaq_f32(vmagic_float, in_4567, inv_scale_vec);
+    float32x4_t outf_89AB = vfmaq_f32(vmagic_float, in_89AB, inv_scale_vec);
+    float32x4_t outf_CDEF = vfmaq_f32(vmagic_float, in_CDEF, inv_scale_vec);
+
+    int32x4_t out_0123 =
+        vsubq_s32(vreinterpretq_s32_f32(outf_0123), vmagic_int_less_zp);
+    int32x4_t out_4567 =
+        vsubq_s32(vreinterpretq_s32_f32(outf_4567), vmagic_int_less_zp);
+    int32x4_t out_89AB =
+        vsubq_s32(vreinterpretq_s32_f32(outf_89AB), vmagic_int_less_zp);
+    int32x4_t out_CDEF =
+        vsubq_s32(vreinterpretq_s32_f32(outf_CDEF), vmagic_int_less_zp);
+
+    int16x8_t out_04152637;
+    int16x8_t out_8C9DAEBF;
+    out_04152637 = vmovnbq_s32(out_04152637, out_0123);
+    out_04152637 = vmovntq_s32(out_04152637, out_4567);
+    out_8C9DAEBF = vmovnbq_s32(out_8C9DAEBF, out_89AB);
+    out_8C9DAEBF = vmovntq_s32(out_8C9DAEBF, out_CDEF);
+
+    int16x8_t out_04152637_clamped =
+        vminq_s16(vmaxq_s16(out_04152637, vqmin), vqmax);
+    int16x8_t out_8C9DAEBF_clamped =
+        vminq_s16(vmaxq_s16(out_8C9DAEBF, vqmin), vqmax);
+
+    int8x16_t out_084C195D2A6E3B7F;
+    out_084C195D2A6E3B7F =
+        vmovnbq_s16(out_084C195D2A6E3B7F, out_04152637_clamped);
+    out_084C195D2A6E3B7F =
+        vmovntq_s16(out_084C195D2A6E3B7F, out_8C9DAEBF_clamped);
+
+    vstrbq_scatter_offset_s8(out_data, voffset, out_084C195D2A6E3B7F);
+    input_data += 16;
+    out_data += 16;
+  }
+#endif // defined(HAS_HELIUM_SIMD)
+
+  for (; i < numel; i++) {
+    *out_data =
+        quantize_val<int8_t, float>(inv_scale, zp, *input_data, qmin, qmax);
+    input_data++;
+    out_data++;
   }
-#endif
 
   return out;
 }
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 3d6acf2b94a..73fa4b24d4e 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -40,6 +40,11 @@
 )
 
 from executorch.backends.arm.vgf_partitioner import VgfPartitioner
+
+# To use Cortex-M backend
+from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
+    ReplaceQuantNodesPass,
+)
 from executorch.devtools.backend_debug import get_delegation_info
 from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 
@@ -59,6 +64,7 @@
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
 
+
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.WARNING, format=FORMAT)
 
@@ -216,6 +222,54 @@ def forward(self, x, y):
     can_delegate = True
 
 
+class QuantAddTest(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a):
+        return a + a
+
+    example_input = (torch.rand([13, 3], dtype=torch.float32),)  # a - normal values
+    can_delegate = True  # when quantized
+
+
+class QuantAddTest2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b):
+        p = a + a
+        q = b + b
+        r = p + q
+        return p, q, r
+
+    example_input = (
+        torch.randn([13, 7, 3], dtype=torch.float32),
+        torch.randn([13, 7, 3], dtype=torch.float32),
+    )
+    can_delegate = True  # when quantized
+
+
+class QuantOpTest(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, w, x, y, z):
+        o1 = w - x
+        o2 = o1 + y
+        o3 = o2 * z
+        return o1, o2, o3
+
+    example_input = (
+        torch.randn([3, 1, 2], dtype=torch.float32),  # w - normal values
+        torch.randn([3, 5, 2], dtype=torch.float32),  # x - normal values
+        torch.randn([3, 5, 1], dtype=torch.float32)
+        * -0.000001,  # y - small -ve values, needs to be calibration for tests
+        torch.randn([3, 5, 2], dtype=torch.float32) * 1000,  # z - large values
+    )
+    can_delegate = True  # when quantized
+
+
 class SoftmaxModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -241,6 +295,9 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
     "add": AddModule,
     "add2": AddModule2,
     "add3": AddModule3,
+    "qadd": QuantAddTest,
+    "qadd2": QuantAddTest2,
+    "qops": QuantOpTest,
     "softmax": SoftmaxModule,
     "MultipleOutputsModule": MultipleOutputsModule,
 }
@@ -255,6 +312,17 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         torch.randn(32, 5),
         torch.randn(32, 5),
     ),
+    "qadd": (torch.randn(32, 2, 1),),
+    "qadd2": (
+        torch.randn(32, 2, 1),
+        torch.randn(32, 2, 1),
+    ),
+    "qops": (
+        torch.randn(32, 2, 1),
+        torch.randn(32, 2, 1),
+        torch.randn(32, 2, 1) * -0.000001,
+        torch.randn(32, 2, 1) * 1000,
+    ),
     "softmax": (torch.randn(32, 2, 2),),
 }
 
@@ -656,6 +724,7 @@ def to_edge_TOSA_delegate(
             _check_ir_validity=False,
         ),
     )
+
     return model_int8, edge
 
 
@@ -681,9 +750,18 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
             _check_ir_validity=False,
         ),
     )
+
     return model_int8, edge
 
 
+def transform_for_cortex_m_backend(edge):
+    # Let's make sure we are using optimized Cortex M backend
+    # NB: If we can't find and replace ops those are expected to be replaced,
+    # bad things will happen at runtime, like "missing operator" errors!
+    edge = edge.transform([ReplaceQuantNodesPass()])
+    return edge
+
+
 if __name__ == "__main__":  # noqa: C901
     args = get_args()
 
@@ -715,6 +793,9 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
             exported_program, args, model, example_inputs
         )
 
+    # Transform so we can use ops from the Cortex M backend
+    edge = transform_for_cortex_m_backend(edge)
+
     dump_delegation_info(edge, args.intermediates)
 
     try:
@@ -759,7 +840,9 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
             output_name = os.path.join(args.output, output_name)
 
     if args.bundleio:
-        save_bpte_program(exec_prog, original_model, output_name)
+        # Realize the quantization impact on numerics when generating reference output
+        reference_model = original_model if not model_int8 else model_int8
+        save_bpte_program(exec_prog, reference_model, output_name)
         print(f"Bundle PTE file saved as {output_name}")
     else:
         save_pte_program(exec_prog, output_name)
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 63cdcc45aad..1568bef0301 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -492,7 +492,6 @@ set_property(
   PROPERTY IMPORTED_LOCATION
            "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a"
 )
-
 add_library(quantized_ops_lib STATIC IMPORTED)
 set_property(
   TARGET quantized_ops_lib
@@ -505,7 +504,18 @@ set_property(
   PROPERTY IMPORTED_LOCATION
            "${ET_BUILD_DIR_PATH}/kernels/quantized/libquantized_kernels.a"
 )
-
+add_library(cortex_m_ops_lib STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_ops_lib
+  PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/libcortex_m_ops_lib.a"
+)
+add_library(cortex_m_kernels STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_kernels
+  PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/libcortex_m_kernels.a"
+)
 add_library(extension_runner_util STATIC IMPORTED)
 set_property(
   TARGET extension_runner_util
@@ -546,9 +556,11 @@ list(APPEND arm_executor_runner_link
   executorch
   "-Wl,--whole-archive"
   executorch_delegate_ethos_u
+  cortex_m_ops_lib
   quantized_ops_lib
   portable_ops_lib
   quantized_kernels
+  cortex_m_kernels
   portable_kernels
   "-Wl,--no-whole-archive"
   -Xlinker -Map=arm_executor_runner.map
@@ -561,7 +573,7 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
   set_property(
       TARGET etdump
       PROPERTY IMPORTED_LOCATION
-           "${ET_BUILD_DIR_PATH}/lib/libetdump.a"
+            "${ET_BUILD_DIR_PATH}/lib/libetdump.a"
   )
 
   if(CMAKE_BUILD_TYPE MATCHES "Debug")
@@ -574,7 +586,7 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
   set_property(
       TARGET ${FLATCCRT_LIB}
       PROPERTY IMPORTED_LOCATION
-           "${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
+            "${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
   )
 
   list(APPEND arm_executor_runner_link
@@ -643,4 +655,4 @@ if(SEMIHOSTING)
     ${ETHOS_SDK_PATH}/core_platform/targets/${TARGET_BOARD}/retarget.c
     PROPERTIES HEADER_FILE_ONLY TRUE
   )
-endif()
\ No newline at end of file
+endif()
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 01699087443..ed1cbc5e015 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -177,8 +177,24 @@ backends/arm/scripts/build_portable_kernels.sh --et_build_root="${et_build_root}
 
 if [[ -z "$model_name" ]]; then
     # the test models run, and whether to delegate
-    test_model=( "softmax" "add" "add3" "mv2" )
-    model_compiler_flags=( "" "--delegate" "--delegate" "--delegate --quantize" )
+    test_model=(
+        "softmax"  # 0
+        "add"      # 1
+        "add3"     # 2
+        "qadd"     # 3
+        "qadd3"    # 4
+        "qops"     # 5
+        "mv2"      # 6
+    )
+    model_compiler_flags=(
+        ""                      # 0 softmax
+        "--delegate"            # 1 add
+        "--delegate"            # 2 add3
+        "--delegate --quantize" # 3 qadd
+        "--delegate --quantize" # 4 qadd3
+        "--delegate --quantize" # 5 qops
+        "--delegate --quantize" # 6 mv2
+    )
 else
     test_model=( "$model_name" )
     model_compiler_flags=( "$aot_arm_compiler_flag_delegate $aot_arm_compiler_flag_quantize $aot_arm_compiler_flags" )

From b98c3abc6a367ba6945020f8fb813b4a0e68eda3 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 9 May 2025 10:34:43 -0700
Subject: [PATCH 025/178] Save some size in pattern/{bitwise,comparison}_op.h
 (#10489)

bloaty told me that we were paying a noticeable size cost for the
::value members of these structs (at least after the PR in this stack
that reapplies #9841) and now we're not.

Test Plan: bash test/build_optimized_size_test.sh

```
before:
adopt functionref
==========
ExecuTorch with no ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  153928 Apr 25 11:08 cmake-out/test/size_test
ExecuTorch with portable ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  2150960 Apr 25 11:08 cmake-out/test/size_test_all_ops
ExecuTorch with optimized ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  5927336 Apr 25 11:08 cmake-out/test/size_test_all_optimized_ops
(.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test*
__TEXT	__DATA	__OBJC	others	dec	hex
81920	81920	0	4295049216	4295213056	10003c000	cmake-out/test/size_test
1474560	81920	0	4295655424	4297211904	100224000	cmake-out/test/size_test_all_ops
4505600	98304	0	4296376320	4300980224	1005bc000	cmake-out/test/size_test_all_optimized_ops

after:
ExecuTorch with no ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  153928 Apr 25 12:24 cmake-out/test/size_test
ExecuTorch with portable ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  2150960 Apr 25 12:24 cmake-out/test/size_test_all_ops
ExecuTorch with optimized ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  5887368 Apr 25 12:24 cmake-out/test/size_test_all_optimized_ops
(.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test*
__TEXT	__DATA	__OBJC	others	dec	hex
81920	81920	0	4295049216	4295213056	10003c000	cmake-out/test/size_test
1474560	81920	0	4295655424	4297211904	100224000	cmake-out/test/size_test_all_ops
4489216	98304	0	4296359936	4300947456	1005b4000	cmake-out/test/size_test_all_optimized_ops
```

(yes it's neutral; improves size results for further diffs)
---
 .lintrunner.toml                             |  7 +++
 kernels/portable/cpu/op_bitwise_and.cpp      |  6 ++-
 kernels/portable/cpu/op_bitwise_or.cpp       |  6 ++-
 kernels/portable/cpu/op_bitwise_xor.cpp      |  6 ++-
 kernels/portable/cpu/op_eq.cpp               |  8 ++-
 kernels/portable/cpu/op_ge.cpp               |  8 ++-
 kernels/portable/cpu/op_gt.cpp               |  6 ++-
 kernels/portable/cpu/op_le.cpp               |  8 ++-
 kernels/portable/cpu/op_lt.cpp               |  6 ++-
 kernels/portable/cpu/op_ne.cpp               |  8 ++-
 kernels/portable/cpu/pattern/bitwise_op.h    | 15 +++---
 kernels/portable/cpu/pattern/comparison_op.h | 54 ++------------------
 12 files changed, 63 insertions(+), 75 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index ae0d134f8c7..2835af1bf92 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -220,6 +220,13 @@ exclude_patterns = [
     'extension/**',
     'kernels/optimized/**',
     # Justified <functional> include.
+    'kernels/portable/cpu/op_bitwise*.cpp',
+    'kernels/portable/cpu/op_eq.cpp',
+    'kernels/portable/cpu/op_ge.cpp',
+    'kernels/portable/cpu/op_gt.cpp',
+    'kernels/portable/cpu/op_le.cpp',
+    'kernels/portable/cpu/op_lt.cpp',
+    'kernels/portable/cpu/op_ne.cpp',
     'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
diff --git a/kernels/portable/cpu/op_bitwise_and.cpp b/kernels/portable/cpu/op_bitwise_and.cpp
index f62d0b70dd4..609dcb1e949 100644
--- a/kernels/portable/cpu/op_bitwise_and.cpp
+++ b/kernels/portable/cpu/op_bitwise_and.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& bitwise_and_Tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_and.Tensor_out";
-  return internal::bitwise_tensor_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_tensor_out<std::bit_and, op_name>(ctx, a, b, out);
 }
 
 Tensor& bitwise_and_Scalar_out(
@@ -29,7 +31,7 @@ Tensor& bitwise_and_Scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_and.Scalar_out";
-  return internal::bitwise_scalar_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_scalar_out<std::bit_and, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_bitwise_or.cpp b/kernels/portable/cpu/op_bitwise_or.cpp
index 8028815fbf9..42cb2a6c3ba 100644
--- a/kernels/portable/cpu/op_bitwise_or.cpp
+++ b/kernels/portable/cpu/op_bitwise_or.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& bitwise_or_Tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_or.Tensor_out";
-  return internal::bitwise_tensor_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_tensor_out<std::bit_or, op_name>(ctx, a, b, out);
 }
 
 Tensor& bitwise_or_Scalar_out(
@@ -29,7 +31,7 @@ Tensor& bitwise_or_Scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_or.Scalar_out";
-  return internal::bitwise_scalar_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_scalar_out<std::bit_or, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_bitwise_xor.cpp b/kernels/portable/cpu/op_bitwise_xor.cpp
index 85badf95789..5fe4e1708d5 100644
--- a/kernels/portable/cpu/op_bitwise_xor.cpp
+++ b/kernels/portable/cpu/op_bitwise_xor.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& bitwise_xor_Tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_xor.Tensor_out";
-  return internal::bitwise_tensor_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_tensor_out<std::bit_xor, op_name>(ctx, a, b, out);
 }
 
 Tensor& bitwise_xor_Scalar_out(
@@ -29,7 +31,7 @@ Tensor& bitwise_xor_Scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_xor.Scalar_out";
-  return internal::bitwise_scalar_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_scalar_out<std::bit_xor, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_eq.cpp b/kernels/portable/cpu/op_eq.cpp
index bddb6181ee0..9e21b82c43c 100644
--- a/kernels/portable/cpu/op_eq.cpp
+++ b/kernels/portable/cpu/op_eq.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& eq_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "eq.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& eq_scalar_out(
@@ -29,7 +32,8 @@ Tensor& eq_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "eq.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_ge.cpp b/kernels/portable/cpu/op_ge.cpp
index 8457f91b548..d5e7576b7ae 100644
--- a/kernels/portable/cpu/op_ge.cpp
+++ b/kernels/portable/cpu/op_ge.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& ge_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ge.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::greater_equal, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& ge_scalar_out(
@@ -29,7 +32,8 @@ Tensor& ge_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ge.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::greater_equal, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_gt.cpp b/kernels/portable/cpu/op_gt.cpp
index bb1f6a274cd..cd65a3b68d9 100644
--- a/kernels/portable/cpu/op_gt.cpp
+++ b/kernels/portable/cpu/op_gt.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& gt_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "gt.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::greater, op_name>(ctx, a, b, out);
 }
 
 Tensor& gt_scalar_out(
@@ -29,7 +31,7 @@ Tensor& gt_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "gt.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::greater, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_le.cpp b/kernels/portable/cpu/op_le.cpp
index e893678fc5e..909de1bfad2 100644
--- a/kernels/portable/cpu/op_le.cpp
+++ b/kernels/portable/cpu/op_le.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& le_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "le.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::less_equal, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& le_scalar_out(
@@ -29,7 +32,8 @@ Tensor& le_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "le.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::less_equal, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_lt.cpp b/kernels/portable/cpu/op_lt.cpp
index 6f1ffb21153..5af89920536 100644
--- a/kernels/portable/cpu/op_lt.cpp
+++ b/kernels/portable/cpu/op_lt.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& lt_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "lt.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::less, op_name>(ctx, a, b, out);
 }
 
 Tensor& lt_scalar_out(
@@ -29,7 +31,7 @@ Tensor& lt_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "lt.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::less, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_ne.cpp b/kernels/portable/cpu/op_ne.cpp
index 5e5a2d38a33..a4b292359df 100644
--- a/kernels/portable/cpu/op_ne.cpp
+++ b/kernels/portable/cpu/op_ne.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& ne_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ne.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::not_equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& ne_scalar_out(
@@ -29,7 +32,8 @@ Tensor& ne_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ne.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::not_equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
index 6e4c111b8f2..b8d8acf3382 100644
--- a/kernels/portable/cpu/pattern/bitwise_op.h
+++ b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -47,11 +47,13 @@ constexpr bitwise_fn<T> get_bitwise_fn() {
 
 template <typename T, const char* op_name>
 struct BitwiseFnForOp {
-  static constexpr auto value = get_bitwise_fn<T, op_name>();
-  static_assert(value != nullptr, "unknown op_name!");
+  static constexpr auto get_value() {
+    return get_bitwise_fn<T, op_name>();
+  }
+  static_assert(get_value() != nullptr, "unknown op_name!");
 };
 
-template <const char* op_name>
+template <template <typename> class BitOp, const char* op_name>
 Tensor& bitwise_tensor_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -81,7 +83,7 @@ Tensor& bitwise_tensor_out(
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
         utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-            BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value,
+            BitOp<CTYPE_COMPUTE>(),
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
@@ -94,7 +96,7 @@ Tensor& bitwise_tensor_out(
   return out;
 }
 
-template <const char* op_name>
+template <template <typename> class BitOp, const char* op_name>
 Tensor& bitwise_scalar_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -123,8 +125,7 @@ Tensor& bitwise_scalar_out(
         const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
         utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
             [val_b](const CTYPE_COMPUTE val_a) {
-              return BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value(
-                  val_a, val_b);
+              return BitOp()(val_a, val_b);
             },
             ctx,
             a,
diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h
index e0d9bf4dcab..092a323c63e 100644
--- a/kernels/portable/cpu/pattern/comparison_op.h
+++ b/kernels/portable/cpu/pattern/comparison_op.h
@@ -17,53 +17,7 @@ namespace executor {
 namespace native {
 namespace internal {
 
-#define DEFINE_BINARY_OPERATOR_TEMPLATE(name, op) \
-  template <typename T>                           \
-  T name(const T val_a, const T val_b) {          \
-    return val_a op val_b;                        \
-  }
-
-DEFINE_BINARY_OPERATOR_TEMPLATE(eq, ==)
-DEFINE_BINARY_OPERATOR_TEMPLATE(ne, !=)
-DEFINE_BINARY_OPERATOR_TEMPLATE(ge, >=)
-DEFINE_BINARY_OPERATOR_TEMPLATE(le, <=)
-DEFINE_BINARY_OPERATOR_TEMPLATE(gt, >)
-DEFINE_BINARY_OPERATOR_TEMPLATE(lt, <)
-
-template <typename T>
-using comparison_fn = T (*)(const T, const T);
-
-template <typename T, const char* op_name>
-constexpr comparison_fn<T> get_comparison_fn() {
-  std::string_view op = op_name;
-  if (op == "eq.Tensor_out" || op == "eq.Scalar_out") {
-    return eq;
-  }
-  if (op == "ne.Tensor_out" || op == "ne.Scalar_out") {
-    return ne;
-  }
-  if (op == "ge.Tensor_out" || op == "ge.Scalar_out") {
-    return ge;
-  }
-  if (op == "le.Tensor_out" || op == "le.Scalar_out") {
-    return le;
-  }
-  if (op == "gt.Tensor_out" || op == "gt.Scalar_out") {
-    return gt;
-  }
-  if (op == "lt.Tensor_out" || op == "lt.Scalar_out") {
-    return lt;
-  }
-  return nullptr;
-};
-
-template <typename T, const char* op_name>
-struct ComparisonFnForOp {
-  static constexpr auto value = get_comparison_fn<T, op_name>();
-  static_assert(value != nullptr, "unknown op_name!");
-};
-
-template <const char* op_name>
+template <template <typename> class Comparison, const char* op_name>
 Tensor& comparison_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
@@ -92,7 +46,7 @@ Tensor& comparison_tensor_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value,
+        Comparison<CTYPE_COMPUTE>(),
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -105,7 +59,7 @@ Tensor& comparison_tensor_out(
   return out;
 }
 
-template <const char* op_name>
+template <template <typename> class Comparison, const char* op_name>
 Tensor& comparison_scalar_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
@@ -129,7 +83,7 @@ Tensor& comparison_scalar_out(
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
         [val_b](const CTYPE_COMPUTE val_a) {
-          return ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value(val_a, val_b);
+          return Comparison<CTYPE_COMPUTE>()(val_a, val_b);
         },
         ctx,
         a,

From 80752f410158e320774edb6f51d2fc0703f4c2df Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 9 May 2025 10:35:05 -0700
Subject: [PATCH 026/178] Reapply #9842: Save some size in dtype_util when
 dtype selective build is not in use (#10490)

We duplicate a lot of functions depending on the operator name so that
dtype selective build will work. We can just detect if dtype selective
build is in use and, if not, stop duplicating.

Test Plan: compared results of bash test/build_optimized_size_test.sh
before/after this rev.

Before:
```
ExecuTorch with no ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  153928 Apr 25 12:24 cmake-out/test/size_test
ExecuTorch with portable ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  2150960 Apr 25 12:24 cmake-out/test/size_test_all_ops
ExecuTorch with optimized ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  5887368 Apr 25 12:24 cmake-out/test/size_test_all_optimized_ops
(.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test*
__TEXT	__DATA	__OBJC	others	dec	hex
81920	81920	0	4295049216	4295213056	10003c000	cmake-out/test/size_test
1474560	81920	0	4295655424	4297211904	100224000	cmake-out/test/size_test_all_ops
4489216	98304	0	4296359936	4300947456	1005b4000	cmake-out/test/size_test_all_optimized_ops
```

After:
```
ExecuTorch with no ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  153928 Apr 25 12:51 cmake-out/test/size_test
ExecuTorch with portable ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  1796928 Apr 25 12:51 cmake-out/test/size_test_all_ops
ExecuTorch with optimized ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  5605176 Apr 25 12:51 cmake-out/test/size_test_all_optimized_ops
(.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test*
__TEXT	__DATA	__OBJC	others	dec	hex
81920	81920	0	4295049216	4295213056	10003c000	cmake-out/test/size_test
1310720	81920	0	4295458816	4296851456	1001cc000	cmake-out/test/size_test_all_ops
4358144	98304	0	4296212480	4300668928	100570000	cmake-out/test/size_test_all_optimized_ops
```

(This was reverted because the diff it was stacked on was a size
regression. Reversing the order instead this time around, and reverted
part of the change that was actually regressing size.)
---
 kernels/portable/cpu/util/dtype_util.h | 29 +++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 2286ca50bee..1e7901c80b2 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -228,7 +228,7 @@ enum class SupportedTensorDtypes {
 namespace internal {
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -251,6 +251,10 @@ load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
   return nullptr;
 }
 
+// NOTE: applying the #ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+// technique used for get_load_to_compute_fn in this path was a size
+// regression rather than an improvement. Haven't fully investigated
+// why; just be aware when trying to improve size further.
 template <typename CTYPE_COMPUTE, const char* op_name>
 store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
     const Tensor& t,
@@ -285,6 +289,29 @@ store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
   return nullptr;
 }
 
+#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+inline constexpr const char kGenericElementwiseOpName[] =
+    "generic_elementwise_op";
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  // NOTE: Selective build relies on the operator name being passed
+  // here. When it's *not* active, using the same operator name
+  // everywhere saves on size because we don't require a new template
+  // instantiation for every operator.
+  return get_load_to_compute_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
+
 bool check_tensor_dtype(
     const Tensor t,
     SupportedTensorDtypes dtypes,

From f6883291ae1992d613ba4bd6619e82ffa5fbc128 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 9 May 2025 10:42:59 -0700
Subject: [PATCH 027/178] Reapply #9841: Migrate elementwise_util callers to
 the variants with out_dtypes in template arguments (#10491)

This is necessary to take advantage of #9388, which
creates dtype-specialized implementations for the non-mixed dtype case.

Measured the size cost of this approach with
test/build_optimized_size_test.sh . It does cost us some size:

```
Before:
ExecuTorch with no ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  153928 Apr 25 12:51 cmake-out/test/size_test
ExecuTorch with portable ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  1796928 Apr 25 12:51 cmake-out/test/size_test_all_ops
ExecuTorch with optimized ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  5605176 Apr 25 12:51 cmake-out/test/size_test_all_optimized_ops
(.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test*
__TEXT	__DATA	__OBJC	others	dec	hex
81920	81920	0	4295049216	4295213056	10003c000	cmake-out/test/size_test
1310720	81920	0	4295458816	4296851456	1001cc000	cmake-out/test/size_test_all_ops
4358144	98304	0	4296212480	4300668928	100570000	cmake-out/test/size_test_all_optimized_ops

After:
ExecuTorch with no ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  153928 Apr 25 12:57 cmake-out/test/size_test
ExecuTorch with portable ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  1889792 Apr 25 12:57 cmake-out/test/size_test_all_ops
ExecuTorch with optimized ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  5799704 Apr 25 12:57 cmake-out/test/size_test_all_optimized_ops
(.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test*
__TEXT	__DATA	__OBJC	others	dec	hex
81920	81920	0	4295049216	4295213056	10003c000	cmake-out/test/size_test
1376256	81920	0	4295491584	4296949760	1001e4000	cmake-out/test/size_test_all_ops
4423680	98304	0	4296327168	4300849152	10059c000	cmake-out/test/size_test_all_optimized_ops
```

However, on an absolute basis, size is still below where we are at two
PRs ago, which was:

```
ExecuTorch with no ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  153928 Apr 25 12:24 cmake-out/test/size_test
ExecuTorch with portable ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  2150960 Apr 25 12:24 cmake-out/test/size_test_all_ops
ExecuTorch with optimized ops binary size, unstripped:
-rwxr-xr-x  1 swolchok  staff  5887368 Apr 25 12:24 cmake-out/test/size_test_all_optimized_ops
(.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test*
__TEXT	__DATA	__OBJC	others	dec	hex
81920	81920	0	4295049216	4295213056	10003c000	cmake-out/test/size_test
1474560	81920	0	4295655424	4297211904	100224000	cmake-out/test/size_test_all_ops
4489216	98304	0	4296359936	4300947456	1005b4000	cmake-out/test/size_test_all_optimized_ops
```
---
 kernels/portable/cpu/op_add.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_addmm.cpp            | 10 ++++---
 kernels/portable/cpu/op_atan2.cpp            | 10 ++++---
 kernels/portable/cpu/op_clamp.cpp            | 18 ++++++++----
 kernels/portable/cpu/op_copy.cpp             | 16 ++++++----
 kernels/portable/cpu/op_div.cpp              | 31 ++++++++++++--------
 kernels/portable/cpu/op_elu.cpp              | 11 ++++---
 kernels/portable/cpu/op_floor_divide.cpp     |  9 ++++--
 kernels/portable/cpu/op_fmod.cpp             | 18 ++++++++----
 kernels/portable/cpu/op_maximum.cpp          |  8 +++--
 kernels/portable/cpu/op_minimum.cpp          |  9 ++++--
 kernels/portable/cpu/op_mul.cpp              | 10 ++++---
 kernels/portable/cpu/op_pow.cpp              | 27 +++++++++++------
 kernels/portable/cpu/op_remainder.cpp        | 18 ++++++++----
 kernels/portable/cpu/op_rsub.cpp             | 10 ++++---
 kernels/portable/cpu/op_sigmoid.cpp          | 11 ++++---
 kernels/portable/cpu/op_sub.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_where.cpp            | 14 +++++----
 kernels/portable/cpu/pattern/bitwise_op.h    | 20 ++++++++-----
 kernels/portable/cpu/pattern/comparison_op.h | 18 ++++++++----
 kernels/portable/cpu/pattern/logical_op.h    |  9 ++++--
 21 files changed, 200 insertions(+), 117 deletions(-)

diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index adb9d4ea723..555341b3447 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -52,8 +52,11 @@ Tensor& add_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a + val_alpha * val_b;
         },
         ctx,
@@ -61,8 +64,7 @@ Tensor& add_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -100,8 +102,11 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [b, alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [b, alpha](const auto val_a) {
           CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
           CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
           return val_a + val_alpha * val_b;
@@ -109,8 +114,7 @@ Tensor& add_scalar_out(
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp
index d1df5818cd8..440a8b2c0fa 100644
--- a/kernels/portable/cpu/op_addmm.cpp
+++ b/kernels/portable/cpu/op_addmm.cpp
@@ -88,8 +88,11 @@ Tensor& addmm_out(
           n,
           p);
 
-      utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-          [alpha_val, beta_val](const CTYPE val_a, const CTYPE val_b) {
+      utils::apply_bitensor_elementwise_fn<
+          CTYPE,
+          op_name,
+          utils::SupportedTensorDtypes::REALHBF16>(
+          [alpha_val, beta_val](const auto val_a, const auto val_b) {
             return val_a * alpha_val + val_b * beta_val;
           },
           ctx,
@@ -97,8 +100,7 @@ Tensor& addmm_out(
           utils::SupportedTensorDtypes::REALHBF16,
           in,
           utils::SupportedTensorDtypes::REALHBF16,
-          out,
-          utils::SupportedTensorDtypes::REALHBF16);
+          out);
     }
   });
 
diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp
index 19267ef49dd..33d66cf2ad7 100644
--- a/kernels/portable/cpu/op_atan2.cpp
+++ b/kernels/portable/cpu/op_atan2.cpp
@@ -55,8 +55,11 @@ Tensor& atan2_out(
   static constexpr const char op_name[] = "atan2.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) {
           return std::atan2(val_a, val_b);
         },
         ctx,
@@ -64,8 +67,7 @@ Tensor& atan2_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index c1c40a38f34..6974789eccf 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -134,8 +134,12 @@ Tensor& clamp_out(
   static constexpr const char op_name[] = "clamp.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
         [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
@@ -150,8 +154,7 @@ Tensor& clamp_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
@@ -210,11 +213,15 @@ Tensor& clamp_tensor_out(
   static constexpr const char op_name[] = "clamp.Tensor_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [has_min, has_max](
             const CTYPE_COMPUTE val_in,
             const CTYPE_COMPUTE val_min,
             const CTYPE_COMPUTE val_max) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(val_out, val_min);
@@ -231,8 +238,7 @@ Tensor& clamp_tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         max,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp
index 069800cc399..41a13ed0b38 100644
--- a/kernels/portable/cpu/op_copy.cpp
+++ b/kernels/portable/cpu/op_copy.cpp
@@ -53,15 +53,17 @@ Tensor& copy_out(
     std::memcpy(out.mutable_data_ptr(), src.const_data_ptr(), src.nbytes());
   } else {
     ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy.out", CTYPE, [&]() {
-      utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
+      utils::apply_bitensor_elementwise_fn<
+          CTYPE,
+          op_name,
+          utils::SupportedTensorDtypes::REALHBBF16>(
           [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
           ctx,
           in,
           utils::SupportedTensorDtypes::REALHBBF16,
           src,
           utils::SupportedTensorDtypes::REALHBBF16,
-          out,
-          utils::SupportedTensorDtypes::REALHBBF16);
+          out);
     });
   }
 
@@ -93,15 +95,17 @@ Tensor& copy_(
     std::memcpy(in.mutable_data_ptr(), src.const_data_ptr(), in.nbytes());
   } else {
     ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy_", CTYPE, [&]() {
-      utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
+      utils::apply_bitensor_elementwise_fn<
+          CTYPE,
+          op_name,
+          utils::SupportedTensorDtypes::REALHBBF16>(
           [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
           ctx,
           in,
           utils::SupportedTensorDtypes::REALHBBF16,
           src,
           utils::SupportedTensorDtypes::REALHBBF16,
-          in,
-          utils::SupportedTensorDtypes::REALHBBF16);
+          in);
     });
   }
 
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
index 94cd9ea5011..70f9479c464 100644
--- a/kernels/portable/cpu/op_div.cpp
+++ b/kernels/portable/cpu/op_div.cpp
@@ -58,17 +58,17 @@ Tensor& div_out(
   static constexpr const char op_name[] = "div.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          return val_a / val_b;
-        },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
@@ -122,9 +122,13 @@ Tensor& div_out_mode(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [mode_is_trunc, &div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -146,8 +150,7 @@ Tensor& div_out_mode(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -188,13 +191,15 @@ Tensor& div_scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp
index d4846fb1bfb..d6533642860 100644
--- a/kernels/portable/cpu/op_elu.cpp
+++ b/kernels/portable/cpu/op_elu.cpp
@@ -44,8 +44,12 @@ Tensor& elu_out(
     ET_EXTRACT_SCALAR(scale, math_scale);
     ET_EXTRACT_SCALAR(input_scale, math_input_scale);
     const auto negcoef = math_alpha * math_scale;
-    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
-        [negcoef, math_scale, math_input_scale](auto x) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [negcoef, math_scale, math_input_scale](const auto x) {
+          // TODO: rewrite this to be vectorization-capable.
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
@@ -53,8 +57,7 @@ Tensor& elu_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::FLOATHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
   return out;
 }
diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp
index 85eb612ea1e..50723c3fa0a 100644
--- a/kernels/portable/cpu/op_floor_divide.cpp
+++ b/kernels/portable/cpu/op_floor_divide.cpp
@@ -53,9 +53,13 @@ Tensor& floor_divide_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -69,8 +73,7 @@ Tensor& floor_divide_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
index 1e8cba0f1ae..96a971b166a 100644
--- a/kernels/portable/cpu/op_fmod.cpp
+++ b/kernels/portable/cpu/op_fmod.cpp
@@ -55,9 +55,13 @@ Tensor& fmod_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -73,8 +77,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -131,16 +134,19 @@ Tensor& fmod_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = std::fmod(val_a, val_b);
           return value;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 5cf3b5a19f8..3a84095a4df 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -45,7 +45,10 @@ Tensor& maximum_out(
   static constexpr const char op_name[] = "maximum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return utils::max_override(val_a, val_b);
         },
@@ -54,8 +57,7 @@ Tensor& maximum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index e2c641bdb22..5c0e79eb9bb 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -45,8 +45,12 @@ Tensor& minimum_out(
   static constexpr const char op_name[] = "minimum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::min_override(val_a, val_b);
         },
         ctx,
@@ -54,8 +58,7 @@ Tensor& minimum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index a69104f2e32..ba16ddc075a 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -114,13 +114,15 @@ Tensor& mul_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a * val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp
index 81319b03d9f..4d2673cb72d 100644
--- a/kernels/portable/cpu/op_pow.cpp
+++ b/kernels/portable/cpu/op_pow.cpp
@@ -53,8 +53,12 @@ Tensor& pow_Tensor_Tensor_out(
   static constexpr const char op_name[] = "pow.Tensor_Tensor_out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return std::pow(val_a, val_b);
         },
         ctx,
@@ -62,8 +66,7 @@ Tensor& pow_Tensor_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -104,13 +107,16 @@ Tensor& pow_Tensor_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -151,13 +157,16 @@ Tensor& pow_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_a = utils::scalar_to<CTYPE_COMPUTE>(a);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
         ctx,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp
index d34c34a0380..01a5d72de01 100644
--- a/kernels/portable/cpu/op_remainder.cpp
+++ b/kernels/portable/cpu/op_remainder.cpp
@@ -53,9 +53,13 @@ Tensor& remainder_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -71,8 +75,7 @@ Tensor& remainder_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -126,15 +129,18 @@ Tensor& remainder_Scalar_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::remainder_override(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
index 46af021efda..6a0a77b6596 100644
--- a/kernels/portable/cpu/op_rsub.cpp
+++ b/kernels/portable/cpu/op_rsub.cpp
@@ -52,15 +52,17 @@ Tensor& rsub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_b - val_alpha * val_a;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index 09cfed524f9..acb743a2db6 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -45,8 +45,12 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   static constexpr const char op_name[] = "sigmoid.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_in) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_in) -> CTYPE_COMPUTE {
+          // TODO: rewrite this to be vectorization-capable
           CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
               (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
           return out_val;
@@ -54,8 +58,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index 6217f82c3b1..aa90df8dee4 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -56,8 +56,11 @@ Tensor& sub_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a - val_alpha * val_b;
         },
         ctx,
@@ -65,8 +68,7 @@ Tensor& sub_out(
         utils::SupportedTensorDtypes::REALHBF16,
         b,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -110,15 +112,17 @@ Tensor& sub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_a - val_alpha * val_b;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index b455c45c2d1..692e296ee00 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -43,10 +43,13 @@ Tensor& where_out(
   static constexpr const char op_name[] = "where.self_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a,
-           const CTYPE_COMPUTE val_b,
-           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [](const auto val_a, const auto val_b, const auto val_c) {
+          return val_c ? val_a : val_b;
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -54,8 +57,7 @@ Tensor& where_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         cond,
         utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
index b8d8acf3382..bb06ed39cae 100644
--- a/kernels/portable/cpu/pattern/bitwise_op.h
+++ b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -82,15 +82,18 @@ Tensor& bitwise_tensor_out(
 
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-        utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_bitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
+            // TODO: rewrite this to be vectorization-capable.
             BitOp<CTYPE_COMPUTE>(),
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
             b,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
@@ -123,15 +126,18 @@ Tensor& bitwise_scalar_out(
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
         const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-        utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_unitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
             [val_b](const CTYPE_COMPUTE val_a) {
-              return BitOp()(val_a, val_b);
+              // TODO: rewrite this to be vectorization-capable.
+              return BitOp<CTYPE_COMPUTE>()(val_a, val_b);
             },
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h
index 092a323c63e..5163bddcb93 100644
--- a/kernels/portable/cpu/pattern/comparison_op.h
+++ b/kernels/portable/cpu/pattern/comparison_op.h
@@ -45,15 +45,18 @@ Tensor& comparison_tensor_out(
   ScalarType compute_type = utils::get_compute_type(common_type);
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         Comparison<CTYPE_COMPUTE>(),
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -81,15 +84,18 @@ Tensor& comparison_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return Comparison<CTYPE_COMPUTE>()(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/logical_op.h b/kernels/portable/cpu/pattern/logical_op.h
index 017822a85a6..4547d3df51b 100644
--- a/kernels/portable/cpu/pattern/logical_op.h
+++ b/kernels/portable/cpu/pattern/logical_op.h
@@ -34,15 +34,18 @@ Tensor& logical_tensor_out(
       InvalidArgument,
       out);
 
-  utils::apply_bitensor_elementwise_fn<bool, op_name>(
+  utils::apply_bitensor_elementwise_fn<
+      bool,
+      op_name,
+      utils::SupportedTensorDtypes::REALHBBF16>(
+      // TODO: rewrite this to be vectorization-capable.
       fn,
       ctx,
       a,
       utils::SupportedTensorDtypes::REALHBBF16,
       b,
       utils::SupportedTensorDtypes::REALHBBF16,
-      out,
-      utils::SupportedTensorDtypes::REALHBBF16);
+      out);
 
   return out;
 }

From b866837c7fa2cd336149a154d46f62810db42cd6 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Fri, 9 May 2025 13:05:22 -0700
Subject: [PATCH 028/178] Use torchtune 0.6.1 (#10792)

---
 .ci/scripts/test_model.sh                           |  4 ----
 .ci/scripts/unittest-linux.sh                       |  3 ---
 .ci/scripts/unittest-macos.sh                       |  1 -
 backends/arm/test/test_arm_baremetal.sh             |  4 ----
 .../models/llama3_2_vision/install_requirements.sh  | 13 -------------
 requirements-examples.txt                           |  1 +
 6 files changed, 1 insertion(+), 25 deletions(-)
 delete mode 100755 examples/models/llama3_2_vision/install_requirements.sh

diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index fa922e897d7..9175942a0ac 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -87,10 +87,6 @@ test_model() {
     bash examples/models/llava/install_requirements.sh
     STRICT="--no-strict"
   fi
-  if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" || "$MODEL_NAME" == "llama3_2_text_decoder" ]]; then
-    # Install requirements for llama vision.
-    bash examples/models/llama3_2_vision/install_requirements.sh
-  fi
   if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
       # Install requirements for export_llama
       bash examples/models/llama/install_requirements.sh
diff --git a/.ci/scripts/unittest-linux.sh b/.ci/scripts/unittest-linux.sh
index a05211d8e0e..dcb15ba5050 100755
--- a/.ci/scripts/unittest-linux.sh
+++ b/.ci/scripts/unittest-linux.sh
@@ -24,9 +24,6 @@ if [[ "$BUILD_TOOL" == "cmake" ]]; then
     CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
     .ci/scripts/setup-linux.sh "$@"
 
-    # Install llama3_2_vision dependencies.
-    PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.sh
-
     .ci/scripts/unittest-linux-cmake.sh
 elif [[ "$BUILD_TOOL" == "buck2" ]]; then
     # Removing this breaks sccache in the Buck build, apparently
diff --git a/.ci/scripts/unittest-macos.sh b/.ci/scripts/unittest-macos.sh
index 12c9d3f1508..1c202c0c235 100755
--- a/.ci/scripts/unittest-macos.sh
+++ b/.ci/scripts/unittest-macos.sh
@@ -29,7 +29,6 @@ if [[ "$BUILD_TOOL" == "cmake" ]]; then
     # Install llama3_2_vision dependencies.
     PYTHON_EXECUTABLE=python \
     ${CONDA_RUN} --no-capture-output \
-    ./examples/models/llama3_2_vision/install_requirements.sh
 
     .ci/scripts/unittest-macos-cmake.sh
 elif [[ "$BUILD_TOOL" == "buck2" ]]; then
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 476d417a69a..330f0f138d0 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -83,8 +83,6 @@ test_pytest_ops() { # Test ops and other things
 test_pytest_models() { # Test ops and other things
     echo "${TEST_SUITE_NAME}: Run pytest"
 
-    examples/models/llama3_2_vision/install_requirements.sh
-
     # Prepare for pytest
     backends/arm/scripts/build_executorch.sh
 
@@ -117,8 +115,6 @@ test_pytest_ops_ethosu_fvp() { # Same as test_pytest but also sometime verify us
 test_pytest_models_ethosu_fvp() { # Same as test_pytest but also sometime verify using Corstone FVP
     echo "${TEST_SUITE_NAME}: Run pytest with fvp"
 
-    examples/models/llama3_2_vision/install_requirements.sh
-
     # Prepare Corstone-3x0 FVP for pytest
     backends/arm/scripts/build_executorch.sh
     backends/arm/scripts/build_portable_kernels.sh
diff --git a/examples/models/llama3_2_vision/install_requirements.sh b/examples/models/llama3_2_vision/install_requirements.sh
deleted file mode 100755
index 4a5df966e5f..00000000000
--- a/examples/models/llama3_2_vision/install_requirements.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set +ex
-
-NIGHTLY_VERSION="dev20250310"
-
-# Install torchtune nightly for model definitions.
-pip install --pre torchtune==0.6.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
diff --git a/requirements-examples.txt b/requirements-examples.txt
index d4126a178aa..f52eb113075 100644
--- a/requirements-examples.txt
+++ b/requirements-examples.txt
@@ -2,4 +2,5 @@
 # TODO: Make each example publish its own requirements.txt
 timm == 1.0.7
 torchsr == 1.0.4
+torchtune >= 0.6.1
 transformers ==4.47.1

From 6e959bebc43b4945ca60d3e5df5d61dbeef7ce1c Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 9 May 2025 15:20:44 -0500
Subject: [PATCH 029/178] bugfix

Differential Revision: D74495058

Pull Request resolved: https://github.com/pytorch/executorch/pull/10793
---
 backends/cortex_m/ops/op_dequantize_per_tensor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/cortex_m/ops/op_dequantize_per_tensor.cpp b/backends/cortex_m/ops/op_dequantize_per_tensor.cpp
index 6d3f3698c67..d598ab2f70a 100644
--- a/backends/cortex_m/ops/op_dequantize_per_tensor.cpp
+++ b/backends/cortex_m/ops/op_dequantize_per_tensor.cpp
@@ -173,8 +173,8 @@ Tensor& dequantize_per_tensor_out(
 
   for (; i < numel; i++) {
     *out_data = dequantize_val<int8_t, float>(scale, zp, *input_data);
-    *input_data++;
-    *out_data++;
+    input_data++;
+    out_data++;
   }
   return out;
 }

From bf505276b6ea0e020b582edf8b6b9342e9d15d8f Mon Sep 17 00:00:00 2001
From: Max Ren <40742183+mcr229@users.noreply.github.com>
Date: Fri, 9 May 2025 14:10:14 -0700
Subject: [PATCH 030/178] fix bug with sequential backends

Differential Revision: D74226258

Pull Request resolved: https://github.com/pytorch/executorch/pull/10708
---
 exir/backend/backend_api.py                   | 18 +++--
 .../test/backend_with_preprocess_all_demo.py  | 55 ++++++++++----
 .../test/test_to_backend_multi_method.py      | 71 +++++++++++++++++++
 exir/lowered_backend_module.py                | 16 +++--
 4 files changed, 136 insertions(+), 24 deletions(-)

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 310e5ea9379..838156498c4 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -204,12 +204,16 @@ def _insert_lowered_submodule(
     owning_graph_module = call_submodule_node.graph.owning_module
     # call delegate args should only use user_inputs
     call_delegate_args = []
-    # Preserve input order as user_inputs
-    for inp_name in submodule_program.graph_signature.user_inputs:
-        for inp_node in call_submodule_node.all_input_nodes:
-            if inp_node.name == inp_name:
-                call_delegate_args.append(inp_node)
-                break
+    # names of input_specs to delete
+    input_specs_to_delete = toplevel_input_specs_to_delete
+    # Delete owned constants from the call_submodule_node args
+    for call_sm_input in call_submodule_node.args:
+        if (
+            isinstance(call_sm_input, torch.fx.Node)
+            and call_sm_input.name in input_specs_to_delete.keys()
+        ):
+            continue
+        call_delegate_args.append(call_sm_input)
 
     def generate_debug_handle(ep: ExportedProgram) -> int:
         """
@@ -324,6 +328,7 @@ def _partition_and_lower_one_graph_module(
             toplevel_input_specs_to_delete,
             toplevel_output_specs_to_delete,
         )
+        owning_program._validate()
 
     return tagged_graph_module
 
@@ -742,6 +747,7 @@ def to_backend(
     for method_name in method_to_edge_program.keys():
         if method_name in method_to_tagged_exported_program:
             tagged_exported_program = method_to_tagged_exported_program[method_name]
+            tagged_exported_program._validate()
             partitioned_and_lowered_exported_programs[method_name] = ExportedProgram(
                 root=tagged_exported_program.graph_module,
                 graph=tagged_exported_program.graph_module.graph,
diff --git a/exir/backend/test/backend_with_preprocess_all_demo.py b/exir/backend/test/backend_with_preprocess_all_demo.py
index ae9a8174be5..11941b703a0 100644
--- a/exir/backend/test/backend_with_preprocess_all_demo.py
+++ b/exir/backend/test/backend_with_preprocess_all_demo.py
@@ -21,10 +21,30 @@
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.graph_module import get_control_flow_submodules
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
 
 
+def is_param_node(exp_prog: ExportedProgram, node: torch.fx.Node) -> bool:
+    return (
+        is_param(exp_prog, node)
+        or is_buffer(exp_prog, node)
+        or is_lifted_tensor_constant(exp_prog, node)
+    )
+
+
+def get_total_num_ops_in_ep(edge_programs, supported_ops):
+    total_number_of_ops = 0
+    for edge_program in edge_programs.values():
+        for partitioned_program in edge_program:
+            for node in partitioned_program.graph.nodes:
+                if node.op == "call_function":
+                    if node.target in supported_ops:
+                        total_number_of_ops += 1
+    return total_number_of_ops
+
+
 def _preprocess_multimethod(
     edge_programs: Dict[str, List[ExportedProgram]],
     compile_specs: Dict[str, List[List[CompileSpec]]],
@@ -37,13 +57,7 @@ def _preprocess_multimethod(
     in testing for a partitioner which tags different partitions for different backends
     to be lowered to
     """
-    total_number_of_ops = 0
-    for edge_program in edge_programs.values():
-        for partitioned_program in edge_program:
-            for node in partitioned_program.graph.nodes:
-                if node.op == "call_function":
-                    if node.target in supported_ops:
-                        total_number_of_ops += 1
+    total_number_of_ops = get_total_num_ops_in_ep(edge_programs, supported_ops)
     all_processed_results = {key: [] for key in edge_programs.keys()}
 
     for method_name, partitioned_programs in edge_programs.items():
@@ -67,6 +81,8 @@ def _preprocess_multimethod(
                         raise RuntimeError(
                             f"{node.op} {node.target.__name__} is not supported in backend {backend_name}"
                         )
+                if is_param_node(partitioned_program, node):
+                    processed_bytes += f"CONST{node.name}:"
 
             processed_bytes += "#"
             for cs in compile_spec_for_partition:
@@ -171,14 +187,30 @@ def preprocess_multimethod(
 
 
 class AddSinOperatorSupport(OperatorSupportBase):
+    def __init__(self, original_program):
+        self.original_program = original_program
+        super().__init__()
+
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        return node.op == "call_function" and node.target in [
+        supported_targets = [
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.sin.default,
         ]
+        if node.op == "call_function" and node.target in supported_targets:
+            return True
+
+        if node.op == "placeholder" and is_param_node(self.original_program, node):
+            for user in node.users.keys():
+                if user.target in supported_targets:
+                    return True
+        return False
 
 
 class SubCosOperatorSupport(OperatorSupportBase):
+    def __init__(self, original_program):
+        self.original_program = original_program
+        super().__init__()
+
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         return node.op == "call_function" and node.target in [
             exir_ops.edge.aten.sub.Tensor,
@@ -199,11 +231,8 @@ class BackendWithPreprocessAllPartitioner(Partitioner):
     """
 
     def __init__(self) -> None:
-        self.add_sin_support = any_chain(AddSinOperatorSupport())
-        self.add_sin_backend_id = FirstBackendWithPreprocessAll.__name__
-
-        self.sub_cos_support = any_chain(SubCosOperatorSupport())
         self.sub_cos_backend_id = SecondBackendWithPreprocessAll.__name__
+        self.add_sin_backend_id = FirstBackendWithPreprocessAll.__name__
 
     def _partition_graph_module(
         self,
@@ -260,6 +289,8 @@ def _partition_graph_module(
         return partition_tags, start_idx_for_submodules
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        self.add_sin_support = any_chain(AddSinOperatorSupport(exported_program))
+        self.sub_cos_support = any_chain(SubCosOperatorSupport(exported_program))
         partition_tags, _ = self._partition_graph_module(exported_program.graph_module)
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/exir/backend/test/test_to_backend_multi_method.py b/exir/backend/test/test_to_backend_multi_method.py
index d4f8fccb8f2..045de253e0f 100644
--- a/exir/backend/test/test_to_backend_multi_method.py
+++ b/exir/backend/test/test_to_backend_multi_method.py
@@ -392,6 +392,77 @@ def forward(self, x):
         }
         self._test(test_set)
 
+    def test_multi_method_to_backend_sequential_delegates(self):
+        class SequentialBackendModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y, z):
+                # delegate one
+                x = x - x
+                y = y - y
+                z = z - z
+                # graph break
+                a = x * y * z
+                # delegate two uses outputs from delegate one and the
+                # output from the graph break
+                b = x + a
+                b = b + z + a
+                b = b + y + a
+                return b
+
+        module = SequentialBackendModule()
+        example_inputs = (torch.ones(1), torch.ones(1), torch.ones(1))
+        seq_edgeir_m = to_edge(torch.export.export(module, example_inputs))
+
+        test_set = {
+            "seq_edgeir": (
+                seq_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "SecondBackendWithPreprocessAll#3#aten.sub.Tensor:aten.sub.Tensor:aten.sub.Tensor:#sub:b'\\x02';sub:b'\\x02';sub:b'\\x02';",
+                    "FirstBackendWithPreprocessAll#5#aten.add.Tensor:aten.add.Tensor:aten.add.Tensor:aten.add.Tensor:aten.add.Tensor:#add:b'\\x00';add:b'\\x00';add:b'\\x00';add:b'\\x00';add:b'\\x00';",
+                ],
+            ),
+        }
+        self._test(test_set)
+
+    def test_multi_method_to_backend_constants(self):
+        class SequentialBackendModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.zeros(1)
+
+            def forward(self, x, y, z):
+                # delegate one
+                x = x - x
+                y = y - y
+                z = z - z
+                # graph break
+                a = x * y * z * self.const
+                # delegate two uses outputs from delegate one and the
+                # output from the graph break
+                b = x + self.const + a
+                b = z + a + b
+                b = y + a + b
+                return b
+
+        module = SequentialBackendModule()
+        example_inputs = (torch.ones(1), torch.ones(1), torch.ones(1))
+        seq_const_m = to_edge(torch.export.export(module, example_inputs))
+
+        test_set = {
+            "seq_const": (
+                seq_const_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "SecondBackendWithPreprocessAll#3#aten.sub.Tensor:aten.sub.Tensor:aten.sub.Tensor:#sub:b'\\x02';sub:b'\\x02';sub:b'\\x02';",
+                    "FirstBackendWithPreprocessAll#6#CONSTc_const_copy_0:aten.add.Tensor:aten.add.Tensor:aten.add.Tensor:aten.add.Tensor:aten.add.Tensor:aten.add.Tensor:#add:b'\\x00';add:b'\\x00';add:b'\\x00';add:b'\\x00';add:b'\\x00';add:b'\\x00';",
+                ],
+            ),
+        }
+        self._test(test_set)
+
     def test_multi_method_to_backend_not_found(self):
         class SinModule(torch.nn.Module):
             def __init__(self):
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 78b031a238e..6792626d4ac 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -381,7 +381,7 @@ def _fixup_output_node(gm: torch.fx.GraphModule) -> None:
 
 
 def arrange_graph_placeholders(
-    gm: torch.fx.GraphModule, owning_program: ExportedProgram
+    gm: torch.fx.GraphModule, owning_program: ExportedProgram, tag
 ) -> torch.fx.GraphModule:
     """
     Modifies the graph of the given graphmodule with one that contains the same nodes as the original,
@@ -411,9 +411,15 @@ def arrange_graph_placeholders(
         if node.op != "placeholder":
             continue
 
-        if node.name in graph_sign.inputs_to_parameters:
+        if (
+            node.name in graph_sign.inputs_to_parameters
+            and node.meta.get("delegation_tag", None) == tag
+        ):
             param_nodes.append(node)
-        elif node.name in graph_sign.inputs_to_buffers:
+        elif (
+            node.name in graph_sign.inputs_to_buffers
+            and node.meta.get("delegation_tag", None) == tag
+        ):
             buffer_nodes.append(node)
         else:
             input_nodes.append(node)
@@ -694,7 +700,7 @@ def create_exported_program_from_submodule(
             removed from the toplevel ExportedProgram.
     """
     # Arrange the submodule's placeholders in order
-    submodule = arrange_graph_placeholders(submodule, owning_program)
+    submodule = arrange_graph_placeholders(submodule, owning_program, tag)
 
     # TODO: we probably need to arrange the outputs wrt buffer mutations.
 
@@ -958,5 +964,3 @@ def _unsafe_adjust_original_program(  # noqa: C901
             if user_idx > idx:
                 user.args = (user.args[0], user_idx - (len(getitem_idxs) - i))
                 break
-
-    original_program._validate()

From 1c2b7bad7224d746e6e30bcadf4b49abd6b1a988 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Fri, 9 May 2025 14:55:57 -0700
Subject: [PATCH 031/178] Remove FLATC_EXECUTABLE and the ability to bring your
 own flatc (#10781)

### Summary

* Remove the ability to set flatc using FLATC_EXECUTABLE and always
build it from source
  * Also remove, the now redundant, `EXECUTORCH_BUILD_FLATC`
* Move the flatc configuration into `third-party/CMakeLists.txt` to keep
things organized
* Allow `FLATBUFFERS_MAX_ALIGNMENT` to be configurable

### Test plan

CI

cc @larryliu0820
---
 CMakeLists.txt                                | 78 +------------------
 backends/apple/mps/CMakeLists.txt             |  2 +-
 backends/qualcomm/CMakeLists.txt              |  4 +-
 backends/vulkan/CMakeLists.txt                |  2 +-
 backends/xnnpack/CMakeLists.txt               |  2 +-
 devtools/CMakeLists.txt                       |  2 +-
 .../flat_tensor/serialize/CMakeLists.txt      |  4 +-
 schema/CMakeLists.txt                         |  4 +-
 setup.py                                      | 35 ++++-----
 third-party/CMakeLists.txt                    | 49 ++++++++++++
 tools/cmake/Utils.cmake                       |  5 --
 tools/cmake/preset/default.cmake              |  7 +-
 12 files changed, 82 insertions(+), 112 deletions(-)
 create mode 100644 third-party/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d292c209a6..950d9bc6998 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,7 @@ print_configured_options()
 
 include(tools/cmake/Utils.cmake)
 include(CMakeDependentOption)
+include(ExternalProject)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -260,6 +261,8 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
+add_subdirectory(third-party)
+
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
   set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
@@ -454,81 +457,6 @@ if(EXECUTORCH_USE_CPP_CODE_COVERAGE)
   endif()
 endif()
 
-#
-# flatc: Flatbuffer commandline tool to generate .h files from .fbs files
-#
-cmake_dependent_option(
-  EXECUTORCH_BUILD_FLATC "Build the flatc executable." ON
-  "NOT FLATC_EXECUTABLE" OFF
-)
-
-set(FLATBUFFERS_BUILD_FLATC OFF CACHE BOOL "")
-set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "")
-set(FLATBUFFERS_BUILD_FLATLIB OFF CACHE BOOL "")
-set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "")
-set(FLATBUFFERS_INSTALL OFF CACHE BOOL "")
-# exir lets users set the alignment of tensor data embedded in the flatbuffer,
-# and some users need an alignment larger than the default, which is typically
-# 32.
-set(FLATBUFFERS_MAX_ALIGNMENT 1024)
-
-if(EXECUTORCH_BUILD_FLATC)
-  if(FLATC_EXECUTABLE)
-    # We could ignore this, but it could lead to confusion about which `flatc`
-    # is actually being used.
-    message(
-      FATAL_ERROR "May not set both EXECUTORCH_BUILD_FLATC and FLATC_EXECUTABLE"
-    )
-  endif()
-
-  # Build flatc for the *host* to generate files as part of the build step.
-  include(ExternalProject)
-  ExternalProject_Add(
-    flatbuffers
-    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/third-party/flatbuffers
-    BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/flatbuffers
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third-party/flatbuffers
-    CMAKE_ARGS -DFLATBUFFERS_BUILD_FLATC=ON
-               -DFLATBUFFERS_BUILD_FLATHASH=${FLATBUFFERS_BUILD_FLATHASH}
-               -DFLATBUFFERS_BUILD_FLATLIB=${FLATBUFFERS_BUILD_FLATLIB}
-               -DFLATBUFFERS_BUILD_TESTS=${FLATBUFFERS_BUILD_TESTS}
-               -DFLATBUFFERS_INSTALL=${FLATBUFFERS_INSTALL}
-               -DCMAKE_CXX_FLAGS="-DFLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}"
-               # If building for iOS, "unset" these variables to rely on the host (macOS) defaults.
-               $<$<AND:$<BOOL:${CMAKE_TOOLCHAIN_IOS}>,$<BOOL:$<FILTER:${PLATFORM},EXCLUDE,^MAC>>>:-DCMAKE_OSX_SYSROOT=>
-    INSTALL_COMMAND ""
-    BUILD_BYPRODUCTS <BINARY_DIR>/flatc
-  )
-  ExternalProject_Get_Property(flatbuffers BINARY_DIR)
-  if(WIN32)
-    # flatbuffers does not use CMAKE_BUILD_TYPE. Internally, the build forces Release
-    # config, but from CMake's perspective the build type is always Debug.
-    set(FLATC_EXECUTABLE ${BINARY_DIR}/$<CONFIG>/flatc.exe)
-  elseif(CMAKE_GENERATOR STREQUAL "Xcode")
-    set(FLATC_EXECUTABLE ${BINARY_DIR}/$<CONFIG>/flatc)
-  else()
-    set(FLATC_EXECUTABLE ${BINARY_DIR}/flatc)
-  endif()
-  set(FLATC_EXECUTABLE_BUILT_FROM_SOURCE YES)
-endif()
-
-if(NOT FLATC_EXECUTABLE)
-  message(
-    WARNING "FLATC_EXECUTABLE not specified, looking for flatc"
-  )
-  find_program(FLATC_EXECUTABLE flatc)
-
-  if(NOT FLATC_EXECUTABLE)
-    message(FATAL_ERROR "FLATC_EXECUTABLE must be set when EXECUTORCH_BUILD_FLATC is disabled.")
-  endif()
-endif()
-
-add_executable(flatc IMPORTED GLOBAL)
-set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${FLATC_EXECUTABLE})
-
-if(FLATC_EXECUTABLE_BUILT_FROM_SOURCE)
-  add_dependencies(flatc flatbuffers)
-endif()
 
 #
 # program_schema: Generated .h files from schema/*.fbs inputs
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index 132307c5c78..f8119fbea97 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -42,7 +42,7 @@ endforeach()
 add_custom_command(
   OUTPUT ${_mps_schema__outputs}
   COMMAND
-    ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
+    flatc --cpp --cpp-std c++11 --scoped-enums -o
     "${_mps_schema__include_dir}/executorch/backends/apple/mps"
     ${_mps_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 1b7c8891a4e..d0ea30011a9 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -43,7 +43,7 @@ set(qcir_schema_include_dir ${CMAKE_CURRENT_LIST_DIR}/aot/ir)
 set(qcir_schema_output ${qcir_schema_include_dir}/qcir_generated.h)
 add_custom_command(
   OUTPUT qcir_schema_output
-  COMMAND ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
+  COMMAND flatc --cpp --cpp-std c++11 --scoped-enums -o
           ${qcir_schema_include_dir} ${qcir_schema_include_dir}/qcir.fbs
   DEPENDS flatc
   COMMENT "Generating qualcomm ir schema headers"
@@ -94,7 +94,7 @@ endforeach()
 add_custom_command(
   OUTPUT ${_qnn_schema__outputs}
   COMMAND
-    ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
+    flatc --cpp --cpp-std c++11 --scoped-enums -o
     "${_qnn_schema__include_dir}/executorch/backends/qualcomm"
     ${_qnn_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_SOURCE_DIR}
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index c17ca0b4ecf..ae3ec4b4e24 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -85,7 +85,7 @@ set(GENERATED_HEADER
 add_custom_command(
   OUTPUT ${GENERATED_HEADER}
   COMMAND
-    ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
+    flatc --cpp --cpp-std c++11 --scoped-enums -o
     "${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/" ${_vulkan_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
   DEPENDS flatc
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 250190a5712..bae557037f1 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -92,7 +92,7 @@ endif()
 add_custom_command(
   OUTPUT ${_xnnpack_schema__outputs}
   COMMAND
-    ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
+    flatc --cpp --cpp-std c++11 --scoped-enums -o
     "${_xnnpack_schema__include_dir}/executorch/backends/xnnpack/serialization"
     ${_xnnpack_schema__srcs}
   COMMAND ${MV_COMMAND}
diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt
index 9dd38d3678e..f9fd2086b91 100644
--- a/devtools/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -199,7 +199,7 @@ target_link_libraries(
 add_custom_command(
   OUTPUT ${_bundled_program_schema__outputs}
   COMMAND
-    ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
+    flatc --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
     "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema"
     ${_bundled_program_schema__srcs}
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools
diff --git a/extension/flat_tensor/serialize/CMakeLists.txt b/extension/flat_tensor/serialize/CMakeLists.txt
index d1ae797f8b3..39b364797b8 100644
--- a/extension/flat_tensor/serialize/CMakeLists.txt
+++ b/extension/flat_tensor/serialize/CMakeLists.txt
@@ -30,7 +30,7 @@ function(generate_flat_tensor_schema _schema_srcs _schema_name)
   add_custom_command(
     OUTPUT ${_schema_outputs}
     COMMAND
-      ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
+      flatc --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
       "${_flat_tensor_schema__output_dir}" ${_schema_srcs}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
     DEPENDS flatc ${_schema_srcs}
@@ -45,7 +45,7 @@ function(generate_flat_tensor_schema _schema_srcs _schema_name)
   # and some users need an alignment larger than the default, which is typically
   # 32.
   target_compile_definitions(
-    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}
+    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${EXECUTORCH_FLATBUFFERS_MAX_ALIGNMENT}
   )
 
   target_include_directories(
diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt
index 484363acdf5..f5bb3ff3ca7 100644
--- a/schema/CMakeLists.txt
+++ b/schema/CMakeLists.txt
@@ -30,7 +30,7 @@ function(generate_program_schema _schema_srcs _schema_name)
   add_custom_command(
     OUTPUT ${_schema_outputs}
     COMMAND
-      ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
+      flatc --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
       "${_program_schema__output_dir}" ${_schema_srcs}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
     DEPENDS flatc ${_schema_srcs}
@@ -45,7 +45,7 @@ function(generate_program_schema _schema_srcs _schema_name)
   # and some users need an alignment larger than the default, which is typically
   # 32.
   target_compile_definitions(
-    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}
+    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${EXECUTORCH_FLATBUFFERS_MAX_ALIGNMENT}
   )
 
   target_include_directories(
diff --git a/setup.py b/setup.py
index 2c5f5578bcf..033426f29ce 100644
--- a/setup.py
+++ b/setup.py
@@ -156,10 +156,6 @@ def llama_custom_ops(cls) -> bool:
             "EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT", default=True
         )
 
-    @classmethod
-    def flatc(cls) -> bool:
-        return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_FLATC", default=True)
-
 
 class Version:
     """Static strings that describe the version of the pip package."""
@@ -833,23 +829,20 @@ def run(self):
 
 def get_ext_modules() -> List[Extension]:
     """Returns the set of extension modules to build."""
-    ext_modules = []
-    if ShouldBuild.flatc():
-        ext_modules.extend(
-            [
-                BuiltFile(
-                    src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers/%BUILD_TYPE%/",
-                    src_name="flatc",
-                    dst="executorch/data/bin/",
-                    is_executable=True,
-                ),
-                BuiltFile(
-                    src_dir="tools/wheel",
-                    src_name="pip_data_bin_init.py.in",
-                    dst="executorch/data/bin/__init__.py",
-                ),
-            ]
-        )
+
+    ext_modules = [
+        BuiltFile(
+            src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers_external_project",
+            src_name="flatc",
+            dst="executorch/data/bin/",
+            is_executable=True,
+        ),
+        BuiltFile(
+            src_dir="tools/wheel",
+            src_name="pip_data_bin_init.py.in",
+            dst="executorch/data/bin/__init__.py",
+        ),
+    ]
 
     if ShouldBuild.pybindings():
         ext_modules.append(
diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt
new file mode 100644
index 00000000000..2aa606927c3
--- /dev/null
+++ b/third-party/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# MARK: - flatbuffers
+
+# We use ExternalProject to build flatc from source to force it target the host.
+# Otherwise, flatc will target the project's toolchain (i.e. iOS, or Android).
+ExternalProject_Add(
+  flatbuffers_external_project
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_external_project
+  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_external_project
+  SOURCE_DIR ${PROJECT_SOURCE_DIR}/third-party/flatbuffers
+  # Always use Make to avoid needing to codesign flatc if the project is using Xcode.
+  CMAKE_GENERATOR "Unix Makefiles"
+  CMAKE_ARGS -DFLATBUFFERS_BUILD_FLATC=ON
+             -DFLATBUFFERS_INSTALL=OFF
+             -DFLATBUFFERS_BUILD_FLATHASH=OFF
+             -DFLATBUFFERS_BUILD_FLATLIB=OFF
+             -DFLATBUFFERS_BUILD_TESTS=OFF
+             -DCMAKE_CXX_FLAGS="-DFLATBUFFERS_MAX_ALIGNMENT=${EXECUTORCH_FLATBUFFERS_MAX_ALIGNMENT}"
+             # Unset the toolchain to build for the host instead of the toolchain set for the project.
+             -DCMAKE_TOOLCHAIN_FILE=
+             # If building for iOS, "unset" these variables to rely on the host (macOS) defaults.
+             $<$<AND:$<BOOL:${APPLE}>,$<BOOL:$<FILTER:${PLATFORM},EXCLUDE,^MAC>>>:-DCMAKE_OSX_SYSROOT=>
+             -DCMAKE_OSX_DEPLOYMENT_TARGET:STRING=${CMAKE_OSX_DEPLOYMENT_TARGET}
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS <BINARY_DIR>/flatc
+)
+ExternalProject_Get_Property(flatbuffers_external_project BINARY_DIR)
+add_executable(flatc IMPORTED GLOBAL)
+add_dependencies(flatc flatbuffers_external_project)
+if(WIN32)
+  # flatbuffers does not use CMAKE_BUILD_TYPE. Internally, the build forces Release
+  # config, but from CMake's perspective the build type is always Debug.
+  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${BINARY_DIR}/$<CONFIG>/flatc.exe)
+else()
+  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${BINARY_DIR}/flatc)
+endif()
+
+# TODO: re-enable once flatbuffers is added as a subdirectory.
+# set(FLATBUFFERS_BUILD_FLATC OFF)
+# set(FLATBUFFERS_INSTALL OFF)
+# set(FLATBUFFERS_BUILD_FLATHASH OFF)
+# set(FLATBUFFERS_BUILD_FLATLIB OFF)
+# set(FLATBUFFERS_BUILD_TESTS OFF)
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index dda83f1794e..13ec09c59d9 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -29,7 +29,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  CMAKE_TOOLCHAIN_FILE          : ${CMAKE_TOOLCHAIN_FILE}")
   message(STATUS "  BUCK2                         : ${BUCK2}")
   message(STATUS "  PYTHON_EXECUTABLE             : ${PYTHON_EXECUTABLE}")
-  message(STATUS "  FLATC_EXECUTABLE              : ${FLATC_EXECUTABLE}")
   message(STATUS "  EXECUTORCH_ENABLE_PROGRAM_VERIFICATION : "
                  "${EXECUTORCH_ENABLE_PROGRAM_VERIFICATION}"
   )
@@ -77,10 +76,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  EXECUTORCH_BUILD_EXTENSION_TRAINING    : "
                  "${EXECUTORCH_BUILD_EXTENSION_TRAINING}"
   )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_FLATC                 : ${EXECUTORCH_BUILD_FLATC}"
-  )
   message(
     STATUS
       "  EXECUTORCH_BUILD_GFLAGS                : ${EXECUTORCH_BUILD_GFLAGS}"
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 5fbb47b1396..ddcab5d367b 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -12,7 +12,12 @@ else()
   set(_is_build_type_debug ON)
 endif()
 
-# MARK: - Definitions
+# MARK: - Overridable Options
 
 define_overridable_option(EXECUTORCH_ENABLE_LOGGING "Build with ET_LOG_ENABLED" BOOL ${_is_build_type_debug})
 define_overridable_option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" BOOL OFF)
+define_overridable_option(
+  EXECUTORCH_FLATBUFFERS_MAX_ALIGNMENT
+  "Exir lets users set the alignment of tensor data embedded in the flatbuffer, and some users need an alignment larger than the default, which is typically 32."
+  STRING 1024
+)

From b173722085b3f555d6ba4533d6bbaddfd7c71144 Mon Sep 17 00:00:00 2001
From: tmsl <phaiting@gmail.com>
Date: Fri, 9 May 2025 17:29:45 -0700
Subject: [PATCH 032/178] =?UTF-8?q?Introduce=20assertj=20test=20lib=20to?=
 =?UTF-8?q?=20make=20the=20throw=20exception=20test=20more=20accu=E2=80=A6?=
 =?UTF-8?q?=20(#10779)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
This change introduced the assertj test lib to make the throw exception
testing more accurate

### Test plan
./gradlew :executorch_android:testDebugUnitTest

---------

Co-authored-by: Haiting Pu <haiting@meta.com>
---
 .../android/executorch_android/build.gradle   |   1 +
 .../java/org/pytorch/executorch/Tensor.java   |   4 +-
 .../java/org/pytorch/executorch/EValueTest.kt | 389 +++++++-------
 .../java/org/pytorch/executorch/TensorTest.kt | 507 +++++++++---------
 4 files changed, 439 insertions(+), 462 deletions(-)

diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index fac08588740..2fa0b9fd57c 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -49,6 +49,7 @@ dependencies {
     implementation 'com.facebook.soloader:nativeloader:0.10.5'
     implementation libs.core.ktx
     testImplementation 'junit:junit:4.12'
+    testImplementation 'org.assertj:assertj-core:3.27.2'
     androidTestImplementation 'androidx.test.ext:junit:1.1.5'
     androidTestImplementation 'androidx.test:rules:1.2.0'
     androidTestImplementation 'commons-io:commons-io:2.4'
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Tensor.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Tensor.java
index 1a30baba2f1..62535156a52 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Tensor.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Tensor.java
@@ -394,7 +394,9 @@ public byte[] getDataAsByteArray() {
    */
   public byte[] getDataAsUnsignedByteArray() {
     throw new IllegalStateException(
-        "Tensor of type " + getClass().getSimpleName() + " cannot return data as byte array.");
+        "Tensor of type "
+            + getClass().getSimpleName()
+            + " cannot return data as unsigned byte array.");
   }
 
   /**
diff --git a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt
index 0e56480d621..7e9fea9a699 100644
--- a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt
+++ b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.kt
@@ -7,218 +7,211 @@
  */
 package org.pytorch.executorch
 
-import org.junit.Assert
+import org.assertj.core.api.Assertions.assertThatThrownBy
+import org.junit.Assert.assertEquals
+import org.junit.Assert.assertFalse
+import org.junit.Assert.assertTrue
 import org.junit.Test
 import org.junit.runner.RunWith
 import org.junit.runners.JUnit4
 
-/** Unit tests for [EValue].  */
+/** Unit tests for [EValue]. */
 @RunWith(JUnit4::class)
 class EValueTest {
-    @Test
-    fun testNone() {
-        val evalue = EValue.optionalNone()
-        Assert.assertTrue(evalue.isNone)
+  @Test
+  fun testNone() {
+    val evalue = EValue.optionalNone()
+    assertTrue(evalue.isNone)
+  }
+
+  @Test
+  fun testTensorValue() {
+    val data = longArrayOf(1, 2, 3)
+    val shape = longArrayOf(1, 3)
+    val evalue = EValue.from(Tensor.fromBlob(data, shape))
+    assertTrue(evalue.isTensor)
+    assertTrue(evalue.toTensor().shape.contentEquals(shape))
+    assertTrue(evalue.toTensor().dataAsLongArray.contentEquals(data))
+  }
+
+  @Test
+  fun testBoolValue() {
+    val evalue = EValue.from(true)
+    assertTrue(evalue.isBool)
+    assertTrue(evalue.toBool())
+  }
+
+  @Test
+  fun testIntValue() {
+    val evalue = EValue.from(1)
+    assertTrue(evalue.isInt)
+    assertEquals(evalue.toInt(), 1)
+  }
+
+  @Test
+  fun testDoubleValue() {
+    val evalue = EValue.from(0.1)
+    assertTrue(evalue.isDouble)
+    assertEquals(evalue.toDouble(), 0.1, 0.0001)
+  }
+
+  @Test
+  fun testStringValue() {
+    val evalue = EValue.from("a")
+    assertTrue(evalue.isString)
+    assertEquals(evalue.toStr(), "a")
+  }
+
+  @Test
+  fun testAllIllegalCast() {
+    val evalue = EValue.optionalNone()
+    assertTrue(evalue.isNone)
+
+    // try Tensor
+    assertFalse(evalue.isTensor)
+    assertThatThrownBy { evalue.toTensor() }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Expected EValue type Tensor, actual type None")
+
+    // try bool
+    assertFalse(evalue.isBool)
+    assertThatThrownBy { evalue.toBool() }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Expected EValue type Bool, actual type None")
+
+    // try int
+    assertFalse(evalue.isInt)
+    assertThatThrownBy { evalue.toInt() }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Expected EValue type Int, actual type None")
+
+    // try double
+    assertFalse(evalue.isDouble)
+    assertThatThrownBy { evalue.toDouble() }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Expected EValue type Double, actual type None")
+
+    // try string
+    assertFalse(evalue.isString)
+    assertThatThrownBy { evalue.toStr() }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Expected EValue type String, actual type None")
+  }
+
+  @Test
+  fun testNoneSerde() {
+    val evalue = EValue.optionalNone()
+    val bytes = evalue.toByteArray()
+
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isNone, true)
+  }
+
+  @Test
+  fun testBoolSerde() {
+    val evalue = EValue.from(true)
+    val bytes = evalue.toByteArray()
+    assertEquals(1, bytes[1].toLong())
+
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isBool, true)
+    assertEquals(deser.toBool(), true)
+  }
+
+  @Test
+  fun testBoolSerde2() {
+    val evalue = EValue.from(false)
+    val bytes = evalue.toByteArray()
+    assertEquals(0, bytes[1].toLong())
+
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isBool, true)
+    assertEquals(deser.toBool(), false)
+  }
+
+  @Test
+  fun testIntSerde() {
+    val evalue = EValue.from(1)
+    val bytes = evalue.toByteArray()
+    assertEquals(0, bytes[1].toLong())
+    assertEquals(0, bytes[2].toLong())
+    assertEquals(0, bytes[3].toLong())
+    assertEquals(0, bytes[4].toLong())
+    assertEquals(0, bytes[5].toLong())
+    assertEquals(0, bytes[6].toLong())
+    assertEquals(0, bytes[7].toLong())
+    assertEquals(1, bytes[8].toLong())
+
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isInt, true)
+    assertEquals(deser.toInt(), 1)
+  }
+
+  @Test
+  fun testLargeIntSerde() {
+    val evalue = EValue.from(256000)
+    val bytes = evalue.toByteArray()
+
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isInt, true)
+    assertEquals(deser.toInt(), 256000)
+  }
+
+  @Test
+  fun testDoubleSerde() {
+    val evalue = EValue.from(1.345e-2)
+    val bytes = evalue.toByteArray()
+
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isDouble, true)
+    assertEquals(1.345e-2, deser.toDouble(), 1e-6)
+  }
+
+  @Test
+  fun testLongTensorSerde() {
+    val data = longArrayOf(1, 2, 3, 4)
+    val shape = longArrayOf(2, 2)
+    val tensor = Tensor.fromBlob(data, shape)
+
+    val evalue = EValue.from(tensor)
+    val bytes = evalue.toByteArray()
+
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isTensor, true)
+    val deserTensor = deser.toTensor()
+    val deserShape = deserTensor.shape()
+    val deserData = deserTensor.dataAsLongArray
+
+    for (i in data.indices) {
+      assertEquals(data[i], deserData[i])
     }
 
-    @Test
-    fun testTensorValue() {
-        val data = longArrayOf(1, 2, 3)
-        val shape = longArrayOf(1, 3)
-        val evalue = EValue.from(Tensor.fromBlob(data, shape))
-        Assert.assertTrue(evalue.isTensor)
-        Assert.assertTrue(evalue.toTensor().shape.contentEquals(shape))
-        Assert.assertTrue(evalue.toTensor().dataAsLongArray.contentEquals(data))
+    for (i in shape.indices) {
+      assertEquals(shape[i], deserShape[i])
     }
+  }
 
-    @Test
-    fun testBoolValue() {
-        val evalue = EValue.from(true)
-        Assert.assertTrue(evalue.isBool)
-        Assert.assertTrue(evalue.toBool())
-    }
-
-    @Test
-    fun testIntValue() {
-        val evalue = EValue.from(1)
-        Assert.assertTrue(evalue.isInt)
-        Assert.assertEquals(evalue.toInt(), 1)
-    }
-
-    @Test
-    fun testDoubleValue() {
-        val evalue = EValue.from(0.1)
-        Assert.assertTrue(evalue.isDouble)
-        Assert.assertEquals(evalue.toDouble(), 0.1, 0.0001)
-    }
-
-    @Test
-    fun testStringValue() {
-        val evalue = EValue.from("a")
-        Assert.assertTrue(evalue.isString)
-        Assert.assertEquals(evalue.toStr(), "a")
-    }
-
-    @Test
-    fun testAllIllegalCast() {
-        val evalue = EValue.optionalNone()
-        Assert.assertTrue(evalue.isNone)
-
-        // try Tensor
-        Assert.assertFalse(evalue.isTensor)
-        try {
-            evalue.toTensor()
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-        }
-
-        // try bool
-        Assert.assertFalse(evalue.isBool)
-        try {
-            evalue.toBool()
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-        }
-
-        // try int
-        Assert.assertFalse(evalue.isInt)
-        try {
-            evalue.toInt()
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-        }
-
-        // try double
-        Assert.assertFalse(evalue.isDouble)
-        try {
-            evalue.toDouble()
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-        }
-
-        // try string
-        Assert.assertFalse(evalue.isString)
-        try {
-            evalue.toStr()
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-        }
-    }
-
-    @Test
-    fun testNoneSerde() {
-        val evalue = EValue.optionalNone()
-        val bytes = evalue.toByteArray()
-
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isNone, true)
-    }
-
-    @Test
-    fun testBoolSerde() {
-        val evalue = EValue.from(true)
-        val bytes = evalue.toByteArray()
-        Assert.assertEquals(1, bytes[1].toLong())
-
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isBool, true)
-        Assert.assertEquals(deser.toBool(), true)
-    }
-
-    @Test
-    fun testBoolSerde2() {
-        val evalue = EValue.from(false)
-        val bytes = evalue.toByteArray()
-        Assert.assertEquals(0, bytes[1].toLong())
+  @Test
+  fun testFloatTensorSerde() {
+    val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+    val shape = longArrayOf(2, 2)
+    val tensor = Tensor.fromBlob(data, shape)
 
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isBool, true)
-        Assert.assertEquals(deser.toBool(), false)
-    }
-
-    @Test
-    fun testIntSerde() {
-        val evalue = EValue.from(1)
-        val bytes = evalue.toByteArray()
-        Assert.assertEquals(0, bytes[1].toLong())
-        Assert.assertEquals(0, bytes[2].toLong())
-        Assert.assertEquals(0, bytes[3].toLong())
-        Assert.assertEquals(0, bytes[4].toLong())
-        Assert.assertEquals(0, bytes[5].toLong())
-        Assert.assertEquals(0, bytes[6].toLong())
-        Assert.assertEquals(0, bytes[7].toLong())
-        Assert.assertEquals(1, bytes[8].toLong())
-
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isInt, true)
-        Assert.assertEquals(deser.toInt(), 1)
-    }
-
-    @Test
-    fun testLargeIntSerde() {
-        val evalue = EValue.from(256000)
-        val bytes = evalue.toByteArray()
-
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isInt, true)
-        Assert.assertEquals(deser.toInt(), 256000)
-    }
-
-    @Test
-    fun testDoubleSerde() {
-        val evalue = EValue.from(1.345e-2)
-        val bytes = evalue.toByteArray()
-
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isDouble, true)
-        Assert.assertEquals(1.345e-2, deser.toDouble(), 1e-6)
-    }
-
-    @Test
-    fun testLongTensorSerde() {
-        val data = longArrayOf(1, 2, 3, 4)
-        val shape = longArrayOf(2, 2)
-        val tensor = Tensor.fromBlob(data, shape)
-
-        val evalue = EValue.from(tensor)
-        val bytes = evalue.toByteArray()
-
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isTensor, true)
-        val deserTensor = deser.toTensor()
-        val deserShape = deserTensor.shape()
-        val deserData = deserTensor.dataAsLongArray
+    val evalue = EValue.from(tensor)
+    val bytes = evalue.toByteArray()
 
-        for (i in data.indices) {
-            Assert.assertEquals(data[i], deserData[i])
-        }
+    val deser = EValue.fromByteArray(bytes)
+    assertEquals(deser.isTensor, true)
+    val deserTensor = deser.toTensor()
+    val deserShape = deserTensor.shape()
+    val deserData = deserTensor.dataAsFloatArray
 
-        for (i in shape.indices) {
-            Assert.assertEquals(shape[i], deserShape[i])
-        }
+    for (i in data.indices) {
+      assertEquals(data[i].toDouble(), deserData[i].toDouble(), 1e-5)
     }
 
-    @Test
-    fun testFloatTensorSerde() {
-        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
-        val shape = longArrayOf(2, 2)
-        val tensor = Tensor.fromBlob(data, shape)
-
-        val evalue = EValue.from(tensor)
-        val bytes = evalue.toByteArray()
-
-        val deser = EValue.fromByteArray(bytes)
-        Assert.assertEquals(deser.isTensor, true)
-        val deserTensor = deser.toTensor()
-        val deserShape = deserTensor.shape()
-        val deserData = deserTensor.dataAsFloatArray
-
-        for (i in data.indices) {
-            Assert.assertEquals(data[i].toDouble(), deserData[i].toDouble(), 1e-5)
-        }
-
-        for (i in shape.indices) {
-            Assert.assertEquals(shape[i], deserShape[i])
-        }
+    for (i in shape.indices) {
+      assertEquals(shape[i], deserShape[i])
     }
+  }
 }
diff --git a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt
index 4b206c8efbd..e59b40030d7 100644
--- a/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt
+++ b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.kt
@@ -7,290 +7,271 @@
  */
 package org.pytorch.executorch
 
-import org.junit.Assert
+import org.assertj.core.api.Assertions.assertThatThrownBy
+import org.junit.Assert.assertEquals
 import org.junit.Test
 import org.junit.runner.RunWith
 import org.junit.runners.JUnit4
 
-/** Unit tests for [Tensor].  */
+/** Unit tests for [Tensor]. */
 @RunWith(JUnit4::class)
 class TensorTest {
-    @Test
-    fun testFloatTensor() {
-        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
-        val shape = longArrayOf(2, 2)
-        var tensor = Tensor.fromBlob(data, shape)
-        Assert.assertEquals(tensor.dtype(), DType.FLOAT)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toDouble(), tensor.dataAsFloatArray[0].toDouble(), 1e-5)
-        Assert.assertEquals(data[1].toDouble(), tensor.dataAsFloatArray[1].toDouble(), 1e-5)
-        Assert.assertEquals(data[2].toDouble(), tensor.dataAsFloatArray[2].toDouble(), 1e-5)
-        Assert.assertEquals(data[3].toDouble(), tensor.dataAsFloatArray[3].toDouble(), 1e-5)
-
-        val floatBuffer = Tensor.allocateFloatBuffer(4)
-        floatBuffer.put(data)
-        tensor = Tensor.fromBlob(floatBuffer, shape)
-        Assert.assertEquals(tensor.dtype(), DType.FLOAT)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toDouble(), tensor.dataAsFloatArray[0].toDouble(), 1e-5)
-        Assert.assertEquals(data[1].toDouble(), tensor.dataAsFloatArray[1].toDouble(), 1e-5)
-        Assert.assertEquals(data[2].toDouble(), tensor.dataAsFloatArray[2].toDouble(), 1e-5)
-        Assert.assertEquals(data[3].toDouble(), tensor.dataAsFloatArray[3].toDouble(), 1e-5)
-    }
+  @Test
+  fun testFloatTensor() {
+    val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+    val shape = longArrayOf(2, 2)
+    var tensor = Tensor.fromBlob(data, shape)
+    assertEquals(tensor.dtype(), DType.FLOAT)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toDouble(), tensor.dataAsFloatArray[0].toDouble(), 1e-5)
+    assertEquals(data[1].toDouble(), tensor.dataAsFloatArray[1].toDouble(), 1e-5)
+    assertEquals(data[2].toDouble(), tensor.dataAsFloatArray[2].toDouble(), 1e-5)
+    assertEquals(data[3].toDouble(), tensor.dataAsFloatArray[3].toDouble(), 1e-5)
 
-    @Test
-    fun testIntTensor() {
-        val data = intArrayOf(Int.MIN_VALUE, 0, 1, Int.MAX_VALUE)
-        val shape = longArrayOf(1, 4, 1)
-        var tensor = Tensor.fromBlob(data, shape)
-        Assert.assertEquals(tensor.dtype(), DType.INT32)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(shape[2], tensor.shape()[2])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toLong(), tensor.dataAsIntArray[0].toLong())
-        Assert.assertEquals(data[1].toLong(), tensor.dataAsIntArray[1].toLong())
-        Assert.assertEquals(data[2].toLong(), tensor.dataAsIntArray[2].toLong())
-        Assert.assertEquals(data[3].toLong(), tensor.dataAsIntArray[3].toLong())
-
-        val intBuffer = Tensor.allocateIntBuffer(4)
-        intBuffer.put(data)
-        tensor = Tensor.fromBlob(intBuffer, shape)
-        Assert.assertEquals(tensor.dtype(), DType.INT32)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(shape[2], tensor.shape()[2])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toLong(), tensor.dataAsIntArray[0].toLong())
-        Assert.assertEquals(data[1].toLong(), tensor.dataAsIntArray[1].toLong())
-        Assert.assertEquals(data[2].toLong(), tensor.dataAsIntArray[2].toLong())
-        Assert.assertEquals(data[3].toLong(), tensor.dataAsIntArray[3].toLong())
-    }
+    val floatBuffer = Tensor.allocateFloatBuffer(4)
+    floatBuffer.put(data)
+    tensor = Tensor.fromBlob(floatBuffer, shape)
+    assertEquals(tensor.dtype(), DType.FLOAT)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toDouble(), tensor.dataAsFloatArray[0].toDouble(), 1e-5)
+    assertEquals(data[1].toDouble(), tensor.dataAsFloatArray[1].toDouble(), 1e-5)
+    assertEquals(data[2].toDouble(), tensor.dataAsFloatArray[2].toDouble(), 1e-5)
+    assertEquals(data[3].toDouble(), tensor.dataAsFloatArray[3].toDouble(), 1e-5)
+  }
 
-    @Test
-    fun testDoubleTensor() {
-        val data = doubleArrayOf(Double.MIN_VALUE, 0.0, 0.1, Double.MAX_VALUE)
-        val shape = longArrayOf(1, 4)
-        var tensor = Tensor.fromBlob(data, shape)
-        Assert.assertEquals(tensor.dtype(), DType.DOUBLE)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0], tensor.dataAsDoubleArray[0], 1e-5)
-        Assert.assertEquals(data[1], tensor.dataAsDoubleArray[1], 1e-5)
-        Assert.assertEquals(data[2], tensor.dataAsDoubleArray[2], 1e-5)
-        Assert.assertEquals(data[3], tensor.dataAsDoubleArray[3], 1e-5)
-
-        val doubleBuffer = Tensor.allocateDoubleBuffer(4)
-        doubleBuffer.put(data)
-        tensor = Tensor.fromBlob(doubleBuffer, shape)
-        Assert.assertEquals(tensor.dtype(), DType.DOUBLE)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0], tensor.dataAsDoubleArray[0], 1e-5)
-        Assert.assertEquals(data[1], tensor.dataAsDoubleArray[1], 1e-5)
-        Assert.assertEquals(data[2], tensor.dataAsDoubleArray[2], 1e-5)
-        Assert.assertEquals(data[3], tensor.dataAsDoubleArray[3], 1e-5)
-    }
+  @Test
+  fun testIntTensor() {
+    val data = intArrayOf(Int.MIN_VALUE, 0, 1, Int.MAX_VALUE)
+    val shape = longArrayOf(1, 4, 1)
+    var tensor = Tensor.fromBlob(data, shape)
+    assertEquals(tensor.dtype(), DType.INT32)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(shape[2], tensor.shape()[2])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toLong(), tensor.dataAsIntArray[0].toLong())
+    assertEquals(data[1].toLong(), tensor.dataAsIntArray[1].toLong())
+    assertEquals(data[2].toLong(), tensor.dataAsIntArray[2].toLong())
+    assertEquals(data[3].toLong(), tensor.dataAsIntArray[3].toLong())
 
-    @Test
-    fun testLongTensor() {
-        val data = longArrayOf(Long.MIN_VALUE, 0L, 1L, Long.MAX_VALUE)
-        val shape = longArrayOf(4, 1)
-        var tensor = Tensor.fromBlob(data, shape)
-        Assert.assertEquals(tensor.dtype(), DType.INT64)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0], tensor.dataAsLongArray[0])
-        Assert.assertEquals(data[1], tensor.dataAsLongArray[1])
-        Assert.assertEquals(data[2], tensor.dataAsLongArray[2])
-        Assert.assertEquals(data[3], tensor.dataAsLongArray[3])
-
-        val longBuffer = Tensor.allocateLongBuffer(4)
-        longBuffer.put(data)
-        tensor = Tensor.fromBlob(longBuffer, shape)
-        Assert.assertEquals(tensor.dtype(), DType.INT64)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0], tensor.dataAsLongArray[0])
-        Assert.assertEquals(data[1], tensor.dataAsLongArray[1])
-        Assert.assertEquals(data[2], tensor.dataAsLongArray[2])
-        Assert.assertEquals(data[3], tensor.dataAsLongArray[3])
-    }
+    val intBuffer = Tensor.allocateIntBuffer(4)
+    intBuffer.put(data)
+    tensor = Tensor.fromBlob(intBuffer, shape)
+    assertEquals(tensor.dtype(), DType.INT32)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(shape[2], tensor.shape()[2])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toLong(), tensor.dataAsIntArray[0].toLong())
+    assertEquals(data[1].toLong(), tensor.dataAsIntArray[1].toLong())
+    assertEquals(data[2].toLong(), tensor.dataAsIntArray[2].toLong())
+    assertEquals(data[3].toLong(), tensor.dataAsIntArray[3].toLong())
+  }
 
-    @Test
-    fun testSignedByteTensor() {
-        val data = byteArrayOf(Byte.MIN_VALUE, 0.toByte(), 1.toByte(), Byte.MAX_VALUE)
-        val shape = longArrayOf(1, 1, 4)
-        var tensor = Tensor.fromBlob(data, shape)
-        Assert.assertEquals(tensor.dtype(), DType.INT8)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(shape[2], tensor.shape()[2])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toLong(), tensor.dataAsByteArray[0].toLong())
-        Assert.assertEquals(data[1].toLong(), tensor.dataAsByteArray[1].toLong())
-        Assert.assertEquals(data[2].toLong(), tensor.dataAsByteArray[2].toLong())
-        Assert.assertEquals(data[3].toLong(), tensor.dataAsByteArray[3].toLong())
-
-        val byteBuffer = Tensor.allocateByteBuffer(4)
-        byteBuffer.put(data)
-        tensor = Tensor.fromBlob(byteBuffer, shape)
-        Assert.assertEquals(tensor.dtype(), DType.INT8)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(shape[2], tensor.shape()[2])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toLong(), tensor.dataAsByteArray[0].toLong())
-        Assert.assertEquals(data[1].toLong(), tensor.dataAsByteArray[1].toLong())
-        Assert.assertEquals(data[2].toLong(), tensor.dataAsByteArray[2].toLong())
-        Assert.assertEquals(data[3].toLong(), tensor.dataAsByteArray[3].toLong())
-    }
+  @Test
+  fun testDoubleTensor() {
+    val data = doubleArrayOf(Double.MIN_VALUE, 0.0, 0.1, Double.MAX_VALUE)
+    val shape = longArrayOf(1, 4)
+    var tensor = Tensor.fromBlob(data, shape)
+    assertEquals(tensor.dtype(), DType.DOUBLE)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0], tensor.dataAsDoubleArray[0], 1e-5)
+    assertEquals(data[1], tensor.dataAsDoubleArray[1], 1e-5)
+    assertEquals(data[2], tensor.dataAsDoubleArray[2], 1e-5)
+    assertEquals(data[3], tensor.dataAsDoubleArray[3], 1e-5)
 
-    @Test
-    fun testUnsignedByteTensor() {
-        val data = byteArrayOf(0.toByte(), 1.toByte(), 2.toByte(), 255.toByte())
-        val shape = longArrayOf(4, 1, 1)
-        var tensor = Tensor.fromBlobUnsigned(data, shape)
-        Assert.assertEquals(tensor.dtype(), DType.UINT8)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(shape[2], tensor.shape()[2])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toLong(), tensor.dataAsUnsignedByteArray[0].toLong())
-        Assert.assertEquals(data[1].toLong(), tensor.dataAsUnsignedByteArray[1].toLong())
-        Assert.assertEquals(data[2].toLong(), tensor.dataAsUnsignedByteArray[2].toLong())
-        Assert.assertEquals(data[3].toLong(), tensor.dataAsUnsignedByteArray[3].toLong())
-
-        val byteBuffer = Tensor.allocateByteBuffer(4)
-        byteBuffer.put(data)
-        tensor = Tensor.fromBlobUnsigned(byteBuffer, shape)
-        Assert.assertEquals(tensor.dtype(), DType.UINT8)
-        Assert.assertEquals(shape[0], tensor.shape()[0])
-        Assert.assertEquals(shape[1], tensor.shape()[1])
-        Assert.assertEquals(shape[2], tensor.shape()[2])
-        Assert.assertEquals(4, tensor.numel())
-        Assert.assertEquals(data[0].toLong(), tensor.dataAsUnsignedByteArray[0].toLong())
-        Assert.assertEquals(data[1].toLong(), tensor.dataAsUnsignedByteArray[1].toLong())
-        Assert.assertEquals(data[2].toLong(), tensor.dataAsUnsignedByteArray[2].toLong())
-        Assert.assertEquals(data[3].toLong(), tensor.dataAsUnsignedByteArray[3].toLong())
-    }
+    val doubleBuffer = Tensor.allocateDoubleBuffer(4)
+    doubleBuffer.put(data)
+    tensor = Tensor.fromBlob(doubleBuffer, shape)
+    assertEquals(tensor.dtype(), DType.DOUBLE)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0], tensor.dataAsDoubleArray[0], 1e-5)
+    assertEquals(data[1], tensor.dataAsDoubleArray[1], 1e-5)
+    assertEquals(data[2], tensor.dataAsDoubleArray[2], 1e-5)
+    assertEquals(data[3], tensor.dataAsDoubleArray[3], 1e-5)
+  }
 
-    @Test
-    fun testIllegalDataTypeException() {
-        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
-        val shape = longArrayOf(2, 2)
-        val tensor = Tensor.fromBlob(data, shape)
-        Assert.assertEquals(tensor.dtype(), DType.FLOAT)
-
-        try {
-            tensor.dataAsByteArray
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-            // expected
-        }
-        try {
-            tensor.dataAsUnsignedByteArray
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-            // expected
-        }
-        try {
-            tensor.dataAsIntArray
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-            // expected
-        }
-        try {
-            tensor.dataAsDoubleArray
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-            // expected
-        }
-        try {
-            tensor.dataAsLongArray
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalStateException) {
-            // expected
-        }
-    }
+  @Test
+  fun testLongTensor() {
+    val data = longArrayOf(Long.MIN_VALUE, 0L, 1L, Long.MAX_VALUE)
+    val shape = longArrayOf(4, 1)
+    var tensor = Tensor.fromBlob(data, shape)
+    assertEquals(tensor.dtype(), DType.INT64)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0], tensor.dataAsLongArray[0])
+    assertEquals(data[1], tensor.dataAsLongArray[1])
+    assertEquals(data[2], tensor.dataAsLongArray[2])
+    assertEquals(data[3], tensor.dataAsLongArray[3])
 
-    @Test
-    fun testIllegalArguments() {
-        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
-        val shapeWithNegativeValues = longArrayOf(-1, 2)
-        val mismatchShape = longArrayOf(1, 2)
-
-        try {
-            val tensor = Tensor.fromBlob(null as FloatArray?, mismatchShape)
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalArgumentException) {
-            // expected
-        }
-        try {
-            val tensor = Tensor.fromBlob(data, null)
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalArgumentException) {
-            // expected
-        }
-        try {
-            val tensor = Tensor.fromBlob(data, shapeWithNegativeValues)
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalArgumentException) {
-            // expected
-        }
-        try {
-            val tensor = Tensor.fromBlob(data, mismatchShape)
-            Assert.fail("Should have thrown an exception")
-        } catch (e: IllegalArgumentException) {
-            // expected
-        }
-    }
+    val longBuffer = Tensor.allocateLongBuffer(4)
+    longBuffer.put(data)
+    tensor = Tensor.fromBlob(longBuffer, shape)
+    assertEquals(tensor.dtype(), DType.INT64)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0], tensor.dataAsLongArray[0])
+    assertEquals(data[1], tensor.dataAsLongArray[1])
+    assertEquals(data[2], tensor.dataAsLongArray[2])
+    assertEquals(data[3], tensor.dataAsLongArray[3])
+  }
+
+  @Test
+  fun testSignedByteTensor() {
+    val data = byteArrayOf(Byte.MIN_VALUE, 0.toByte(), 1.toByte(), Byte.MAX_VALUE)
+    val shape = longArrayOf(1, 1, 4)
+    var tensor = Tensor.fromBlob(data, shape)
+    assertEquals(tensor.dtype(), DType.INT8)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(shape[2], tensor.shape()[2])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toLong(), tensor.dataAsByteArray[0].toLong())
+    assertEquals(data[1].toLong(), tensor.dataAsByteArray[1].toLong())
+    assertEquals(data[2].toLong(), tensor.dataAsByteArray[2].toLong())
+    assertEquals(data[3].toLong(), tensor.dataAsByteArray[3].toLong())
+
+    val byteBuffer = Tensor.allocateByteBuffer(4)
+    byteBuffer.put(data)
+    tensor = Tensor.fromBlob(byteBuffer, shape)
+    assertEquals(tensor.dtype(), DType.INT8)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(shape[2], tensor.shape()[2])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toLong(), tensor.dataAsByteArray[0].toLong())
+    assertEquals(data[1].toLong(), tensor.dataAsByteArray[1].toLong())
+    assertEquals(data[2].toLong(), tensor.dataAsByteArray[2].toLong())
+    assertEquals(data[3].toLong(), tensor.dataAsByteArray[3].toLong())
+  }
+
+  @Test
+  fun testUnsignedByteTensor() {
+    val data = byteArrayOf(0.toByte(), 1.toByte(), 2.toByte(), 255.toByte())
+    val shape = longArrayOf(4, 1, 1)
+    var tensor = Tensor.fromBlobUnsigned(data, shape)
+    assertEquals(tensor.dtype(), DType.UINT8)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(shape[2], tensor.shape()[2])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toLong(), tensor.dataAsUnsignedByteArray[0].toLong())
+    assertEquals(data[1].toLong(), tensor.dataAsUnsignedByteArray[1].toLong())
+    assertEquals(data[2].toLong(), tensor.dataAsUnsignedByteArray[2].toLong())
+    assertEquals(data[3].toLong(), tensor.dataAsUnsignedByteArray[3].toLong())
+
+    val byteBuffer = Tensor.allocateByteBuffer(4)
+    byteBuffer.put(data)
+    tensor = Tensor.fromBlobUnsigned(byteBuffer, shape)
+    assertEquals(tensor.dtype(), DType.UINT8)
+    assertEquals(shape[0], tensor.shape()[0])
+    assertEquals(shape[1], tensor.shape()[1])
+    assertEquals(shape[2], tensor.shape()[2])
+    assertEquals(4, tensor.numel())
+    assertEquals(data[0].toLong(), tensor.dataAsUnsignedByteArray[0].toLong())
+    assertEquals(data[1].toLong(), tensor.dataAsUnsignedByteArray[1].toLong())
+    assertEquals(data[2].toLong(), tensor.dataAsUnsignedByteArray[2].toLong())
+    assertEquals(data[3].toLong(), tensor.dataAsUnsignedByteArray[3].toLong())
+  }
+
+  @Test
+  fun testIllegalDataTypeException() {
+    val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+    val shape = longArrayOf(2, 2)
+    val tensor = Tensor.fromBlob(data, shape)
+    assertEquals(tensor.dtype(), DType.FLOAT)
 
-    @Test
-    fun testLongTensorSerde() {
-        val data = longArrayOf(1, 2, 3, 4)
-        val shape = longArrayOf(2, 2)
-        val tensor = Tensor.fromBlob(data, shape)
-        val bytes = tensor.toByteArray()
+    assertThatThrownBy { tensor.dataAsByteArray }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Tensor of type Tensor_float32 cannot return data as byte array.")
 
-        val deser = Tensor.fromByteArray(bytes)
-        val deserShape = deser.shape()
-        val deserData = deser.dataAsLongArray
+    assertThatThrownBy { tensor.dataAsUnsignedByteArray }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Tensor of type Tensor_float32 cannot return data as unsigned byte array.")
 
-        for (i in data.indices) {
-            Assert.assertEquals(data[i], deserData[i])
-        }
+    assertThatThrownBy { tensor.dataAsIntArray }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Tensor of type Tensor_float32 cannot return data as int array.")
 
-        for (i in shape.indices) {
-            Assert.assertEquals(shape[i], deserShape[i])
-        }
+    assertThatThrownBy { tensor.dataAsDoubleArray }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Tensor of type Tensor_float32 cannot return data as double array.")
+
+    assertThatThrownBy { tensor.dataAsLongArray }
+        .isInstanceOf(IllegalStateException::class.java)
+        .hasMessage("Tensor of type Tensor_float32 cannot return data as long array.")
+  }
+
+  @Test
+  fun testIllegalArguments() {
+    val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+    val shapeWithNegativeValues = longArrayOf(-1, 2)
+    val mismatchShape = longArrayOf(1, 2)
+
+    assertThatThrownBy { Tensor.fromBlob(null as FloatArray?, mismatchShape) }
+        .isInstanceOf(IllegalArgumentException::class.java)
+        .hasMessage("Data array must be not null")
+
+    assertThatThrownBy { Tensor.fromBlob(data, null) }
+        .isInstanceOf(IllegalArgumentException::class.java)
+        .hasMessage("Shape must be not null")
+
+    assertThatThrownBy { Tensor.fromBlob(data, shapeWithNegativeValues) }
+        .isInstanceOf(IllegalArgumentException::class.java)
+        .hasMessage("Shape elements must be non negative")
+
+    assertThatThrownBy { Tensor.fromBlob(data, mismatchShape) }
+        .isInstanceOf(IllegalArgumentException::class.java)
+        .hasMessage("Inconsistent data capacity:4 and shape number elements:2 shape:[1, 2]")
+  }
+
+  @Test
+  fun testLongTensorSerde() {
+    val data = longArrayOf(1, 2, 3, 4)
+    val shape = longArrayOf(2, 2)
+    val tensor = Tensor.fromBlob(data, shape)
+    val bytes = tensor.toByteArray()
+
+    val deser = Tensor.fromByteArray(bytes)
+    val deserShape = deser.shape()
+    val deserData = deser.dataAsLongArray
+
+    for (i in data.indices) {
+      assertEquals(data[i], deserData[i])
     }
 
-    @Test
-    fun testFloatTensorSerde() {
-        val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
-        val shape = longArrayOf(2, 2)
-        val tensor = Tensor.fromBlob(data, shape)
-        val bytes = tensor.toByteArray()
+    for (i in shape.indices) {
+      assertEquals(shape[i], deserShape[i])
+    }
+  }
+
+  @Test
+  fun testFloatTensorSerde() {
+    val data = floatArrayOf(Float.MIN_VALUE, 0f, 0.1f, Float.MAX_VALUE)
+    val shape = longArrayOf(2, 2)
+    val tensor = Tensor.fromBlob(data, shape)
+    val bytes = tensor.toByteArray()
 
-        val deser = Tensor.fromByteArray(bytes)
-        val deserShape = deser.shape()
-        val deserData = deser.dataAsFloatArray
+    val deser = Tensor.fromByteArray(bytes)
+    val deserShape = deser.shape()
+    val deserData = deser.dataAsFloatArray
 
-        for (i in data.indices) {
-            Assert.assertEquals(data[i].toDouble(), deserData[i].toDouble(), 1e-5)
-        }
+    for (i in data.indices) {
+      assertEquals(data[i].toDouble(), deserData[i].toDouble(), 1e-5)
+    }
 
-        for (i in shape.indices) {
-            Assert.assertEquals(shape[i], deserShape[i])
-        }
+    for (i in shape.indices) {
+      assertEquals(shape[i], deserShape[i])
     }
+  }
 }

From fbb3ad1b5a6be2f3a08180a1eb70a6db7a71d43c Mon Sep 17 00:00:00 2001
From: Yufeng Shi <yufeng.shi@arm.com>
Date: Mon, 12 May 2025 13:27:28 +0100
Subject: [PATCH 033/178] Arm backend: Fix ensures check in
 UnsqueezeScalarPlaceholdersPass (#10811)

- In UnsqueezeScalarPlaceholdersPass, only the placeholders that meet
certain conditions will be unsqueezed. Otherwise, they retain their
original shape. This patch adds a new check to ensure that placeholders
that don't meet the conditions should be skipped

Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218

Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
---
 .../unsqueeze_scalar_placeholders_pass.py       | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py b/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py
index 084bd46456d..0276e65a081 100644
--- a/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py
+++ b/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py
@@ -1,5 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -20,17 +19,19 @@ def __init__(self, exported_program):
         self.exported_program = exported_program
         super().__init__()
 
+    def _is_inputs_to_buffers_or_parameters(self, node):
+        return (
+            node.name in self.exported_program.graph_signature.inputs_to_buffers
+            or node.name in self.exported_program.graph_signature.inputs_to_parameters
+        )
+
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             if node.op != "placeholder":
                 continue
             rank = node.meta["val"].dim()
             if rank == 0:
-                if not (
-                    node.name in self.exported_program.graph_signature.inputs_to_buffers
-                    or node.name
-                    in self.exported_program.graph_signature.inputs_to_parameters
-                ):
+                if not self._is_inputs_to_buffers_or_parameters(node):
                     continue
                 tensor = self.exported_program.state_dict[node.name]
                 if tensor.dim() == 0:
@@ -52,4 +53,6 @@ def ensures(self, graph_module: torch.fx.GraphModule):
             if node.op == "placeholder":
                 rank = node.meta["val"].dim()
                 if rank == 0:
+                    if not self._is_inputs_to_buffers_or_parameters(node):
+                        continue
                     raise ValueError("Placeholders of rank 0 are not supported!")

From 4909db157c7d71d40e0251daf87584306fb73934 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <oscar.andersson@arm.com>
Date: Fri, 2 May 2025 12:14:01 +0200
Subject: [PATCH 034/178] Arm backend: Update rescale to handle more dtypes

Update op_rescale to handle other dtype conversion than int8 <-> int32
for TOSA 1.0.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
Change-Id: Icc19fb8bb391ec063df2f4cb7dddaf8db672332f
---
 backends/arm/operators/op_rescale.py | 48 +++++++---------------------
 backends/arm/tosa_quant_utils.py     | 22 ++++++++++---
 2 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_rescale.py
index 52953db24d0..1a5f91a81e6 100644
--- a/backends/arm/operators/op_rescale.py
+++ b/backends/arm/operators/op_rescale.py
@@ -16,8 +16,8 @@
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
 )
-from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_quant_utils import create_const_ops_for_rescale
+from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg
+from executorch.backends.arm.tosa_quant_utils import build_rescale
 
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from torch.fx import Node
@@ -98,53 +98,29 @@ def define_node(
 
         validate_num_inputs(self.target, inputs, 5)
 
-        input_dtype = node.all_input_nodes[0].meta["val"].dtype
+        input_dtype = inputs[0].dtype
         output_dtype = cast(torch.dtype, node.args[1])
         scale = cast(float, node.args[2])
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        if input_dtype != torch.int8 and input_zp != 0:
+        if input_dtype != map_dtype(torch.int8, self.tosa_spec) and input_zp != 0:
             raise ValueError(
                 f"If input dtype is not int8, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}"
             )
         if output_dtype != torch.int8 and output_zp != 0:
             raise ValueError(
-                f"If output dtype is not int8, output_zp must be 0. Got {output_dtype=}, {output_zp=}"
+                f"If output dtype is not int8, output_zp must be 0. Got {ts.DTypeNames[output_dtype]}, {output_zp=}"
             )
 
-        # scale32 gives higher accuracy but for a higher HW cost.
-        # For now, always go for scale32.
-        scale_32 = True
-        scale_width = 32 if scale_32 else 16
-        multipliers, shifts = tosa_quant_utils.compute_multiplier_and_shift(
-            [scale], scale_width
-        )
-
-        rescale_inputs = create_const_ops_for_rescale(
+        build_rescale(
             tosa_graph,
-            input_dtype,
-            inputs[0].name,
-            multipliers,
-            shifts,
-            input_zp,
-            output_zp,
-            ts,
-        )
-
-        attr_rescale = ts.TosaSerializerAttribute()
-
-        attr_rescale.RescaleAttribute(
-            scale32=scale_32,
+            scale=[scale],
+            input_node=inputs[0],
+            output_name=output.name,
+            output_type=output.dtype,
+            input_zp=input_zp,
+            output_zp=output_zp,
             rounding_mode=RoundingMode.SINGLE_ROUND,
             per_channel=False,
-            input_unsigned=False,
-            output_unsigned=False,
-        )
-
-        tosa_graph.addOperator(
-            ts.TosaOp.Op().RESCALE,
-            [inputs[0].name, *rescale_inputs],
-            [output.name],
-            attr_rescale,
         )
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index de73c194a39..4536f68d8e5 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -236,13 +236,21 @@ def build_rescale_v0_80(
 # For TOSA spec v1.0 RESCALE operator requires multipler, shifts, input_zp and output_zp to be
 # const inputs. Create constant operators from the data already initialized.
 def create_const_ops_for_rescale(
-    tosa_fb, input_dtype, input_name, multipliers, shifts, input_zp, output_zp, ts
+    tosa_fb,
+    scale_32,
+    input_dtype,
+    input_name,
+    multipliers,
+    shifts,
+    input_zp,
+    output_zp,
+    output_dtype,
+    ts,
 ):
-    output_dtype = ts.DType.INT32 if input_dtype == ts.DType.INT8 else ts.DType.INT8
 
     multipliers = tosa_fb.addConst(
         (len(multipliers),),
-        ts.DType.INT32,
+        ts.DType.INT32 if scale_32 else ts.DType.INT16,
         multipliers,
         name=input_name + "_multipliers",
     )
@@ -275,20 +283,24 @@ def build_rescale(
 
     input_name = input_node.name
 
-    multipliers, shifts = compute_multiplier_and_shift(scale, 32)
+    scaleWidth = 32
+    is_scale32 = True
+    multipliers, shifts = compute_multiplier_and_shift(scale, scaleWidth)
     rescale_inputs = create_const_ops_for_rescale(
         tosa_fb,
+        is_scale32,
         input_node.dtype,
         input_name,
         multipliers,
         shifts,
         input_zp,
         output_zp,
+        output_type,
         ts,
     )
     attr_rescale = ts.TosaSerializerAttribute()
     attr_rescale.RescaleAttribute(
-        scale32=True,
+        scale32=is_scale32,
         rounding_mode=rounding_mode,
         per_channel=per_channel,
         input_unsigned=False,

From fa2e1f229cedb0920fce8688e8903b45c28f9fe9 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <oscar.andersson@arm.com>
Date: Tue, 6 May 2025 11:59:30 +0200
Subject: [PATCH 035/178] Arm backend: Rename const-tensors for TOSA 1.0

Rename const tensors such input_zp to use node.name instead of
input_node.name. This avoids clashes as conv and build_rescale used
different names.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
Change-Id: If3b636e97bf623cc9dbbe681632a6bc9f4265083
---
 backends/arm/operators/op_conv2d.py | 30 ++++++++++++++++-------------
 backends/arm/tosa_quant_utils.py    | 14 ++++++--------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index fd35439d64a..fdbb20fbe18 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -277,17 +277,29 @@ def define_node(
             input_qparams = get_input_qparams(node)
             input_zp = input_qparams[0].zp
 
-        tosa_graph.addConst([1], output.dtype, [input_zp], name=f"{node.name}_input_zp")
-        tosa_graph.addConst([1], output.dtype, [0], name=f"{node.name}_weight_zp")
+        # The output type is int32 when input type is int8.
+        conv2d_output_name = output.name
+        if output.dtype == ts.DType.INT8:
+            conv2d_res = tosa_graph.addIntermediate(
+                tosa_shape(output.shape, output.dim_order), ts.DType.INT32
+            )
+            conv2d_output_name = conv2d_res.name
         acc_type = (
             inputs[0].dtype if inputs[0].dtype == ts.DType.FP32 else ts.DType.INT32
         )
 
+        tosa_graph.addConst(
+            [1], output.dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
+        )
+        tosa_graph.addConst(
+            [1], output.dtype, [0], name=f"{conv2d_output_name}_weight_zp"
+        )
+
         # Non-bias case.
         if len(node.all_input_nodes) == 2:
             # Create a zero bias tensor if not presented
             out_channels = weight.shape[0]
-            bias_name = "bias" + node.name.split("default", 1)[1]
+            bias_name = f"{conv2d_output_name}_bias"
             bias_type = output.dtype
             if output.dtype == ts.DType.INT8:
                 # Conv is quantized to int8, but the TOSA operator has
@@ -301,14 +313,6 @@ def define_node(
                 name=bias_name,
             )
 
-        # The output type is int32 when input type is int8.
-        conv2d_output_name = output.name
-        if output.dtype == ts.DType.INT8:
-            conv2d_res = tosa_graph.addIntermediate(
-                tosa_shape(output.shape, output.dim_order), ts.DType.INT32
-            )
-            conv2d_output_name = conv2d_res.name
-
         # Given input.shape is (N, Ci, H, W), and weight.shape is (Co, Ci/G, H, W)
         in_channels = input.shape[1]
         out_channels = weight.shape[0]
@@ -373,8 +377,8 @@ def define_node(
                 input.name,
                 weight_name,
                 bias.name,
-                f"{node.name}_input_zp",
-                f"{node.name}_weight_zp",
+                f"{conv2d_output_name}_input_zp",
+                f"{conv2d_output_name}_weight_zp",
             ],
             [conv2d_output_name],
             attr,
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index 4536f68d8e5..96e9ab4e34a 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -239,7 +239,7 @@ def create_const_ops_for_rescale(
     tosa_fb,
     scale_32,
     input_dtype,
-    input_name,
+    node_name,
     multipliers,
     shifts,
     input_zp,
@@ -252,16 +252,16 @@ def create_const_ops_for_rescale(
         (len(multipliers),),
         ts.DType.INT32 if scale_32 else ts.DType.INT16,
         multipliers,
-        name=input_name + "_multipliers",
+        name=node_name + "_multipliers",
     )
     shifts = tosa_fb.addConst(
-        (len(shifts),), ts.DType.INT8, shifts, name=input_name + "_shifts"
+        (len(shifts),), ts.DType.INT8, shifts, name=node_name + "_shifts"
     )
     input_zp = tosa_fb.addConst(
-        [1], input_dtype, [input_zp], name=input_name + "_input_zp"
+        [1], input_dtype, [input_zp], name=node_name + "_input_zp"
     )
     output_zp = tosa_fb.addConst(
-        [1], output_dtype, [output_zp], name=input_name + "_output_zp"
+        [1], output_dtype, [output_zp], name=node_name + "_output_zp"
     )
 
     return [multipliers.name, shifts.name, input_zp.name, output_zp.name]
@@ -281,8 +281,6 @@ def build_rescale(
     import serializer.tosa_serializer as ts  # type: ignore
     import tosa.Op as TosaOp  # type: ignore
 
-    input_name = input_node.name
-
     scaleWidth = 32
     is_scale32 = True
     multipliers, shifts = compute_multiplier_and_shift(scale, scaleWidth)
@@ -290,7 +288,7 @@ def build_rescale(
         tosa_fb,
         is_scale32,
         input_node.dtype,
-        input_name,
+        output_name,
         multipliers,
         shifts,
         input_zp,

From 0a30c42178a3106021ff644cb539779f76319153 Mon Sep 17 00:00:00 2001
From: robert-kalmar <robert.kalmar@nxp.com>
Date: Mon, 12 May 2025 18:00:00 +0200
Subject: [PATCH 036/178] NXP Backend: Add eIQ Neutron Backend (#10196)

Co-authored-by: Lukas Sztefek <lukas.sztefek@nxp.com>
Co-authored-by: Martin Pavella <martin.pavella@nxp.com>
Co-authored-by: Jiri Ocenasek <jiri.ocenasek@nxp.com>
Co-authored-by: Roman Janik <roman.janik@nxp.com>
Co-authored-by: Simon Strycek <simon.strycek@nxp.com>
---
 LICENSE                                       |    1 +
 backends/arm/_passes/arm_pass_manager.py      |    2 +-
 backends/nxp/README.md                        |   41 +-
 backends/nxp/backend/edge_helper.py           |   40 +
 .../nxp/backend/edge_program_converter.py     |  194 ++
 backends/nxp/backend/ir/conversion_config.py  |   64 +
 backends/nxp/backend/ir/conversion_context.py |   37 +
 backends/nxp/backend/ir/converter/__init__.py |    0
 .../backend/ir/converter/builder/__init__.py  |    0
 .../builder/aten_model_builder_director.py    |  126 ++
 .../ir/converter/builder/model_builder.py     | 1648 +++++++++++++++++
 .../builder/quantization_verification.py      |  377 ++++
 .../ir/converter/conversion/__init__.py       |    0
 .../converter/conversion/aten_translator.py   |   71 +
 .../backend/ir/converter/conversion/common.py |  236 +++
 .../ir/converter/conversion/translator.py     |  961 ++++++++++
 .../backend/ir/converter/node_converter.py    |  189 ++
 .../ir/converter/node_converters/__init__.py  |    4 +
 .../ops_converters/__init__.py                |   51 +
 .../ops_converters/addmm_converter.py         |   62 +
 .../ops_converters/avg_pool_2d_converter.py   |   85 +
 .../constant_pad_nd_converter.py              |  129 ++
 .../ops_converters/convolution_converter.py   |  127 ++
 .../ops_converters/max_pool_2d_converter.py   |  104 ++
 .../ops_converters/mm_converter.py            |   58 +
 .../ops_converters/permute_copy_converter.py  |   64 +
 .../qdq_dequantize_converter.py               |   64 +
 .../ops_converters/qdq_quantize_converter.py  |   45 +
 .../ops_converters/relu_converter.py          |   31 +
 .../ops_converters/softmax_converter.py       |   49 +
 .../ops_converters/view_copy_converter.py     |   98 +
 .../node_converters/shared/__init__.py        |    0
 .../node_converters/shared/recurrent_utils.py |  112 ++
 .../node_converters/shared/reduce_utils.py    |  200 ++
 .../shared/reshape_transposition.py           |  233 +++
 .../ir/converter/quantization_utils.py        |  484 +++++
 .../nxp/backend/ir/converter/tensor_utils.py  |   50 +
 .../nxp/backend/ir/lib/LICENSE_APACHE_2.0     |  251 +++
 backends/nxp/backend/ir/lib/__init__.py       |    0
 .../nxp/backend/ir/lib/tflite/ATan2Options.py |   50 +
 .../nxp/backend/ir/lib/tflite/AbsOptions.py   |   50 +
 .../ir/lib/tflite/ActivationFunctionType.py   |   12 +
 .../nxp/backend/ir/lib/tflite/AddNOptions.py  |   50 +
 .../nxp/backend/ir/lib/tflite/AddOptions.py   |   82 +
 .../backend/ir/lib/tflite/ArgMaxOptions.py    |   65 +
 .../backend/ir/lib/tflite/ArgMinOptions.py    |   65 +
 .../ir/lib/tflite/AssignVariableOptions.py    |   50 +
 .../ir/lib/tflite/BatchMatMulOptions.py       |  101 +
 .../ir/lib/tflite/BatchToSpaceNDOptions.py    |   50 +
 .../BidirectionalSequenceLSTMOptions.py       |  160 ++
 .../tflite/BidirectionalSequenceRNNOptions.py |  126 ++
 .../backend/ir/lib/tflite/BitcastOptions.py   |   50 +
 .../ir/lib/tflite/BitwiseXorOptions.py        |   50 +
 .../ir/lib/tflite/BroadcastToOptions.py       |   50 +
 .../backend/ir/lib/tflite/BucketizeOptions.py |   98 +
 backends/nxp/backend/ir/lib/tflite/Buffer.py  |  132 ++
 .../backend/ir/lib/tflite/BuiltinOperator.py  |  212 +++
 .../backend/ir/lib/tflite/BuiltinOptions.py   |  133 ++
 .../backend/ir/lib/tflite/BuiltinOptions2.py  |   27 +
 .../backend/ir/lib/tflite/CallOnceOptions.py  |   65 +
 .../nxp/backend/ir/lib/tflite/CallOptions.py  |   67 +
 .../nxp/backend/ir/lib/tflite/CastOptions.py  |   80 +
 .../nxp/backend/ir/lib/tflite/CombinerType.py |    9 +
 .../ir/lib/tflite/ConcatEmbeddingsOptions.py  |  163 ++
 .../ir/lib/tflite/ConcatenationOptions.py     |   80 +
 .../backend/ir/lib/tflite/Conv2DOptions.py    |  155 ++
 .../backend/ir/lib/tflite/Conv3DOptions.py    |  170 ++
 .../nxp/backend/ir/lib/tflite/CosOptions.py   |   50 +
 .../backend/ir/lib/tflite/CumsumOptions.py    |   84 +
 .../ir/lib/tflite/CustomOptionsFormat.py      |    7 +
 .../ir/lib/tflite/CustomQuantization.py       |   98 +
 .../backend/ir/lib/tflite/DensifyOptions.py   |   50 +
 .../ir/lib/tflite/DepthToSpaceOptions.py      |   65 +
 .../ir/lib/tflite/DepthwiseConv2DOptions.py   |  157 ++
 .../ir/lib/tflite/DequantizeOptions.py        |   50 +
 .../backend/ir/lib/tflite/DilateOptions.py    |   50 +
 .../ir/lib/tflite/DimensionMetadata.py        |  152 ++
 .../backend/ir/lib/tflite/DimensionType.py    |    8 +
 .../nxp/backend/ir/lib/tflite/DivOptions.py   |   65 +
 .../lib/tflite/DynamicUpdateSliceOptions.py   |   52 +
 .../tflite/EmbeddingLookupSparseOptions.py    |   67 +
 .../nxp/backend/ir/lib/tflite/EqualOptions.py |   50 +
 .../nxp/backend/ir/lib/tflite/ExpOptions.py   |   50 +
 .../ir/lib/tflite/ExpandDimsOptions.py        |   50 +
 .../backend/ir/lib/tflite/FakeQuantOptions.py |  116 ++
 .../nxp/backend/ir/lib/tflite/FillOptions.py  |   50 +
 .../backend/ir/lib/tflite/FloorDivOptions.py  |   50 +
 .../backend/ir/lib/tflite/FloorModOptions.py  |   50 +
 .../ir/lib/tflite/FullyConnectedOptions.py    |  129 ++
 .../FullyConnectedOptionsWeightsFormat.py     |    8 +
 .../backend/ir/lib/tflite/GatherNdOptions.py  |   50 +
 .../backend/ir/lib/tflite/GatherOptions.py    |   80 +
 .../nxp/backend/ir/lib/tflite/GeluOptions.py  |   67 +
 .../ir/lib/tflite/GreaterEqualOptions.py      |   50 +
 .../backend/ir/lib/tflite/GreaterOptions.py   |   50 +
 .../backend/ir/lib/tflite/HardSwishOptions.py |   50 +
 .../ir/lib/tflite/HashtableFindOptions.py     |   50 +
 .../ir/lib/tflite/HashtableImportOptions.py   |   52 +
 .../backend/ir/lib/tflite/HashtableOptions.py |   95 +
 .../ir/lib/tflite/HashtableSizeOptions.py     |   50 +
 .../nxp/backend/ir/lib/tflite/IfOptions.py    |   80 +
 .../nxp/backend/ir/lib/tflite/Int32Vector.py  |   98 +
 .../backend/ir/lib/tflite/L2NormOptions.py    |   65 +
 .../ir/lib/tflite/LSHProjectionOptions.py     |   65 +
 .../ir/lib/tflite/LSHProjectionType.py        |    9 +
 .../backend/ir/lib/tflite/LSTMKernelType.py   |    8 +
 .../nxp/backend/ir/lib/tflite/LSTMOptions.py  |  131 ++
 .../backend/ir/lib/tflite/LeakyReluOptions.py |   67 +
 .../backend/ir/lib/tflite/LessEqualOptions.py |   50 +
 .../nxp/backend/ir/lib/tflite/LessOptions.py  |   50 +
 .../LocalResponseNormalizationOptions.py      |  118 ++
 .../ir/lib/tflite/LogSoftmaxOptions.py        |   50 +
 .../ir/lib/tflite/LogicalAndOptions.py        |   50 +
 .../ir/lib/tflite/LogicalNotOptions.py        |   50 +
 .../backend/ir/lib/tflite/LogicalOrOptions.py |   50 +
 .../ir/lib/tflite/MatrixDiagOptions.py        |   50 +
 .../ir/lib/tflite/MatrixSetDiagOptions.py     |   50 +
 .../ir/lib/tflite/MaximumMinimumOptions.py    |   50 +
 .../nxp/backend/ir/lib/tflite/Metadata.py     |   84 +
 .../backend/ir/lib/tflite/MirrorPadMode.py    |    8 +
 .../backend/ir/lib/tflite/MirrorPadOptions.py |   65 +
 backends/nxp/backend/ir/lib/tflite/Model.py   |  352 ++++
 .../nxp/backend/ir/lib/tflite/MulOptions.py   |   65 +
 .../nxp/backend/ir/lib/tflite/NegOptions.py   |   50 +
 .../lib/tflite/NonMaxSuppressionV4Options.py  |   52 +
 .../lib/tflite/NonMaxSuppressionV5Options.py  |   52 +
 .../backend/ir/lib/tflite/NotEqualOptions.py  |   50 +
 .../backend/ir/lib/tflite/OneHotOptions.py    |   65 +
 .../nxp/backend/ir/lib/tflite/Operator.py     |  428 +++++
 .../nxp/backend/ir/lib/tflite/OperatorCode.py |  112 ++
 .../nxp/backend/ir/lib/tflite/PackOptions.py  |   80 +
 .../nxp/backend/ir/lib/tflite/PadOptions.py   |   50 +
 .../nxp/backend/ir/lib/tflite/PadV2Options.py |   50 +
 backends/nxp/backend/ir/lib/tflite/Padding.py |    8 +
 .../backend/ir/lib/tflite/Pool2DOptions.py    |  140 ++
 .../nxp/backend/ir/lib/tflite/PowOptions.py   |   50 +
 .../ir/lib/tflite/QuantizationDetails.py      |    8 +
 .../ir/lib/tflite/QuantizationParameters.py   |  295 +++
 .../backend/ir/lib/tflite/QuantizeOptions.py  |   50 +
 .../nxp/backend/ir/lib/tflite/RNNOptions.py   |   82 +
 .../backend/ir/lib/tflite/RandomOptions.py    |   80 +
 .../nxp/backend/ir/lib/tflite/RangeOptions.py |   50 +
 .../nxp/backend/ir/lib/tflite/RankOptions.py  |   50 +
 .../ir/lib/tflite/ReadVariableOptions.py      |   50 +
 .../ir/lib/tflite/ReduceWindowFunction.py     |   13 +
 .../ir/lib/tflite/ReduceWindowOptions.py      |   65 +
 .../backend/ir/lib/tflite/ReducerOptions.py   |   67 +
 .../backend/ir/lib/tflite/ReshapeOptions.py   |   98 +
 .../ir/lib/tflite/ResizeBilinearOptions.py    |   84 +
 .../tflite/ResizeNearestNeighborOptions.py    |   86 +
 .../ir/lib/tflite/ReverseSequenceOptions.py   |   82 +
 .../backend/ir/lib/tflite/ReverseV2Options.py |   50 +
 .../backend/ir/lib/tflite/Rfft2dOptions.py    |   50 +
 .../ir/lib/tflite/RightShiftOptions.py        |   50 +
 .../nxp/backend/ir/lib/tflite/RngAlgorithm.py |    9 +
 .../nxp/backend/ir/lib/tflite/SVDFOptions.py  |   97 +
 .../backend/ir/lib/tflite/ScatterNdOptions.py |   50 +
 .../ir/lib/tflite/SegmentSumOptions.py        |   50 +
 .../backend/ir/lib/tflite/SelectOptions.py    |   50 +
 .../backend/ir/lib/tflite/SelectV2Options.py  |   50 +
 .../ir/lib/tflite/SequenceRNNOptions.py       |   99 +
 .../nxp/backend/ir/lib/tflite/ShapeOptions.py |   65 +
 .../nxp/backend/ir/lib/tflite/SignOptions.py  |   50 +
 .../nxp/backend/ir/lib/tflite/SignatureDef.py |  172 ++
 .../backend/ir/lib/tflite/SkipGramOptions.py  |   97 +
 .../nxp/backend/ir/lib/tflite/SliceOptions.py |   50 +
 .../backend/ir/lib/tflite/SoftmaxOptions.py   |   67 +
 .../ir/lib/tflite/SpaceToBatchNDOptions.py    |   50 +
 .../ir/lib/tflite/SpaceToDepthOptions.py      |   65 +
 .../ir/lib/tflite/SparseIndexVector.py        |   10 +
 .../ir/lib/tflite/SparseToDenseOptions.py     |   67 +
 .../ir/lib/tflite/SparsityParameters.py       |  190 ++
 .../nxp/backend/ir/lib/tflite/SplitOptions.py |   65 +
 .../backend/ir/lib/tflite/SplitVOptions.py    |   65 +
 .../backend/ir/lib/tflite/SquareOptions.py    |   50 +
 .../ir/lib/tflite/SquaredDifferenceOptions.py |   52 +
 .../backend/ir/lib/tflite/SqueezeOptions.py   |   98 +
 .../tflite/StablehloBroadcastInDimOptions.py  |  102 +
 .../ir/lib/tflite/StablehloCompareOptions.py  |   86 +
 .../tflite/StablehloComparisonDirection.py    |   12 +
 .../ir/lib/tflite/StablehloComparisonType.py  |   11 +
 .../lib/tflite/StablehloConcatenateOptions.py |   67 +
 .../lib/tflite/StablehloConvolutionOptions.py |  634 +++++++
 .../lib/tflite/StablehloCustomCallOptions.py  |  214 +++
 .../lib/tflite/StablehloDotGeneralOptions.py  |  304 +++
 .../tflite/StablehloDynamicSliceOptions.py    |  100 +
 .../ir/lib/tflite/StablehloGatherOptions.py   |  276 +++
 .../ir/lib/tflite/StablehloIotaOptions.py     |   65 +
 .../ir/lib/tflite/StablehloPadOptions.py      |  194 ++
 .../ir/lib/tflite/StablehloPrecisionConfig.py |    9 +
 .../ir/lib/tflite/StablehloReduceOptions.py   |  115 ++
 .../tflite/StablehloReduceWindowOptions.py    |  307 +++
 .../tflite/StablehloRngBitGeneratorOptions.py |   67 +
 .../ir/lib/tflite/StablehloScatterOptions.py  |  268 +++
 .../ir/lib/tflite/StablehloSliceOptions.py    |  194 ++
 .../ir/lib/tflite/StablehloSortOptions.py     |   97 +
 .../lib/tflite/StablehloTransposeOptions.py   |  100 +
 .../ir/lib/tflite/StablehloWhileOptions.py    |   80 +
 .../ir/lib/tflite/StridedSliceOptions.py      |  142 ++
 .../nxp/backend/ir/lib/tflite/SubGraph.py     |  251 +++
 .../nxp/backend/ir/lib/tflite/SubOptions.py   |   82 +
 backends/nxp/backend/ir/lib/tflite/Tensor.py  |  317 ++++
 .../nxp/backend/ir/lib/tflite/TensorMap.py    |   84 +
 .../nxp/backend/ir/lib/tflite/TensorType.py   |   24 +
 .../nxp/backend/ir/lib/tflite/TileOptions.py  |   50 +
 .../backend/ir/lib/tflite/TopKV2Options.py    |   50 +
 .../ir/lib/tflite/TransposeConvOptions.py     |  125 ++
 .../backend/ir/lib/tflite/TransposeOptions.py |   50 +
 .../nxp/backend/ir/lib/tflite/Uint16Vector.py |   98 +
 .../nxp/backend/ir/lib/tflite/Uint8Vector.py  |   98 +
 .../UnidirectionalSequenceLSTMOptions.py      |  164 ++
 .../backend/ir/lib/tflite/UniqueOptions.py    |   65 +
 .../backend/ir/lib/tflite/UnpackOptions.py    |   80 +
 .../lib/tflite/UnsortedSegmentMaxOptions.py   |   52 +
 .../lib/tflite/UnsortedSegmentMinOptions.py   |   52 +
 .../lib/tflite/UnsortedSegmentProdOptions.py  |   52 +
 .../lib/tflite/UnsortedSegmentSumOptions.py   |   52 +
 .../backend/ir/lib/tflite/VarHandleOptions.py |   84 +
 .../backend/ir/lib/tflite/VariantSubType.py   |  130 ++
 .../nxp/backend/ir/lib/tflite/WhereOptions.py |   50 +
 .../nxp/backend/ir/lib/tflite/WhileOptions.py |   80 +
 .../backend/ir/lib/tflite/ZerosLikeOptions.py |   50 +
 .../nxp/backend/ir/lib/tflite/__init__.py     |    0
 backends/nxp/backend/ir/logger.py             |  343 ++++
 backends/nxp/backend/ir/tensor_formatting.py  |   55 +
 .../backend/ir/tflite_generator/__init__.py   |    0
 .../builtin_options/__init__.py               |    0
 .../builtin_options/abs_options.py            |   25 +
 .../builtin_options/add_n_options.py          |   31 +
 .../builtin_options/add_options.py            |   45 +
 .../builtin_options/arg_max_options.py        |   28 +
 .../builtin_options/arg_min_options.py        |   28 +
 .../average_pool_2d_options.py                |   62 +
 .../builtin_options/batch_mat_mul_options.py  |   45 +
 .../bidirectional_sequence_lstm_options.py    |   65 +
 .../bidirectional_sequence_rnn_options.py     |   53 +
 .../builtin_options/bitwise_xor_options.py    |   23 +
 .../builtin_options/broadcast_to_options.py   |   25 +
 .../builtin_options/cast_options.py           |   32 +
 .../builtin_options/concatenation_options.py  |   38 +
 .../builtin_options/conv_2d_options.py        |   59 +
 .../builtin_options/conv_3d_options.py        |   67 +
 .../builtin_options/cum_sum_options.py        |   30 +
 .../builtin_options/depth_to_space_options.py |   29 +
 .../depthwise_conv_2d_options.py              |   69 +
 .../builtin_options/dequantize_options.py     |   24 +
 .../builtin_options/div_options.py            |   43 +
 .../builtin_options/equal_options.py          |   23 +
 .../builtin_options/exp_options.py            |   23 +
 .../builtin_options/floor_mod_options.py      |   23 +
 .../fully_connected_options.py                |   57 +
 .../builtin_options/gather_nd_options.py      |   23 +
 .../builtin_options/gather_options.py         |   30 +
 .../builtin_options/gelu_options.py           |   27 +
 .../builtin_options/greater_equal_options.py  |   25 +
 .../builtin_options/greater_options.py        |   23 +
 .../builtin_options/hard_swish_options.py     |   23 +
 .../builtin_options/leaky_relu_options.py     |   31 +
 .../builtin_options/less_equal_options.py     |   23 +
 .../builtin_options/less_options.py           |   23 +
 .../builtin_options/log_softmax_options.py    |   29 +
 .../builtin_options/logical_and_options.py    |   23 +
 .../builtin_options/logical_not_options.py    |   23 +
 .../builtin_options/logical_or_options.py     |   23 +
 .../builtin_options/lrn_options.py            |   45 +
 .../builtin_options/lstm_options.py           |   60 +
 .../builtin_options/max_pool_2d_options.py    |   62 +
 .../builtin_options/maximum_options.py        |   21 +
 .../builtin_options/mean_options.py           |   28 +
 .../builtin_options/minimum_options.py        |   21 +
 .../builtin_options/mirror_pad_options.py     |   60 +
 .../builtin_options/mul_options.py            |   36 +
 .../builtin_options/multinomial_options.py    |   30 +
 .../builtin_options/neg_options.py            |   23 +
 .../builtin_options/not_equal_options.py      |   23 +
 .../builtin_options/one_hot_options.py        |   27 +
 .../builtin_options/pad_options.py            |   27 +
 .../builtin_options/pad_v2_options.py         |   27 +
 .../builtin_options/pow_options.py            |   24 +
 .../builtin_options/quantize_options.py       |   30 +
 .../builtin_options/range_options.py          |   23 +
 .../builtin_options/reduce_max_options.py     |   28 +
 .../builtin_options/reduce_min_options.py     |   28 +
 .../builtin_options/reduce_prod_options.py    |   28 +
 .../builtin_options/reshape_options.py        |   52 +
 .../resize_bilinear_options.py                |   34 +
 .../resize_nearest_neighbor_options.py        |   37 +
 .../reverse_sequence_options.py               |   32 +
 .../builtin_options/scatter_nd_options.py     |   23 +
 .../builtin_options/select_v2_options.py      |   24 +
 .../builtin_options/shape_options.py          |   34 +
 .../builtin_options/sign_options.py           |   22 +
 .../builtin_options/slice_options.py          |   29 +
 .../builtin_options/softmax_options.py        |   35 +
 .../builtin_options/space_to_depth_options.py |   29 +
 .../builtin_options/split_options.py          |   27 +
 .../builtin_options/split_v_options.py        |   27 +
 .../builtin_options/square_options.py         |   23 +
 .../squared_difference_options.py             |   25 +
 .../builtin_options/squeeze_options.py        |   42 +
 .../builtin_options/strided_slice_options.py  |   35 +
 .../builtin_options/sub_options.py            |   45 +
 .../builtin_options/sum_options.py            |   28 +
 .../builtin_options/tile_options.py           |   23 +
 .../builtin_options/transpose_conv_options.py |   50 +
 .../builtin_options/transpose_options.py      |   29 +
 .../unidirectional_sequence_lstm_options.py   |   67 +
 .../unidirectional_sequence_rnn_options.py    |   49 +
 .../custom_options/flex_transpose_options.py  |   78 +
 .../ir/tflite_generator/meta/__init__.py      |    0
 .../backend/ir/tflite_generator/meta/meta.py  |  255 +++
 .../backend/ir/tflite_generator/meta/types.py |  198 ++
 .../ir/tflite_generator/tflite_model.py       |  816 ++++++++
 .../nxp/backend/ir/tflite_optimizer/README.md |  166 ++
 .../backend/ir/tflite_optimizer/__init__.py   |    0
 .../ir/tflite_optimizer/graph_utils.py        |  115 ++
 .../ir/tflite_optimizer/operator_rules.py     |  122 ++
 .../optimizations/__init__.py                 |    0
 .../optimizations/base_optimization.py        |   36 +
 ...bine_hard_sigmoid_and_mul_to_hard_swish.py |  256 +++
 .../optimizations/eliminate_dead_branches.py  |   82 +
 .../fuse_activation_functions.py              |  235 +++
 .../fuse_fully_connected_and_add_operators.py |   80 +
 .../fuse_quanitze_into_preceding_ops.py       |   94 +
 .../optimizations/keep_one_empty_buffer.py    |   39 +
 .../optimizations/move_relu_before_concat.py  |  107 ++
 ...e_fully_connected_weights_after_reshape.py |  121 ++
 .../optimizations/prune_cast_operators.py     |  117 ++
 .../optimizations/prune_quantize_operators.py |  304 +++
 .../optimizations/prune_reshape_operators.py  |  116 ++
 .../prune_transpose_operators.py              |  155 ++
 .../remove_unused_tensors_and_buffers.py      |   62 +
 ...ge_pool_before_fully_connected_with_sum.py |  164 ++
 .../backend/ir/tflite_optimizer/optimizer.py  |  229 +++
 .../ir/tflite_optimizer/pattern_matcher.py    |  921 +++++++++
 .../ir/tflite_optimizer/tensor_rules.py       |  710 +++++++
 .../nxp/backend/neutron_converter_manager.py  |   55 +
 backends/nxp/backend/node_format_inference.py |  259 +++
 backends/nxp/neutron_node_extraction.py       |  102 +
 backends/nxp/neutron_partitioner.py           |  328 ++++
 backends/nxp/neutron_pass_manager.py          |   50 +
 backends/nxp/nxp_backend.py                   |  333 ++++
 backends/nxp/requirements-tests.txt           |    6 +
 backends/nxp/tests/executorch_pipeline.py     |   91 +
 backends/nxp/tests/executors.py               |  293 +++
 .../nxp/tests/exported_program_vizualize.py   |   90 +
 .../test_avg_pool2d_converter.py              |  158 ++
 .../test_constant_pad_nd_converter.py         |  147 ++
 .../node_converter/test_conv_converter.py     |  206 +++
 .../node_converter/test_linear_converter.py   |   40 +
 .../test_max_pool_2d_converter.py             |  121 ++
 .../test_permute_copy_converter.py            |   64 +
 .../node_converter/test_relu_converter.py     |  108 ++
 .../node_converter/test_softmax_converter.py  |  111 ++
 .../test_view_copy_converter.py               |  237 +++
 backends/nxp/tests/test_neutron_backend.py    |  156 ++
 .../tests/test_neutron_converter_manager.py   |   59 +
 .../nxp/tests/test_node_format_inference.py   |   89 +
 backends/nxp/tests/test_operator_selector.py  |   24 +
 .../nxp/tests/test_qdq_clustering_conv.py     |   31 +
 .../remove_getitem_op.py                      |    0
 backends/xnnpack/_passes/__init__.py          |    3 +-
 .../test/passes/test_remove_get_item_pass.py  |    2 +-
 examples/nxp/setup.sh                         |   10 +
 setup.py                                      |   10 +-
 365 files changed, 34051 insertions(+), 9 deletions(-)
 create mode 100644 backends/nxp/backend/edge_helper.py
 create mode 100644 backends/nxp/backend/edge_program_converter.py
 create mode 100644 backends/nxp/backend/ir/conversion_config.py
 create mode 100644 backends/nxp/backend/ir/conversion_context.py
 create mode 100755 backends/nxp/backend/ir/converter/__init__.py
 create mode 100755 backends/nxp/backend/ir/converter/builder/__init__.py
 create mode 100644 backends/nxp/backend/ir/converter/builder/aten_model_builder_director.py
 create mode 100755 backends/nxp/backend/ir/converter/builder/model_builder.py
 create mode 100755 backends/nxp/backend/ir/converter/builder/quantization_verification.py
 create mode 100755 backends/nxp/backend/ir/converter/conversion/__init__.py
 create mode 100755 backends/nxp/backend/ir/converter/conversion/aten_translator.py
 create mode 100755 backends/nxp/backend/ir/converter/conversion/common.py
 create mode 100755 backends/nxp/backend/ir/converter/conversion/translator.py
 create mode 100755 backends/nxp/backend/ir/converter/node_converter.py
 create mode 100755 backends/nxp/backend/ir/converter/node_converters/__init__.py
 create mode 100755 backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/addmm_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/avg_pool_2d_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool_2d_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/mm_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/permute_copy_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_quantize_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/relu_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py
 create mode 100755 backends/nxp/backend/ir/converter/node_converters/shared/__init__.py
 create mode 100755 backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py
 create mode 100755 backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py
 create mode 100755 backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py
 create mode 100755 backends/nxp/backend/ir/converter/quantization_utils.py
 create mode 100755 backends/nxp/backend/ir/converter/tensor_utils.py
 create mode 100644 backends/nxp/backend/ir/lib/LICENSE_APACHE_2.0
 create mode 100755 backends/nxp/backend/ir/lib/__init__.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ATan2Options.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/AbsOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ActivationFunctionType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/AddNOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/AddOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ArgMaxOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ArgMinOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/AssignVariableOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BatchMatMulOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BatchToSpaceNDOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceLSTMOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceRNNOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BitcastOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BitwiseXorOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BroadcastToOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BucketizeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Buffer.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BuiltinOperator.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BuiltinOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/BuiltinOptions2.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CallOnceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CallOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CastOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CombinerType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ConcatEmbeddingsOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ConcatenationOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Conv2DOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Conv3DOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CosOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CumsumOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CustomOptionsFormat.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/CustomQuantization.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DensifyOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DepthToSpaceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DepthwiseConv2DOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DequantizeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DilateOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DimensionMetadata.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DimensionType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DivOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/DynamicUpdateSliceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/EmbeddingLookupSparseOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/EqualOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ExpOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ExpandDimsOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/FakeQuantOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/FillOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/FloorDivOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/FloorModOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/FullyConnectedOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/FullyConnectedOptionsWeightsFormat.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/GatherNdOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/GatherOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/GeluOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/GreaterEqualOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/GreaterOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/HardSwishOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/HashtableFindOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/HashtableImportOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/HashtableOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/HashtableSizeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/IfOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Int32Vector.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/L2NormOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LSHProjectionOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LSHProjectionType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LSTMKernelType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LSTMOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LeakyReluOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LessEqualOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LessOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LocalResponseNormalizationOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LogSoftmaxOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LogicalAndOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LogicalNotOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/LogicalOrOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/MatrixDiagOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/MatrixSetDiagOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/MaximumMinimumOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Metadata.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/MirrorPadMode.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/MirrorPadOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Model.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/MulOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/NegOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV4Options.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV5Options.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/NotEqualOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/OneHotOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Operator.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/OperatorCode.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/PackOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/PadOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/PadV2Options.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Padding.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Pool2DOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/PowOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/QuantizationDetails.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/QuantizationParameters.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/QuantizeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/RNNOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/RandomOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/RangeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/RankOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ReadVariableOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ReduceWindowFunction.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ReduceWindowOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ReducerOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ReshapeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ResizeBilinearOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ResizeNearestNeighborOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ReverseSequenceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ReverseV2Options.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Rfft2dOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/RightShiftOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/RngAlgorithm.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SVDFOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ScatterNdOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SegmentSumOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SelectOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SelectV2Options.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SequenceRNNOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ShapeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SignOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SignatureDef.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SkipGramOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SliceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SoftmaxOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SpaceToBatchNDOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SpaceToDepthOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SparseIndexVector.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SparseToDenseOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SparsityParameters.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SplitOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SplitVOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SquareOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SquaredDifferenceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SqueezeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloBroadcastInDimOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloCompareOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloComparisonDirection.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloComparisonType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloConcatenateOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloConvolutionOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloCustomCallOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloDotGeneralOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloDynamicSliceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloGatherOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloIotaOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloPadOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloPrecisionConfig.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloReduceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloReduceWindowOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloRngBitGeneratorOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloScatterOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloSliceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloSortOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloTransposeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StablehloWhileOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/StridedSliceOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SubGraph.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/SubOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Tensor.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/TensorMap.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/TensorType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/TileOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/TopKV2Options.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/TransposeConvOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/TransposeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Uint16Vector.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/Uint8Vector.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/UnidirectionalSequenceLSTMOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/UniqueOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/UnpackOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMaxOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMinOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/UnsortedSegmentProdOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/UnsortedSegmentSumOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/VarHandleOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/VariantSubType.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/WhereOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/WhileOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/ZerosLikeOptions.py
 create mode 100755 backends/nxp/backend/ir/lib/tflite/__init__.py
 create mode 100644 backends/nxp/backend/ir/logger.py
 create mode 100644 backends/nxp/backend/ir/tensor_formatting.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/__init__.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/__init__.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/abs_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/add_n_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/add_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/arg_max_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/arg_min_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/average_pool_2d_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/batch_mat_mul_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_lstm_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_rnn_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/bitwise_xor_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/broadcast_to_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/cast_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/concatenation_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/conv_2d_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/conv_3d_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/cum_sum_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/depth_to_space_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/depthwise_conv_2d_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/dequantize_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/div_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/equal_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/exp_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/floor_mod_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/fully_connected_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/gather_nd_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/gather_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/gelu_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/greater_equal_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/greater_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/hard_swish_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/leaky_relu_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/less_equal_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/less_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/log_softmax_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/logical_and_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/logical_not_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/logical_or_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/lrn_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/lstm_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/max_pool_2d_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/maximum_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/mean_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/minimum_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/mirror_pad_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/mul_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/multinomial_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/neg_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/not_equal_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/one_hot_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/pad_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/pad_v2_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/pow_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/quantize_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/range_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_max_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_min_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_prod_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/reshape_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/resize_bilinear_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/resize_nearest_neighbor_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/reverse_sequence_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/scatter_nd_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/select_v2_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/shape_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/sign_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/slice_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/softmax_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/space_to_depth_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/split_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/split_v_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/square_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/squared_difference_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/squeeze_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/strided_slice_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/sub_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/sum_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/tile_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_conv_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_lstm_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_rnn_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/custom_options/flex_transpose_options.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/meta/__init__.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/meta/meta.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/meta/types.py
 create mode 100755 backends/nxp/backend/ir/tflite_generator/tflite_model.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/README.md
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/__init__.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/graph_utils.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/operator_rules.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/__init__.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/base_optimization.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/eliminate_dead_branches.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_activation_functions.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_quanitze_into_preceding_ops.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_cast_operators.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_quantize_operators.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_reshape_operators.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/remove_unused_tensors_and_buffers.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/replace_average_pool_before_fully_connected_with_sum.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizer.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/pattern_matcher.py
 create mode 100755 backends/nxp/backend/ir/tflite_optimizer/tensor_rules.py
 create mode 100644 backends/nxp/backend/neutron_converter_manager.py
 create mode 100644 backends/nxp/backend/node_format_inference.py
 create mode 100644 backends/nxp/neutron_node_extraction.py
 create mode 100644 backends/nxp/neutron_partitioner.py
 create mode 100644 backends/nxp/neutron_pass_manager.py
 create mode 100644 backends/nxp/nxp_backend.py
 create mode 100644 backends/nxp/requirements-tests.txt
 create mode 100644 backends/nxp/tests/executorch_pipeline.py
 create mode 100644 backends/nxp/tests/executors.py
 create mode 100644 backends/nxp/tests/exported_program_vizualize.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
 create mode 100644 backends/nxp/tests/test_neutron_backend.py
 create mode 100644 backends/nxp/tests/test_neutron_converter_manager.py
 create mode 100644 backends/nxp/tests/test_node_format_inference.py
 create mode 100644 backends/nxp/tests/test_operator_selector.py
 create mode 100644 backends/nxp/tests/test_qdq_clustering_conv.py
 rename backends/{xnnpack/_passes => transforms}/remove_getitem_op.py (100%)
 create mode 100644 examples/nxp/setup.sh

diff --git a/LICENSE b/LICENSE
index 6c1f7a760a7..fdd029058a6 100644
--- a/LICENSE
+++ b/LICENSE
@@ -7,6 +7,7 @@ Copyright 2023 Arm Limited and/or its affiliates.
 Copyright (c) Qualcomm Innovation Center, Inc.
 Copyright (c) 2023 Apple Inc.
 Copyright (c) 2024 MediaTek Inc.
+Copyright 2023 NXP
 
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index d6d608918a6..c5ebace2834 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -65,7 +65,7 @@
     DecomposeScaledDotProductAttention,
 )
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
-from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
+from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
 from executorch.exir.pass_manager import PassManager
 from torch.fx import GraphModule
diff --git a/backends/nxp/README.md b/backends/nxp/README.md
index 103bcf52f75..10eb1290a8b 100644
--- a/backends/nxp/README.md
+++ b/backends/nxp/README.md
@@ -27,14 +27,45 @@ In the future the NXP eIQ Neutron Backend will be extended to support [i.MX 9 Ap
 with eIQ Neutron NPU, like the [i.MX 95](https://www.nxp.com/products/iMX95).
 
 
-## Layout
-TBD
-
 ## Backend Status and Maturity
 **Current Status:** Prototype Quality
 
-The eIQ Neutron NPU Backend should be considered as prototype quality at this moment. Subject to significant changes and 
-improvements. NXP and the ExecuTorch community is actively developing this codebase. 
+The eIQ Neutron NPU Backend should be considered as prototype quality at this moment. Subject to significant changes and
+improvements. NXP and the ExecuTorch community is actively developing this codebase.
+
+## Neutron Backend implementation and SW architecture
+Neutron Backend uses the eIQ Neutron Converter as ML compiler to compile the delegated subgraph to Neutron microcode. 
+The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler. 
+
+The Neutron Backend in its early prototype phase, is based on existing NXP products, such as 
+onnx2tflite, known from the NXP's eIQ Toolkit. 
+The **onnx2tflite** is a converter from the ONNX format to LiteRT (formerly known as TFLite).
+It consists of 3 stages: 
+* ONNX Model Parsing
+* Tensor Format Inference, to identify tensors using channel-first layer
+* ONNX to LiteRT Conversion 
+* Optimization Passes, which operate on top of the LiteRT format
+* LiteRT Serialization 
+
+Due to the similarities between ONNX to LiteRT and Edge to LiteRT conversion, the Neutron Backend's 
+currently leverages the Tensor format Inference and LiteRT Optimizer. 
+This shall be considered as temporary solution, intended to be replaced with: 
+* Dim Order (https://github.com/pytorch/executorch/issues/4873)
+* Corresponding ExecuTorch/ATen passes
+
+before reaching higher maturity status by the end of 2025. 
+
+## Layout
+The current code base is as follows:
+* `backend/ir/` - TFLite/LiteRT based IR to represent the Edge Subgraph, taken from onnx2tflite code base and extended to
+  support Edge Dialect to LiteRT conversion.
+    * `backend/ir/converter` - Neutron Backends conversion from Edge (ATen) Dialect to LiteRT, TFLite. The subfolder
+      `node_conveters` is structured as single module for each Edge operator.
+    * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema
+    * `backend/ir/tflite_generator` and `backend/ir/tflite_optimizer` handle the serialization
+       of the in-memory built subgraph for delegation into LiteRT/TFLite flatbuffers 
+       representation. Code taken from the onnx2tflite tool.
+*  `quantizer` - Neutron Backends quantizer implementation. 
 
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
diff --git a/backends/nxp/backend/edge_helper.py b/backends/nxp/backend/edge_helper.py
new file mode 100644
index 00000000000..9b584d5166b
--- /dev/null
+++ b/backends/nxp/backend/edge_helper.py
@@ -0,0 +1,40 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch.fx import Node
+
+
+def input_tensor(node: Node, input_index: int) -> torch.Tensor:
+    if len(node.all_input_nodes) <= input_index:
+        raise IndexError
+
+    return node.all_input_nodes[input_index].meta["val"]
+
+
+def output_tensor(node: Node) -> torch.Tensor:
+    return node.meta["val"]
+
+
+def tensor_rank(tensor: torch.Tensor) -> int:
+    return len(tensor.size())
+
+
+def input_rank(node: Node, input_index: int) -> int:
+    return tensor_rank(input_tensor(node, input_index))
+
+
+def input_tensor_safe(node: Node, input_index: int) -> torch.Tensor | None:
+    """Return the input tensor of 'node' at index 'input_index', or None if the node doesn't have that input.
+
+    :param node: Edge node to get the input tensor from.
+    :param input_index: Index of the input tensor to get.
+    :return: The input tensor at index 'input_index', or None.
+    """
+
+    if len(node.all_input_nodes) <= input_index:
+        return None
+
+    return input_tensor(node, input_index)
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
new file mode 100644
index 00000000000..488703db120
--- /dev/null
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -0,0 +1,194 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.logger as logger
+import flatbuffers
+from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
+from executorch.backends.nxp.backend.ir.conversion_context import ConversionContext
+from executorch.backends.nxp.backend.ir.converter.builder.aten_model_builder_director import (
+    AtenModelBuilderDirector,
+)
+from torch.export import ExportedProgram
+from torch.export.graph_signature import InputKind
+from torch.fx import Node
+from torch.nn.parameter import Parameter
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.backend.node_format_inference import (
+    NodeFormat,
+    NodeFormatInference,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+# noinspection PyProtectedMember
+functions_converters = {
+    exir_ops.edge.aten.addmm.default: AddMMConverter,  # noqa F405
+    exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
+    exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
+    exir_ops.edge.aten.max_pool2d.default: MaxPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.mm.default: MMConverter,  # noqa F405
+    exir_ops.edge.aten.permute_copy.default: PermuteCopyConverter,  # noqa F405
+    exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
+    exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
+}
+
+
+class EdgeProgramToIRConverter:
+    """
+    Converter from convertion of ExportedProgram in Edge dialect to IR (TFLite Flatbuffers).
+    """
+
+    _default_conversion_config = ConversionConfig()
+
+    def convert_program(
+        self,
+        edge_program: ExportedProgram,
+        conversion_config=_default_conversion_config,
+    ) -> (bytes, dict):
+        """
+        Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes.
+
+        :param edge_program: Converter ExportedProgram.
+        :param conversion_config: ConversionConfig instance.
+        :return: TFLite flatbuffers as bytes.
+        """
+        node_formats = NodeFormatInference(edge_program).identify_node_formats()
+        parameters_mapping = self.map_inputs_to_parameters(edge_program)
+
+        cc = self.build_conversion_context(
+            parameters_mapping, node_formats, conversion_config
+        )
+
+        # Program conversion
+        self.append_placeholders_and_tensors(edge_program.graph.nodes, cc)
+        self._convert_qdq_cluster_q_dq_nodes(edge_program.graph.nodes, cc)
+        self._process_nodes(edge_program.graph.nodes, cc)
+
+        # Assign output
+        io_formats = cc.tflite_builder.assign_model_io_to_subgraph_and_get_io_formats(
+            edge_program.graph_signature
+        )
+
+        # TFLite model generation
+        internal_tflite_model = cc.tflite_builder.finish()
+        flatbuffers_builder = flatbuffers.Builder()
+        internal_tflite_model.gen_tflite(flatbuffers_builder)
+
+        return bytes(flatbuffers_builder.Output()), io_formats
+
+    @staticmethod
+    def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext):
+        for node in nodes:
+            if node.op == "placeholder":
+                node_format = context.node_formats[node]
+
+                if node.name in context.parameters_mapping:
+                    # Node is placeholder and has data -> append as static tensor with data
+                    tensor = context.parameters_mapping[node.name]
+                    context.tflite_builder.append_as_static_tensor(
+                        node, node_format, tensor
+                    )
+                else:
+                    # Node is placeholder and doesn't have data (user input) -> append as fake tensor
+                    context.tflite_builder.append_as_fake_tensor(node, node_format)
+            elif node.op == "call_function":
+                # Node is call function -> append only output as a tensor
+                node_format = context.node_formats[node]
+                context.tflite_builder.append_as_fake_tensor(node, node_format)
+            elif node.op == "output":
+                # Nothing to do
+                pass
+            else:
+                logger.e(
+                    logger.Code.INTERNAL_ERROR, f"Unexpected node op type: '{node.op}'!"
+                )
+
+    def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContext):
+        """
+        Go through program nodes and append their TFLite siblings into ModelBuilder.
+
+        :param nodes: Program's nodes.
+        :param conversion_context: ConversionContext instance.
+        """
+
+        qdq_related_functions = [
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        ]
+
+        for node in nodes:
+            if node.op == "call_function":
+                if node.target in qdq_related_functions and "cluster" in node.meta:
+                    # Skip (De)Quantize nodes that were already processed
+                    pass
+                elif node.target in functions_converters:
+                    functions_converters[node.target](conversion_context).convert(node)
+                else:
+                    logger.e(
+                        logger.Code.NOT_IMPLEMENTED,
+                        f"Converter for '{node.target.__name__}' not implemented!",
+                    )
+
+    @staticmethod
+    def map_inputs_to_parameters(edge_program: ExportedProgram) -> dict[str, Parameter]:
+        """
+        Create mapping between program parameters (input nodes & static data nodes) and their names.
+
+        :param edge_program: EdgeProgram instance.
+        :return: Mapping from parameter name to parameter instance.
+        """
+        result_map = {}
+
+        for input_spec in edge_program.graph_signature.input_specs:
+            if input_spec.kind in [InputKind.PARAMETER, InputKind.BUFFER]:
+                result_map[input_spec.arg.name] = edge_program.state_dict[
+                    input_spec.target
+                ]
+
+        return result_map
+
+    @staticmethod
+    def build_conversion_context(
+        parameters_mapping: dict,
+        node_formats: dict[Node, NodeFormat],
+        conversion_config: ConversionConfig = _default_conversion_config,
+    ) -> ConversionContext:
+        tflite_builder = AtenModelBuilderDirector(
+            3, "TFLite from EdgeProgram", conversion_config
+        )
+
+        # Add "sentinel" buffer (defined in schema.fbs)
+        tflite_builder.build_empty_buffer()
+
+        context = ConversionContext(
+            tflite_builder, conversion_config, parameters_mapping, node_formats
+        )
+
+        return context
+
+    def _convert_qdq_cluster_q_dq_nodes(
+        self, nodes: list[Node], conversion_context: ConversionContext
+    ):
+        """
+        Go through program and convert De(Quantize) nodes that are part of the QDQ cluster into
+        tensors.
+
+        :param nodes: Program's nodes.
+        :param conversion_context: ConversionContext instance.
+        """
+        qdq_q_ops_converters = {
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQDequantizeConverter,  # noqa F405
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: QDQQuantizeConverter,  # noqa F405
+        }
+
+        for node in nodes:
+            part_of_qdq_cluster = "cluster" in node.meta
+            if (
+                node.op == "call_function"
+                and node.target in qdq_q_ops_converters
+                and part_of_qdq_cluster
+            ):
+                qdq_q_ops_converters[node.target](conversion_context).convert(node)
diff --git a/backends/nxp/backend/ir/conversion_config.py b/backends/nxp/backend/ir/conversion_config.py
new file mode 100644
index 00000000000..4ac88eb467c
--- /dev/null
+++ b/backends/nxp/backend/ir/conversion_config.py
@@ -0,0 +1,64 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+class ConversionConfig:
+
+    def __init__(self, args: dict | None = None):
+        """
+        Conversion configuration passed through command line arguments or gathered during
+        the conversion process.
+
+        :param args: Optional dictionary with conversion arguments. Unknown arguments are ignored.
+        """
+        self.keep_io_format: bool = False
+        self.skip_shape_inference: bool = False
+        self.allow_inputs_stripping: bool = True
+        self.qdq_aware_conversion: bool = True
+        self.symbolic_dimensions_mapping: dict[str, int] | None = None
+        self.input_shapes_mapping: dict[str, tuple] | None = None
+        self.dont_skip_nodes_with_known_outputs: bool = False
+        self.allow_select_ops: bool = True
+        self.generate_artifacts_after_failed_shape_inference: bool = True
+
+        self.optimization_whitelist: list | None = None
+        self.optimization_blacklist: list | None = None
+
+        self.non_negative_indices: bool = False
+        self.cast_int64_to_int32: bool = False
+        self.accept_resize_rounding_error: bool = False
+        self.ignore_opset_version: bool = False
+
+        self.tflite_quantization_integrity_check: bool = True
+
+        if args is not None:
+            for key, value in args.items():
+                if key in self.__dict__:
+                    setattr(self, key, value)
+
+    def __repr__(self):
+        attrs = []
+        for attr in self.__dict__:
+            attrs.append(f"{attr}={getattr(self, attr)}")
+
+        return "ConversionConfig[" + ", ".join(attrs) + "]"
+
+
+class SkipShapeInferenceConfig(ConversionConfig):
+
+    def __init__(self):
+        """
+        Conversion config shortcut with disabled shape inference.
+        """
+        super().__init__({"skip_shape_inference": True})
+
+
+class QDQAwareConfig(ConversionConfig):
+
+    def __init__(self):
+        """
+        Conversion config shortcut with QDQ aware conversion enabled.
+        """
+        super().__init__({"qdq_aware_conversion": True})
diff --git a/backends/nxp/backend/ir/conversion_context.py b/backends/nxp/backend/ir/conversion_context.py
new file mode 100644
index 00000000000..6ec80f02a66
--- /dev/null
+++ b/backends/nxp/backend/ir/conversion_context.py
@@ -0,0 +1,37 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
+from executorch.backends.nxp.backend.ir.converter.builder.aten_model_builder_director import (
+    AtenModelBuilderDirector,
+)
+from executorch.backends.nxp.backend.node_format_inference import NodeFormat
+from torch import Node
+from torch.nn import Parameter
+
+
+class ConversionContext:
+    tflite_builder: AtenModelBuilderDirector
+    conversion_config: ConversionConfig
+    parameters_mapping: dict[str, Parameter]
+    node_formats: dict[Node, NodeFormat]
+
+    def __init__(
+        self,
+        tflite_builder: AtenModelBuilderDirector,
+        conversion_config: ConversionConfig,
+        parameters_mapping: dict,
+        node_formats: dict[Node, NodeFormat],
+    ):
+        """
+        Context with data related to current conversion.
+
+        :param tflite_builder: TFLite model builder.
+        :param conversion_config: Conversion configuration flags and metadata.
+        """
+        self.tflite_builder = tflite_builder
+        self.conversion_config = conversion_config
+        self.parameters_mapping = parameters_mapping
+        self.node_formats = node_formats
diff --git a/backends/nxp/backend/ir/converter/__init__.py b/backends/nxp/backend/ir/converter/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/converter/builder/__init__.py b/backends/nxp/backend/ir/converter/builder/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/converter/builder/aten_model_builder_director.py b/backends/nxp/backend/ir/converter/builder/aten_model_builder_director.py
new file mode 100644
index 00000000000..a420cea9aa7
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/builder/aten_model_builder_director.py
@@ -0,0 +1,126 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.builder.model_builder import (
+    ModelBuilder,
+)
+from executorch.backends.nxp.backend.ir.converter.conversion import translator
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.node_format_inference import NodeFormat
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class AtenModelBuilderDirector(ModelBuilder):
+    """
+    ModelBuilder's extension that simplifies some actions during build process. It also
+    contains methods related to Edge program nodes conversion.
+    """
+
+    def append_as_fake_tensor(self, node: Node, node_format: NodeFormat):
+        """
+        Append node into ModelBuilder as tensor without data (FakeTensor). Can be used
+        for activations and output tensors.
+
+        :param node: Node instance.
+        :param node_format: NodeFormat definition.
+        """
+        if self.tensor_exists(node.name):
+            return
+
+        tensor = node.meta["val"]
+        if isinstance(tensor, tuple):
+            tensor = tensor[0]  # Fake tensor
+        _type = translator.convert_data_type(tensor.dtype)
+        shape = list(tensor.shape)
+
+        if node_format.is_channels_first():
+            shape = translator.dims_to_channels_last(shape)
+
+        tensor = self.create_empty_tensor(node.name, _type, shape)
+        tensor.tensor_format = TensorFormat.from_node_format(node_format)
+
+    def append_as_static_tensor(
+        self, node: Node, node_format: NodeFormat, tensor: Parameter
+    ):
+        """
+        Append node into ModelBuilder as tensor with data (static). Can be used for weights,
+        permutations etc.
+
+        :param node: Node instance.
+        :param node_format: NodeFormat definition.
+        :param tensor: Torch Tensor (Parameter) that holds tensor data.
+        """
+        assert not self.tensor_exists(node.name), f"Tensor '{node.name}' already added!"
+
+        if self.tensor_exists(node.name):
+            return
+
+        data = tensor.data.numpy()
+
+        if node_format.is_channels_first():
+            data = translator.convert_data_to_channels_last(data)
+
+        tensor = self.create_tensor_for_data(data, node.name)
+        tensor.tensor_format = TensorFormat.from_node_format(node_format)
+
+    def append_operators(self, ops_to_add: list[tflite_model.Operator]):
+        """
+        Append list of TFLite operators to created model via ModelBuilder.
+
+        :param ops_to_add: List of operators to be added.
+        """
+        for op in ops_to_add:
+            if op.builtin_options is not None:
+                op.opcode_index = self.op_code_index_for_op_type(
+                    op.builtin_options.operator_type, op.tmp_version
+                )
+
+            elif op.custom_options is not None:
+                op.opcode_index = self.op_code_index_for_op_type(
+                    op.custom_options.operator_type,
+                    op.tmp_version,
+                    op.custom_options.custom_code,
+                )
+
+            self.check_and_append_operator(op)
+
+    def assign_model_io_to_subgraph_and_get_io_formats(
+        self, graph_signature
+    ) -> dict[str, dict]:
+        """
+        Assign model's inputs/outputs to SubGraph.
+
+        :param graph_signature: Instance of GraphSignature.
+        :returns: Mapping between IO tensors' names and their formats.
+        """
+        io_formats = {
+            "inputs": {},
+            "outputs": {},
+        }
+
+        self.get_sub_graph().inputs = tflite_model.SubGraphInputs()
+        for input_name in graph_signature.user_inputs:
+            tensor = self.tensor_for_name(input_name)
+            assert input_name == tensor.name, (
+                "Program's input name doesn't match with tensor name in TFLite. "
+                "Input was probably redirected."
+            )
+            self.get_sub_graph().inputs.tmp_inputs.append(tensor)
+            io_formats["inputs"][tensor.name] = tensor.tensor_format
+
+        self.get_sub_graph().outputs = tflite_model.SubGraphOutputs()
+        for output_name in graph_signature.user_outputs:
+            tensor = self.tensor_for_name(output_name)
+            assert output_name == tensor.name, (
+                "Program's output name doesn't match with tensor name in TFLite. "
+                "Output was probably redirected."
+            )
+            self.get_sub_graph().outputs.tmp_outputs.append(tensor)
+
+            io_formats["outputs"][tensor.name] = tensor.tensor_format
+
+        return io_formats
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
new file mode 100755
index 00000000000..1ca46237814
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -0,0 +1,1648 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023-2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+from copy import deepcopy
+from typing import Dict, List, Optional, Union
+
+import executorch.backends.nxp.backend.ir.converter.conversion.translator as translator
+import executorch.backends.nxp.backend.ir.logger as logger
+import executorch.backends.nxp.backend.ir.tflite_generator.tflite_model as tflite_model
+
+import numpy as np
+from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
+from executorch.backends.nxp.backend.ir.converter.builder import (
+    quantization_verification,
+)
+from executorch.backends.nxp.backend.ir.converter.conversion.common import (
+    uses_shape_broadcasting,
+)
+from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
+    propagate_quantization,
+)
+from executorch.backends.nxp.backend.ir.converter.tensor_utils import (
+    _buffer_has_data,
+    all_tensors_are_static,
+    tensor_has_data,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    cast_options,
+    dequantize_options,
+    gather_options,
+    pad_options,
+    pad_v2_options,
+    quantize_options,
+    reshape_options,
+    slice_options,
+    transpose_options,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.custom_options.flex_transpose_options import (
+    FlexTranspose,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer import optimizer
+
+
+class ModelBuilder:
+    """
+    Class encapsulates a TFLite object model defined in '/src/tflite_generator/'.
+    Provides methods to create and modify the TFLite model.
+    At the end call 'finish()' to finalize and optimise the model.
+    """
+
+    _tfl_model: tflite_model.Model
+
+    _tensor_name_map: Dict  # Mapping 'str' to 'tflT.Tensor'
+
+    # Maps BuiltinOperator to a Dict, mapping version to index. Operators of type 'BuiltinOperator.CUSTOM'
+    # have their 'version' prepended with its name, for example "FlexErf_1".
+    op_code_type_index_map: Dict[BuiltinOperator, Dict[Union[str, int], int]]
+
+    _nchw_tensor_version: Dict  # Mapping 'tflT.Tensor' to 'tflT.Tensor' which is
+    # equal, but in NCHW format
+
+    _skipped_output_map: Dict  # Mapping 'tflT.Tensor' objects that were outputs
+    # of skipped operators, to 'tflT.Tensor' outputs of
+    # previous operators
+
+    _zeros_tensor_map: Dict  # Mapping 'string' shapes to 'tflT.Tensor' objects
+
+    _default_conversion_config = ConversionConfig()
+
+    conversion_config: ConversionConfig
+
+    def __init__(
+        self,
+        model_version: int,
+        model_description: str,
+        conversion_config: ConversionConfig = _default_conversion_config,
+    ) -> None:
+        self._tfl_model = tflite_model.Model(model_version, model_description)
+        self.conversion_config = conversion_config
+
+        self.op_code_type_index_map = {}
+        self._tensor_name_map = {}
+        self._nchw_tensor_version = {}
+        self._skipped_output_map = {}
+        self._zeros_tensor_map = {}
+
+    def create_zeros_tensor(
+        self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False
+    ) -> tflite_model.Tensor:
+        """Create and return a Tensor with given shape, name and dtype that only contains zeros.
+        If 'can_reuse' is True, created tensor can be shared with other operators.
+        """
+
+        def _dims_to_string(dims: List[int]):
+            """Convert a list of integers to a string."""
+            tmp = [str(dim) for dim in dims]
+            return "_".join(tmp)
+
+        if can_reuse:
+            # The zeros tensor can be shared with other operators
+            str_dims = _dims_to_string(dims)
+            tensor_as_string = str_dims + dtype.name
+
+            # Check if such tensor already exists
+            if tensor_as_string in self._zeros_tensor_map.keys():
+                logger.d(
+                    f"REUSING zero tensor of size {str_dims} with type {dtype.name}."
+                )
+                return self._zeros_tensor_map[tensor_as_string]
+
+            else:
+                # Create a new one and register it for potential future use.
+                logger.d(
+                    f"ADDING zero tensor of size {str_dims} with type {dtype.name}."
+                )
+                data = np.zeros(dims, dtype)
+                new_tensor = self.create_tensor_for_data(data, name)
+
+                self._zeros_tensor_map[tensor_as_string] = new_tensor
+
+                return new_tensor
+
+        # Tensor cannot be shared. Just create one and return it
+        data = np.zeros(dims, dtype)
+
+        return self.create_tensor_for_data(data, name)
+
+    def create_pad_operator_before(
+        self,
+        before_op: tflite_model.Operator,
+        on_input_index: int,
+        explicit_padding: List[List[int]],
+        constant_value: np.ndarray = None,
+    ) -> tflite_model.Operator:
+        """Create a TFLite 'Pad' operator before the 'before_op' operator. The input of 'before_op' on index
+             'on_input_index' is where the 'Pad' operator will connect.
+
+        :param before_op: TFLite operator that will consume the output of the new 'Pad' operator.
+        :param on_input_index: Index of an input tensor of the 'before_op' operator, which will serve as the new input
+                                for the 'Pad' operator.
+        :param explicit_padding: TFLite style explicit padding compatible with the TFLite 'Pad' operator.
+        :param constant_value: The scalar array used as pad value. Must be same type as input tensor at
+                                index 'on_input_index'.
+        :return: The TFLite 'Pad' operator.
+        """
+        if on_input_index >= len(before_op.tmp_inputs):
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                f"ModelBuilder.create_pad_operator_before(): input index '{on_input_index}' is out of range!",
+            )
+
+        input_tensor = before_op.tmp_inputs[on_input_index]
+
+        # New shape of the tensor after padding
+        padded_shape = translator.get_tflite_tensor_shape_with_explicit_padding(
+            input_tensor.shape.vector, explicit_padding
+        )
+
+        # Create the output tensor of the 'Pad' operator.
+        padded_tensor = self.duplicate_tensor(
+            input_tensor, input_tensor.name + "_padded", empty_buffer=True
+        )
+        padded_tensor.shape = tflite_model.Shape(padded_shape)
+
+        # Create the second input of the 'Pad' operator.
+        explicit_padding_tensor = self.create_tensor_for_data(
+            np.asarray(explicit_padding, dtype=np.int32), "padding"
+        )
+
+        # Create the 'Pad' operator
+        pad_operator = tflite_model.Operator(builtin_options=pad_options.Pad())
+        pad_operator.tmp_inputs = [input_tensor, explicit_padding_tensor]
+        pad_operator.tmp_outputs = [padded_tensor]
+
+        # Add tensor with constant values (scalar array)
+        if constant_value is not None:
+            # Only PadV2 supports the constant value tensor. (Seems that regular Pad does too, but it's not documented.)
+            pad_operator.builtin_options = pad_v2_options.PadV2()
+
+            constant_value_tensor = self.create_tensor_for_data(
+                constant_value, "constant_values"
+            )
+
+            # Input tensor 'constant_values' must have same quantization params as 'input'
+            propagate_quantization(input_tensor, constant_value_tensor)
+            pad_operator.tmp_inputs.append(constant_value_tensor)
+
+        # Connect the operators
+        before_op.tmp_inputs[on_input_index] = padded_tensor
+
+        return pad_operator
+
+    def channels_first_version_of(self, t_tensor: tflite_model.Tensor):
+        """Get the channels first version of non-static 't_tensor'. If one is not
+        available in the graph yet, add transpose operator to create it."""
+        if t_tensor in self._nchw_tensor_version.keys():
+            return self._nchw_tensor_version[t_tensor]
+
+        # Need to add Transpose operator to transform 't_tensor' to NCHW.
+
+        new_tensor = self.duplicate_tensor(
+            t_tensor, t_tensor.name + "_channels_first", empty_buffer=True
+        )
+        new_tensor.shape = translator.channels_last_shape_to_channels_first(
+            t_tensor.shape
+        )
+        new_tensor.tensor_format = new_tensor.tensor_format.to_node_format()
+
+        perm = translator.create_channels_last_to_channels_first_permutation(
+            t_tensor.rank
+        )
+        transpose = self._create_transpose_operator(t_tensor, new_tensor, perm)
+
+        self.check_and_append_operator(transpose)
+
+        self._nchw_tensor_version[t_tensor] = new_tensor
+
+        return new_tensor
+
+    def redirect_tensor(
+        self, from_tensor: tflite_model.Tensor, to_tensor: tflite_model.Tensor
+    ):
+        """Create a mapping of 'from_tensor' to 'to_tensor', which will ensure that when 'check_and_append_operator()'
+             is called with an operator that references 'from_tensor', it will be replaced by 'to_tensor'. This ensures
+             that future operators will not use output tensors of operators, which are not actually in the model.
+
+            This method should be explicitly used when an operator is skipped during conversion, so that other operators
+             which used the output tensors of the skipped operator will be redirected to valid tensors, such as an
+             appropriate input tensor of the skipped operator.
+
+        :param from_tensor: Tensor which will be replaced by 'to_tensor'.
+        :param to_tensor: Valid tensor, that the future operators will use instead of the 'from_operator'.
+        """
+
+        old_replacement = self._skipped_output_map.get(from_tensor, None)
+        if old_replacement is not None:
+            if old_replacement != to_tensor:
+                logger.e(
+                    logger.Code.INTERNAL_ERROR,
+                    "redirect_tensor(): Tensor has already been redirected before!",
+                )
+            else:
+                # Tensor has already been mapped to 'to_tensor'.
+                return
+
+        # 'to_tensor' might have been redirected too (and so on) -> find the root of the redirection.
+        while to_tensor in self._skipped_output_map.keys():
+            to_tensor = self._skipped_output_map[to_tensor]
+
+        # Map 'from_tensor' to 'to_tensor'.
+        self._skipped_output_map[from_tensor] = to_tensor
+
+        # Swap the names of the tensors to preserve the model IO interface.
+        self.swap_tensor_names(from_tensor, to_tensor)
+
+    def check_and_append_operator(self, t_op: tflite_model.Operator):
+        """Append the new TFLite operator to the model."""
+
+        self.get_operators().append(t_op)
+
+    def create_transposed_tensor(
+        self, tflite_tensor: tflite_model.Tensor, axes: list[int] | None = None
+    ) -> tflite_model.Tensor:
+        """Create a transposed version of given static TFLite tensor using numpy.transpose().
+
+        :param tflite_tensor: Static TFLite tensor to create the transposed version for.
+        :param axes: Permutation applied during transposition. If None, current axes in reversed order are used.
+        :return: The new transposed TFLite tensor.
+        """
+
+        if not tensor_has_data(tflite_tensor):
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "ModelBuilder.create_transposed_tensor() requires a static tensor!",
+            )
+
+        new_tensor = self.duplicate_tensor(
+            tflite_tensor, tflite_tensor.name + "_transposed"
+        )
+
+        new_tensor.tmp_buffer.data = np.transpose(new_tensor.tmp_buffer.data, axes)
+        new_tensor.shape = tflite_model.Shape(list(new_tensor.tmp_buffer.data.shape))
+
+        return new_tensor
+
+    def duplicate_tensor(
+        self,
+        tensor: tflite_model.Tensor,
+        new_name: Optional[str] = None,
+        name_suffix: str = "",
+        empty_buffer: bool = False,
+    ) -> tflite_model.Tensor:
+        """Create a new TFLite tensor, which is an identical copy of 'tensor', with a new name.
+             If 'new_name' is given, it will be used as the name for the new tensor.
+             If instead the 'name_suffix' is given, it will be appended to the name of 'tensor'.
+             If neither is given, the new tensor will have a similar name as 'tensor'.
+
+            The final name may be altered automatically, to ensure uniqueness.
+
+        :param tensor: TFLite tensor to duplicate.
+        :param new_name: Optional name for the new tensor.
+        :param name_suffix: Optional suffix for the name of the new tensor.
+        :param empty_buffer: If `True`, the new copied tensor will have its own new empy buffer with no data.
+                              If `False`, the new copied tensor will also have a copy of the buffer (data) of the
+                              original tensor.
+        :return: A copy of 'tensor'.
+        """
+
+        new_tensor = deepcopy(tensor)
+
+        new_name = new_name or tensor.name + name_suffix
+        new_tensor.name = self._validate_new_tensor_name(new_name)
+
+        self.append_new_buffer(new_tensor.tmp_buffer)
+        if empty_buffer:
+            new_tensor.tmp_buffer.data = None
+
+        self.append_new_tensor(new_tensor)
+
+        return new_tensor
+
+    def swap_tensor_names(self, t1: tflite_model.Tensor, t2: tflite_model.Tensor):
+        """Correctly swap the names of the 2 provided tensors."""
+
+        logger.internal_assert(
+            self._tensor_name_map.get(t1.name, t1) == t1
+            and self._tensor_name_map.get(t2.name, t2) == t2,
+            "ModelBuilder.swap_tensor_names(): The name to tensor mapping is not valid.",
+        )
+
+        self._tensor_name_map[t1.name] = t2
+        self._tensor_name_map[t2.name] = t1
+
+        t1.name, t2.name = t2.name, t1.name
+
+    def _make_inputs_channels_first(self):
+        new_inputs = []
+
+        for input_tensor in self.get_sub_graph().inputs.tmp_inputs:
+
+            if input_tensor.tensor_format.is_channels_last():
+                # Create a Transpose operator and replace the graph input
+
+                if input_tensor.rank > 6:
+                    msg = (
+                        f"Couldn't preserve the shape of input tensor '{input_tensor.name}', because it has "
+                        f"'{input_tensor.rank}' dimensions. TFLite Transpose only supports up to 6 dimensions."
+                    )
+                    logger.e(logger.Code.IO_PRESERVATION_ERROR, msg)
+
+                new_input = self.duplicate_tensor(
+                    input_tensor, input_tensor.name + "_channels_first"
+                )
+                new_input.shape = translator.channels_last_shape_to_channels_first(
+                    input_tensor.shape
+                )
+                new_input.tensor_format = input_tensor.tensor_format.to_node_format()
+
+                perm = translator.create_channels_first_to_channels_last_permutation(
+                    input_tensor.rank
+                )
+                transpose = self._create_transpose_operator(
+                    new_input, input_tensor, perm
+                )
+
+                self.get_operators().vector.insert(0, transpose)
+
+                # Swap the names of `new_input` and `input_tensor`.
+                self.swap_tensor_names(new_input, input_tensor)
+
+                new_inputs.append(new_input)
+
+            else:
+                # Keep the input
+                new_inputs.append(input_tensor)
+
+        self.get_sub_graph().inputs.tmp_inputs = new_inputs
+
+    def _make_outputs_channels_first(self):
+        new_outputs = []
+
+        for output_tensor in self.get_sub_graph().outputs.tmp_outputs:
+            if output_tensor.tensor_format.is_channels_last():
+                # Add a Transpose operator, to make the output channels first
+
+                if output_tensor.rank > 6:
+                    logger.e(
+                        logger.Code.IO_PRESERVATION_ERROR,
+                        f"Couldn't preserve the shape of output tensor '{output_tensor.name}', because it has "
+                        f"'{output_tensor.rank}' dimensions. TFLite Transpose only supports up to 6 "
+                        "dimensions.",
+                    )
+
+                new_output = self.channels_first_version_of(output_tensor)
+
+                # Swap the names of `new_output` and `output_tensor`.
+                self.swap_tensor_names(new_output, output_tensor)
+
+                new_outputs.append(new_output)
+
+            else:
+                new_outputs.append(output_tensor)
+
+        self.get_sub_graph().outputs.tmp_outputs = new_outputs
+
+    def finish(self) -> tflite_model.Model:
+        """Finalize and optimize the converted TFLite model. Then return it.
+
+        At least one of 'optimization_whitelist' and 'optimization_blacklist' must be 'None'.
+        :return: The final TFLite model.
+        """
+
+        if self.conversion_config.keep_io_format:
+            # If the input or output is channels last, add a Transpose operator, to make is channels first.
+            self._make_inputs_channels_first()
+            self._make_outputs_channels_first()
+
+        # Apply optimizations to the internal TFLite model.
+        optimizer.Optimizer(self, self.conversion_config).optimize(
+            self.conversion_config.optimization_whitelist,
+            self.conversion_config.optimization_blacklist,
+        )
+
+        # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference.
+        operator_outputs = []
+        for op in self.get_operators().vector:
+            operator_outputs.extend(op.tmp_outputs)
+        graph_outputs = self.get_sub_graph().outputs.tmp_outputs.copy()
+        for output in graph_outputs:
+            if output not in operator_outputs:
+                self.get_sub_graph().outputs.tmp_outputs.remove(output)
+
+        # Switch from using 'tmp' references to 'index' references in tensors and buffers.
+        self._assign_tensor_and_buffer_indices(
+            self.conversion_config.allow_inputs_stripping
+        )
+
+        if self.conversion_config.tflite_quantization_integrity_check:
+            quantization_verification.verify_quantization_integrity(self._tfl_model)
+
+        return self._tfl_model
+
+    def _assign_tensor_and_buffer_indices(  # noqa C901
+        self, allow_inputs_stripping: bool
+    ):
+        """Correctly initialize all references via indices in all tensors and buffers."""
+
+        # Assign each buffer its index
+        for i, buffer in enumerate(self.get_buffers().vector):
+            buffer.tmp_index = i
+
+        # Assign each tensor its index and its buffer index
+        for i, tensor in enumerate(self.get_tensors().vector):
+            if tensor.tmp_null_tensor:
+                # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
+                #  this tensor should not be used.
+                # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
+                tensor.tmp_index = -1
+            else:
+                tensor.tmp_index = i
+
+            tensor.buffer = tensor.tmp_buffer.tmp_index
+
+        # TODO Remove inputs and outputs that are not in the tensors collection
+
+        # Assign 'Outputs' and 'Inputs' their tensor indices
+        outputs = self.get_sub_graph().outputs
+        for tensor in outputs.tmp_outputs:
+            try:
+                outputs.append(tensor.tmp_index)
+            except Exception:
+                logger.e(
+                    logger.Code.GENERATED_MODEL_INVALID,
+                    f"The tensor '{tensor.name}' is among the model outputs, but does NOT appear in the graph!",
+                )
+
+        inputs = self.get_sub_graph().inputs
+        for tensor in inputs.tmp_inputs:
+            try:
+                inputs.append(tensor.tmp_index)
+            except Exception:
+                if allow_inputs_stripping:
+                    logger.i(
+                        f"The input tensor '{tensor.name}' will not be present in generated TFLite graph."
+                    )
+                else:
+                    logger.e(
+                        logger.Code.GENERATED_MODEL_INVALID,
+                        f"The tensor '{tensor.name}' is among the model inputs, but does NOT appear in the graph!",
+                    )
+
+        # Assign each operator its inputs and outputs indices
+        for operator in self.get_sub_graph().operators.vector:
+            for inputTensor in operator.tmp_inputs:
+                operator.inputs.append(inputTensor.tmp_index)
+
+            for outputTensor in operator.tmp_outputs:
+                operator.outputs.append(outputTensor.tmp_index)
+
+    def _build_operator_code(
+        self, op_type: BuiltinOperator, version, custom_code: str = None
+    ):
+        """Add a new OperatorCode for given 'op_type' and 'version' to the 'operator_codes' vector."""
+        op_code = tflite_model.OperatorCode(op_type, version, custom_code)
+
+        self.get_operator_codes().append(op_code)
+
+    def build_empty_buffer(self) -> tflite_model.Buffer:
+        """Create, register and return a new empty 'Buffer' object."""
+        buffer = tflite_model.Buffer()
+
+        self.get_buffers().append(buffer)
+
+        return buffer
+
+    def create_tensor_for_data(self, data: np.ndarray, name: str):
+        data_type = translator.numpy_type_to_tf_lite(data.dtype)
+
+        buffer = tflite_model.Buffer(data, data_type)
+        self.append_new_buffer(buffer)
+
+        shape = translator.shape_from_numpy(data)
+        name = self._validate_new_tensor_name(name)
+
+        tensor = tflite_model.Tensor(shape, name, data_type=data_type)
+
+        tensor.tmp_buffer = buffer
+
+        self.append_new_tensor(tensor)
+
+        return tensor
+
+    def create_empty_tensor(
+        self, name: str, tensor_type: TensorType, shape: Optional[List[int]] = None
+    ):
+        name = self._validate_new_tensor_name(name)
+
+        if shape is not None:
+            shape = tflite_model.Shape(list(shape))
+
+        tensor = tflite_model.Tensor(shape, name, data_type=tensor_type)
+        tensor.tmp_buffer = self.build_empty_buffer()
+
+        self.append_new_tensor(tensor)
+
+        return tensor
+
+    def create_null_tensor(self, name: str = "null_"):
+        """Create and return a TFLite tensor, which will be recognized by the TFLite inference engine as an empty
+             tensor. Internal TFLite kernel functions will return 'nullptr' when accessing this tensor.
+
+        :param name: Optional name for the null tensor.
+        :return: The new TFLite null tensor.
+        """
+
+        tensor = self.create_empty_tensor(name, TensorType.FLOAT32)
+        tensor.tmp_null_tensor = True
+        return tensor
+
+    """ -------------------- 'quality of life' functions. -------------------- """
+
+    def operator_can_be_skipped(self, t_op: tflite_model.Operator) -> bool:
+        """Determine whether operator 't_op' uses both a graph input and a graph output tensor. If it does, it cannot
+             be skipped.
+
+        :param t_op: TFLite operator to check.
+        :return: True, if 't_op' doesn't use both a graph input and a graph output.
+        """
+        sub_graph = self.get_sub_graph()
+        graph_inputs = sub_graph.inputs.tmp_inputs
+        graph_outputs = sub_graph.outputs.tmp_outputs
+
+        produces_graph_output = any(
+            op_output in graph_outputs for op_output in t_op.tmp_outputs
+        )
+
+        consumes_graph_input = False
+        for op_input in t_op.tmp_inputs:
+            root = self._skipped_output_map.get(op_input, op_input)
+            if root in graph_inputs:
+                consumes_graph_input = True
+
+        if produces_graph_output and consumes_graph_input:
+            # The input and output would be disconnected.
+            return False
+
+        input_data_is_known = all_tensors_are_static(*t_op.tmp_inputs)
+
+        if produces_graph_output and input_data_is_known:
+            # If the operator is skipped, the output tensor would be assigned static data, which is not allowed for
+            #  model outputs.
+            return False
+
+        return True
+
+    def turn_operator_to_identity(self, t_op: tflite_model.Operator):
+        """Turn the operator 't_op' to a Transpose operator, which does nothing.
+            't_op' MUST have exactly 1 input tensor.
+
+        :param t_op: TFLite operator to turn into Transpose.
+        """
+        if len(t_op.tmp_inputs) != 1:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "turn_operator_to_identity(): Operator doesn't have 1 input!",
+            )
+
+        if len(t_op.tmp_outputs) != 1:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "turn_operator_to_identity(): Operator doesn't have 1 output!",
+            )
+
+        if t_op.tmp_inputs[0].rank <= 6:
+            # Create regular `Transpose`.
+            t_op.builtin_options = transpose_options.Transpose()
+        else:
+            # 6D and bigger require the Flex delegate `Transpose`.
+            if t_op.tmp_inputs[0].quantization is not None:
+                logger.e(
+                    logger.Code.CONVERSION_IMPOSSIBLE,
+                    "Conversion requires the addition of a `Transpose` operator with more than 6 dimensions, "
+                    "which doesn't support quantization.",
+                )
+
+            if not self.conversion_config.allow_select_ops:
+                logger.e(
+                    logger.Code.CONVERSION_IMPOSSIBLE,
+                    "Conversion requires the addition of a `Transpose` operator with more than 6 dimensions, "
+                    "which requires the use of Flex delegate. "
+                    + logger.Message.ALLOW_SELECT_OPS,
+                )
+
+            t_op.custom_options = FlexTranspose()
+
+        rank = t_op.tmp_inputs[0].rank
+        identity = np.asarray(range(rank), np.int32)
+        identity_tensor = self.create_tensor_for_data(identity, "identity")
+        t_op.tmp_inputs.append(identity_tensor)
+
+    def _validate_new_tensor_name(self, name: str) -> str:
+        """Take tensor name 'name' and make it unique in the model. Returns a unique tensor name."""
+
+        # Try adding numbers to the 'name' until it is unique
+        suffix = 0
+        new_name = name
+        while self.tensor_exists(new_name):
+            new_name = name + str(suffix)
+            suffix += 1
+
+        return new_name
+
+    def op_code_index_for_op_type(
+        self, op_type: BuiltinOperator, version: int = 1, custom_code: str = None
+    ):
+        """
+        Return the index to the 'operator_codes' vector in the TFLite model for the operator
+        with given 'op_type' and 'version'. If corresponding OperatorCode doesn't exist, add
+        it and create a new mapping.
+
+        :param op_type: Operator type. One of BuiltinOperator enum.
+        :param version: Operator version. Defaults to 1.
+        :param custom_code: Custom code name. Must be used with 'op_type' equal to 'BuiltinOperator.CUSTOM'.
+        :return: Index of the operator in 'operator_codes' vector.
+        """
+
+        version_name = version
+        if custom_code is not None:
+            version_name = f"{custom_code}_{version}"
+
+        if op_type not in self.op_code_type_index_map.keys():
+            self.op_code_type_index_map[op_type] = {}
+
+        if version_name not in self.op_code_type_index_map[op_type].keys():
+            self.op_code_type_index_map[op_type][
+                version_name
+            ] = self.operator_codes_size()
+            self._build_operator_code(op_type, version, custom_code)
+
+        return self.op_code_type_index_map[op_type][version_name]
+
+    def tensor_exists(self, name: str):
+        """Determine if a tensor with 'name' already exists or not."""
+        return name in self._tensor_name_map.keys()
+
+    def _remove_tensor_with_name_from_collection(self, name, collection):
+        """Find and remove a tensor with given 'name' from given 'collection'."""
+        to_remove = None
+
+        for t in collection:
+            if t.name == name:
+                to_remove = t
+                break
+
+        if to_remove is not None:
+            collection.remove(to_remove)
+
+    def _tensors_similar(
+        self, t_tensor1: tflite_model.Tensor, t_tensor2: tflite_model.Tensor
+    ) -> bool:
+        """Determine if the given TFLite tensors have the same shape and
+        datatype."""
+
+        if t_tensor1.type != t_tensor2.type:
+            return False
+
+        return translator.collections_equal(
+            t_tensor1.shape.vector, t_tensor2.shape.vector
+        )
+
+    def tensor_for_name(self, name: str) -> tflite_model.Tensor:
+        """
+        Get an existing TFLite tensor with given 'name'. If such tensor does NOT exist, function will
+        create and register a new tensor with shape '[]', which will be returned. If the tensor was
+        redirected, destination tensor is returned instead.
+
+        :param name: Name of the tensor.
+        :return: Tensor instance.
+        """
+        if name not in self._tensor_name_map.keys():
+            logger.d(f"Tensor '{name}' is not yet in the tensors. Adding it!")
+
+            new_tensor = tflite_model.Tensor(tflite_model.Shape([]), name)
+            new_tensor.tmp_buffer = self.build_empty_buffer()
+
+            self.append_new_tensor(new_tensor)
+        else:
+            tensor = self._tensor_name_map[name]
+            if new_tensor := self._skipped_output_map.get(tensor, None):
+                # Tensor was redirected - return destination tensor
+                if not self._tensors_similar(tensor, new_tensor):
+                    logger.e(
+                        logger.Code.INTERNAL_ERROR,
+                        "Attempt to return non-matching tensor after redirect!",
+                    )
+
+                return new_tensor
+
+        return self._tensor_name_map[name]
+
+    def buffers_size(self):
+        """Return the number of buffers that are currently in the model."""
+        return self.get_buffers().len()
+
+    def operator_codes_size(self):
+        """Return the number of operator codes that are currently in the model."""
+        return self.get_operator_codes().len()
+
+    def _remove_input_with_name(self, name):
+        """Find and remove a tensor in the sub_graph 'inputs' with given 'name'."""
+        self._remove_tensor_with_name_from_collection(
+            name, self.get_sub_graph().inputs.tmp_inputs
+        )
+
+    def _remove_output_with_name(self, name):
+        """Find and remove a tensor in the sub_graph 'outputs' with given 'name'."""
+        self._remove_tensor_with_name_from_collection(
+            name, self.get_sub_graph().outputs.tmp_outputs
+        )
+
+    def _remove_tensor_with_name(self, name):
+        """Find and remove a tensor in the graph with given 'name'."""
+        self._remove_tensor_with_name_from_collection(name, self.get_tensors().vector)
+
+    def append_new_tensor(self, t_tensor: tflite_model.Tensor, overwrite: bool = False):
+        """Append the TFLite tensor 't_tensor' to the 'SubGraph.tensors' and register it."""
+
+        if t_tensor.name in self._tensor_name_map.keys():
+            """Tensor has already been added. Sometimes however, ONNX models
+            will have tensors in their 'inputs' or 'outputs', which don't
+            belong there and are in fact static. I this case we need to
+            overwrite the existing tensors."""
+
+            if overwrite:
+                self._remove_tensor_with_name(t_tensor.name)
+
+                # If the tenor previously appeared in ONNX 'inputs' or 'outputs',
+                # the old version MUST be removed from there.
+                self._remove_input_with_name(t_tensor.name)
+                self._remove_output_with_name(t_tensor.name)
+
+                self.get_tensors().append(t_tensor)
+                self._tensor_name_map[t_tensor.name] = t_tensor
+            else:
+                logger.w(f"Tensor '{t_tensor.name}' is already in the tensors!")
+
+        else:
+            self._tensor_name_map[t_tensor.name] = t_tensor
+            self.get_tensors().append(t_tensor)
+
+    def append_new_buffer(self, buffer: tflite_model.Buffer):
+        """Append the 'buffer' to the 'model.buffers'."""
+        self.get_buffers().append(buffer)
+
+    def get_first_empty_buffer(self) -> tflite_model.Buffer:
+        """Return the first empty buffer in the model. It should be the one on index 0."""
+        for b in self.get_buffers().vector:
+            if not _buffer_has_data(b):
+                return b
+
+        # No empty buffers in the model -> create one. This is uncommon, but can happen in weird models.
+        return self.build_empty_buffer()
+
+    def get_operator_with_output(
+        self, t_tensor: tflite_model.Tensor
+    ) -> Optional[tflite_model.Operator]:
+        """Get the first operator from the graph, that has 't_tensor' in its 'tmp_outputs' list.
+        If such operator doesn't exist, return None.
+        """
+
+        for op in self.get_operators().vector:
+            if t_tensor in op.tmp_outputs:
+                return op
+
+        return None
+
+    def _create_transpose_operator(
+        self,
+        input_tensor: tflite_model.Tensor,
+        output_tensor: tflite_model.Tensor,
+        permutation: list[int] | np.ndarray,
+    ):
+        """Create a `Transpose` operator with given input, output and permutation."""
+        if isinstance(permutation, list):
+            permutation = np.asarray(permutation, np.int32)
+        elif isinstance(permutation, np.ndarray):
+            logger.internal_assert(
+                permutation.dtype == np.int32,
+                "model_builder._create_transpose_operator(): "
+                "permutation doesn't have type int32.",
+            )
+        else:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "model_builder._create_transpose_operator(): permutation is not "
+                "a list or a numpy array.",
+            )
+
+        permutation_tensor = self.create_tensor_for_data(permutation, "perm")
+
+        if input_tensor.rank <= 6:
+            # Create regular `Transpose`.
+            transpose = tflite_model.Operator(
+                builtin_options=transpose_options.Transpose(),
+                opcode_index=self.op_code_index_for_op_type(BuiltinOperator.TRANSPOSE),
+            )
+        else:
+            # 7D and bigger require the Flex delegate `Transpose`.
+
+            if input_tensor.quantization is not None:
+                logger.e(
+                    logger.Code.CONVERSION_IMPOSSIBLE,
+                    "Conversion requires the addition of a `Transpose` operator with more than 6 dimensions, "
+                    "which doesn't support quantization.",
+                )
+
+            if not self.conversion_config.allow_select_ops:
+                logger.e(
+                    logger.Code.CONVERSION_IMPOSSIBLE,
+                    "Conversion requires the addition of a `Transpose` operator with more than 6 dimensions, "
+                    "which requires the use of Flex delegate. "
+                    + logger.Message.ALLOW_SELECT_OPS,
+                )
+
+            transpose = tflite_model.Operator(
+                custom_options=FlexTranspose(),
+                opcode_index=self.op_code_index_for_op_type(
+                    FlexTranspose.operator_type, 1, FlexTranspose.custom_code
+                ),
+            )
+
+        transpose.tmp_inputs = [input_tensor, permutation_tensor]
+        transpose.tmp_outputs = [output_tensor]
+        transpose.tmp_added_extra = True
+
+        return transpose
+
+    def create_transpose_operator_before(
+        self,
+        before_operator: tflite_model.Operator,
+        on_input_index: int,
+        permutation: list[int] | np.ndarray,
+    ):
+        """
+            Create a TFLite 'Transpose' operator before the 'before_operator'.
+            The input of 'before_operator' at index 'on_input_index', is where the Transpose operator will connect to
+            the graph.
+
+        :param before_operator: Create the Transpose operator in front of this operator.
+        :param on_input_index: Attach the output of the Transpose op to the input of 'before_operator' on this index.
+        :param permutation: The permutation that will be applied by the Transpose operator.
+        """
+
+        input_tensor = before_operator.tmp_inputs[on_input_index]
+        output_tensor = self.duplicate_tensor(
+            input_tensor, name_suffix="_transposed_", empty_buffer=True
+        )
+        permuted_shape = translator.apply_permutation_to(
+            output_tensor.shape.vector, permutation
+        )
+        output_tensor.shape = tflite_model.Shape(permuted_shape)
+
+        transpose = self._create_transpose_operator(
+            input_tensor, output_tensor, permutation
+        )
+
+        before_operator.tmp_inputs[on_input_index] = output_tensor
+
+        return transpose
+
+    def create_transpose_operator_after(
+        self,
+        after_operator: tflite_model.Operator,
+        on_output_index: int,
+        permutation: list[int] | np.ndarray,
+        keep_output_shape: bool = True,
+    ):
+        """
+            Create a TFLite 'Transpose' operator after the 'after_operator'.
+            The output of 'after_operator' at index 'on_output_index' is where the Transpose operator will be connected.
+
+            The original output tensor of 'after_operator' will be used as the output of the Transpose operator.
+            Meaning that operators which use that output of 'after_operator' will now use the output of the Transpose
+            operator instead!
+
+            If 'keep_output_shape' is True, the output of the Transpose operator will have the same shape as the
+             original output of the 'after_operator', and the 'after_operator' will have its output shape changed to
+             match the permutation.
+            If 'keep_output_shape' is False, the output of the Transpose operator will have the new permuted shape, and
+             the shape of the output of 'after_operator' will stay the same.
+
+        :param after_operator: Create the Transpose operator right after this operator.
+        :param on_output_index: Attach the input of the Transpose op to the output of 'after_operator' on this index.
+        :param permutation: The permutation that will be applied by the Transpose operator.
+        :param keep_output_shape: If True, the output of the Transpose will have the same shape as the original output
+                                   of 'after_operator', and 'after_operator' will have its output modified to match.
+                                  If False, the output of the Transpose operator will have the new permuted shape, and
+                                  the output of 'after_operator' will remain unchanged.
+        """
+
+        # Input and output tensors of the Transpose operator
+        output_tensor = after_operator.tmp_outputs[on_output_index]
+        input_tensor = self.duplicate_tensor(
+            output_tensor, output_tensor.name, empty_buffer=True
+        )
+
+        if keep_output_shape:
+            # The output of Transpose keeps its shape. Input of Transpose must be changed
+            inverse_permutation = translator.create_inverse_permutation(permutation)
+            pre_permuted_shape = translator.apply_permutation_to(
+                input_tensor.shape.vector, inverse_permutation
+            )
+            input_tensor.shape = tflite_model.Shape(pre_permuted_shape)
+
+        else:
+            # Set the shape of the Transpose output
+            permuted_shape = translator.apply_permutation_to(
+                output_tensor.shape.vector, permutation
+            )
+            output_tensor.shape = tflite_model.Shape(permuted_shape)
+
+        transpose = self._create_transpose_operator(
+            input_tensor, output_tensor, permutation
+        )
+
+        after_operator.tmp_outputs[on_output_index] = input_tensor
+
+        return transpose
+
+    def create_quantize_operator_before(
+        self,
+        before_operator: tflite_model.Operator,
+        on_input_index: int,
+        new_input_data_type: TensorType,
+        new_input_scale: Optional[List[float]] = None,
+        new_input_zero_point: Optional[List[int]] = None,
+    ):
+        """
+            Create a TFLite 'Quantize' operator before the 'before_operator'.
+            The input of 'before_operator' at index 'on_input_index', is where the Quantize operator will connect to the
+            graph.
+            The input of 'before_operator' will now have a new data type and quantization parameters.
+
+        :param before_operator: Create the Quantize operator in front of this operator.
+        :param on_input_index: Attach the output of the Quantize op to the input of 'before_operator' on this index.
+        :param new_input_data_type: New input TFLite data type of the 'before_operator' operator.
+        :param new_input_scale: New input scale of the 'before_operator' operator.
+        :param new_input_zero_point: New input zero point of the 'before_operator' operator.
+        """
+
+        input_tensor = before_operator.tmp_inputs[on_input_index]
+        output_tensor = self.duplicate_tensor(
+            input_tensor, input_tensor.name, empty_buffer=True
+        )
+
+        quantized_dimension = input_tensor.quantization.quantized_dimension
+
+        if new_input_scale is None:
+            new_input_scale = input_tensor.quantization.scale.vector.copy()
+        if new_input_zero_point is None:
+            new_input_zero_point = input_tensor.quantization.zero_point.vector.copy()
+
+        output_tensor.type = new_input_data_type
+        output_tensor.quantization = tflite_model.Quantization(
+            scale=tflite_model.Scale(new_input_scale),
+            zero_point=tflite_model.ZeroPoint(new_input_zero_point),
+            quantized_dimension=quantized_dimension,
+        )
+        quantize = tflite_model.Operator(
+            builtin_options=quantize_options.Quantize(),
+            opcode_index=self.op_code_index_for_op_type(BuiltinOperator.QUANTIZE),
+        )
+        quantize.tmp_inputs = [input_tensor]
+        quantize.tmp_outputs = [output_tensor]
+        quantize.tmp_added_extra = True
+
+        before_operator.tmp_inputs[on_input_index] = output_tensor
+
+        return quantize
+
+    def create_quantize_operator_after(
+        self,
+        after_operator: tflite_model.Operator,
+        on_output_index: int,
+        new_output_data_type: TensorType,
+        new_output_scale: Optional[List[float]] = None,
+        new_output_zero_point: Optional[List[int]] = None,
+    ) -> tflite_model.Operator:
+        """
+            Create a TFLite 'Quantize' operator after the 'after_operator'.
+            The output of 'after_operator' at index 'on_output_index', is where the Quantize operator will connect to
+            the graph.
+            The output of 'after_operator' will now have a new data type and quantization parameters.
+
+        :param after_operator: Create the Quantize operator behind this operator.
+        :param on_output_index: Attach the input of the Quantize op to the output of 'before_operator' on this index.
+        :param new_output_data_type: New output TFLite data type of the 'after_operator' operator.
+        :param new_output_scale: New output scale of the 'after_operator' operator.
+        :param new_output_zero_point: New output zero point of the 'after_operator' operator.
+        """
+
+        output_tensor = after_operator.tmp_outputs[on_output_index]
+        input_tensor = self.duplicate_tensor(
+            output_tensor, output_tensor.name, empty_buffer=True
+        )
+
+        quantized_dimension = output_tensor.quantization.quantized_dimension
+
+        if new_output_scale is None:
+            new_output_scale = input_tensor.quantization.scale.vector.copy()
+        if new_output_zero_point is None:
+            new_output_zero_point = input_tensor.quantization.zero_point.vector.copy()
+
+        input_tensor.type = new_output_data_type
+        input_tensor.quantization = tflite_model.Quantization(
+            scale=tflite_model.Scale(new_output_scale),
+            zero_point=tflite_model.ZeroPoint(new_output_zero_point),
+            quantized_dimension=quantized_dimension,
+        )
+
+        quantize = tflite_model.Operator(
+            builtin_options=quantize_options.Quantize(),
+            opcode_index=self.op_code_index_for_op_type(BuiltinOperator.QUANTIZE),
+        )
+        quantize.tmp_inputs = [input_tensor]
+        quantize.tmp_outputs = [output_tensor]
+        quantize.tmp_added_extra = True
+
+        after_operator.tmp_outputs[on_output_index] = input_tensor
+
+        return quantize
+
+    def create_dequantize_operator_after(
+        self,
+        after_operator: tflite_model.Operator,
+        on_output_index: int,
+        new_output_data_type: TensorType,
+        new_output_scale: list[float],
+        new_output_zero_point: list[int],
+        quantized_dimension: int,
+    ) -> tflite_model.Operator:
+        """
+            Create a TFLite 'Dequantize' operator after the 'after_operator'.
+            The output of 'after_operator' at index 'on_output_index', is where the Quantize operator will connect to
+            the graph.
+            The output of 'after_operator' will now have a new quantized data type.
+            This method was designed for the use case, where 'after_operator' has a FLOAT32 output tensor, but the
+             operator will produce a quantized output. The following operators however expect the float data.
+             This is in line with other similar methods.
+
+        :param after_operator: Create the Dequantize operator behind this operator.
+        :param on_output_index: Attach the input of the Dequantize op to the output of 'before_operator' on this index.
+        :param new_output_data_type: New output TFLite data type of the 'after_operator' operator.
+        :param new_output_scale: New output scale of the 'after_operator' operator.
+        :param new_output_zero_point: New output zero point of the 'after_operator' operator.
+        :param quantized_dimension: The quantized dimension parameter of the new output tensor of 'after_operator'.
+        :return: The Dequantize operator.
+        """
+
+        output_tensor = after_operator.tmp_outputs[on_output_index]
+        input_tensor = self.duplicate_tensor(
+            output_tensor, output_tensor.name, empty_buffer=True
+        )
+
+        input_tensor.type = new_output_data_type
+        input_tensor.quantization = tflite_model.Quantization(
+            scale=tflite_model.Scale(new_output_scale),
+            zero_point=tflite_model.ZeroPoint(new_output_zero_point),
+            quantized_dimension=quantized_dimension,
+        )
+
+        dequantize = tflite_model.Operator(
+            builtin_options=dequantize_options.Dequantize(),
+            opcode_index=self.op_code_index_for_op_type(BuiltinOperator.DEQUANTIZE),
+        )
+        dequantize.tmp_inputs = [input_tensor]
+        dequantize.tmp_outputs = [output_tensor]
+        dequantize.tmp_added_extra = True
+
+        after_operator.tmp_outputs[on_output_index] = input_tensor
+
+        return dequantize
+
+    def create_reshape_before(
+        self,
+        before_op: tflite_model.Operator,
+        on_input_index: int,
+        new_shape: List[int],
+    ) -> tflite_model.Operator:
+        """
+        Create a TFLite 'Reshape' operator before the 'before_op' operator. The input of 'before_op' on index
+        'on_input_index' is where the 'Reshape' operator will connect. With this function it is expected
+        to change input shape of 'before_op' operator on index 'on_input_index'.
+
+        :param before_op: TFLite operator that will consume the output of the new 'Reshape' operator.
+        :param on_input_index: Index of an input tensor of the 'before_op' operator, which will serve as the new input
+                                for the 'Reshape' operator.
+        :param new_shape: Shape of the new tensor that will serve as an output of 'Reshape' operator.
+        :return: The TFLite 'Reshape' operator.
+        """
+
+        input_tensor = before_op.tmp_inputs[on_input_index]
+
+        reshape_output = self.duplicate_tensor(
+            input_tensor, input_tensor.name + "_reshaped", empty_buffer=True
+        )
+        reshape_output.shape = tflite_model.Shape(new_shape)
+
+        reshape_op = tflite_model.Operator(
+            builtin_options=reshape_options.Reshape(new_shape)
+        )
+
+        reshape_op.tmp_inputs = [input_tensor]
+        reshape_op.tmp_outputs = [reshape_output]
+
+        before_op.tmp_inputs[on_input_index] = reshape_output
+
+        return reshape_op
+
+    def create_reshape_after(
+        self,
+        after_op: tflite_model.Operator,
+        on_output_index: int,
+        new_shape: List[int],
+    ) -> tflite_model.Operator:
+        """
+        Create a TFLite 'Reshape' operator after the 'after_op' operator. The output of 'after_op' on index
+        'on_output_index' is where the 'Reshape' operator will connect. This function will preserve output
+        shape of 'after_op' operator on index 'on_output_index'.
+
+        :param after_op: TFLite operator that will produce the input of the new 'Reshape' operator.
+        :param on_output_index: Index of an output tensor of the 'after_op' operator, which will serve as the new input
+                                for the 'Reshape' operator.
+        :param new_shape: Shape of the new tensor that will serve as an output of 'Reshape' operator.
+        :return: The TFLite 'Reshape' operator.
+        """
+
+        output_tensor = after_op.tmp_outputs[on_output_index]
+
+        reshape_input = self.duplicate_tensor(
+            output_tensor, output_tensor.name + "_reshaped", empty_buffer=True
+        )
+        output_tensor.shape = tflite_model.Shape(new_shape)
+
+        reshape_op = tflite_model.Operator(
+            builtin_options=reshape_options.Reshape(new_shape)
+        )
+
+        reshape_op.tmp_inputs = [reshape_input]
+        reshape_op.tmp_outputs = [output_tensor]
+        reshape_op.tmp_added_extra = True
+
+        after_op.tmp_outputs[on_output_index] = reshape_input
+
+        return reshape_op
+
+    def create_cast_before(
+        self,
+        before_op: tflite_model.Operator,
+        on_input_index: int,
+        new_type: TensorType,
+    ) -> tflite_model.Operator:
+        """
+        Create a TFLite 'Cast' operator before the 'before_op' operator. The input of 'before_op' on index
+        'on_input_index' is where the 'Cast' operator will connect.
+
+        :param before_op: TFLite operator that will consume the output of the new 'Cast' operator.
+        :param on_input_index: Index of an input tensor of the 'before_op' operator, which will serve as the new input
+                                for the 'Cast' operator.
+        :param new_type: Type of output tensor of 'Cast' operator.
+        :return: The TFLite 'Cast' operator.
+        """
+
+        input_tensor = before_op.tmp_inputs[on_input_index]
+
+        cast_output = self.duplicate_tensor(
+            input_tensor, input_tensor.name + "_casted", empty_buffer=True
+        )
+        cast_output.type = new_type
+
+        cast_op = tflite_model.Operator(
+            builtin_options=cast_options.Cast(input_tensor.type, new_type)
+        )
+        cast_op.tmp_inputs = [input_tensor]
+        cast_op.tmp_outputs = [cast_output]
+        cast_op.tmp_added_extra = True
+
+        before_op.tmp_inputs[on_input_index] = cast_output
+
+        return cast_op
+
+    def create_cast_after(
+        self,
+        after_op: tflite_model.Operator,
+        on_output_index: int,
+        new_type: TensorType,
+    ) -> tflite_model.Operator:
+        """
+        Create a TFLite 'Cast' operator after the 'after_op' operator. The output of 'after_op' on index
+        'on_output_index' is where the 'Cast' operator will connect. This function will change output
+        type of 'after_op' operator on index 'on_output_index' to 'new_type'.
+
+        :param after_op: TFLite operator that will produce the input of the new 'Cast' operator.
+        :param on_output_index: Index of an output tensor of the 'after_op' operator, which will serve as the new input
+                                for the 'Cast' operator.
+        :param new_type: Type of the new tensor that will serve as an input of 'Cast' operator.
+        :return: The TFLite 'Cast' operator.
+        """
+
+        output_tensor = after_op.tmp_outputs[on_output_index]
+
+        cast_input = self.duplicate_tensor(
+            output_tensor, output_tensor.name + "_casted", empty_buffer=True
+        )
+        cast_input.type = new_type
+
+        cast_builtin_options = cast_options.Cast(
+            in_data_type=new_type, out_data_type=output_tensor.type
+        )
+        cast_op = tflite_model.Operator(builtin_options=cast_builtin_options)
+
+        cast_op.tmp_inputs = [cast_input]
+        cast_op.tmp_outputs = [output_tensor]
+        cast_op.tmp_added_extra = True
+
+        after_op.tmp_outputs[on_output_index] = cast_input
+
+        return cast_op
+
+    def create_slice_after(
+        self,
+        after_op: tflite_model.Operator,
+        on_output_index: int,
+        begin: list[int],
+        size: list[int],
+    ):
+        """
+        Create a TFLite 'Slice' operator after the 'after_op' operator. The output of 'after_op' on index
+        'on_output_index' is where the 'Slice' operator will connect. This function will preserve output
+        shape of 'after_op' operator on index 'on_output_index'.
+
+        :param after_op: TFLite operator that will produce the input of the new 'Slice' operator.
+        :param on_output_index: Index of an output tensor of the 'after_op' operator, which will serve as the new input
+                                for the 'Slice' operator.
+        :param begin: List of indices where slicing begins. Must have same length as sliced tensor.
+        :param size: List of sliced sizes. Defines how many items is sliced per dimension. Must
+                        have same length as sliced tensor.
+        :return: The TFLite 'Slice' operator.
+        """
+
+        output_tensor = after_op.tmp_outputs[on_output_index]
+
+        logger.internal_assert(
+            len(begin) == len(size),
+            "create_slice_after(): Rank of 'begin' tensor and 'size' tensor don't match.",
+        )
+        logger.internal_assert(
+            len(begin) == len(output_tensor.shape.vector),
+            "create_slice_after(): Rank of 'begin' tensor and sliced tensor don't match.",
+        )
+
+        slice_input = self.duplicate_tensor(
+            output_tensor, output_tensor.name + "_sliced", empty_buffer=True
+        )
+        output_tensor.shape = tflite_model.Shape(size)
+
+        begin_tensor = self.create_tensor_for_data(np.asarray(begin, np.int32), "begin")
+        size_tensor = self.create_tensor_for_data(np.asarray(size, np.int32), "size")
+
+        slice_op = tflite_model.Operator(builtin_options=slice_options.Slice())
+        slice_op.tmp_inputs = [slice_input, begin_tensor, size_tensor]
+        slice_op.tmp_outputs = [output_tensor]
+        slice_op.tmp_added_extra = True
+
+        after_op.tmp_outputs[on_output_index] = slice_input
+
+        return slice_op
+
+    def create_gather_before(
+        self,
+        before_op: tflite_model.Operator,
+        on_input_index: int,
+        indices: list[int],
+        output_shape: list[int],
+        axis: int = 0,
+    ) -> tflite_model.Operator:
+        """
+        Create a TFLite 'Gather' operator before the 'before_op' operator. The input of 'before_op' on index
+        'on_input_index' is where the 'Gather' operator will connect.
+
+        :param before_op: TFLite operator that will consume the output of the new 'Gather' operator.
+        :param on_input_index: Index of an input tensor of the 'before_op' operator, which will serve as the new output
+                                for the 'Gather' operator.
+        :param indices: The `indices` operand of the TFLite 'Gather' operator.
+        :param output_shape: The shape of the output of the 'Gather' operator.
+        :param axis: The `axis` attribute of the TFLite 'Gather' operator.
+        :return: The TFLite 'Gather' operator.
+        """
+
+        input_tensor = before_op.tmp_inputs[on_input_index]
+
+        gather_output = self.duplicate_tensor(input_tensor, empty_buffer=True)
+        gather_output.shape = tflite_model.Shape(output_shape)
+
+        indices_tensor = self.create_tensor_for_data(
+            np.array(indices, np.int32), "indices"
+        )
+
+        gather_op = tflite_model.Operator(builtin_options=gather_options.Gather(axis))
+
+        gather_op.tmp_inputs = [input_tensor, indices_tensor]
+        gather_op.tmp_outputs = [gather_output]
+        gather_op.tmp_added_extra = True
+
+        before_op.tmp_inputs[on_input_index] = gather_output
+
+        return gather_op
+
+    def ensure_correct_broadcasting(
+        self, t_op: tflite_model.Operator, main_output: tflite_model.Tensor
+    ) -> List[tflite_model.Operator]:
+        """Make sure that all input tensors of 't_op' can have their shape broadcasted correctly.
+             Static input tensors will be altered statically and for dynamic tensors, Reshape and Transpose operators
+             will be added to ensure a valid shape.
+            Note: The TFLite 't_op' operator still has to support shape broadcasting! This function just makes sure, the
+             shapes are broadcastable correctly. it doesn't eliminate the need for broadcasting.
+
+        :param t_op: TFLite operator with input tensors that need to be made broadcastable.
+        :param main_output: The TFLite tensor, that is the main output of the operation carried out by 't_op'.
+        :return: A list of TFLite operators Reshape and Transpose, that need to be added to the model before 't_op'.
+        """
+
+        if main_output not in t_op.tmp_outputs:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "ModelBuilder.ensure_correct_broadcasting(): 'main_output' is not among the outputs of 't_op'!",
+            )
+
+        if not uses_shape_broadcasting(t_op):
+            # Operator doesn't use shape broadcasting
+            return []
+
+        if not main_output.tensor_format.is_channels_last() and not any(
+            input_tensor.tensor_format.is_channels_last()
+            for input_tensor in t_op.tmp_inputs
+        ):
+            # Operator uses only formatless tensors
+            return []
+
+        # -- Operator uses channels last tensors and shape broadcasting --
+
+        ops_to_add = []
+        new_tmp_inputs = []
+        output_shape = main_output.shape
+        output_rank = output_shape.len()
+
+        for input_tensor in t_op.tmp_inputs:
+
+            if input_tensor.shape != main_output.shape:
+                if tensor_has_data(input_tensor):
+                    # Replace the static input with one with a corrected shape.
+                    x = self.prepare_static_tensor_for_correct_broadcasting_with_channels_first_tensors(
+                        input_tensor, output_rank
+                    )
+                    new_tmp_inputs.append(x)
+                else:
+                    # Prepend Reshape and Transpose
+                    ops = self.prepare_dynamic_tensor_for_correct_broadcasting_with_channels_first_tensors(
+                        input_tensor, output_rank
+                    )
+
+                    if len(ops) != 0:
+                        # The output of the 'Transpose' (last returned op) will be the new input of the operator
+                        new_tmp_inputs.append(ops[-1].tmp_outputs[0])
+                    else:
+                        new_tmp_inputs.append(input_tensor)
+
+                    ops_to_add.extend(ops)
+
+            else:
+                # Keep the original input as is
+                new_tmp_inputs.append(input_tensor)
+
+        t_op.tmp_inputs = new_tmp_inputs
+
+        return ops_to_add
+
+    def prepare_dynamic_tensor_for_correct_broadcasting_with_channels_first_tensors(
+        self, tensor: tflite_model.Tensor, output_rank: int
+    ) -> List[tflite_model.Operator]:
+        """Create Reshape and Transpose operators, to make sure the shape of the dynamic 'tensor' can be correctly
+             broadcasted with other TFLite channels last tensors.
+            The assumption is that the 'tensor' needs to be broadcasted with channels last tensors with a greater or
+             equal rank. And due to its smaller rank, the shapes will not line up.
+            The output tensor of the last returned operator is new, and must be set as a new input of the original
+             operator.
+
+        :param tensor: Dynamic TFLite tensor, that needs to be broadcastable with channels last tensors, but the shape
+                        doesn't line up, due to prior (possibly incorrect) conversion.
+        :param output_rank: The rank of the output tensor of the operator.
+        :return: A list of Reshape and Transpose operators, which need to be added to the model before 't_op'.
+        """
+        input_rank = tensor.shape.len()
+        rank_diff = output_rank - input_rank
+
+        if rank_diff < 0:
+            logger.e(
+                logger.Code.INTERNAL_ERROR, "'tensor' rank must be <= output_rank!"
+            )
+
+        if rank_diff == 0:
+            # The tensor is already broadcastable
+            return []
+
+        ops_to_add = []
+
+        # -- Add a Reshape operator to extend the rank --
+
+        extended_shape = [1] * rank_diff + tensor.shape.vector
+        transpose_input = self.duplicate_tensor(tensor)
+        transpose_input.shape = tflite_model.Shape(extended_shape)
+
+        reshape = tflite_model.Operator(
+            builtin_options=reshape_options.Reshape(extended_shape)
+        )
+        reshape.tmp_inputs = [tensor]
+        reshape.tmp_outputs = [transpose_input]
+
+        ops_to_add.append(reshape)
+
+        # Add Transpose operator
+        if tensor.tensor_format.is_channels_last():
+            # The 'tensor' was incorrectly converted from channels first before. Revert it and then convert properly.
+
+            revert_perm = translator.create_channels_last_to_channels_first_permutation(
+                input_rank
+            )
+
+            # The indices refer to dimensions according to the rank of the input. But the Reshape may have increased the
+            #  rank by prepending 1s. Therefore, we need to increment these indices according to the rank difference, to
+            #  still refer to the same dimensions from the right.
+            revert_perm += rank_diff
+
+            # Prepend a partial identity, to keep leading dimensions unchanged.
+            revert_perm = list(range(rank_diff)) + list(revert_perm)
+
+            # Now add a permutation to convert the extended ONNX shape to a TFLite shape
+            to_tflite_perm = (
+                translator.create_channels_first_to_channels_last_permutation(
+                    output_rank
+                )
+            )
+
+            perm = translator.combine_permutations(revert_perm, to_tflite_perm)
+
+        else:
+            # The 'tensor' was NOT incorrectly converted earlier. Just convert the extended shape to TFLite.
+            perm = translator.create_channels_first_to_channels_last_permutation(
+                output_rank
+            )
+
+        transpose_output = self.duplicate_tensor(transpose_input)
+        transpose_output.shape = tflite_model.Shape(
+            translator.apply_permutation_to(transpose_output.shape.vector, perm)
+        )
+        transpose_output.tensor_format = TensorFormat.CHANNELS_LAST
+
+        transpose = self._create_transpose_operator(
+            transpose_input, transpose_output, perm
+        )
+        ops_to_add.append(transpose)
+
+        return ops_to_add
+
+    def prepare_static_tensor_for_correct_broadcasting_with_channels_first_tensors(
+        self, tensor: tflite_model.Tensor, output_rank: int
+    ) -> tflite_model.Tensor:
+        """Create a TFLite tensor based on the static 'tensor', so that it can be correctly broadcasted with channels
+             last tensors, and return it.
+            The assumption is that the 'tensor' needs to be broadcasted with channels last tensors with a greater or
+             equal rank. And due to its smaller rank, the shapes will not line up.
+
+        :param tensor: Static TFLite tensor, that needs to be broadcastable with channels last tensors, but the shape
+                        doesn't line up, due to prior incorrect conversion.
+        :param output_rank: The rank of the output tensor of the operator.
+        :return: A new static tensor, with a corrected shape for TFLite broadcasting.
+        """
+        if not tensor_has_data(tensor):
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "ModelBuilder._reshape_static_tensor_to_be_broadcastable(): 'tensor' is not static!",
+            )
+
+        tensor = self.duplicate_tensor(
+            tensor
+        )  # Work with a clean copy, in case the tensor is also used elsewhere.
+        data = tensor.tmp_buffer.data
+        shape = tensor.shape.vector
+
+        rank_diff = output_rank - len(shape)
+        if rank_diff < 0:
+            logger.e(
+                logger.Code.INTERNAL_ERROR, "'tensor' rank must be <= output_rank!"
+            )
+
+        if tensor.tensor_format.is_channels_last():
+            # The tensor was incorrectly converted to channels last. Extend it with 1s and convert properly.
+
+            original_shape = translator.dims_to_channels_first(
+                shape
+            )  # Same shape as in the ONNX model
+
+            # Prepend 1s to the shape
+            extended_onnx_shape = [1] * rank_diff + original_shape
+
+            # Convert the full shape to TFLite format
+            tflite_shape = translator.dims_to_channels_last(extended_onnx_shape)
+            tensor.shape = tflite_model.Shape(tflite_shape)
+
+            # Statically transpose the data
+            data = translator.convert_data_to_channels_first(
+                data
+            )  # To the same shape as in the ONNX model
+            data = data.reshape(extended_onnx_shape)  # Extend with leading 1s
+            tensor.tmp_buffer.data = translator.convert_data_to_channels_last(
+                data
+            )  # Convert to TFLite format
+
+            assert tflite_shape == list(tensor.tmp_buffer.data.shape)
+
+        else:
+            # The tensor is the same as in the ONNX model.
+
+            extended_onnx_shape = [1] * rank_diff + shape
+
+            # Convert the full shape to TFLite format
+            tflite_shape = translator.dims_to_channels_last(extended_onnx_shape)
+            tensor.shape = tflite_model.Shape(tflite_shape)
+
+            # Statically transpose the data
+            data = data.reshape(extended_onnx_shape)  # Extend with leading 1s
+            tensor.tmp_buffer.data = translator.convert_data_to_channels_last(
+                data
+            )  # Convert to TFLite format
+
+            assert tflite_shape == list(tensor.tmp_buffer.data.shape)
+
+        return tensor
+
+    def operator_produces_graph_output(self, t_op: tflite_model.Operator) -> bool:
+        """Determine whether any output tensor of the operator 't_op' is also an output of the entire graph.
+
+        :param t_op: TFLite operator to check,
+        :return: True, if at least 1 output of 't_op' is also an output of the graph.
+        """
+        graph_outputs = self.get_sub_graph().outputs.tmp_outputs
+        return any(output_tensor in graph_outputs for output_tensor in t_op.tmp_outputs)
+
+    """ ---------------- Functions to get an element of the TFLite model. ----------------
+    If the element doesn't exist, it is created. So functions always return a valid object. """
+
+    def get_sub_graphs(self) -> tflite_model.SubGraphs:
+        if self._tfl_model.sub_graphs is None:
+            self._tfl_model.sub_graphs = tflite_model.SubGraphs()
+
+        return self._tfl_model.sub_graphs
+
+    def get_sub_graph(self) -> tflite_model.SubGraph:
+        sub_graphs = self.get_sub_graphs()
+        if sub_graphs.len() == 0:
+            sub_graphs.append(tflite_model.SubGraph())
+
+        return sub_graphs.get(0)
+
+    def get_tensors(self) -> tflite_model.Tensors:
+        sub_graph = self.get_sub_graph()
+        if sub_graph.tensors is None:
+            sub_graph.tensors = tflite_model.Tensors()
+
+        return sub_graph.tensors
+
+    def get_buffers(self) -> tflite_model.Buffers:
+        if self._tfl_model.buffers is None:
+            self._tfl_model.buffers = tflite_model.Buffers()
+
+        return self._tfl_model.buffers
+
+    def get_operators(self) -> tflite_model.Operators:
+        sub_graph = self.get_sub_graph()
+        if sub_graph.operators is None:
+            sub_graph.operators = tflite_model.Operators()
+
+        return sub_graph.operators
+
+    def get_operator_codes(self) -> tflite_model.OperatorCodes:
+        if self._tfl_model.operator_codes is None:
+            self._tfl_model.operator_codes = tflite_model.OperatorCodes()
+
+        return self._tfl_model.operator_codes
diff --git a/backends/nxp/backend/ir/converter/builder/quantization_verification.py b/backends/nxp/backend/ir/converter/builder/quantization_verification.py
new file mode 100755
index 00000000000..25989123385
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/builder/quantization_verification.py
@@ -0,0 +1,377 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import abc
+
+import numpy as np
+from executorch.backends.nxp.backend.ir import logger
+
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+
+
+class IOTensor(abc.ABC):
+    idx: int
+
+
+class Input(IOTensor):
+    def __init__(self, idx):
+        self.idx = idx
+
+    def __str__(self):
+        return f"Input(idx={self.idx})"
+
+
+class OptionalInput(IOTensor):
+    def __init__(self, idx):
+        self.idx = idx
+
+    def __str__(self):
+        return f"OptionalInput(idx={self.idx})"
+
+
+class Output(IOTensor):
+    def __init__(self, idx):
+        self.idx = idx
+
+    def __str__(self):
+        return f"Output(idx={self.idx})"
+
+
+class QuantizationRule(abc.ABC):
+
+    @abc.abstractmethod
+    def valid(self, op: tflite_model.Operator) -> bool:
+        pass
+
+    @abc.abstractmethod
+    def __str__(self):
+        pass
+
+
+class SharedParamsForType(QuantizationRule):
+
+    def __init__(self, tensor_type: TensorType, *tensors: IOTensor):
+        self.tensor_type = tensor_type
+        self.tensors = tensors
+
+    def valid(self, op: tflite_model.Operator) -> bool:
+        shared_tensors = []
+        for tensor in self.tensors:
+            if isinstance(tensor, Input):
+                shared_tensors.append(op.tmp_inputs[tensor.idx])
+            elif isinstance(tensor, OptionalInput):
+                if tensor.idx < len(op.tmp_inputs):
+                    shared_tensors.append(op.tmp_inputs[tensor.idx])
+                else:
+                    return True
+            elif isinstance(tensor, Output):
+                shared_tensors.append(op.tmp_outputs[tensor.idx])
+            else:
+                logger.e(
+                    logger.Code.INTERNAL_ERROR, f"Unknown IOTensor type: {type(tensor)}"
+                )
+
+        if shared_tensors[0].type != self.tensor_type:
+            return True
+
+        if all(tensor.quantization is None for tensor in shared_tensors):
+            return True
+
+        first_quantization = shared_tensors[0].quantization
+
+        # Check quantization values (scales & zero-points)
+        scales_same = all(
+            first_quantization.scale == t.quantization.scale for t in shared_tensors[1:]
+        )
+        zp_same = all(
+            first_quantization.zero_point == t.quantization.zero_point
+            for t in shared_tensors[1:]
+        )
+        return scales_same and zp_same
+
+    def __str__(self):
+        return (
+            f"Q-params match required for tensors: {', '.join(map(str, self.tensors))}"
+        )
+
+
+class ExactValueForType(QuantizationRule):
+
+    def __init__(
+        self,
+        tensor_type: TensorType,
+        tensor: IOTensor,
+        scale: list[float],
+        zero_point: list,
+    ):
+        self.tensor = tensor
+        self.tensor_type = tensor_type
+        self.scale = scale
+        self.zero_point = zero_point
+
+    def valid(self, op: tflite_model.Operator) -> bool:
+        if isinstance(self.tensor, Input):
+            tflite_tensor = op.tmp_inputs[self.tensor.idx]
+        elif isinstance(self.tensor, OptionalInput):
+            if self.tensor.idx < len(op.tmp_inputs):
+                tflite_tensor = op.tmp_outputs[self.tensor.idx]
+            else:
+                return True
+        elif isinstance(self.tensor, Output):
+            tflite_tensor = op.tmp_outputs[self.tensor.idx]
+        else:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                f"Unknown IOTensor type: {type(self.tensor)}",
+            )
+
+        if tflite_tensor.quantization is None or self.tensor_type != tflite_tensor.type:
+            return True
+
+        scale = tflite_tensor.quantization.scale.vector
+        zp = tflite_tensor.quantization.zero_point.vector
+
+        # noinspection PyTypeChecker
+        return np.allclose(scale, self.scale) and np.allclose(zp, self.zero_point)
+
+    def __str__(self):
+        return f"ExactValue(scale={self.scale}, zero_point={self.zero_point}, type={self.tensor_type}, tensor={self.tensor})"
+
+
+class FullyConnectedWeightZeroPoint(QuantizationRule):
+    """LiteRT documentation says that `FullyConnected` must have weight zero point = 0
+     (https://ai.google.dev/edge/litert/models/quantization_spec)
+    If this condition is not satisfied, LiteRT will not raise any errors but the output will not be correct.
+
+    However, if the `weights` are dynamic the kernels DO in fact support any zero point. Not just 0s.
+    """
+
+    def valid(self, op: tflite_model.Operator) -> bool:
+        weights = op.tmp_inputs[1]
+        if weights.quantization is None:
+            return True
+
+        if weights.tmp_buffer is None or weights.tmp_buffer.data is None:
+            # The `weights` are dynamic. LiteRT supports any zero point in this case.
+            return True
+
+        else:
+            # Static `weights`.
+            if weights.type == TensorType.INT8:
+                zero_point = 0
+            elif weights.type == TensorType.UINT8:
+                zero_point = 128
+            else:
+                return True
+
+            return all(zp == zero_point for zp in weights.quantization.zero_point)
+
+    def __str__(self):
+        return "FullyConnectedWeightZeroPoint()"
+
+
+class ValidBiasValues(QuantizationRule):
+
+    def valid(self, op: tflite_model.Operator) -> bool:
+        if len(op.tmp_inputs) < 3:
+            # Bias tensor not present -> ignore
+            return True
+        if (bias_quant := op.tmp_inputs[2].quantization) is None:
+            # Not quantized -> ignore
+            return True
+
+        if (input_1_quant := op.tmp_inputs[0].quantization) is None:
+            logger.w(
+                "Bias tensor quantized but first input tensor not. This is not supported in TFLite."
+            )
+            return False
+        if (input_2_quant := op.tmp_inputs[1].quantization) is None:
+            logger.w(
+                "Bias tensor quantized but weight tensor not. This is not supported in TFLite."
+            )
+            return False
+
+        if op.tmp_inputs[2].type != TensorType.INT32:
+            logger.w(
+                "Quantized bias tensor's type isn't INT32. This is not supported in TFLite."
+            )
+            return False
+
+        expected_bias_scale = np.array(input_1_quant.scale.vector) * np.array(
+            input_2_quant.scale.vector
+        )
+
+        if not np.allclose(
+            expected_bias_scale.astype(np.float32),
+            np.array(bias_quant.scale.vector, dtype=np.float32),
+        ):
+            logger.w(
+                f"Scale of quantized bias tensor '{op.tmp_inputs[2].name}' is not equal to 'input0_scale * "
+                "input1_scale[...]'. This is not supported in TFLite."
+            )
+            return False
+
+        if bias_quant.zero_point.vector[0] != 0:
+            logger.w(
+                "Zero point of quantized bias tensor is not equal to '0'. This is not supported in TFLite."
+            )
+            return False
+
+        return True
+
+    def __str__(self):
+        return "ExactBiasValues()"
+
+
+def verify_quantization_integrity(model: tflite_model.Model):
+    rules = {
+        BuiltinOperator.AVERAGE_POOL_2D: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.BROADCAST_TO: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.CONCATENATION: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.INT8, Input(0), OptionalInput(1)),
+            SharedParamsForType(TensorType.INT8, Input(0), OptionalInput(2)),
+            SharedParamsForType(TensorType.INT8, Input(0), OptionalInput(3)),
+            SharedParamsForType(TensorType.INT8, Input(0), OptionalInput(4)),
+        ],
+        BuiltinOperator.CONV_2D: [ValidBiasValues()],
+        BuiltinOperator.DEPTHWISE_CONV_2D: [ValidBiasValues()],
+        BuiltinOperator.FULLY_CONNECTED: [
+            ValidBiasValues(),
+            FullyConnectedWeightZeroPoint(),
+        ],
+        BuiltinOperator.GATHER: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.GATHER_ND: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.L2_NORMALIZATION: [
+            ExactValueForType(TensorType.INT8, Output(0), [1.0 / 128.0], [0]),
+        ],
+        BuiltinOperator.LOG_SOFTMAX: [
+            ExactValueForType(TensorType.INT8, Output(0), [16.0 / 256.0], [127]),
+            ExactValueForType(TensorType.UINT8, Output(0), [16.0 / 256.0], [255]),
+        ],
+        BuiltinOperator.LOGISTIC: [
+            ExactValueForType(TensorType.INT8, Output(0), [1.0 / 256.0], [-128]),
+            ExactValueForType(TensorType.UINT8, Output(0), [1.0 / 256.0], [0]),
+        ],
+        BuiltinOperator.MAX_POOL_2D: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.MAXIMUM: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.INT8, Input(0), Input(1)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Input(1)),
+        ],
+        BuiltinOperator.MINIMUM: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.INT8, Input(0), Input(1)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Input(1)),
+        ],
+        BuiltinOperator.PAD: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.PADV2: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.INT8, Input(0), OptionalInput(2)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), OptionalInput(2)),
+        ],
+        BuiltinOperator.RESHAPE: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.RESIZE_BILINEAR: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.RESIZE_NEAREST_NEIGHBOR: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.SCATTER_ND: [
+            SharedParamsForType(TensorType.INT8, Input(1), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(1), Output(0)),
+        ],
+        BuiltinOperator.SELECT_V2: [
+            SharedParamsForType(TensorType.INT8, Input(1), Output(0)),
+            SharedParamsForType(TensorType.INT8, Input(1), Input(2)),
+            SharedParamsForType(TensorType.UINT8, Input(1), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(1), Input(2)),
+        ],
+        BuiltinOperator.SLICE: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.SOFTMAX: [
+            ExactValueForType(TensorType.INT8, Output(0), [1.0 / 256.0], [-128]),
+            ExactValueForType(TensorType.UINT8, Output(0), [1.0 / 256.0], [0]),
+        ],
+        BuiltinOperator.SQUEEZE: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.TANH: [
+            ExactValueForType(TensorType.INT8, Output(0), [1.0 / 128.0], [0]),
+        ],
+        BuiltinOperator.TILE: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+        BuiltinOperator.TRANSPOSE: [
+            SharedParamsForType(TensorType.INT8, Input(0), Output(0)),
+            SharedParamsForType(TensorType.UINT8, Input(0), Output(0)),
+        ],
+    }
+
+    ops: list[tflite_model.Operator] = model.sub_graphs.vector[0].operators.vector
+    operator_codes = {
+        idx: code.builtin_code for idx, code in enumerate(model.operator_codes.vector)
+    }
+    is_error = False
+
+    for op in ops:
+        if op.builtin_options:
+            if op.builtin_options.operator_type in rules:
+                for rule in rules[op.builtin_options.operator_type]:
+                    if not rule.valid(op):
+                        logger.w(
+                            f"TFLite operator with op_type='{op.builtin_options.operator_type}' wasn't quantized "
+                            f"properly. Following TFLite quantization rule was not satisfied: '{rule}'."
+                        )
+                        is_error = True
+        else:
+            if operator_codes[op.opcode_index] in rules:
+                for rule in rules[operator_codes[op.opcode_index]]:
+                    if not rule.valid(op):
+                        logger.w(
+                            f"TFLite operator with op_type='{operator_codes[op.opcode_index]}' wasn't quantized "
+                            f"properly. Following TFLite quantization rule was not satisfied: '{rule}'."
+                        )
+                        is_error = True
+
+    if is_error:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "Some ops were not correctly quantized. Refer to previous log messages and please report this issue.",
+        )
diff --git a/backends/nxp/backend/ir/converter/conversion/__init__.py b/backends/nxp/backend/ir/converter/conversion/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/converter/conversion/aten_translator.py b/backends/nxp/backend/ir/converter/conversion/aten_translator.py
new file mode 100755
index 00000000000..075ad9abd57
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/conversion/aten_translator.py
@@ -0,0 +1,71 @@
+# Copyright 2023 Martin Pavella
+# Copyright 2024-2025 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    translator
+
+Module contains functions for context-free conversion of various
+things from Torch ATEN to TFLite.
+"""
+
+from typing import Optional, Tuple
+
+import executorch.backends.nxp.backend.ir.lib.tflite.Padding as tflPadding
+import executorch.backends.nxp.backend.ir.logger as logger
+
+
+def torch_explicit_padding_to_tflite(torch_padding: list[int]) -> list[list[int]]:
+    """Convert the attribute or input 'pad' of the Torch 'Pad' operator to the 'paddings' input of the TFLite 'Pad'
+     class of operators.
+
+    This function does NOT take tensor formats into consideration.
+    """
+    return [[dim_padding, dim_padding] for dim_padding in torch_padding]
+
+
+def torch_padding_to_tflite_explicit_padding(
+    torch_padding: list[int],
+) -> list[list[int]]:
+    """Convert a Torch attribute 'padding' of operators such as Conv, MaxPool or AveragePool, to a list of ints which
+    is compatible with the TFLite 'Pad' operator.
+    """
+    tflite_padding = torch_explicit_padding_to_tflite(torch_padding)
+
+    # TFLite also allows padding to the 'batch' and 'channels'. Torch does not
+    tflite_padding.insert(0, [0, 0])
+    tflite_padding.append([0, 0])
+
+    return tflite_padding
+
+
+def convert_padding(
+    t_padding: list[int],
+) -> Tuple[tflPadding.Padding, Optional[list[list[int]]]]:
+    """Convert Torch operator attributes 'pads' and 'auto_pad' to TFLite.
+
+    :param t_padding: Torch operator attribute 'padding'
+    :return: A tuple.
+                The first element is the converted TFLite padding.
+                The second is None, if conversion is finished. Or it is a list of ints representing the explicit
+                padding in TFLite format (compatible with the 'Pad' operator), which needs to be provided by a
+                'Pad' operator. Caller must add this operator using model_builder!
+    """
+
+    if t_padding == [0, 0]:
+        return tflPadding.Padding.VALID, None
+    else:
+        # 'padding' cannot be converted directly. Return 'VALID' and the required explicit padding and caller must
+        # implement conversion by adding a 'Pad' operator.
+
+        logger.d(
+            "Explicit Torch 'padding' cannot be represented directly as 'VALID'. "
+            "Inserting an extra 'Pad' operator."
+        )
+
+        # Torch 'padding' uses different format than TFLite 'Pad' operator. Convert the explicit padding.
+        tflite_explicit_padding = torch_padding_to_tflite_explicit_padding(t_padding)
+
+        return tflPadding.Padding.VALID, tflite_explicit_padding
diff --git a/backends/nxp/backend/ir/converter/conversion/common.py b/backends/nxp/backend/ir/converter/conversion/common.py
new file mode 100755
index 00000000000..d56893731f0
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/conversion/common.py
@@ -0,0 +1,236 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023-2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    common
+
+This file contains functions shared by the various files in the 
+'conversion/builtin/' directory.
+"""
+
+from typing import Any, List, MutableSequence, Optional
+
+import executorch.backends.nxp.backend.ir.logger as logger
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    average_pool_2d_options,
+    conv_2d_options,
+    max_pool_2d_options,
+    transpose_conv_options,
+)
+
+
+def exactly_one_is_none(obj1: Optional, obj2: Optional) -> bool:
+    """Determine if exactly 1 of the arguments is None, or not."""
+    return (obj1 is None and obj2 is not None) or (obj1 is not None and obj2 is None)
+
+
+def contains_duplicates(list_to_check: List[Any]) -> bool:
+    """Determine if given list has duplicate elements or not."""
+    return len(list_to_check) != len(set(list_to_check))
+
+
+def clamp(val: int, start: int, end: int) -> int:
+    """Clamp an int value between start and end (inclusive) and return it."""
+    if val < start:
+        return start
+
+    elif val > end:
+        return end
+
+    return val
+
+
+def try_get_input(t_op: tflite_model.Operator, idx: int) -> tflite_model.Tensor | None:
+    """Return the input tensors of 't_op' at index 'idx', or None if the operator doesn't have that input.
+
+        This function should ALWAYS be used to get optional input tensors.
+
+    :param t_op: TFLite operator to get the input tensor from.
+    :param idx: Index of the input tensor to get.
+    :return: The input tensor at index 'idx', or None.
+    """
+
+    if len(t_op.tmp_inputs) < idx + 1:
+        # The operator doesn't have that many inputs.
+        return None
+
+    tensor = t_op.tmp_inputs[idx]
+
+    if tensor.name == "":
+        # ONNX allows the name "" for optional tensors. It indicates that the tensor should be ignored, and a default
+        #  value should be used. Just like if the tensor was omitted altogether.
+        return None
+
+    return tensor
+
+
+def extend_1d_pads_to_2d(onnx_1d_pads: MutableSequence):
+    """Extend the onnx 'pads' operator attribute that represents padding for a 1D kernel to 2D, by adding '0's."""
+    if onnx_1d_pads is not None:
+        onnx_1d_pads.insert(1, 0)
+        onnx_1d_pads.append(0)
+
+
+def extend_1d_strides_to_2d(onnx_1d_strides: MutableSequence):
+    """Extend the onnx 'strides' operator attribute that represents strides for a 1D kernel to 2D, by adding '1'."""
+    if onnx_1d_strides is not None:
+        onnx_1d_strides.append(1)
+
+
+def extend_1d_dilations_to_2d(onnx_1d_dilations: MutableSequence):
+    """Extend the onnx 'dilations' operator attribute that represents dilations for a 1D kernel to 2D, by adding '1'."""
+    if onnx_1d_dilations is not None:
+        onnx_1d_dilations.append(1)
+
+
+def extend_1d_kernel_shape_to_2d(onnx_1d_kernel_shape: MutableSequence):
+    """Extend the onnx 1D 'kernel_shape' operator attribute to 2D, by adding '1'."""
+    if onnx_1d_kernel_shape is not None:
+        onnx_1d_kernel_shape.append(1)
+
+
+StridedOptions = (
+    average_pool_2d_options.AveragePool2D
+    | conv_2d_options.Conv2D
+    | max_pool_2d_options.MaxPool2D
+    | transpose_conv_options.TransposeConv
+)
+
+
+def assign_2d_strides(options: StridedOptions, strides: Optional[List[int]]):
+    """Assign to 'obj' the attributes 'stride_h' and 'stride_w' from 'strides'.
+         If 'strides' is None, assign 1s.
+
+    :param options: TFLite AveragePool2D, Conv2D, MaxPool2D or TransposeConv options object.
+    :param strides: An optional list of ONNX strides attribute.
+    """
+
+    if strides is None:
+        # Default values are [1, 1]
+        options.stride_h = 1
+        options.stride_w = 1
+
+    elif len(strides) == 2:
+        options.stride_h = strides[0]
+        options.stride_w = strides[1]
+
+    else:
+        logger.e(
+            logger.Code.INVALID_ONNX_OPERATOR_ATTRIBUTE,
+            f"ONNX operator has invalid 'strides' attribute! ('{strides}')",
+        )
+
+
+def assign_2d_dilations(conv_2d: conv_2d_options, dilations: Optional[List[int]]):
+    """Assign the 'conv_2d' attributes 'dilations_h' and 'dilations_2' from 'dilations'."""
+
+    if dilations is None:
+        return
+
+    if len(dilations) == 2:
+        conv_2d.dilation_h_factor = dilations[0]
+        conv_2d.dilation_w_factor = dilations[1]
+    else:
+        logger.d(f"Expected 2D dilations, got '{dilations}'. Leaving default values.")
+
+
+def uses_shape_broadcasting(t_op: tflite_model.Operator) -> bool:
+    """Determine if given TFLite operator uses shape broadcasting for it's input tensors or not.
+
+    :param t_op: TFLite operator with 'tmp_inputs' initialized.
+    :return: True, if the operator uses shape broadcasting for it's input tensors.
+             False otherwise.
+    """
+
+    if t_op.tmp_inputs is None:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "common.uses_shape_broadcasting(): 'tmp_inputs' are None!",
+        )
+
+    if len(t_op.tmp_inputs) == 0:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "common.uses_shape_broadcasting(): Operator has no inputs!",
+        )
+
+    first_input_shape = t_op.tmp_inputs[0].shape
+
+    return any(
+        input_tensor.shape != first_input_shape for input_tensor in t_op.tmp_inputs[1:]
+    )
+
+
+def uses_multiple_input_types(t_op: tflite_model.Operator) -> bool:
+    """Determine if the input tensors of given TFLite operator use different data types or not.
+
+    :param t_op: TFLite operator with 'tmp_inputs' initialized.
+    :return: True, if any two input tensors have a different data type.
+             False, if all input tensors use the same data type.
+    """
+
+    if t_op.tmp_inputs is None:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "common.uses_multiple_input_types(): 'tmp_inputs' are None!",
+        )
+
+    if len(t_op.tmp_inputs) == 0:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "common.uses_multiple_input_types(): Operator has no inputs!",
+        )
+
+    first_input_type = t_op.tmp_inputs[0].type
+    return any(
+        input_tensor.type != first_input_type for input_tensor in t_op.tmp_inputs[1:]
+    )
+
+
+class OpsList:
+    """
+    Holder of TFLite operator (middle_op) that can be prefixed (pre_ops) of suffixed (post_ops)
+    by other operators. When flattened, order of the operators is preserved.
+    """
+
+    pre_ops: List[tflite_model.Operator]
+    middle_op: tflite_model.Operator
+    post_ops: List[tflite_model.Operator]
+
+    def __init__(
+        self,
+        pre_ops: List[tflite_model.Operator] | None = None,
+        middle_op=None,
+        post_ops: List[tflite_model.Operator] | None = None,
+    ):
+        self.pre_ops = pre_ops or []
+        self.middle_op = middle_op
+        self.post_ops = post_ops or []
+
+    def flatten(self):
+        return self.pre_ops + [self.middle_op] + self.post_ops
+
+    def add_pre(self, ops: tflite_model.Operator | list[tflite_model.Operator]):
+        if isinstance(ops, tflite_model.Operator):
+            ops = [ops]
+
+        logger.internal_assert(
+            isinstance(ops, list), "OpsList: add_pre() called with invalid value."
+        )
+
+        self.pre_ops.extend(ops)
+
+    def add_post(self, ops: tflite_model.Operator | list[tflite_model.Operator]):
+        if isinstance(ops, tflite_model.Operator):
+            ops = [ops]
+
+        logger.internal_assert(
+            isinstance(ops, list), "OpsList: add_post() called with invalid value."
+        )
+
+        self.post_ops.extend(ops)
diff --git a/backends/nxp/backend/ir/converter/conversion/translator.py b/backends/nxp/backend/ir/converter/conversion/translator.py
new file mode 100755
index 00000000000..4f327c6ac80
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/conversion/translator.py
@@ -0,0 +1,961 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023-2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    translator
+
+Module contains functions for context-free conversion of various
+things from ONNX to TFLite.
+"""
+
+from typing import Any, Collection, List, Optional, Sequence, Tuple
+
+import executorch.backends.nxp.backend.ir.lib.tflite.Padding as tflPadding
+import executorch.backends.nxp.backend.ir.logger as logger
+import executorch.backends.nxp.backend.ir.tflite_generator.tflite_model as tflite_model
+
+import numpy as np
+import torch
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+from executorch.backends.nxp.backend.ir.tflite_generator.meta.types import (
+    TensorFlowDataType,
+)
+
+
+def permute_static_tensor(tensor: tflite_model.Tensor, perm: list[int]):
+    """Take a static TFLite tensor and permute its shape and data according to the permutation in 'perm'.
+
+    :param tensor: Static TFLite tensor to permute.
+    :param perm: Permutation to apply to the tensor.
+    """
+
+    logger.internal_assert(
+        tensor.tmp_buffer is not None, "permute_static_tensor: tensor is not static."
+    )
+
+    data = tensor.tmp_buffer.data
+    data = np.transpose(data, perm)
+
+    shape = apply_permutation_to(tensor.shape.vector.copy(), perm)
+    logger.internal_assert(
+        shape == list(data.shape), "permute_static_tensor: shapes do not match."
+    )
+
+    tensor.tmp_buffer.data = data
+    tensor.shape = tflite_model.Shape(shape)
+
+
+def get_tflite_tensor_shape_with_explicit_padding(
+    tflite_shape: List[int], explicit_padding: List[List[int]]
+) -> List[int]:
+    """Get the resulting shape of a tensor with shape 'tflite_shape' (in TFLite format), after 'explicit_padding' is
+    applied to it.
+    """
+
+    if (len(tflite_shape) != len(explicit_padding)) or any(
+        len(sub_list) != 2 for sub_list in explicit_padding
+    ):
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            f"Cannot apply padding '{explicit_padding}' to TFLite shape '{tflite_shape}'!",
+        )
+
+    total_padding = [
+        start + end for start, end in explicit_padding
+    ]  # Total padding for each dimension
+
+    padded_shape = []
+    for dimension, padding in zip(tflite_shape, total_padding):
+        if isinstance(dimension, int) and dimension > 0:
+            padded_shape.append(dimension + padding)
+
+        else:
+            # Variable shape
+
+            if padding == 0:
+                padded_shape.append(dimension)
+
+            else:
+                # Cannot add padding to a variable dimension.
+                logger.e(
+                    logger.Code.CONVERSION_IMPOSSIBLE,
+                    "Adding explicit padding to a variable sized tensor is not supported!",
+                )
+
+    return padded_shape
+
+
+def convert_tensor_format_to_tflite(tensor_format: TensorFormat) -> TensorFormat:
+    """Convert the format of a tensor from ONNX to TFLite.
+    :return: The tensor_format converted to TFLite.
+    """
+    if tensor_format is TensorFormat.CHANNELS_FIRST:
+        return TensorFormat.CHANNELS_LAST
+
+    elif tensor_format not in (TensorFormat.FORMATLESS, TensorFormat.NONE):
+        logger.d(
+            f"translator.convert_tensor_format(): Got unexpected format '{tensor_format}'."
+        )
+
+    return tensor_format
+
+
+def dims_to_channels_first(channels_last_dimensions: List[int]) -> List[int]:
+    """Convert a list of ints which represent dimensions in the channels last (TFLite) format to the channels first
+    (ONNX) format.
+    """
+    assert len(channels_last_dimensions) > 0, "Dimensions list is empty!"
+
+    if len(channels_last_dimensions) == 1:
+        return [0]
+
+    res = list(channels_last_dimensions)
+
+    res.insert(1, res.pop())  # Insert 'C' (last item) to index 1
+
+    return res
+
+
+def dims_to_channels_last(channels_first_dimensions: List[int]) -> List[int]:
+    """Convert a list of ints which represent dimensions in the channels first (ONNX) format to the channels last
+    (TFLite) format.
+    """
+    assert len(channels_first_dimensions) > 0, "Dimensions list is empty!"
+
+    if len(channels_first_dimensions) == 1:
+        return [0]
+
+    res = list(channels_first_dimensions)
+
+    res.append(res.pop(1))  # Move 'C' (idx 1) to the end
+
+    return res
+
+
+def collections_equal(col_a, col_b):
+    """Compare each individual element of both collections.
+    They can be any combination of lists, tuples or numpy arrays.
+    Return True if they are equal.
+    """
+    if len(col_a) != len(col_b):
+        return False
+
+    for a, b in zip(col_a, col_b):
+        if a != b:
+            return False
+    return True
+
+
+def _calculate_effective_kernel_shape(
+    kernel_shape: List[int], dilations: Optional[List[int]]
+) -> List[int]:
+    """Calculate the reach of a kernel with respect to its shape and dilations.
+    For example a [3, 3] kernel with dilations [2, 2] has effective shape of [5, 5].
+    """
+
+    if dilations is None:
+        dilations = [1] * len(kernel_shape)
+
+    return [(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)]
+
+
+def _same_upper_equals_same_lower(
+    tflite_input_shape: List[int],
+    tflite_output_shape: List[int],
+    o_kernel_shape: List[int],
+    o_strides: Optional[List[int]] = None,
+    o_dilations: Optional[List[int]] = None,
+) -> bool:
+    """Determine if in a given particular setting, the values of the ONNX `auto_pads` attribute SAME_UPPER and
+    SAME_LOWER represent the exact same padding.
+    """
+
+    padding, offset = tflite_compute_padding_with_offset(
+        tflite_input_shape, o_kernel_shape, tflite_output_shape, o_strides, o_dilations
+    )
+
+    # Only if offset for every dimension is 0, SAME_UPPER and SAME_LOWER will behave equally.
+    return all(elt == 0 for elt in offset)
+
+
+def _tflite_padding_compute_output_size(
+    padding: tflPadding.Padding,
+    tflite_spatial_input_shape: List[int],
+    tflite_kernel_shape: List[int],
+    strides: Optional[List[int]] = None,
+    dilations: Optional[List[int]] = None,
+) -> List[int]:
+    """
+    Calculates the output shape of the tensor with particular setting as tflite would. Implementation corresponds to
+    tensorflow/lite/kernels/padding.h:ComputeOutSize()
+    :param padding: TFLite Padding value - 'Same' or 'Valid'
+    :param tflite_spatial_input_shape: input tensor shape
+    :param tflite_kernel_shape: convolution kernel shape
+    :param strides: strides (default is 1)
+    :param dilations: dilation (default is 1)
+    :return: Output shape of the tensor with particular padding settings
+    """
+    if strides is None:
+        strides = [1] * len(tflite_kernel_shape)
+
+    effective_kernel_shape = _calculate_effective_kernel_shape(
+        tflite_kernel_shape, dilations
+    )
+
+    if padding == tflPadding.Padding.SAME:
+        return [
+            (in_shape + stride - 1) // stride
+            for in_shape, stride in zip(tflite_spatial_input_shape, strides)
+        ]
+    elif padding == tflPadding.Padding.VALID:
+        return [
+            (in_shape + stride - ef_kernel_shape) // stride
+            for in_shape, stride, ef_kernel_shape in zip(
+                tflite_spatial_input_shape, strides, effective_kernel_shape
+            )
+        ]
+
+
+def tflite_compute_padding_with_offset(
+    tflite_input_shape: List[int],
+    tflite_kernel_shape: List[int],
+    tflite_output_shape: List[int],
+    strides: Optional[List[int]] = None,
+    dilations: Optional[List[int]] = None,
+) -> (List[int], List[int]):
+    """
+    Calculate padding and offset for each dimension for particular convolution setting as TFLite.
+    Implementation corresponds to tensorflow/lite/kernels/padding.h:ComputePaddingWithOffset()
+    :param tflite_input_shape: tensorflow lite input shape
+    :param tflite_kernel_shape: tensorflow lite kernel shape
+    :param tflite_output_shape: tensorflow lite output shape
+    :param strides: stride setting, default is 1
+    :param dilations: dilation setting, default is 1
+    :return: (padding, offset) - padding and offset for each axis. Padding is added on beginning and end of the axis.
+             Offset to be optionally added to end of the axis if odd.
+    """
+    if strides is None:
+        strides = [1] * len(tflite_kernel_shape)
+
+    spatial_input_shape = tflite_input_shape[1:-1]  # The spatial portion of the input
+    spatial_output_shape = tflite_output_shape[
+        1:-1
+    ]  # The spatial portion of the output
+
+    effective_kernel_shape = _calculate_effective_kernel_shape(
+        tflite_kernel_shape, dilations
+    )
+
+    total_padding = [
+        (spatial_output - 1) * stride + effective_kernel - spatial_input
+        for spatial_output, stride, effective_kernel, spatial_input in zip(
+            spatial_output_shape, strides, effective_kernel_shape, spatial_input_shape
+        )
+    ]
+
+    padding = [tp // 2 for tp in total_padding]
+    offset = [tp % 2 for tp in total_padding]
+
+    return padding, offset
+
+
+def _is_same_padding(
+    o_pads: List[int],
+    tflite_input_shape: List[int],
+    tflite_output_shape: List[int],
+    o_kernel_shape: List[int],
+    o_strides: Optional[List[int]] = None,
+    o_dilations: Optional[List[int]] = None,
+) -> bool:
+    """Determine if given ONNX 'pads' padding can be represented exactly with the TFLite 'SAME' padding type.
+
+    :param o_pads: ONNX 'pads' attribute.
+    :param tflite_input_shape: The shape of the main input of the operator in TFLite format.
+    :param tflite_output_shape: The shape of the main output of the operator in TFLite format.
+    :param o_kernel_shape: ONNX 'kernel_shape' attribute.
+    :param o_strides: ONNX 'strides' attribute. Can be omitted.
+    :param o_dilations: ONNX 'dilations' attribute. Can be omitted.
+    """
+
+    if len(tflite_input_shape) == 0 or len(tflite_output_shape) == 0:
+        logger.e(
+            logger.Code.INVALID_TENSOR_SHAPE,
+            f"Cannot verify that padding '{o_pads}' can be represented as 'SAME' for input shape "
+            f"'{tflite_input_shape}' and output shape '{tflite_output_shape}'.",
+        )
+
+    # Calculate if the output shape corresponds to Same padding setting in TFLite
+    tflite_spatial_input_shape = tflite_input_shape[1:-1]
+    tmp_spatial_output_shape = _tflite_padding_compute_output_size(
+        tflPadding.Padding.SAME,
+        tflite_spatial_input_shape,
+        o_kernel_shape,
+        o_strides,
+        o_dilations,
+    )
+    if tmp_spatial_output_shape != tflite_output_shape[1:-1]:
+        return False
+
+    # For every dimension, the padding is added to the start and end of the dimension.
+    # TFLite padding 'SAME' tries to split it evenly, but in case of odd padding, 'SAME' adds the excess 1 at the end.
+    # TFLite represents this in the offset. The offset is added to the end of particular dimension,
+    # i.e. bottom for H dim, right for W dim and so on.
+    # ONNX represents this in 'pads' as [x1_begin, x2_begin,... , x1_end, x2_end,...].
+    padding, offset = tflite_compute_padding_with_offset(
+        tflite_input_shape, o_kernel_shape, tflite_output_shape, o_strides, o_dilations
+    )
+    start_padding = padding
+    end_padding = [p + o for p, o in zip(padding, offset)]
+    effective_padding = start_padding + end_padding
+
+    if effective_padding != o_pads:
+        return False
+
+    return True
+
+
+def permutations_are_inverse(
+    permutation1: Sequence[int], permutation2: Sequence[int]
+) -> bool:
+    """Determine if given Transpose permutations are inverse of each other.
+    i.e. when applied back to back, there will be no effect.
+
+    Example:
+      0 3 1 2
+      0 2 3 1
+    """
+
+    if len(permutation1) != len(permutation2):
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "translator.permutations_are_inverse(): permutations have different size!",
+        )
+
+    for i, perm2 in enumerate(permutation2):
+        if i != permutation1[perm2]:
+            return False
+
+    return True
+
+
+def combine_permutations(
+    permutation1: Sequence[int], permutation2: Sequence[int]
+) -> List[int]:
+    """Combine 2 permutations into 1.
+
+    :param permutation1: The first permutation to apply.
+    :param permutation2:  The second permutation to apply.
+    :return: The combined permutation.
+    """
+    if len(permutation1) != len(permutation2):
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "translator.combine_permutations(): permutations have different size!",
+        )
+
+    return [permutation1[perm2] for perm2 in permutation2]
+
+
+def nhc_dimensions_to_nhwc(nhc_dimensions: List[int]) -> List[int]:
+    """Convert a list of ints representing the shape of an NHC tensor to NHWC, where W = 1."""
+    nhwc_dimensions = nhc_dimensions.copy()
+    nhwc_dimensions.insert(2, 1)
+
+    return nhwc_dimensions
+
+
+def shape_from_numpy(numpy_array):
+    """Return a 'Shape' object representing the shape of given 'numpy_array'."""
+    dims = list(numpy_array.shape)
+    return tflite_model.Shape(dims)
+
+
+def onnx_explicit_padding_to_tflite(onnx_pads: list[int]) -> list[list[int]]:
+    """Convert the attribute or input 'pads' of the ONNX 'Pad' operator to the 'paddings' input of the TFLite 'Pad'
+     class of operators.
+
+    This function does NOT take tensor formats into consideration.
+    """
+
+    start_padding = onnx_pads[
+        : len(onnx_pads) // 2
+    ]  # Padding at the start of each dimension
+    end_padding = onnx_pads[
+        len(onnx_pads) // 2 :
+    ]  # Padding at the end of each dimension
+
+    return list(zip(start_padding, end_padding))
+
+
+def onnx_pads_to_tflite_explicit_padding(onnx_pads: List[int]) -> List[List[int]]:
+    """Convert an ONNX attribute 'pads' of operators such as Conv, MaxPool or AveragePool, to a list of ints which is
+    compatible with the TFLite 'Pad' operator.
+    """
+
+    tflite_padding = onnx_explicit_padding_to_tflite(onnx_pads)
+
+    # TFLite also allows padding to the 'batch' and 'channels'. ONNX does not
+    tflite_padding.insert(0, [0, 0])
+    tflite_padding.append([0, 0])
+
+    return tflite_padding
+
+
+def _get_explicit_tflite_padding_for_same_lower(
+    tflite_input_shape: List[int],
+    tflite_output_shape: List[int],
+    o_kernel_shape: List[int],
+    o_strides: Optional[List[int]] = None,
+    o_dilations: Optional[List[int]] = None,
+) -> List[List[int]]:
+    """Get the TFLite explicit padding required to represent ONNX 'SAME_LOWER' auto_pad for a particular setting.
+
+    :param tflite_input_shape: TFLite (NHWC) shape of the input tensor of the operator.
+    :param tflite_output_shape: TFLite (NHWC) shape of the output tensor of the operator.
+    :param o_kernel_shape: ONNX 'kernel_shape' attribute.
+    :param o_strides: Optional ONNX 'o_strides' attribute.
+    :param o_dilations: Optional ONNX 'o_dilations' attribute.
+
+    :return: A TFLite style explicit padding, compatible with the TFLite 'Pad' operator.
+    """
+
+    padding, offset = tflite_compute_padding_with_offset(
+        tflite_input_shape, o_kernel_shape, tflite_output_shape, o_strides, o_dilations
+    )
+
+    start_padding = [
+        p + o for p, o in zip(padding, offset)
+    ]  # In case of odd padding, the excess is added at the start
+    end_padding = padding
+
+    onnx_explicit_padding = start_padding + end_padding
+
+    # Return explicit ONNX padding converted to TFLite padding
+    return onnx_pads_to_tflite_explicit_padding(onnx_explicit_padding)
+
+
+def convert_padding(
+    o_auto_pad: str,
+    o_pads: List[int],
+    tflite_input_shape: List[int],
+    tflite_output_shape: List[int],
+    o_kernel_shape: List[int],
+    o_strides: Optional[List[int]],
+    o_dilations: Optional[List[int]] = None,
+) -> Tuple[tflPadding.Padding, Optional[List[List[int]]]]:
+    """Convert ONNX operator attributes 'pads' and 'auto_pad' to TFLite.
+
+    :param o_auto_pad: ONNX operator attribute 'auto_pad'
+    :param o_pads: ONNX operator attribute 'pads'
+    :param tflite_input_shape: The shape of the main input tensor in the TFLite format.
+    :param tflite_output_shape: The shape of the main output tensor in the TFLite format.
+    :param o_kernel_shape: ONNX operator attribute 'kernel_shape'
+    :param o_strides: ONNX operator attribute 'strides'
+    :param o_dilations: ONNX operator attribute 'dilations'
+
+    :return: A tuple.
+                The first element is the converted TFLite padding.
+                The second is None, if conversion is finished. Or it is a list of ints representing the explicit
+                padding in TFLite format (compatible with the 'Pad' operator), which needs to be provided by a
+                'Pad' operator. Caller must add this operator using model_builder!
+    """
+
+    if o_auto_pad == "SAME_UPPER":
+        return tflPadding.Padding.SAME, None
+
+    elif o_auto_pad == "SAME_LOWER":
+        if _same_upper_equals_same_lower(
+            tflite_input_shape,
+            tflite_output_shape,
+            o_kernel_shape,
+            o_strides,
+            o_dilations,
+        ):
+            return tflPadding.Padding.SAME, None
+
+        else:
+            logger.d(
+                "'SAME_LOWER' auto_pad cannot be exactly represented in TFLite as padding 'SAME' or 'VALID'. "
+                "Inserting an extra 'Pad' operator."
+            )
+            tflite_explicit_padding = _get_explicit_tflite_padding_for_same_lower(
+                tflite_input_shape,
+                tflite_output_shape,
+                o_kernel_shape,
+                o_strides,
+                o_dilations,
+            )
+            return tflPadding.Padding.VALID, tflite_explicit_padding
+
+    elif o_auto_pad == "VALID":
+        return tflPadding.Padding.VALID, None
+
+    # auto_pad is NOTSET -> use explicit padding
+    elif o_pads is None or all(val == 0 for val in o_pads):
+        # No padding in any direction
+        return tflPadding.Padding.VALID, None
+
+    elif _is_same_padding(
+        o_pads,
+        tflite_input_shape,
+        tflite_output_shape,
+        o_kernel_shape,
+        o_strides,
+        o_dilations,
+    ):
+        # Explicit padding can be represented with TFLite 'SAME' padding.
+        return tflPadding.Padding.SAME, None
+
+    else:
+        # 'pads' cannot be converted directly. Return 'VALID' and the required explicit padding and caller must
+        # implement conversion by adding a 'Pad' operator.
+
+        logger.d(
+            "Explicit ONNX 'pads' cannot be represented directly as 'SAME' or 'VALID'. "
+            "Inserting an extra 'Pad' operator."
+        )
+
+        # ONNX 'pads' uses different format than TFLite 'Pad' operator. Convert the explicit padding.
+        tflite_explicit_padding = onnx_pads_to_tflite_explicit_padding(o_pads)
+
+        return tflPadding.Padding.VALID, tflite_explicit_padding
+
+
+def convert_data_to_channels_first(array: np.ndarray) -> np.ndarray:
+    """Convert a numpy array representing the data of a tensor from the channels last format (TFLite), to channels
+        first format (ONNX).
+
+    :param array: Numpy array holding the tensor's data.
+    :return: The transformed data.
+    """
+    if len(array.shape) < 3:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            f"translator.convert_data_to_channels_first(): 'array' only has '{len(array.shape)}' dimensions!",
+        )
+
+    return np.moveaxis(array, -1, 1)  # Move last axis (C), to index 1
+
+
+def convert_data_to_channels_last(array: np.ndarray) -> np.ndarray:
+    """Convert a numpy array representing the data of a tensor from the channels first format (ONNX), to channels last
+        format (TFLite).
+
+    :param array: Numpy array holding the tensor's data.
+    :return: The transformed data.
+    """
+    if len(array.shape) < 3:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            f"translator.convert_data_to_channels_last(): 'array' only has '{len(array.shape)}' dimensions!",
+        )
+
+    return np.moveaxis(array, 1, -1)  # Move the second axis (C), to the end
+
+
+def channels_first_shape_to_channels_last(
+    channels_first_shape: tflite_model.Shape,
+) -> tflite_model.Shape:
+    """Create a channels last version of a channels first 'tflite_model.Shape' object."""
+
+    dims = channels_first_shape.vector.copy()
+    dims = dims_to_channels_last(dims)
+
+    return tflite_model.Shape(dims)
+
+
+def channels_last_shape_to_channels_first(
+    nhwc_shape: tflite_model.Shape,
+) -> tflite_model.Shape:
+    """Create a channels first version of a channels last 'tflite_model.Shape' object."""
+
+    dims = nhwc_shape.vector.copy()
+    dims = dims_to_channels_first(dims)
+
+    return tflite_model.Shape(dims)
+
+
+def convert_onnx_dimensions_to_tflite_shape(o_dims: List[int]) -> tflite_model.Shape:
+    """Convert list of ints representing the shape of an ONNX channels first Tensor to a TFLite 'Shape' object."""
+
+    dims = list(o_dims)  # Copy just in case
+
+    dims = dims_to_channels_last(dims)
+
+    return tflite_model.Shape(dims)
+
+
+def create_channels_last_to_channels_first_permutation(
+    rank: int, return_list: bool = False
+) -> np.ndarray | list[int]:
+    """Return a numpy array with data that describes the permutation, which would change a tensor from the channels
+    last (TFLite) format to the channels first (ONNX) format.
+
+    This permutation is compatible with the TFLite `Transpose` operator.
+
+    :param rank: The rank of the required permutation.
+    :param return_list: If True, the function returns a list of ints. If False, a numpy array is returned.
+    :return: A numpy array, or a list of ints, representing the desired permutation.
+    """
+
+    perm = dims_to_channels_first(list(range(rank)))
+
+    if return_list:
+        return perm
+    else:
+        return np.asarray(perm, np.int32)
+
+
+def create_channels_first_to_channels_last_permutation(
+    rank: int, return_list: bool = False
+) -> np.ndarray | list[int]:
+    """Return a numpy array with data that describes the permutation, which would change a tensor from the channels
+    first (ONNX) format to the channels last (TFLite) format.
+
+    This permutation is compatible with the TFLite `Transpose` operator.
+
+    :param rank: The rank of the required permutation.
+    :param return_list: If True, the function returns a list of ints. If False, a numpy array is returned.
+    :return: A numpy array, or a list of ints, representing the desired permutation.
+    """
+
+    perm = dims_to_channels_last(list(range(rank)))
+
+    if return_list:
+        return perm
+    else:
+        return np.asarray(perm, np.int32)
+
+
+def create_axis_to_last_perm(axis, num_dims):
+    """Create a numpy array representing the transpose permutations needed, to
+    make the 'axis' dimension, the last dimension.
+    """
+
+    dims = list(range(num_dims))
+
+    if axis == num_dims - 1:
+        return dims
+    elif axis >= num_dims or axis < 0:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            f"translator.create_axis_to_last_perm({axis},{num_dims}). Inputs don't make sense!",
+        )
+
+    # Remember axis dimension
+    axis_dim = dims[axis]
+
+    # Move dimensions after 'axis' to the left
+    dims[axis:-1] = dims[axis + 1 : -1]
+
+    # Add axis dimension to the end
+    dims.append(axis_dim)
+
+    return np.asarray(dims, np.int32)
+
+
+def apply_permutation_to(target: List[Any], permutation: Collection[int]) -> List:
+    """Permute a list according to a permutation. Uses the same permutation format as the TFLite Transpose operator.
+
+    :param target: A list of any types, to permute. Must be same size as the permutation.
+    :param permutation: The permutation to apply to the target.
+    :return: Permuted list.
+    """
+
+    if len(target) != len(permutation):
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "translator.apply_permutation_to(): 'target' and 'permutation' have different length!",
+        )
+
+    return [target[perm] for perm in permutation]
+
+
+def create_inverse_permutation(permutation: List[int]) -> List[int]:
+    """Create and return a permutation, that is the inverse of the given 'permutation' parameter.
+        Uses the same permutation format as the TFLite Transpose operator.
+
+    :param permutation: The permutation to create the inverse of.
+    :return: Inverse permutation.
+    """
+
+    if set(permutation) != set(range(len(permutation))):
+        # Irreversible permutation. For example [0, 1, 2, 2] (information is lost by applying permutation).
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "translator.create_inverse_permutation(): permutation is not reversible!",
+        )
+
+    return [permutation.index(perm) for perm in range(len(permutation))]
+
+
+def get_max_value_for_type(dtype: np.dtype) -> any:
+    """Return the maximum possible value for given numpy type."""
+    if dtype.kind in ("i", "u"):
+        return np.iinfo(dtype).max
+
+    elif dtype.kind == "f":
+        return np.finfo(dtype).max
+
+    else:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            f"translator.get_max_value_for_type(): unexpected type {dtype.name}.",
+        )
+
+
+def get_min_value_for_type(dtype: np.dtype) -> any:
+    """Return the minimum possible value for given numpy type."""
+    if dtype.kind in ("i", "u"):
+        return np.iinfo(dtype).min
+
+    elif dtype.kind == "f":
+        return np.finfo(dtype).min
+
+    else:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            f"translator.get_min_value_for_type(): unexpected type {dtype.name}.",
+        )
+
+
+def convert_data_type(torch_type: torch.TensorType) -> TensorType:
+    """Convert Torch DataType to TFLite TensorType"""
+
+    if torch_type == torch.float32:
+        return TensorType.FLOAT32
+
+    elif torch_type == torch.uint8:
+        return TensorType.UINT8
+
+    elif torch_type == torch.int8:
+        return TensorType.INT8
+
+    elif torch_type == torch.int32:
+        return TensorType.INT32
+
+    elif torch_type == torch.int64:
+        return TensorType.INT64
+
+    elif torch_type == torch.bool:
+        return TensorType.BOOL
+
+    else:
+        logger.e(
+            logger.Code.NOT_IMPLEMENTED,
+            f"Conversion of Torch type '{torch_type}' not supported.",
+        )
+
+
+def torch_type_to_numpy_type(torch_type: torch.TensorType) -> np.ScalarType:
+    """Convert Torch DataType to TFLite TensorType"""
+
+    if torch_type == torch.float32:
+        return np.dtype(np.float32)
+
+    elif torch_type == torch.uint8:
+        return np.dtype(np.uint8)
+
+    elif torch_type == torch.int8:
+        return np.dtype(np.int8)
+
+    elif torch_type == torch.int32:
+        return np.dtype(np.int32)
+
+    elif torch_type == torch.int64:
+        return np.dtype(np.int64)
+
+    else:
+        logger.e(
+            logger.Code.NOT_IMPLEMENTED,
+            f"Conversion of Torch type '{torch_type}' not supported.",
+        )
+
+
+def numpy_type_to_tf_lite(numpy_type: np.dtype) -> TensorType:  # noqa C901
+    """Convert the numpy data type to a corresponding TFLite 'TensorType'.
+
+    :param numpy_type: Numpy dtype to convert.
+    :return: Corresponding TFLite TensorType.
+    """
+    numpy_type = numpy_type.type
+
+    if numpy_type == np.float32:
+        return TensorType.FLOAT32
+
+    elif numpy_type == np.uint8:
+        return TensorType.UINT8
+
+    elif numpy_type == np.int8:
+        return TensorType.INT8
+
+    elif numpy_type == np.uint16:
+        return TensorType.UINT16
+
+    elif numpy_type == np.int16:
+        return TensorType.INT16
+
+    elif numpy_type == np.int32:
+        return TensorType.INT32
+
+    elif numpy_type == np.int64:
+        return TensorType.INT64
+
+    elif numpy_type == np.string_:
+        return TensorType.STRING
+
+    elif numpy_type == np.bool_:
+        return TensorType.BOOL
+
+    elif numpy_type == np.float16:
+        return TensorType.FLOAT16
+
+    elif numpy_type == np.float64:
+        return TensorType.FLOAT64
+    elif numpy_type == np.double:
+        return TensorType.FLOAT64
+
+    elif numpy_type == np.uint32:
+        return TensorType.UINT32
+
+    elif numpy_type == np.uint64:
+        return TensorType.UINT64
+
+    elif numpy_type == np.complex64:
+        return TensorType.COMPLEX64
+
+    elif numpy_type == np.complex128:
+        return TensorType.COMPLEX128
+
+    else:
+        logger.e(
+            logger.Code.CONVERSION_IMPOSSIBLE,
+            f"Cannot convert numpy data type '{numpy_type}' to TFLite.",
+        )
+
+
+def tf_lite_type_to_numpy(tfl_type: TensorType) -> np.ScalarType:  # noqa C901
+    """Convert TFLite TensorType to numpy dtype"""
+
+    if tfl_type == TensorType.FLOAT32:
+        return np.dtype(np.float32)
+
+    elif tfl_type == TensorType.UINT8:
+        return np.dtype(np.uint8)
+
+    elif tfl_type == TensorType.INT8:
+        return np.dtype(np.int8)
+
+    elif tfl_type == TensorType.UINT16:
+        return np.dtype(np.uint16)
+
+    elif tfl_type == TensorType.INT16:
+        return np.dtype(np.int16)
+
+    elif tfl_type == TensorType.INT32:
+        return np.dtype(np.int32)
+
+    elif tfl_type == TensorType.INT64:
+        return np.dtype(np.int64)
+
+    elif tfl_type == TensorType.STRING:
+        return np.dtype(np.string_)
+
+    elif tfl_type == TensorType.BOOL:
+        return np.dtype(np.bool_)
+
+    elif tfl_type == TensorType.FLOAT16:
+        return np.dtype(np.float16)
+
+    elif tfl_type == TensorType.FLOAT64:
+        return np.dtype(np.float64)
+
+    elif tfl_type == TensorType.UINT32:
+        return np.dtype(np.uint32)
+
+    elif tfl_type == TensorType.UINT64:
+        return np.dtype(np.uint64)
+
+    elif tfl_type == TensorType.COMPLEX64:
+        return np.dtype(np.complex64)
+
+    elif tfl_type == TensorType.COMPLEX128:
+        return np.dtype(np.complex128)
+
+    else:
+        logger.e(
+            logger.Code.CONVERSION_IMPOSSIBLE,
+            f"Cannot convert TFLite type '{tfl_type}' to numpy dtype.",
+        )
+
+
+def tflite_type_to_tensor_flow_data_type(tfl_type: TensorType) -> TensorFlowDataType:
+    """Convert TFLite TensorType to the internal type of TensorFlow."""
+    match tfl_type:
+        case TensorType.FLOAT16:
+            # There seems to be no counterpart in the TF DataType.
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "tflite_type_to_tensor_flow_data_type(): float16.",
+            )
+        case TensorType.FLOAT32:
+            return TensorFlowDataType.DT_FLOAT.value
+        case TensorType.FLOAT64:
+            return TensorFlowDataType.DT_DOUBLE.value
+
+        case TensorType.INT4:
+            return TensorFlowDataType.DT_INT4.value
+        case TensorType.INT8:
+            return TensorFlowDataType.DT_INT8.value
+        case TensorType.INT16:
+            return TensorFlowDataType.DT_INT16.value
+        case TensorType.INT32:
+            return TensorFlowDataType.DT_INT32.value
+        case TensorType.INT64:
+            return TensorFlowDataType.DT_INT64.value
+
+        case TensorType.UINT8:
+            return TensorFlowDataType.DT_UINT8.value
+        case TensorType.UINT16:
+            return TensorFlowDataType.DT_UINT16.value
+        case TensorType.UINT32:
+            return TensorFlowDataType.DT_UINT32.value
+        case TensorType.UINT64:
+            return TensorFlowDataType.DT_UINT64.value
+
+        case TensorType.COMPLEX64:
+            return TensorFlowDataType.DT_COMPLEX64.value
+        case TensorType.COMPLEX128:
+            return TensorFlowDataType.DT_COMPLEX128.value
+
+        case TensorType.STRING:
+            return TensorFlowDataType.DT_STRING.value
+
+        case TensorType.BOOL:
+            return TensorFlowDataType.DT_BOOL.value
+
+        case TensorType.RESOURCE:
+            return TensorFlowDataType.DT_RESOURCE.value
+        case TensorType.VARIANT:
+            return TensorFlowDataType.DT_VARIANT.value
+
+        case _:
+            # All TFLite types are covered. Must be an invalid type.
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                f"tflite_type_to_tensor_flow_data_type(): invalid TFLite type `{tfl_type}`.",
+            )
+
+
+def infer_kernel_shape(weight_tensor: tflite_model.Tensor) -> list[int]:
+    """Returns the kernel shape inferred from the weight tensor.
+
+    Weight tensors shape expected in TFlite Format, where the 0th index is output channels count, last is input channels
+    count.
+    """
+    return weight_tensor.shape.vector[1:-1]
diff --git a/backends/nxp/backend/ir/converter/node_converter.py b/backends/nxp/backend/ir/converter/node_converter.py
new file mode 100755
index 00000000000..a953e8e976a
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converter.py
@@ -0,0 +1,189 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Collection
+
+import torch
+
+from executorch.backends.nxp.backend.ir.conversion_context import ConversionContext
+from executorch.backends.nxp.backend.ir.converter.builder.aten_model_builder_director import (
+    AtenModelBuilderDirector,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+def _is_quant_node(node: torch.fx.Node) -> bool:
+    return node.target in [
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+    ]
+
+
+def _is_dequant_node(node: torch.fx.Node) -> bool:
+    return node.target in [
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    ]
+
+
+class Target(Enum):
+    IGNORE = "ignore"  # No target platform. Any target specific restrictions will be ignored.
+
+    RT700 = "imxrt700"
+    IMX95 = "imx95"
+
+    @classmethod
+    def values(cls) -> list[str]:
+        return [elt.value for elt in cls]
+
+
+class NodeConverter(ABC):
+    """
+    Classes which implement conversion of torch.Node to TFLite should inherit from this class and overwrite the
+     'convert()' method.
+    """
+
+    context: ConversionContext
+    supported_targets: Collection
+
+    def __init__(self, context: ConversionContext):
+        self.context = context
+
+    @abstractmethod
+    def convert(self, node: Node):
+        """Convert the torch.Node in 'node' to TFLite and append changes to ModelBuilder.
+
+            Classes which implement conversion for individual operators must overwrite this method.
+
+        :param node: torch.Node to convert.
+        """
+        pass
+
+    # noinspection PyPep8Naming
+    @staticmethod
+    @abstractmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        """Check if the `node` can be converted to the intermediate representation.
+            Classes which implement conversion for individual operators must overwrite this method.
+
+        :param node: torch.Node to check.
+        """
+        pass
+
+    @classmethod
+    def _is_supported_on_target(cls, target: Target) -> bool:
+        """Check if the node is supported on the target platform. It uses the 'supported_platform' attribute, which is
+             a list of supported target platforms, and it must be defined by the specific `NodeConverter`.
+
+        :param target: Value of the `Target` enum representing the target platform to check for.
+        """
+        if not (
+            hasattr(cls, "supported_targets")
+            and isinstance(cls.supported_targets, Collection)
+        ):
+            raise NotImplementedError(
+                f"The NodeConverter `{cls}` does not define its `supported_targets` collection."
+            )
+
+        return target == Target.IGNORE or target in cls.supported_targets
+
+    @classmethod
+    def is_supported(
+        cls, node: Node, target: Target, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        """Check if the given `node` is supported in the IR and on the given `target` platform.
+
+        :param node: torch.Node to check.
+        :param target: Value of the `Target` enum representing the target platform to check for.
+        :param parameters_mapping: Dict mapping tensor names to their data.
+        """
+        return cls._is_supported_in_IR(
+            node, parameters_mapping
+        ) and cls._is_supported_on_target(target)
+
+    @staticmethod
+    def _has_shared_q_params_if_quantized(node: Node) -> bool:
+        """Check if node has shared quantization parameters if it's quantized."""
+        if len(node.users) < 1 or len(node.all_input_nodes) < 1:
+            # Some exotic operator (only consumer or only produces)
+            return True
+
+        pre_node = node.prev
+        post_node = node.next
+
+        if pre_node.name == node.all_input_nodes[0] and post_node.name == node.users[0]:
+            raise RuntimeError(
+                "Prev & next nodes are not the same as inputs and outputs."
+            )
+
+        if _is_dequant_node(pre_node) and _is_quant_node(post_node):
+            # Node is quantized
+            pre_zp = pre_node.args[1]
+            pre_scale = pre_node.args[2]
+            pre_type = pre_node.args[5]
+
+            post_zp = post_node.args[1]
+            post_scale = post_node.args[2]
+            post_type = pre_node.args[5]
+
+            # Q-params match?
+            return (
+                pre_zp == post_zp and pre_scale == post_scale and pre_type == post_type
+            )
+
+        # Node not quantized
+        return True
+
+    def assert_convertible(self, node):
+        """Assert that the call `_is_supported_in_IR()` returns `True`. Otherwise, raise an exception and print an
+        error message.
+        """
+        assert self._is_supported_in_IR(node, self.context.parameters_mapping), (
+            f"Node `{node}` is not convertible to the intermediate representation. "
+            "There is an error in the partitioner."
+        )
+
+    @property
+    def builder(self) -> AtenModelBuilderDirector:
+        """
+        Get instance of TFLite ModelBuilder from conversion context.
+        :return: AtenModelBuilderDirector instance.
+        """
+        return self.context.tflite_builder
+
+    def _create_tflite_op_with_io_tensors(self, node: Node) -> tflite_model.Operator:
+        """
+        Create TFLite op wrapper with input/output tensors added into 'tmp_inputs' and 'tmp_outputs'.
+
+        :param node: Node instance.
+        :return: TFLite operator with assigned input/output tensors.
+        """
+        t_operator = tflite_model.Operator()
+
+        # Initialize node's inputs
+        t_operator.inputs = tflite_model.OperatorInputs()
+        for ancestor_node in node.all_input_nodes:
+            assert self.context.tflite_builder.tensor_exists(ancestor_node.name)
+            t_operator.tmp_inputs.append(
+                self.context.tflite_builder.tensor_for_name(ancestor_node.name)
+            )
+
+        # Add node's output as a new tensor
+        assert self.context.tflite_builder.tensor_exists(node.name)
+        t_operator.outputs = tflite_model.OperatorOutputs()
+        t_operator.tmp_outputs.append(
+            self.context.tflite_builder.tensor_for_name(node.name)
+        )
+
+        return t_operator
diff --git a/backends/nxp/backend/ir/converter/node_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/__init__.py
new file mode 100755
index 00000000000..9ccf2983b2d
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
new file mode 100755
index 00000000000..7ed81272091
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -0,0 +1,51 @@
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.addmm_converter import (
+    AddMMConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.avg_pool_2d_converter import (
+    AvgPool2dConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.constant_pad_nd_converter import (
+    ConstantPadNDConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.convolution_converter import (
+    ConvolutionConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.max_pool_2d_converter import (
+    MaxPool2dConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.mm_converter import (
+    MMConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.permute_copy_converter import (
+    PermuteCopyConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_dequantize_converter import (
+    QDQDequantizeConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_quantize_converter import (
+    QDQQuantizeConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.relu_converter import (
+    ReLUConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.softmax_converter import (
+    SoftmaxConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.view_copy_converter import (
+    ViewCopyConverter,
+)
+
+__all__ = [
+    "AddMMConverter",
+    "ConvolutionConverter",
+    "MMConverter",
+    "PermuteCopyConverter",
+    "SoftmaxConverter",
+    "ViewCopyConverter",
+    "QDQDequantizeConverter",
+    "QDQQuantizeConverter",
+    "ConstantPadNDConverter",
+    "ReLUConverter",
+    "MaxPool2dConverter",
+    "AvgPool2dConverter",
+]
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/addmm_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/addmm_converter.py
new file mode 100644
index 00000000000..820d1414f3b
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/addmm_converter.py
@@ -0,0 +1,62 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.edge_helper import input_rank
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    fully_connected_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class AddMMConverter(NodeConverter):
+    """Convert the `aten.addmm` operator to TFLite `FullyConnected` with a bias input."""
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        if len(node.all_input_nodes) != 3:
+            return False
+
+        # The weights must be 2D.
+        if input_rank(node, 2) != 2:
+            return False
+
+        return True
+
+    supported_targets = [Target.RT700]
+
+    def convert(self, node: Node):
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+        t_op.builtin_options = fully_connected_options.FullyConnected(
+            keep_num_dims=True
+        )
+
+        bias = t_op.tmp_inputs[0]
+        x = t_op.tmp_inputs[1]
+        w = t_op.tmp_inputs[2]
+        y = t_op.tmp_outputs[0]
+
+        # Assign the operator its TFLite inputs and outputs
+        t_op.tmp_inputs = [x, w, bias]
+        t_op.tmp_outputs = [y]
+
+        ops = OpsList(middle_op=t_op)
+
+        # The `aten.addmm` uses main input with shape [M, N] and the weights have the shape [N, O].
+        # TFLite `FullyConnected` requires the weights to have shape [O, N] (if the main input has shape [M, N]).
+        # Insert a `Transpose` operator to permute the weights to achieve correct conversion. (The `Transpose` will not
+        #  be present in the output model if the weights are static.)
+        ops.add_pre(self.builder.create_transpose_operator_before(t_op, 1, [1, 0]))
+
+        self.builder.append_operators(ops.flatten())
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/avg_pool_2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/avg_pool_2d_converter.py
new file mode 100644
index 00000000000..41150f52d98
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/avg_pool_2d_converter.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2025 NXP
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.conversion import (
+    aten_translator,
+    common,
+)
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    average_pool_2d_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class AvgPool2dConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        n_args = len(node.args)
+
+        padding = node.args[3] if n_args >= 4 else [0, 0]
+        ceil_mode = node.args[4] if n_args >= 5 else False
+        count_include_pad = node.args[5] if n_args >= 6 else True
+        divisor_override = node.args[6] if n_args == 7 else None
+        _, explicit_padding = aten_translator.convert_padding(padding)
+
+        if (
+            (not count_include_pad and explicit_padding is not None)
+            or divisor_override is not None
+            or ceil_mode
+        ):
+            return False
+
+        if not NodeConverter._has_shared_q_params_if_quantized(node):
+            return False
+
+        return True
+
+    # noinspection PyMethodMayBeStatic
+    def _convert_2d_avg_pool(
+        self, kernel_size, stride, padding, t_op: tflite_model.Operator
+    ) -> list[tflite_model.Operator]:
+        ops = OpsList(middle_op=t_op)
+        t_op.builtin_options = average_pool_2d_options.AveragePool2D()
+        t_op.builtin_options.filter_h = kernel_size[0]
+        t_op.builtin_options.filter_w = kernel_size[1]
+        common.assign_2d_strides(t_op.builtin_options, stride)
+        t_op.builtin_options.padding, explicit_padding = (
+            aten_translator.convert_padding(padding)
+        )
+
+        if explicit_padding is not None:
+            # Need to prepend a 'Pad' operator, which adds 0s. But these will be included in the computation!
+            ops.add_pre(
+                self.builder.create_pad_operator_before(t_op, 0, explicit_padding)
+            )
+
+        return ops.flatten()
+
+    # AvgPool2d Node format: (Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False
+    #                         bool count_include_pad=True, int? divisor_override=None)
+    def convert(self, node: Node):
+        """Convert 'avg_pool2d' operator to TFLite 'AveragePool2D'."""
+        self.assert_convertible(node)
+
+        kernel_size = node.args[1]
+        stride = node.args[2]
+        padding = node.args[3] if len(node.args) >= 4 else [0, 0]
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        ops_to_add = self._convert_2d_avg_pool(kernel_size, stride, padding, t_op)
+        self.builder.append_operators(ops_to_add)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
new file mode 100644
index 00000000000..761840c379f
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
@@ -0,0 +1,129 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from typing import Collection
+
+import numpy as np
+
+from executorch.backends.nxp.backend.edge_helper import input_rank
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
+    apply_permutation_to,
+    create_channels_first_to_channels_last_permutation,
+    tf_lite_type_to_numpy,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
+    quantize_int8,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    pad_v2_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class ConstantPadNDConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        paddings = node.args[1]
+
+        # https://github.com/pytorch/pytorch/blob/v2.4.0/aten/src/ATen/native/PadNd.cpp#L38-L40
+        if len(paddings) > (input_rank(node, 0) * 2):
+            return False
+
+        # https://github.com/pytorch/pytorch/blob/v2.4.0/aten/src/ATen/native/PadNd.cpp#L30-L31
+        if len(paddings) % 2 != 0:
+            return False
+
+        if not NodeConverter._has_shared_q_params_if_quantized(node):
+            return False
+
+        return True
+
+    # noinspection PyMethodMayBeStatic
+    def _convert_paddings_to_tflite(
+        self, paddings: Collection[int], input_tensor: tflite_model.Tensor
+    ) -> list[int]:
+        """Convert the PyTorch paddings to TFLite paddings.
+            The PyTorch padding is added to the individual dimensions from the back (slightly confusing), see:
+             https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html#torch.nn.functional.pad
+            TFLite padding has shape [input_rank, 2], where start padding and end padding is specified for every
+             corresponding dimension.
+
+        :param paddings: The PyTorch paddings.
+        :param input_tensor: Main input tensor of the `aten.constant_pad_nd` operator.
+        :return: The equivalent TFLite paddings.
+        """
+
+        # 1st, group the individual paddings into groups of 2 (padding at the start and at the end for every dimension).
+        paddings = np.array(paddings).reshape(-1, 2)
+
+        # 2nd, reverse the padding groups. (The order is inverse between PyTorch and TFLite).
+        paddings = list(reversed(paddings))
+
+        # 3rd, add [0, 0]s from the start to get `rank` padding groups.
+        paddings = [[0, 0]] * (input_tensor.rank - len(paddings)) + paddings
+
+        if input_tensor.tensor_format.is_channels_last():
+            # Permute the `tfl_paddings` to match.
+            to_tflite_perm = create_channels_first_to_channels_last_permutation(
+                input_tensor.rank
+            )
+            paddings = apply_permutation_to(paddings, to_tflite_perm)
+
+        return paddings
+
+    def convert(self, node: Node):
+        """Convert the `aten.constant_pad_nd` operator to TFLite `PadV2`."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        x = t_op.tmp_inputs[0]
+        y = t_op.tmp_outputs[0]
+        paddings = node.args[1]
+        constant = node.args[2]
+
+        paddings = self._convert_paddings_to_tflite(paddings, x)
+        paddings_tensor = self.builder.create_tensor_for_data(
+            np.asarray(paddings, "int32"), "paddings"
+        )
+
+        if x.quantization is None:
+            constant_tensor = self.builder.create_tensor_for_data(
+                np.array([constant], tf_lite_type_to_numpy(x.type)), "constant"
+            )
+        else:
+            quantization = copy.copy(x.quantization)
+            scale, zero_point = (
+                quantization.scale.vector,
+                quantization.zero_point.vector,
+            )
+            constant_data = quantize_int8(
+                np.array([constant], np.float32), scale, zero_point
+            )
+            constant_tensor = self.builder.create_tensor_for_data(
+                constant_data, "constant"
+            )
+            constant_tensor.quantization = quantization
+
+        # Assign the operator its TFLite inputs and outputs.
+        t_op.tmp_inputs = [x, paddings_tensor, constant_tensor]
+        t_op.tmp_outputs = [y]
+        t_op.builtin_options = pad_v2_options.PadV2()
+
+        ops_to_add = OpsList(middle_op=t_op)
+
+        self.builder.append_operators(ops_to_add.flatten())
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
new file mode 100644
index 00000000000..efecebfc783
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
@@ -0,0 +1,127 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_helper import input_tensor, input_tensor_safe
+from executorch.backends.nxp.backend.ir.converter.conversion import (
+    aten_translator,
+    common,
+)
+from executorch.backends.nxp.backend.ir.converter.conversion.common import (
+    OpsList,
+    try_get_input,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
+    set_quantization_parameters_to_tensor,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    conv_2d_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class ConvolutionConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        is_transposed = node.args[6]
+        output_padding = node.args[7]
+        groups = node.args[8]
+
+        if is_transposed:
+            return False
+
+        if output_padding != [0, 0]:
+            return False
+
+        if groups != 1:
+            return False
+
+        if input_tensor_safe(node, 2) is None:
+            # No bias tensor.
+            weight_tensor = input_tensor(node, 1)
+            if weight_tensor.dtype not in [torch.float32, torch.int8, torch.uint8]:
+                return False
+
+        return True
+
+    def _convert_2d_conv(
+        self, stride, padding, dilation, t_op: tflite_model.Operator
+    ) -> list[tflite_model.Operator]:
+        ops = OpsList(middle_op=t_op)
+        t_op.builtin_options = conv_2d_options.Conv2D()
+        common.assign_2d_strides(t_op.builtin_options, stride)
+        common.assign_2d_dilations(t_op.builtin_options, dilation)
+        t_op.builtin_options.padding, explicit_padding = (
+            aten_translator.convert_padding(padding)
+        )
+
+        if explicit_padding is not None:
+            # Need to prepend a 'Pad' operator, which adds 0s. But these will be included in the computation!
+            ops.add_pre(
+                self.builder.create_pad_operator_before(t_op, 0, explicit_padding)
+            )
+
+        input_tensor: tflite_model.Tensor = t_op.tmp_inputs[0]
+        weight_tensor: tflite_model.Tensor = t_op.tmp_inputs[1]
+        output_tensor: tflite_model.Tensor = t_op.tmp_outputs[0]
+
+        if (bias_tensor := try_get_input(t_op, 2)) is None:
+            # Operator has no bias. Convolution aten op can omit it, TFLite can't.
+            output_channels = weight_tensor.shape.vector[0]
+
+            if weight_tensor.type == TensorType.FLOAT32:
+                bias_type = np.dtype(np.float32)
+            elif weight_tensor.type in [TensorType.INT8, TensorType.UINT8]:
+                bias_type = np.dtype(np.int32)
+            else:
+                # Should never happen.
+                raise NotImplementedError(
+                    f"Convolution node with unsupported weight type: {weight_tensor.type}"
+                )
+
+            bias_tensor = self.builder.create_zeros_tensor(
+                [output_channels], "zero_bias", bias_type, True
+            )
+
+            # Compute scale and zero point for bias tensor
+            input_scale = np.array(input_tensor.quantization.scale.vector)
+            weight_scale = np.array(weight_tensor.quantization.scale.vector)
+            bias_scale = input_scale * weight_scale
+            bias_zero_point = np.zeros(weight_scale.shape, dtype=np.int64)
+
+            set_quantization_parameters_to_tensor(
+                bias_tensor, bias_scale, bias_zero_point, quantized_dimension=0
+            )
+
+        # Assign the operator its TFLite inputs and outputs
+        t_op.tmp_inputs = [input_tensor, weight_tensor, bias_tensor]
+        t_op.tmp_outputs = [output_tensor]
+
+        return ops.flatten()
+
+    def convert(self, node: Node):
+        self.assert_convertible(node)
+
+        stride = node.args[3]
+        padding = node.args[4]
+        dilation = node.args[5]
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+        ops_to_add = self._convert_2d_conv(stride, padding, dilation, t_op)
+
+        self.builder.append_operators(ops_to_add)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool_2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool_2d_converter.py
new file mode 100644
index 00000000000..cd917e9d217
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool_2d_converter.py
@@ -0,0 +1,104 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.converter.conversion import (
+    aten_translator,
+    common,
+)
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    max_pool_2d_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class MaxPool2dConverter(NodeConverter):
+    """Convert 'max_pool2d' operator to TFLite 'MaxPool2D'.
+    NOTE: max_pool2d_with_indices is a different operator and is unsupported.
+    """
+
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        n_args = len(node.args)
+
+        dilation = node.args[4] if n_args >= 5 else [1, 1]
+        ceil_mode = node.args[5] if n_args == 6 else False
+
+        if any(dil != 1 for dil in dilation) or ceil_mode:
+            return False
+
+        if not NodeConverter._has_shared_q_params_if_quantized(node):
+            return False
+
+        return True
+
+    def _get_pad_constant_value(self, input_type: TensorType) -> np.ndarray:
+        """
+        Get scalar NumPy array with constant value used as constant value for 'Pad' operator.
+
+        :param input_type: Input tensor type.
+        :return: Scalar array with single minimum value of given type.
+        """
+
+        match input_type:
+            case TensorType.INT8:
+                return np.asarray([np.iinfo(np.int8).min], dtype=np.int8)
+            case TensorType.UINT8:
+                return np.asarray([np.iinfo(np.uint8).min], dtype=np.uint8)
+            case TensorType.FLOAT32:
+                return np.asarray([np.finfo(np.float32).min], dtype=np.float32)
+            case _:
+                raise RuntimeError("Unexpected input type for MaxPool operator.")
+
+    # noinspection PyMethodMayBeStatic
+    def _convert_2d_max_pool(
+        self, kernel_size, stride, padding, t_op: tflite_model.Operator
+    ) -> list[tflite_model.Operator]:
+        x = t_op.tmp_inputs[0]
+
+        ops = OpsList(middle_op=t_op)
+        t_op.builtin_options = max_pool_2d_options.MaxPool2D()
+        t_op.builtin_options.filter_h = kernel_size[0]
+        t_op.builtin_options.filter_w = kernel_size[1]
+        common.assign_2d_strides(t_op.builtin_options, stride)
+        t_op.builtin_options.padding, explicit_padding = (
+            aten_translator.convert_padding(padding)
+        )
+
+        if explicit_padding is not None:
+            # Need to prepend a 'Pad' operator, which adds min values for type.
+            constant_value = self._get_pad_constant_value(x.type)
+            pre_pad_op = self.builder.create_pad_operator_before(
+                t_op, 0, explicit_padding, constant_value=constant_value
+            )
+            ops.add_pre(pre_pad_op)
+
+        return ops.flatten()
+
+    # Maxpool2d Node format: (Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False)
+    def convert(self, node: Node):
+        self.assert_convertible(node)
+
+        n_args = len(node.args)
+
+        kernel_size = node.args[1]
+        stride = node.args[2]
+        padding = node.args[3] if n_args >= 4 else [0, 0]
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+        ops_to_add = self._convert_2d_max_pool(kernel_size, stride, padding, t_op)
+        self.builder.append_operators(ops_to_add)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mm_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mm_converter.py
new file mode 100644
index 00000000000..fc513240c44
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mm_converter.py
@@ -0,0 +1,58 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.edge_helper import input_rank
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    fully_connected_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class MMConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        if len(node.all_input_nodes) != 2:
+            return False
+
+        # The weights must be 2D.
+        if input_rank(node, 1) != 2:
+            return False
+
+        return True
+
+    def convert(self, node: Node):
+        """Convert the `aten.mm` operator to TFLite `FullyConnected` without a bias input."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+        t_op.builtin_options = fully_connected_options.FullyConnected()
+
+        x = t_op.tmp_inputs[0]
+        w = t_op.tmp_inputs[1]
+        y = t_op.tmp_outputs[0]
+
+        # Assign the operator its TFLite inputs and outputs
+        t_op.tmp_inputs = [x, w]
+        t_op.tmp_outputs = [y]
+
+        ops = OpsList(middle_op=t_op)
+
+        # The `aten.mm` uses main input with shape [M, N] and the weights have the shape [N, O].
+        # TFLite `FullyConnected` requires the weights to have shape [O, N] (if the main input has shape [M, N]).
+        # Insert a `Transpose` operator to permute the weights to achieve correct conversion. (The `Transpose` will not
+        #  be present in the output model if the weights are static.)
+        ops.add_pre(self.builder.create_transpose_operator_before(t_op, 1, [1, 0]))
+
+        self.builder.append_operators(ops.flatten())
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/permute_copy_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/permute_copy_converter.py
new file mode 100644
index 00000000000..e24ed4f6863
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/permute_copy_converter.py
@@ -0,0 +1,64 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.converter import quantization_utils
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    transpose_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class PermuteCopyConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        return True
+
+    def convert(self, node: Node):
+        """Convert the `aten.permute_copy` operator to TFLite `Transpose`."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+        t_op.builtin_options = transpose_options.Transpose()
+
+        x = t_op.tmp_inputs[0]
+        y = t_op.tmp_outputs[0]
+
+        if (
+            x.quantization is not None
+            and y.quantization is None
+            and "cluster" in node.meta
+        ):
+            # We know this node is part of QDQ cluster, so we can propagate quantization to inputs of "call_function"
+            # node of this cluster.
+            quantization_utils.propagate_quantization(x, y)
+
+            y.type = x.type
+            assert x.quantization == y.quantization, (
+                "PermuteCopyConverter: Q-params of input and output doesn't "
+                "match. This indicates error in quantizer."
+            )
+
+        perm = np.array(node.args[1], "int32")
+        perm_tensor = self.builder.create_tensor_for_data(perm, "perm")
+
+        # Assign the operator its TFLite inputs and outputs
+        t_op.tmp_inputs = [x, perm_tensor]
+        t_op.tmp_outputs = [y]
+
+        ops_to_add = OpsList(middle_op=t_op)
+
+        self.builder.append_operators(ops_to_add.flatten())
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py
new file mode 100644
index 00000000000..8731b3f6ed2
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py
@@ -0,0 +1,64 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
+    torch_type_to_numpy_type,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
+    set_quantization_parameters_to_tensor,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class QDQDequantizeConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        zero_point_type = torch_type_to_numpy_type(node.args[5])
+        if "cluster" not in node.meta or zero_point_type not in [np.int8, np.int32]:
+            return False
+
+        return True
+
+    def convert(self, node: Node):
+        self.assert_convertible(node)
+
+        from_tensor = self.builder.tensor_for_name(node.name)
+        to_tensor = self.builder.tensor_for_name(node.args[0].name)
+
+        zero_point_type = torch_type_to_numpy_type(node.args[5])
+
+        scale = np.array(node.args[1], dtype=np.float32)
+        zero_point = np.array(node.args[2], dtype=zero_point_type)
+
+        if self.context.parameters_mapping.get(node.args[0].name, None) is None:
+            # Convert dequantize as identity op (Transpose that will be removed) because
+            # input tensor is input of the model and don't have static data. If we do redirection
+            # here we will change input name of the model.
+            t_op = self._create_tflite_op_with_io_tensors(node)
+
+            set_quantization_parameters_to_tensor(to_tensor, scale, zero_point, 0)
+            set_quantization_parameters_to_tensor(from_tensor, scale, zero_point, 0)
+            from_tensor.type = to_tensor.type
+
+            self.builder.turn_operator_to_identity(t_op)
+            self.builder.append_operators([t_op])
+        else:
+            # Dequantize consumes tensor with static data -> convert as a tensor
+            set_quantization_parameters_to_tensor(to_tensor, scale, zero_point, 0)
+
+            # Change type so we pass check tensor similarity check when redirecting
+            from_tensor.type = to_tensor.type
+            self.builder.redirect_tensor(from_tensor, to_tensor)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_quantize_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_quantize_converter.py
new file mode 100644
index 00000000000..b0680e9b949
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_quantize_converter.py
@@ -0,0 +1,45 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
+    set_quantization_parameters_to_tensor,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class QDQQuantizeConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        if "cluster" not in node.meta or node.args[5] != torch.int8:
+            return False
+
+        return True
+
+    def convert(self, node: Node):
+        self.assert_convertible(node)
+
+        from_tensor = self.builder.tensor_for_name(node.name)
+        to_tensor = self.builder.tensor_for_name(node.args[0].name)
+
+        scale = np.array(node.args[1], dtype=np.float32)
+        zero_point = np.array(node.args[2], dtype=np.int8)
+
+        set_quantization_parameters_to_tensor(to_tensor, scale, zero_point, 0)
+
+        # Change type so we pass check tensor similarity check when redirecting
+        to_tensor.type = from_tensor.type
+        self.builder.redirect_tensor(from_tensor, to_tensor)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/relu_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/relu_converter.py
new file mode 100644
index 00000000000..5835667671f
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/relu_converter.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024 NXP
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class ReLUConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        return True
+
+    def convert(self, node: Node):
+        t_op = self._create_tflite_op_with_io_tensors(node)
+        t_op.opcode_index = self.builder.op_code_index_for_op_type(BuiltinOperator.RELU)
+
+        self.builder.append_operators([t_op])
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
new file mode 100644
index 00000000000..99932602c2f
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
@@ -0,0 +1,49 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.edge_helper import input_rank
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    softmax_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class SoftmaxConverter(NodeConverter):
+    supported_targets = []
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        # The IR only supports the `dim` as the last dimension. But that depends on the format of the input tensor,
+        #  which is only known after the `Partitioner` has divided the model. So if the input shape can be channels
+        #  first (i.e. is more than 2D), we cannot determine IR support (we assume it's not supported).
+        x_rank = input_rank(node, 0)
+        if x_rank > 2:
+            return False
+
+        dim = SoftmaxConverter._normalize_dim(node.args[1], x_rank)
+        if dim != x_rank - 1:
+            return False
+
+        return True
+
+    @staticmethod
+    def _normalize_dim(dim, rank):
+        # convert negative index to positive
+        if dim < 0:
+            dim += rank
+        return dim
+
+    def convert(self, node: Node):
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        t_op.builtin_options = softmax_options.Softmax(beta=1.0)
+
+        self.builder.append_operators([t_op])
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py
new file mode 100644
index 00000000000..2eceeba9b24
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from executorch.backends.nxp.backend.edge_helper import (
+    input_tensor,
+    output_tensor,
+    tensor_rank,
+)
+from executorch.backends.nxp.backend.ir.converter import quantization_utils
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    NodeConverter,
+    Target,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reshape_transposition import (
+    ensure_reshape_transposition,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    reshape_options,
+)
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class ViewCopyConverter(NodeConverter):
+    supported_targets = [Target.RT700]
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node, parameters_mapping: dict[str, Parameter]
+    ) -> bool:
+        x = input_tensor(node, 0)
+        y = output_tensor(node)
+
+        flat_input_size = ViewCopyConverter._safe_compute_flat_size(list(x.size()))
+        flat_output_size = ViewCopyConverter._safe_compute_flat_size(list(y.size()))
+
+        if tensor_rank(y) >= 8 or flat_input_size != flat_output_size:
+            return False
+
+        return True
+
+    @staticmethod
+    def _safe_compute_flat_size(shape: list[int | str]) -> int:
+        """Compute the flat size of a tensor with given shape. Strings and negative dimensions are treated as '1'.
+
+        :param shape: Shape of the tensor. Can include integers and strings.
+        :return: The flat size of the tensor.
+        """
+        flat_size = 1
+        for dim in shape:
+            if isinstance(dim, int) and dim > 1:
+                flat_size *= dim
+
+        return flat_size
+
+    def convert(self, node: Node):
+        """Convert the `aten.view_copy` operator to TFLite `Reshape`."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        x = t_op.tmp_inputs[0]
+        y = t_op.tmp_outputs[0]
+
+        ops = OpsList(middle_op=t_op)
+
+        if (
+            x.quantization is not None
+            and y.quantization is None
+            and "cluster" in node.meta
+        ):
+            # We know this node is part of QDQ cluster, so we can propagate quantization to inputs of "call_function"
+            # node of this cluster.
+            quantization_utils.propagate_quantization(x, y)
+
+            y.type = x.type
+            assert x.quantization == y.quantization, (
+                "ViewCopyConverter: Q-params of input and output doesn't match. This "
+                "indicates error in quantizer."
+            )
+
+        new_shape = ensure_reshape_transposition(self.builder, ops)
+
+        # Create the TFLite Reshape with the new shape
+        t_op.builtin_options = reshape_options.Reshape(new_shape)
+
+        # Required by neutron-converter, but it will remove this tensor in optimization phase
+        new_shape_tensor = self.builder.create_tensor_for_data(
+            np.asarray(new_shape, dtype=np.int32), "new_shape"
+        )
+        t_op.tmp_inputs.append(new_shape_tensor)
+
+        self.builder.append_operators(ops.flatten())
diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/__init__.py b/backends/nxp/backend/ir/converter/node_converters/shared/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py b/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py
new file mode 100755
index 00000000000..50b9aef6d18
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py
@@ -0,0 +1,112 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.converter.builder import model_builder
+from executorch.backends.nxp.backend.ir.converter.conversion import translator
+from executorch.backends.nxp.backend.ir.converter.conversion.common import (
+    OpsList,
+    try_get_input,
+)
+from executorch.backends.nxp.backend.ir.converter.tensor_utils import tensor_has_data
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+
+
+def ensure_correct_tensor_formatting(
+    t_op: tflite_model.Operator, builder: model_builder.ModelBuilder, ops: OpsList
+):
+    """Make sure that all input and output tensors of 't_op' have the correct format. 't_op' is assumed to be an LSTM
+         or RNN operator.
+
+        The LSTM/RNN may be using channels last tensors, because of the surrounding operators. LSTM/RNN requires its own
+         format, however I think the input tensors should be marked as 'FORMATLESS', because the main inputs of TFLite
+         and ONNX version of the operators have the same shape.
+        I believe that the cleanest and most robust way to solve this, is to mark LSTM/RNN as an operator which can
+         change the formats of its tensors, and solve any format related issues in this module.
+
+    :param t_op: TFLite operator with inputs and outputs corresponding to the ONNX LSTM/RNN operator.
+    :param builder: ModelBuilder object.
+    :param ops: OpsList object, with operators to add to the model. May already contain some operators.
+    """
+
+    if t_op.tmp_inputs[0].tensor_format == TensorFormat.FORMATLESS:
+        # Nothing to be done. All tensors should be formatless.
+        return
+
+    # Permute the inputs.
+    for idx, tensor in enumerate(t_op.tmp_inputs.copy()):
+        if tensor.tensor_format.is_channels_last():
+            revert_perm = translator.create_channels_last_to_channels_first_permutation(
+                tensor.rank, return_list=True
+            )
+            if tensor_has_data(tensor):
+                translator.permute_static_tensor(tensor, revert_perm)
+
+            else:
+                # Prepend a Transpose operator.
+                transpose = builder.create_transpose_operator_before(
+                    t_op, idx, revert_perm
+                )
+                ops.pre_ops.append(transpose)
+
+            t_op.tmp_inputs[idx].tensor_format = TensorFormat.FORMATLESS
+
+    # LSTM/RNN produces 'FORMATLESS' outputs. However, if the output tensors have the 'channels_last' format, Transpose
+    #  operators must be added, to actually make the inputs 'channels_last'.
+    for idx, tensor in enumerate(t_op.tmp_outputs.copy()):
+        if tensor.tensor_format.is_channels_last():
+            # Append a Transpose operator.
+            revert_perm = translator.create_channels_first_to_channels_last_permutation(
+                tensor.rank, return_list=True
+            )
+            transpose = builder.create_transpose_operator_after(t_op, idx, revert_perm)
+            ops.post_ops.append(transpose)
+
+            t_op.tmp_outputs[idx].tensor_format = TensorFormat.FORMATLESS
+
+
+def get_activation_function_for_name(
+    name: str, op_type: str = "LSTM"
+) -> ActivationFunctionType:
+    get_activation_function_for_name.map = {
+        "Tanh": ActivationFunctionType.TANH,
+        "Relu": ActivationFunctionType.RELU,
+    }
+
+    if act_fun := get_activation_function_for_name.map.get(name, None):
+        return act_fun
+
+    # Couldn't find a corresponding activation function
+    logger.e(
+        logger.Code.CONVERSION_IMPOSSIBLE,
+        f"Conversion of ONNX {op_type} with activation function '{name}' is not possible.",
+    )
+
+
+def check_sequence_lens(
+    t_op: tflite_model.Operator, seq_length: int, op_type: str = "LSTM"
+):
+    """Check if the 'sequence_lens' operand of ONNX LSTM/RNN has an effect. If it does, exit with error.
+
+    :param t_op: TFLite operator with inputs and outputs corresponding to the ONNX operator.
+    :param seq_length: The first dimension of the main LSTM input.
+    :param op_type: Operator type of 't_op'. Used only for printing a specific error message.
+    """
+    if sequence_lens := try_get_input(t_op, 4):
+        # 'sequence_lens' allows each sequence to have a different length. As far as I can tell, TFLite doesn't support
+        #  this.
+        if (not tensor_has_data(sequence_lens)) or any(
+            elt != seq_length for elt in sequence_lens.tmp_buffer.data
+        ):
+            # The 'sequence_lens' is either dynamic, or static with at least one value different from 'seq_length'.
+            # Conversion most likely impossible.
+            logger.e(
+                logger.Code.CONVERSION_IMPOSSIBLE,
+                f"Conversion of ONNX {op_type} with 'sequence_lens' input is not possible.",
+            )
diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py b/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py
new file mode 100755
index 00000000000..fad32edfd26
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py
@@ -0,0 +1,200 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+from executorch.backends.nxp.backend.ir.converter.builder.model_builder import (
+    ModelBuilder,
+)
+from executorch.backends.nxp.backend.ir.converter.conversion import translator
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+
+
+def convert_axes_from_attribute(
+    t_op: tflite_model.Operator, builder: ModelBuilder, axes: list[int] | None
+):
+    """Create an `axes` tensor and assign it as an input to the `t_op`, which is expected to represent an ONNX
+    reduction operator.
+    """
+    x = t_op.tmp_inputs[0]
+    rank = x.rank
+
+    if axes is None:
+        # Default axes -> reduce over all dimensions.
+        axes = np.arange(rank).astype(np.int32)
+
+    else:
+        # Axes are initialized.
+        axes = np.asarray(axes, np.int32)
+
+    # TFLite has `axes` as input tensor -> create it.
+    axes_tensor = builder.create_tensor_for_data(axes, "axes")
+    t_op.tmp_inputs.append(axes_tensor)
+
+
+# def convert_axes_from_input_tensor(
+#     t_op: tflite_model.Operator,
+#     builder: ModelBuilder,
+#     inspector: ONNXModelInspector,
+#     ops: OpsList,
+#     noop_with_empty_axes: int,
+#     op_type: str,
+# ):
+#     """Verify the `axes` tensor (on input index 1) of the `t_op`, which is expected to represent an ONNX reduction
+#     operator.
+#     """
+#     x = t_op.tmp_inputs[0]
+#     rank = x.rank
+#
+#     if axes_tensor := try_get_input(t_op, 1):
+#
+#         # ONNX uses int64, while TFLite requires int32 for the `axes` tensor.
+#         if axes_tensor.type != TensorType.INT64:
+#             logger.e(
+#                 logger.Code.INVALID_ONNX_OPERATOR,
+#                 f"ONNX `{op_type}` has `axes` of type `{name_for_type(axes_tensor.type)}`, instead of INT64.",
+#             )
+#
+#         # Try to get the inferred data for the `axes` input.
+#         if (
+#             axes_data := inspector.try_get_inferred_tensor_data(axes_tensor.name)
+#         ) is not None:
+#             # The `axes` were inferred during shape inference.
+#             logger.d(
+#                 f"Using inferred data for the `axes` input tensor of ONNX `{op_type}`."
+#             )
+#
+#             # Create a new tensor, in case the original `axes` tensor is used by multiple ops.
+#             axes_tensor = builder.create_tensor_for_data(
+#                 axes_data.astype(np.int32), "axes"
+#             )
+#
+#         # Make sure the `axes` are int32.
+#         if tensor_has_data(axes_tensor):
+#             # Cast the `axes` to int32 statically.
+#             axes_tensor.tmp_buffer.data = axes_tensor.tmp_buffer.data.astype(np.int32)
+#             axes_tensor.type = TensorType.INT32
+#
+#         else:
+#             # The `axes` are dynamic and there is no inferred data for them. The shape inference is not possible in
+#             #  this case, so it must have been skipped. If the `axes` are empty at runtime, ONNX will reduce over
+#             #  all dimensions, whereas TFLite will not reduce at all. So the behavior is different, and it depends
+#             #  on runtime data. Conversion could be implemented by adding multiple extra operators.
+#             # I don't thing that completely prohibiting the conversion here is ideal, since the issue arises only in
+#             #  an edge case, which is hopefully not very common. Just print a warning message for now.
+#             logger.w(
+#                 f"Conversion of ONNX `{op_type}` with a dynamic `axes` input will not be correct, if the `axes`"
+#                 "are empty at runtime!"
+#             )
+#
+#             # Insert a `Cast` op, to make the `axes` int32.
+#             cast_op = builder.create_cast_before(t_op, 1, TensorType.INT32)
+#             ops.add_pre(cast_op)
+#
+#             # For future references. Following code only cares about the final axes tensor.
+#             axes_tensor = cast_op.tmp_outputs[0]
+#
+#         # Assign the new `axes_tensor` to the ReduceX operator.
+#         t_op.tmp_inputs[1] = axes_tensor
+#
+#     else:
+#         # No axes specified.
+#
+#         if noop_with_empty_axes == 1:
+#             # ONNXRT: According to the documentation, the operator should do nothing in this situation. But that's
+#             #  not what happens in ONNX Runtime. ORT seems to simply ignore the `noop_with_empty_axes` attribute.
+#             #  https://github.com/microsoft/onnxruntime/issues/19147
+#             # For now, exit with error. If later ORT adds support for this attribute, simply uncomment the
+#             #  following code.
+#
+#             # if self.builder.operator_can_be_skipped(t_op, self.inspector):
+#             #     # Skip the operator.
+#             #     self.builder.redirect_tensor(t_op.tmp_outputs[0], t_op.tmp_inputs[0])
+#             #     return []
+#             #
+#             # else:
+#             #     # Return an operator which does nothing.
+#             #     self.builder.turn_operator_to_identity(t_op)
+#             #     return [t_op]
+#
+#             logger.e(
+#                 logger.Code.INVALID_ONNX_OPERATOR,
+#                 f"ONNX `{op_type}` has `noop_with_empty_axes` == 1 and the `axes` are not specified, which"
+#                 " indicates that the operator should do nothing. This is however not supported by ONNX"
+#                 " Runtime, and therefore the conversion is also not supported.",
+#             )
+#
+#         else:
+#             # Default is to reduce all axes.
+#             axes_tensor = builder.create_tensor_for_data(
+#                 np.arange(rank).astype(np.int32), "axes"
+#             )
+#
+#             t_op.tmp_inputs[1:] = (
+#                 []
+#             )  # If the optional input was passed with name "", remove it.
+#             t_op.tmp_inputs.append(axes_tensor)
+
+
+def ensure_reduce_transposition(builder, ops: OpsList):
+    """
+    Ensure transposition of ReduceX operator is defined correctly based on tensor format.
+    New operators (Transpose) are added into "ops" collection when necessary.
+
+    :param builder: ModelBuilder instance.
+    :param ops: OpsList instance with operators related to currently converted ReduceX operator.
+    """
+    t_op = ops.middle_op
+    input_tensor = t_op.tmp_inputs[0]
+    input_rank = input_tensor.rank
+    input_format = input_tensor.tensor_format
+    output_tensor = t_op.tmp_outputs[0]
+    output_rank = output_tensor.rank
+    output_format = output_tensor.tensor_format
+
+    if input_format.is_channels_last() and output_format.is_channels_last():
+        to_onnx_perm = translator.create_channels_last_to_channels_first_permutation(
+            input_rank
+        )
+        to_tflite_perm = translator.create_channels_first_to_channels_last_permutation(
+            output_rank, return_list=True
+        )
+
+        transpose_before = builder.create_transpose_operator_before(
+            t_op, 0, to_onnx_perm
+        )
+        transpose_before.tmp_outputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+        ops.add_pre(transpose_before)
+
+        transpose_after = builder.create_transpose_operator_after(
+            t_op, 0, to_tflite_perm
+        )
+        transpose_after.tmp_inputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+        ops.post_ops.insert(0, transpose_after)
+
+    elif input_format.is_channels_last() and not output_format.is_channels_last():
+        # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ONNX.
+
+        permutation = list(
+            translator.create_channels_last_to_channels_first_permutation(input_rank)
+        )
+        transpose = builder.create_transpose_operator_before(t_op, 0, permutation)
+        transpose.tmp_outputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+
+        ops.add_pre(transpose)
+
+    elif not input_format.is_channels_last() and output_format.is_channels_last():
+        # The ReduceX introduces format to the tensor
+        # The ONNX ReduceX outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator
+        # must be added, to change the tensor to 'channels last'.
+
+        permutation = list(
+            translator.create_channels_first_to_channels_last_permutation(output_rank)
+        )
+        transpose = builder.create_transpose_operator_after(t_op, 0, permutation)
+        transpose.tmp_inputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+
+        ops.post_ops.insert(0, transpose)
diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py b/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py
new file mode 100755
index 00000000000..0e55c27684b
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py
@@ -0,0 +1,233 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.converter.conversion import translator
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+
+
+class SingleUnitaryDimensionChangeType(Enum):
+    SQUEEZE = (0,)  # Removing one dimension with value 1
+    UNSQUEEZE = 1  # Adding one dimensions with value 1
+
+
+def _single_unitary_dimension_change(  # noqa C901
+    from_shape, to_shape
+) -> tuple[int, SingleUnitaryDimensionChangeType] | None:
+    """
+    Get change details (index of change and type of change) if there's only single unitary change
+    between input shapes. If there is no such a change, None is returned otherwise.
+
+    :param from_shape: First compared shape.
+    :param to_shape: Second compared shape.
+    :return: Tuple with change details (changed index and type of change) or None.
+    """
+    change_type = SingleUnitaryDimensionChangeType.UNSQUEEZE
+
+    if (
+        abs(len(from_shape) - len(to_shape)) != 1
+    ):  # More than one added/removed dimension
+        return None
+    elif len(from_shape) > len(to_shape):  # Make sure 'from_shape' is a shorter one
+        from_shape, to_shape = to_shape, from_shape
+        change_type = SingleUnitaryDimensionChangeType.SQUEEZE
+
+    # All dimensions in both shapes are ones
+    if np.all(np.array(to_shape) == 1) and np.all(np.array(from_shape) == 1):
+        return 0, change_type
+
+    # Iterate from the beginning of the shorter shape and find first non-matching dimension
+    first_non_matching_forward = None
+    for i in range(len(from_shape)):
+        if from_shape[i] != to_shape[i]:
+            first_non_matching_forward = i
+            break
+
+    # Iterate from the end of the shorter shape and find first non-matching dimension
+    first_non_matching_backward = None
+    for i in range(-1, -len(from_shape) - 1, -1):
+        if from_shape[i] != to_shape[i]:
+            first_non_matching_backward = i
+            break
+
+    # Normalize (from negative to positive value) index of non-matching dimension with
+    # respect to shape with more dims
+    if first_non_matching_backward is not None:
+        first_non_matching_backward = first_non_matching_backward + len(to_shape)
+
+    # 'from_shape' completely matched the beginning of 'to_shape', for example:
+    # from_shape=(2,3,4), to_shape=(2,3,4,1)
+    if first_non_matching_forward is None and first_non_matching_backward is not None:
+        if to_shape[first_non_matching_backward] == 1:
+            return first_non_matching_backward, change_type
+    # 'from_shape' completely matched the end of 'to_shape', for example:
+    # from_shape=(2,3,4), to_shape=(1,2,3,4)
+    elif first_non_matching_forward is not None and first_non_matching_backward is None:
+        if to_shape[first_non_matching_forward] == 1:
+            return first_non_matching_forward, change_type
+    # 'from_shape' matched partially from the beginning and partly from the end of 'to_shape',
+    # for example: from_shape=(2,3,4), to_shape=(2,1,3,4)
+    elif (first_non_matching_forward == first_non_matching_backward) and to_shape[
+        first_non_matching_forward
+    ] == 1:
+        return first_non_matching_forward, change_type
+
+    return None
+
+
+def _get_permutation_for_single_unitary_change_in_NC_dims(
+    shape_from: list[int], to_shape: list[int]
+) -> list[int] | None:
+    """
+    Get permutation used by prepended 'Transpose' operator if there's only single unitary
+    dimension change (single added/removed dimension with value 1) in batch or channel dimension
+    done by 'Reshape' operator.
+
+    :param shape_from: Input shape of 'Reshape' operator.
+    :param to_shape: Output shape of 'Reshape' operator.
+    :return: Permutation as list of ints, or None if there is no single unitary change in NC dimensions.
+    """
+
+    old_shape_channel_first = translator.dims_to_channels_first(shape_from)
+    new_shape_channel_first = translator.dims_to_channels_first(to_shape)
+
+    change_details = _single_unitary_dimension_change(
+        old_shape_channel_first, new_shape_channel_first
+    )
+
+    # Mapping from dimension change details into permutation used in prepended 'Transpose' op
+    # in format: permutation_mapping[SQUEEZE/UNSQUEEZE][old_shape dimension][changed index]
+    permutation_mapping = {
+        SingleUnitaryDimensionChangeType.SQUEEZE: {
+            4: {
+                0: [0, 3, 2, 1],
+                1: [0, 2, 1, 3],
+            },
+            5: {
+                0: [0, 4, 2, 3, 1],
+                1: [0, 2, 3, 1, 4],
+            },
+        },
+        SingleUnitaryDimensionChangeType.UNSQUEEZE: {
+            3: {
+                0: [2, 1, 0],
+                1: [0, 2, 1],
+            },
+            4: {
+                0: [3, 1, 2, 0],
+                1: [0, 3, 1, 2],
+            },
+        },
+    }
+
+    if change_details is not None:
+        changed_index, change_type = change_details
+        if changed_index > 1:
+            # There is single unitary change in other than NC dimensions -> ignoring
+            return None
+        return permutation_mapping[change_type][len(shape_from)][changed_index]
+
+    return None
+
+
+def ensure_reshape_transposition(builder, ops: OpsList) -> list[int]:
+    """
+    Ensure transposition of Reshape operator is defined correctly based on tensor format.
+    New operators (Transpose) are added into "ops" collection when necessary.
+
+    :param builder: ModelBuilder instance.
+    :param ops: OpsList instance with Reshape as "middle_op".
+    :return: New shape of Reshape operator.
+    """
+    t_op = ops.middle_op
+    input_tensor = t_op.tmp_inputs[0]
+    input_rank = input_tensor.rank
+    input_format = input_tensor.tensor_format
+    output_tensor = t_op.tmp_outputs[0]
+    output_rank = output_tensor.rank
+    output_format = output_tensor.tensor_format
+
+    # Shapes in TFLite format
+    input_shape = input_tensor.shape.vector
+    new_shape = output_tensor.shape.vector
+
+    if input_format.is_channels_last() and not output_format.is_channels_last():
+        # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ONNX.
+
+        permutation = list(
+            translator.create_channels_last_to_channels_first_permutation(input_rank)
+        )
+        transpose = builder.create_transpose_operator_before(t_op, 0, permutation)
+        transpose.tmp_outputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+
+        ops.add_pre(transpose)
+
+    elif not input_format.is_channels_last() and output_format.is_channels_last():
+        # The Reshape introduces format to the tensor (2D -> 4D  for example)
+        # The ONNX Reshape outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator
+        # must be added, to change the tensor to 'channels last'.
+
+        permutation = list(
+            translator.create_channels_first_to_channels_last_permutation(output_rank)
+        )
+        transpose = builder.create_transpose_operator_after(t_op, 0, permutation)
+        transpose.tmp_inputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+
+        new_shape = translator.dims_to_channels_first(new_shape)
+
+        ops.post_ops.insert(0, transpose)
+    elif input_format.is_channels_last() and output_format.is_channels_last():
+        batch_match = input_tensor.shape.vector[0] == output_tensor.shape.vector[0]
+        channels_match = input_tensor.shape.vector[-1] == output_tensor.shape.vector[-1]
+
+        if batch_match and channels_match:
+            # It is safe to skip 'Transposition' at all because 'NC' dimensions are the same and
+            # not mixed with other dimensions
+            pass
+        elif permutation := _get_permutation_for_single_unitary_change_in_NC_dims(
+            input_shape, new_shape
+        ):
+            # Single added/removed dimension with value 1
+            transpose = builder.create_transpose_operator_before(t_op, 0, permutation)
+            transpose.tmp_outputs[0].tensor_format = (
+                TensorFormat.RESHAPE_SINGLE_UNITARY_TRANSPOSITION
+            )
+
+            ops.add_pre(transpose)
+        else:
+            # The only way to convert this correctly is to insert a Transpose operator before, to make the input
+            # channels first, and another Transpose after, to make the output channels last again.
+            last_to_first_perm = (
+                translator.create_channels_last_to_channels_first_permutation(
+                    input_rank
+                )
+            )
+            ops.add_pre(
+                builder.create_transpose_operator_before(
+                    t_op, 0, list(last_to_first_perm)
+                )
+            )
+            t_op.tmp_inputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+
+            new_shape = translator.dims_to_channels_first(new_shape)
+
+            first_to_last_perm = (
+                translator.create_channels_first_to_channels_last_permutation(
+                    output_rank
+                )
+            )
+            ops.post_ops.insert(
+                0,
+                builder.create_transpose_operator_after(
+                    t_op, 0, list(first_to_last_perm)
+                ),
+            )
+            t_op.tmp_outputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
+
+    return new_shape
diff --git a/backends/nxp/backend/ir/converter/quantization_utils.py b/backends/nxp/backend/ir/converter/quantization_utils.py
new file mode 100755
index 00000000000..d9e7674d953
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/quantization_utils.py
@@ -0,0 +1,484 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from typing import Iterable, List, Optional
+
+import executorch.backends.nxp.backend.ir.converter.builder.model_builder as model_builder
+
+import numpy as np
+from executorch.backends.nxp.backend.ir import logger as logger
+from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
+    tf_lite_type_to_numpy,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite import TensorType as tflTensorType
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_generator import (
+    tflite_model as tflite_model,
+)
+
+
+def quantization_is_equal(
+    x_scale: np.ndarray,
+    x_zp: np.ndarray,
+    x_type: TensorType,
+    y_scale: np.ndarray,
+    y_zp: np.ndarray,
+    y_type: TensorType,
+) -> bool:
+    """Determine if provided quantization parameters of tensors 'x' and 'y' are the same.
+
+    :param x_scale: Scale of the 'x' tensor.
+    :param x_zp: Zero point of the 'x' tensor.
+    :param x_type: TFLite data type of the 'x' tensor.
+    :param y_scale: Scale of the 'y' tensor.
+    :param y_zp: Zero point of the 'y' tensor.
+    :param y_type: TFLite data type of the 'y' tensor.
+    :return: True, if the quantization parameters are equal.
+    """
+    if x_type != y_type:
+        return False
+
+    if not (x_scale.size == x_zp.size == y_scale.size == y_zp.size):
+        return False
+
+    x_scale, x_zp = quantization_params_to_lists(x_scale, x_zp)
+    y_scale, y_zp = quantization_params_to_lists(y_scale, y_zp)
+
+    return all(
+        x_s == y_s and x_z == y_z
+        for x_s, y_s, x_z, y_z in zip(x_scale, y_scale, x_zp, y_zp)
+    )
+
+
+def quantization_params_to_lists(
+    scale: np.ndarray, zero_point: np.ndarray
+) -> (List[float], List[int]):
+    if (scale is None) or (zero_point is None):
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "Missing zero_point and/or scale quantization params when converting to list!",
+        )
+
+    if (scale.size == 1) and (zero_point.size == 1):
+        # Per tensor quantization
+        scale = [scale.item()]
+        zero_point = [zero_point.item()]
+    elif (scale.size != 1) and (zero_point.size != 1):
+        # Per channel quantization
+        scale = scale.tolist()
+        zero_point = zero_point.tolist()
+    else:
+        logger.e(
+            logger.Code.CONVERSION_IMPOSSIBLE,
+            "TFLite doesn't support combination of per-channel and per-tensor quantization params.",
+        )
+
+    return scale, zero_point
+
+
+def is_quantization_valid(scale, zero_point):
+    return scale.size == zero_point.size
+
+
+def is_per_tensor_quantized(scale, zero_point):
+    return (scale.size == 1) and (zero_point.size == 1)
+
+
+def is_per_channel_quantized(scale, zero_point):
+    return is_quantization_valid(scale, zero_point) and not is_per_tensor_quantized(
+        scale, zero_point
+    )
+
+
+def get_symmetric_zero_point_for_type(tensor_type: TensorType):
+    match tensor_type:
+        case TensorType.INT8:
+            return 0
+        case TensorType.UINT8:
+            return 128
+        case _:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                f"Attempt to get zero point definition for type: {tensor_type}",
+            )
+
+
+def _validate_or_set_quant_params(
+    tensor: tflite_model.Tensor, quant: tflite_model.Quantization
+) -> bool:
+    """
+    Set quantization parameters 'quant' in the tensor. If tensor already has any quantization parameters,
+    checks if equals to quant
+    :param tensor: tensor where to set the quantization parameters
+    :param quant: Quantization parameters
+    :return: False if validation failed, True otherwise
+    """
+
+    if tensor.quantization is not None:
+        return tensor.quantization == quant
+    tensor.quantization = copy.copy(quant)
+
+    return True
+
+
+def propagate_quantization(
+    from_tensor: tflite_model.Tensor, to_tensor: tflite_model.Tensor
+):
+    """
+    Propagates quantization parameters from from_tensor to to_tensor. If to_tensor already has the params set
+    checks the consistency.
+    :raises: logger.Error - INVALID_ONNX_MODEL
+    """
+
+    if (
+        from_tensor.quantization is not None
+        and from_tensor.quantization.is_per_channel()
+    ):
+        # Note: For simplicity the quantization propagation is allowed only for per tensor quantized tensors.
+        # Typically, operator inputs and outputs are per-tensor quantized. Per channel is only for weights.
+        logger.e(
+            logger.Code.NOT_IMPLEMENTED,
+            "Propagation of quantization for PerChannel quantized tensors is not yet supported",
+        )
+
+    # noinspection PyTypeChecker
+    if not _validate_or_set_quant_params(to_tensor, from_tensor.quantization):
+        logger.e(
+            logger.Code.INVALID_ONNX_MODEL,
+            f'Mismatched quantization parameters between tensors "{from_tensor.name}" and "{to_tensor.name}"',
+        )
+
+
+def set_quantization_parameters_to_tensor(
+    tflite_tensor: tflite_model.Tensor,
+    scale: np.ndarray,
+    zero_point: np.ndarray,
+    quantized_dimension: int = 0,
+):
+    """Create a TFLite QuantizationParameters object, initialize it from given parameters and add it to the
+    'tflite_tensor'.
+    :param tflite_tensor: The TFLite tensor in the model, to add the quantization to.
+    :param scale: The data of the tensor, which is an input of a quantized ONNX operator and represents the
+                  quantization scale.
+    :param zero_point: The data of the tensor, which is an input of a quantized ONNX operator and represents the
+                       quantization zero point.
+    :param quantized_dimension: The quantized dimension attribute of TFLite QuantizationParameters.
+    """
+    if (scale is None) or (zero_point is None):
+        logger.e(
+            logger.Code.NOT_IMPLEMENTED,
+            "Conversion of ONNX quantized operators is only supported when "
+            "the quantization parameters are static!",
+        )
+
+    if (scale.size == 1) and (zero_point.size == 1):
+        # Per tensor quantization
+        scale = [scale.item()]
+        zero_point = [zero_point.item()]
+
+    elif (scale.size != 1) and (zero_point.size != 1):
+        # Per channel quantization
+
+        if scale.size != zero_point.size:
+            logger.e(
+                logger.Code.INVALID_ONNX_MODEL,
+                f"The per channel quantization parameters of ONNX tensor "
+                f"'{tflite_tensor.name}' are of different sizes! ('{scale.size}'"
+                f" != '{zero_point.size}')",
+            )
+
+        quantized_dimension_size = tflite_tensor.shape.get(quantized_dimension)
+        if scale.size != quantized_dimension_size:
+            logger.e(
+                logger.Code.INVALID_ONNX_MODEL,
+                f"The ONNX per channel quantization parameter vectors do not "
+                f"match the size of the quantized dimension! ('{scale.size}' != "
+                f"'{quantized_dimension_size}')",
+            )
+
+        scale = scale.tolist()
+        zero_point = zero_point.tolist()
+
+    else:
+        # Combination of per tensor and per channel quantization parameters
+        logger.e(
+            logger.Code.INVALID_ONNX_MODEL,
+            f"ONNX tensor '{tflite_tensor.name}' uses a combination of per "
+            f"tensor and per channel quantization parameters. Conversion to "
+            f"TFLite is not possible!",
+        )
+
+    quant = tflite_model.Quantization(
+        scale=tflite_model.Scale(scale),
+        zero_point=tflite_model.ZeroPoint(zero_point),
+        quantized_dimension=quantized_dimension,
+    )
+    if not _validate_or_set_quant_params(tflite_tensor, quant):
+        logger.e(
+            logger.Code.INVALID_ONNX_MODEL,
+            f'Mismatched quantization parameters between tensors: "{tflite_tensor.name}" already '
+            f"has the quantization params set",
+        )
+
+
+def calculate_uint_to_int_re_quantization_zero_point(
+    data_type_byte_size: int, old_zero_point: Iterable[int]
+) -> np.ndarray:
+    """
+        Calculate the new zero points, after a quantized tensor with an unsigned int data type is re-quantized to
+        a signed type.
+    :param data_type_byte_size: Size of the data type that is used, in Bytes. For example 1 for INT8.
+    :param old_zero_point: The zero point quantisation parameter, of the original data, before re-quantization.
+    :return: The new zero point quantisation parameter, after re-quantization.
+    """
+    data_type_bit_size = 8 * data_type_byte_size
+    zero_point_shift = 2 ** (data_type_bit_size - 1)
+    return np.asarray(np.subtract(np.array(old_zero_point, np.int32), zero_point_shift))
+
+
+def _re_quantize_uint8_to_int8(tensor_data: np.ndarray) -> np.ndarray:
+    """Re-quantize static uint8 data to int8."""
+    int16_data = np.asarray(tensor_data, np.int16)
+    return np.array(int16_data - 128, np.int8)
+
+
+def quantize_int8(
+    data: np.ndarray, scale: List[float], zero_point: List[int]
+) -> np.ndarray:
+    new_data = np.add(np.round(np.divide(data, scale)), zero_point)
+    return np.clip(new_data, -128, 127).astype(np.int8)
+
+
+def quantize_uint8(
+    data: np.ndarray, scale: List[float], zero_point: List[int]
+) -> np.ndarray:
+    new_data = np.add(np.round(np.divide(data, scale)), zero_point)
+    return np.clip(new_data, 0, 255).astype(np.uint8)
+
+
+def quantize_int32(
+    data: np.ndarray, scale: List[float], zero_point: List[int]
+) -> np.ndarray:
+    new_data = np.add(np.round(np.divide(data, scale)), zero_point)
+    return np.clip(new_data, -2_147_483_648, 2_147_483_648).astype(np.int32)
+
+
+def dequantize(
+    data: np.ndarray, scale: List[float], zero_point: List[int]
+) -> np.ndarray:
+    return np.multiply(
+        np.subtract(np.array(data, dtype=np.float32), zero_point),
+        scale,
+        dtype=np.float32,
+    )
+
+
+def re_quantize_static_tensor(
+    builder: "model_builder.ModelBuilder",
+    tflite_tensor: tflite_model.Tensor,
+    to_type: tflTensorType.TensorType,
+    new_scale: Optional[List[float]] = None,
+    new_zero_point: Optional[List[int]] = None,
+) -> tflite_model.Tensor:
+    """Create a new TFLite Tensor with new quantization parameters, type and data.
+
+    :param builder: A ModelBuilder instance.
+    :param tflite_tensor: TFLite tensor to re-quantize.
+    :param to_type: The TFLite TensorType, that the tensor will be re-quantized to.
+    :param new_scale: New scale quantization parameter. Used only when re-quantizing to the same type.
+    :param new_zero_point: New zero point quantization parameter. Used only when re-quantizing to the same type.
+    :return: A new re-quantized tensor.
+    """
+    if tflite_tensor.quantization is None:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "translator.re_quantize_static_tensor(): Got tensor without quantization!",
+        )
+
+    if tflite_tensor.tmp_buffer.data is None:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "translator.re_quantize_static_tensor(): Got tensor without static data!",
+        )
+
+    new_dtype = tf_lite_type_to_numpy(to_type)
+    re_quantized_tensor = builder.duplicate_tensor(tflite_tensor)
+    tensor_data = re_quantized_tensor.tmp_buffer.data
+
+    if tensor_data.dtype == np.uint8 and new_dtype == np.int8:  # INT8 -> UINT8
+        re_quantized_tensor.tmp_buffer.data = _re_quantize_uint8_to_int8(tensor_data)
+        re_quantized_tensor.type = tflTensorType.TensorType.INT8
+        calculated_zero_point = calculate_uint_to_int_re_quantization_zero_point(
+            1, re_quantized_tensor.quantization.zero_point.vector
+        )
+        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            list(calculated_zero_point)
+        )
+
+    elif tensor_data.dtype == np.int32 and new_dtype == np.int8:  # INT32 -> INT8
+        if new_zero_point is None or new_scale is None:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "Missing new zero_point or new scale when re-quantizing tensor.",
+            )
+
+        old_zp = re_quantized_tensor.quantization.zero_point.vector
+        old_scale = re_quantized_tensor.quantization.scale.vector
+        float_data = dequantize(tensor_data, old_scale, old_zp)
+        int8_data = quantize_int8(float_data, new_scale, new_zero_point)
+
+        re_quantized_tensor.tmp_buffer.data = int8_data
+        re_quantized_tensor.type = tflTensorType.TensorType.INT8
+        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            list(new_zero_point)
+        )
+        re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale))
+
+    elif tensor_data.dtype == np.int32 and new_dtype == np.uint8:  # INT32 -> UINT8
+        if new_zero_point is None or new_scale is None:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "Missing new zero_point or new scale when re-quantizing tensor.",
+            )
+
+        old_zp = re_quantized_tensor.quantization.zero_point.vector
+        old_scale = re_quantized_tensor.quantization.scale.vector
+        float_data = dequantize(tensor_data, old_scale, old_zp)
+        uint8_data = quantize_uint8(float_data, new_scale, new_zero_point)
+
+        re_quantized_tensor.tmp_buffer.data = uint8_data
+        re_quantized_tensor.type = tflTensorType.TensorType.UINT8
+        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            list(new_zero_point)
+        )
+        re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale))
+
+    elif tensor_data.dtype == np.int8 and new_dtype == np.int8:  # INT8 -> INT8
+        # Re-quantizing int8 tensor data with different quantization parameters
+        if new_zero_point is None or new_scale is None:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "Missing new zero_point or new scale when re-quantizing tensor.",
+            )
+
+        zero_point_data = re_quantized_tensor.quantization.zero_point.vector
+        scale_data = re_quantized_tensor.quantization.scale.vector
+        new_tensor_data = dequantize(tensor_data, scale_data, zero_point_data)
+
+        re_quantized_tensor.tmp_buffer.data = quantize_int8(
+            new_tensor_data, new_scale, new_zero_point
+        )
+        re_quantized_tensor.quantization.scale = tflite_model.Scale(new_scale)
+        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            new_zero_point
+        )
+
+    elif tensor_data.dtype == np.int32 and new_dtype == np.int32:  # INT32 -> INT32
+        if new_zero_point is None or new_scale is None:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "Missing new zero_point or new scale when re-quantizing tensor.",
+            )
+
+        old_zp = re_quantized_tensor.quantization.zero_point.vector
+        old_scale = re_quantized_tensor.quantization.scale.vector
+        float_data = dequantize(tensor_data, old_scale, old_zp)
+        int32_data = quantize_int32(float_data, new_scale, new_zero_point)
+
+        re_quantized_tensor.tmp_buffer.data = int32_data
+        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            list(new_zero_point)
+        )
+        re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale))
+
+    else:
+        logger.e(
+            logger.Code.NOT_IMPLEMENTED,
+            f"Re-quantization of static tensors from type '{tensor_data.dtype}' "
+            f"to type '{to_type}' is not yet implemented!",
+        )
+
+    return re_quantized_tensor
+
+
+def quantize_static_float_tensor(
+    builder: "model_builder.ModelBuilder",
+    tflite_tensor: tflite_model.Tensor,
+    to_type: tflTensorType.TensorType,
+    scale: List[float],
+    zero_point: List[int],
+    quantized_dimension: int = 0,
+) -> tflite_model.Tensor:
+    """Quantize tensor 'tflite_tensor' with passed quantization params.
+
+    :param builder: A ModelBuilder instance.
+    :param tflite_tensor: TFLite tensor to quantize.
+    :param to_type: The TFLite TensorType, that the tensor will be quantized to.
+    :param scale: Scale quantization parameter.
+    :param zero_point: Zero point quantization parameter.
+    :param quantized_dimension: Quantized dimension.
+    """
+    if tflite_tensor.quantization is not None:
+        logger.e(logger.Code.INTERNAL_ERROR, "Got tensor with quantization!")
+
+    if tflite_tensor.tmp_buffer.data is None:
+        logger.e(logger.Code.INTERNAL_ERROR, "Got tensor without static data!")
+
+    quantized_tensor = builder.duplicate_tensor(tflite_tensor)
+    tensor_data = quantized_tensor.tmp_buffer.data
+
+    if zero_point is None or scale is None:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            "Missing new zero_point or new scale when quantizing tensor.",
+        )
+
+    new_dtype = tf_lite_type_to_numpy(to_type)
+
+    if tensor_data.dtype == np.float32 and new_dtype == np.int8:
+        int8_data = quantize_int8(tensor_data, scale, zero_point)
+
+        quantized_tensor.tmp_buffer.data = int8_data
+        quantized_tensor.type = tflTensorType.TensorType.INT8
+        quantized_tensor.quantization = tflite_model.Quantization()
+        quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            list(zero_point)
+        )
+        quantized_tensor.quantization.scale = tflite_model.Scale(list(scale))
+        quantized_tensor.quantization.quantized_dimension = quantized_dimension
+
+    elif tensor_data.dtype == np.float32 and new_dtype == np.uint8:
+        uint8_data = quantize_uint8(tensor_data, scale, zero_point)
+
+        quantized_tensor.tmp_buffer.data = uint8_data
+        quantized_tensor.type = tflTensorType.TensorType.UINT8
+        quantized_tensor.quantization = tflite_model.Quantization()
+        quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            list(zero_point)
+        )
+        quantized_tensor.quantization.scale = tflite_model.Scale(list(scale))
+        quantized_tensor.quantization.quantized_dimension = quantized_dimension
+
+    elif tensor_data.dtype == np.float32 and new_dtype == np.int32:
+        int32_data = quantize_int32(tensor_data, scale, zero_point)
+
+        quantized_tensor.tmp_buffer.data = int32_data
+        quantized_tensor.type = tflTensorType.TensorType.INT32
+        quantized_tensor.quantization = tflite_model.Quantization()
+        quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
+            list(zero_point)
+        )
+        quantized_tensor.quantization.scale = tflite_model.Scale(list(scale))
+        quantized_tensor.quantization.quantized_dimension = quantized_dimension
+
+    else:
+        logger.e(
+            logger.Code.NOT_IMPLEMENTED,
+            f"Quantization of static tensors from type '{tensor_data.dtype}' "
+            f"to type '{to_type}' is not yet implemented!",
+        )
+
+    return quantized_tensor
diff --git a/backends/nxp/backend/ir/converter/tensor_utils.py b/backends/nxp/backend/ir/converter/tensor_utils.py
new file mode 100755
index 00000000000..efa0bdc2a42
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/tensor_utils.py
@@ -0,0 +1,50 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+from executorch.backends.nxp.backend.ir import logger as logger
+from executorch.backends.nxp.backend.ir.tflite_generator import (
+    tflite_model as tflite_model,
+)
+
+
+def _buffer_has_data(t_buffer: tflite_model.Buffer) -> Optional[bool]:
+    """Determine if given buffer has any data in it."""
+
+    try:
+        if t_buffer.data is None:
+            return False
+
+        size = t_buffer.data.size
+        return size != 0
+
+    except Exception as e:
+        logger.d("'ModelBuilder.bufferHasData()' failed!")
+        print(e)
+        return None
+
+
+def tensor_has_data(t_tensor: tflite_model.Tensor) -> bool:
+    """Determine if given TFLite tensor has any data."""
+
+    if t_tensor.tmp_buffer is None:
+        return False
+
+    res = _buffer_has_data(t_tensor.tmp_buffer)
+    if res is None:
+        res = False
+
+    return res
+
+
+def all_tensors_are_static(*list_of_tensors) -> bool:
+    """Return True, if all tensors in 'list_of_tensors' have data stored in them.
+
+    :param list_of_tensors: List of TFLite tensors to check.
+    :return: True, if all tensors are static. False, if at least 1 is not static.
+    """
+
+    return all(tensor_has_data(t) for t in list_of_tensors)
diff --git a/backends/nxp/backend/ir/lib/LICENSE_APACHE_2.0 b/backends/nxp/backend/ir/lib/LICENSE_APACHE_2.0
new file mode 100644
index 00000000000..12d255f8e0f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/LICENSE_APACHE_2.0
@@ -0,0 +1,251 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+## Some of TensorFlow's code is derived from Caffe, which is subject to the following copyright notice:
+
+COPYRIGHT
+
+All contributions by the University of California:
+
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+
+Copyright (c) 2014, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+   ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
\ No newline at end of file
diff --git a/backends/nxp/backend/ir/lib/__init__.py b/backends/nxp/backend/ir/lib/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/lib/tflite/ATan2Options.py b/backends/nxp/backend/ir/lib/tflite/ATan2Options.py
new file mode 100755
index 00000000000..7418141a593
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ATan2Options.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ATan2Options(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ATan2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsATan2Options(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ATan2OptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ATan2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def ATan2OptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    ATan2OptionsStart(builder)
+
+
+def ATan2OptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ATan2OptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/AbsOptions.py b/backends/nxp/backend/ir/lib/tflite/AbsOptions.py
new file mode 100755
index 00000000000..3cd401c07ee
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/AbsOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class AbsOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = AbsOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsAbsOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def AbsOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # AbsOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def AbsOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    AbsOptionsStart(builder)
+
+
+def AbsOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return AbsOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ActivationFunctionType.py b/backends/nxp/backend/ir/lib/tflite/ActivationFunctionType.py
new file mode 100755
index 00000000000..a3235396477
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ActivationFunctionType.py
@@ -0,0 +1,12 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class ActivationFunctionType(object):
+    NONE = 0
+    RELU = 1
+    RELU_N1_TO_1 = 2
+    RELU6 = 3
+    TANH = 4
+    SIGN_BIT = 5
diff --git a/backends/nxp/backend/ir/lib/tflite/AddNOptions.py b/backends/nxp/backend/ir/lib/tflite/AddNOptions.py
new file mode 100755
index 00000000000..b3a45971094
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/AddNOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class AddNOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = AddNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsAddNOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def AddNOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # AddNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def AddNOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    AddNOptionsStart(builder)
+
+
+def AddNOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return AddNOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/AddOptions.py b/backends/nxp/backend/ir/lib/tflite/AddOptions.py
new file mode 100755
index 00000000000..1d3625f3c9c
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/AddOptions.py
@@ -0,0 +1,82 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class AddOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = AddOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsAddOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def AddOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # AddOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # AddOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # AddOptions
+    def PotScaleInt16(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return True
+
+
+def AddOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    AddOptionsStart(builder)
+
+
+def AddOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    AddOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def AddOptionsAddPotScaleInt16(builder, potScaleInt16):
+    builder.PrependBoolSlot(1, potScaleInt16, 1)
+
+
+def AddPotScaleInt16(builder, potScaleInt16):
+    AddOptionsAddPotScaleInt16(builder, potScaleInt16)
+
+
+def AddOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return AddOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ArgMaxOptions.py b/backends/nxp/backend/ir/lib/tflite/ArgMaxOptions.py
new file mode 100755
index 00000000000..4e055b96710
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ArgMaxOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ArgMaxOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ArgMaxOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsArgMaxOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ArgMaxOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ArgMaxOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ArgMaxOptions
+    def OutputType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def ArgMaxOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    ArgMaxOptionsStart(builder)
+
+
+def ArgMaxOptionsAddOutputType(builder, outputType):
+    builder.PrependInt8Slot(0, outputType, 0)
+
+
+def AddOutputType(builder, outputType):
+    ArgMaxOptionsAddOutputType(builder, outputType)
+
+
+def ArgMaxOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ArgMaxOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ArgMinOptions.py b/backends/nxp/backend/ir/lib/tflite/ArgMinOptions.py
new file mode 100755
index 00000000000..163468c34e9
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ArgMinOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ArgMinOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ArgMinOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsArgMinOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ArgMinOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ArgMinOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ArgMinOptions
+    def OutputType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def ArgMinOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    ArgMinOptionsStart(builder)
+
+
+def ArgMinOptionsAddOutputType(builder, outputType):
+    builder.PrependInt8Slot(0, outputType, 0)
+
+
+def AddOutputType(builder, outputType):
+    ArgMinOptionsAddOutputType(builder, outputType)
+
+
+def ArgMinOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ArgMinOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/AssignVariableOptions.py b/backends/nxp/backend/ir/lib/tflite/AssignVariableOptions.py
new file mode 100755
index 00000000000..a0c8365b1f3
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/AssignVariableOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class AssignVariableOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = AssignVariableOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsAssignVariableOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def AssignVariableOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # AssignVariableOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def AssignVariableOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    AssignVariableOptionsStart(builder)
+
+
+def AssignVariableOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return AssignVariableOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BatchMatMulOptions.py b/backends/nxp/backend/ir/lib/tflite/BatchMatMulOptions.py
new file mode 100755
index 00000000000..cbd8f4a198f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BatchMatMulOptions.py
@@ -0,0 +1,101 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BatchMatMulOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BatchMatMulOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBatchMatMulOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BatchMatMulOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BatchMatMulOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # BatchMatMulOptions
+    def AdjX(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # BatchMatMulOptions
+    def AdjY(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # BatchMatMulOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def BatchMatMulOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    BatchMatMulOptionsStart(builder)
+
+
+def BatchMatMulOptionsAddAdjX(builder, adjX):
+    builder.PrependBoolSlot(0, adjX, 0)
+
+
+def AddAdjX(builder, adjX):
+    BatchMatMulOptionsAddAdjX(builder, adjX)
+
+
+def BatchMatMulOptionsAddAdjY(builder, adjY):
+    builder.PrependBoolSlot(1, adjY, 0)
+
+
+def AddAdjY(builder, adjY):
+    BatchMatMulOptionsAddAdjY(builder, adjY)
+
+
+def BatchMatMulOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    builder.PrependBoolSlot(2, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    BatchMatMulOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs)
+
+
+def BatchMatMulOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BatchMatMulOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BatchToSpaceNDOptions.py b/backends/nxp/backend/ir/lib/tflite/BatchToSpaceNDOptions.py
new file mode 100755
index 00000000000..6caac9e63d5
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BatchToSpaceNDOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BatchToSpaceNDOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BatchToSpaceNDOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBatchToSpaceNDOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BatchToSpaceNDOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BatchToSpaceNDOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def BatchToSpaceNDOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    BatchToSpaceNDOptionsStart(builder)
+
+
+def BatchToSpaceNDOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BatchToSpaceNDOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceLSTMOptions.py b/backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceLSTMOptions.py
new file mode 100755
index 00000000000..3d9bed3ae03
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceLSTMOptions.py
@@ -0,0 +1,160 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BidirectionalSequenceLSTMOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BidirectionalSequenceLSTMOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBidirectionalSequenceLSTMOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BidirectionalSequenceLSTMOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BidirectionalSequenceLSTMOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # BidirectionalSequenceLSTMOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # BidirectionalSequenceLSTMOptions
+    def CellClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # BidirectionalSequenceLSTMOptions
+    def ProjClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # BidirectionalSequenceLSTMOptions
+    def MergeOutputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # BidirectionalSequenceLSTMOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return True
+
+    # BidirectionalSequenceLSTMOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def BidirectionalSequenceLSTMOptionsStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    BidirectionalSequenceLSTMOptionsStart(builder)
+
+
+def BidirectionalSequenceLSTMOptionsAddFusedActivationFunction(
+    builder, fusedActivationFunction
+):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    BidirectionalSequenceLSTMOptionsAddFusedActivationFunction(
+        builder, fusedActivationFunction
+    )
+
+
+def BidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip):
+    builder.PrependFloat32Slot(1, cellClip, 0.0)
+
+
+def AddCellClip(builder, cellClip):
+    BidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip)
+
+
+def BidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip):
+    builder.PrependFloat32Slot(2, projClip, 0.0)
+
+
+def AddProjClip(builder, projClip):
+    BidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip)
+
+
+def BidirectionalSequenceLSTMOptionsAddMergeOutputs(builder, mergeOutputs):
+    builder.PrependBoolSlot(3, mergeOutputs, 0)
+
+
+def AddMergeOutputs(builder, mergeOutputs):
+    BidirectionalSequenceLSTMOptionsAddMergeOutputs(builder, mergeOutputs)
+
+
+def BidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor):
+    builder.PrependBoolSlot(4, timeMajor, 1)
+
+
+def AddTimeMajor(builder, timeMajor):
+    BidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor)
+
+
+def BidirectionalSequenceLSTMOptionsAddAsymmetricQuantizeInputs(
+    builder, asymmetricQuantizeInputs
+):
+    builder.PrependBoolSlot(5, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    BidirectionalSequenceLSTMOptionsAddAsymmetricQuantizeInputs(
+        builder, asymmetricQuantizeInputs
+    )
+
+
+def BidirectionalSequenceLSTMOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BidirectionalSequenceLSTMOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceRNNOptions.py b/backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceRNNOptions.py
new file mode 100755
index 00000000000..7fb26888b5d
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BidirectionalSequenceRNNOptions.py
@@ -0,0 +1,126 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BidirectionalSequenceRNNOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BidirectionalSequenceRNNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBidirectionalSequenceRNNOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BidirectionalSequenceRNNOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BidirectionalSequenceRNNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # BidirectionalSequenceRNNOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # BidirectionalSequenceRNNOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # BidirectionalSequenceRNNOptions
+    def MergeOutputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # BidirectionalSequenceRNNOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def BidirectionalSequenceRNNOptionsStart(builder):
+    builder.StartObject(4)
+
+
+def Start(builder):
+    BidirectionalSequenceRNNOptionsStart(builder)
+
+
+def BidirectionalSequenceRNNOptionsAddTimeMajor(builder, timeMajor):
+    builder.PrependBoolSlot(0, timeMajor, 0)
+
+
+def AddTimeMajor(builder, timeMajor):
+    BidirectionalSequenceRNNOptionsAddTimeMajor(builder, timeMajor)
+
+
+def BidirectionalSequenceRNNOptionsAddFusedActivationFunction(
+    builder, fusedActivationFunction
+):
+    builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    BidirectionalSequenceRNNOptionsAddFusedActivationFunction(
+        builder, fusedActivationFunction
+    )
+
+
+def BidirectionalSequenceRNNOptionsAddMergeOutputs(builder, mergeOutputs):
+    builder.PrependBoolSlot(2, mergeOutputs, 0)
+
+
+def AddMergeOutputs(builder, mergeOutputs):
+    BidirectionalSequenceRNNOptionsAddMergeOutputs(builder, mergeOutputs)
+
+
+def BidirectionalSequenceRNNOptionsAddAsymmetricQuantizeInputs(
+    builder, asymmetricQuantizeInputs
+):
+    builder.PrependBoolSlot(3, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    BidirectionalSequenceRNNOptionsAddAsymmetricQuantizeInputs(
+        builder, asymmetricQuantizeInputs
+    )
+
+
+def BidirectionalSequenceRNNOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BidirectionalSequenceRNNOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BitcastOptions.py b/backends/nxp/backend/ir/lib/tflite/BitcastOptions.py
new file mode 100755
index 00000000000..8f8569ab950
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BitcastOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BitcastOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BitcastOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBitcastOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BitcastOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BitcastOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def BitcastOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    BitcastOptionsStart(builder)
+
+
+def BitcastOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BitcastOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BitwiseXorOptions.py b/backends/nxp/backend/ir/lib/tflite/BitwiseXorOptions.py
new file mode 100755
index 00000000000..b17cd8047c6
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BitwiseXorOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BitwiseXorOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BitwiseXorOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBitwiseXorOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BitwiseXorOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BitwiseXorOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def BitwiseXorOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    BitwiseXorOptionsStart(builder)
+
+
+def BitwiseXorOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BitwiseXorOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BroadcastToOptions.py b/backends/nxp/backend/ir/lib/tflite/BroadcastToOptions.py
new file mode 100755
index 00000000000..dca37ff4b1e
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BroadcastToOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BroadcastToOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BroadcastToOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBroadcastToOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BroadcastToOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BroadcastToOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def BroadcastToOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    BroadcastToOptionsStart(builder)
+
+
+def BroadcastToOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BroadcastToOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BucketizeOptions.py b/backends/nxp/backend/ir/lib/tflite/BucketizeOptions.py
new file mode 100755
index 00000000000..f64e7c4e64f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BucketizeOptions.py
@@ -0,0 +1,98 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class BucketizeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BucketizeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBucketizeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BucketizeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # BucketizeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # BucketizeOptions
+    def Boundaries(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # BucketizeOptions
+    def BoundariesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # BucketizeOptions
+    def BoundariesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # BucketizeOptions
+    def BoundariesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def BucketizeOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    BucketizeOptionsStart(builder)
+
+
+def BucketizeOptionsAddBoundaries(builder, boundaries):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(boundaries), 0
+    )
+
+
+def AddBoundaries(builder, boundaries):
+    BucketizeOptionsAddBoundaries(builder, boundaries)
+
+
+def BucketizeOptionsStartBoundariesVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartBoundariesVector(builder, numElems: int) -> int:
+    return BucketizeOptionsStartBoundariesVector(builder, numElems)
+
+
+def BucketizeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BucketizeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Buffer.py b/backends/nxp/backend/ir/lib/tflite/Buffer.py
new file mode 100755
index 00000000000..5a36140725c
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Buffer.py
@@ -0,0 +1,132 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Buffer(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Buffer()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsBuffer(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def BufferBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Buffer
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Buffer
+    def Data(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint8Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1),
+            )
+        return 0
+
+    # Buffer
+    def DataAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Buffer
+    def DataLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Buffer
+    def DataIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # Buffer
+    def Offset(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint64Flags, o + self._tab.Pos
+            )
+        return 0
+
+    # Buffer
+    def Size(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint64Flags, o + self._tab.Pos
+            )
+        return 0
+
+
+def BufferStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    BufferStart(builder)
+
+
+def BufferAddData(builder, data):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(data), 0
+    )
+
+
+def AddData(builder, data):
+    BufferAddData(builder, data)
+
+
+def BufferStartDataVector(builder, numElems):
+    return builder.StartVector(1, numElems, 1)
+
+
+def StartDataVector(builder, numElems: int) -> int:
+    return BufferStartDataVector(builder, numElems)
+
+
+def BufferAddOffset(builder, offset):
+    builder.PrependUint64Slot(1, offset, 0)
+
+
+def AddOffset(builder, offset):
+    BufferAddOffset(builder, offset)
+
+
+def BufferAddSize(builder, size):
+    builder.PrependUint64Slot(2, size, 0)
+
+
+def AddSize(builder, size):
+    BufferAddSize(builder, size)
+
+
+def BufferEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return BufferEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/BuiltinOperator.py b/backends/nxp/backend/ir/lib/tflite/BuiltinOperator.py
new file mode 100755
index 00000000000..6dbadcc91c3
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BuiltinOperator.py
@@ -0,0 +1,212 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class BuiltinOperator(object):
+    ADD = 0
+    AVERAGE_POOL_2D = 1
+    CONCATENATION = 2
+    CONV_2D = 3
+    DEPTHWISE_CONV_2D = 4
+    DEPTH_TO_SPACE = 5
+    DEQUANTIZE = 6
+    EMBEDDING_LOOKUP = 7
+    FLOOR = 8
+    FULLY_CONNECTED = 9
+    HASHTABLE_LOOKUP = 10
+    L2_NORMALIZATION = 11
+    L2_POOL_2D = 12
+    LOCAL_RESPONSE_NORMALIZATION = 13
+    LOGISTIC = 14
+    LSH_PROJECTION = 15
+    LSTM = 16
+    MAX_POOL_2D = 17
+    MUL = 18
+    RELU = 19
+    RELU_N1_TO_1 = 20
+    RELU6 = 21
+    RESHAPE = 22
+    RESIZE_BILINEAR = 23
+    RNN = 24
+    SOFTMAX = 25
+    SPACE_TO_DEPTH = 26
+    SVDF = 27
+    TANH = 28
+    CONCAT_EMBEDDINGS = 29
+    SKIP_GRAM = 30
+    CALL = 31
+    CUSTOM = 32
+    EMBEDDING_LOOKUP_SPARSE = 33
+    PAD = 34
+    UNIDIRECTIONAL_SEQUENCE_RNN = 35
+    GATHER = 36
+    BATCH_TO_SPACE_ND = 37
+    SPACE_TO_BATCH_ND = 38
+    TRANSPOSE = 39
+    MEAN = 40
+    SUB = 41
+    DIV = 42
+    SQUEEZE = 43
+    UNIDIRECTIONAL_SEQUENCE_LSTM = 44
+    STRIDED_SLICE = 45
+    BIDIRECTIONAL_SEQUENCE_RNN = 46
+    EXP = 47
+    TOPK_V2 = 48
+    SPLIT = 49
+    LOG_SOFTMAX = 50
+    DELEGATE = 51
+    BIDIRECTIONAL_SEQUENCE_LSTM = 52
+    CAST = 53
+    PRELU = 54
+    MAXIMUM = 55
+    ARG_MAX = 56
+    MINIMUM = 57
+    LESS = 58
+    NEG = 59
+    PADV2 = 60
+    GREATER = 61
+    GREATER_EQUAL = 62
+    LESS_EQUAL = 63
+    SELECT = 64
+    SLICE = 65
+    SIN = 66
+    TRANSPOSE_CONV = 67
+    SPARSE_TO_DENSE = 68
+    TILE = 69
+    EXPAND_DIMS = 70
+    EQUAL = 71
+    NOT_EQUAL = 72
+    LOG = 73
+    SUM = 74
+    SQRT = 75
+    RSQRT = 76
+    SHAPE = 77
+    POW = 78
+    ARG_MIN = 79
+    FAKE_QUANT = 80
+    REDUCE_PROD = 81
+    REDUCE_MAX = 82
+    PACK = 83
+    LOGICAL_OR = 84
+    ONE_HOT = 85
+    LOGICAL_AND = 86
+    LOGICAL_NOT = 87
+    UNPACK = 88
+    REDUCE_MIN = 89
+    FLOOR_DIV = 90
+    REDUCE_ANY = 91
+    SQUARE = 92
+    ZEROS_LIKE = 93
+    FILL = 94
+    FLOOR_MOD = 95
+    RANGE = 96
+    RESIZE_NEAREST_NEIGHBOR = 97
+    LEAKY_RELU = 98
+    SQUARED_DIFFERENCE = 99
+    MIRROR_PAD = 100
+    ABS = 101
+    SPLIT_V = 102
+    UNIQUE = 103
+    CEIL = 104
+    REVERSE_V2 = 105
+    ADD_N = 106
+    GATHER_ND = 107
+    COS = 108
+    WHERE = 109
+    RANK = 110
+    ELU = 111
+    REVERSE_SEQUENCE = 112
+    MATRIX_DIAG = 113
+    QUANTIZE = 114
+    MATRIX_SET_DIAG = 115
+    ROUND = 116
+    HARD_SWISH = 117
+    IF = 118
+    WHILE = 119
+    NON_MAX_SUPPRESSION_V4 = 120
+    NON_MAX_SUPPRESSION_V5 = 121
+    SCATTER_ND = 122
+    SELECT_V2 = 123
+    DENSIFY = 124
+    SEGMENT_SUM = 125
+    BATCH_MATMUL = 126
+    PLACEHOLDER_FOR_GREATER_OP_CODES = 127
+    CUMSUM = 128
+    CALL_ONCE = 129
+    BROADCAST_TO = 130
+    RFFT2D = 131
+    CONV_3D = 132
+    IMAG = 133
+    REAL = 134
+    COMPLEX_ABS = 135
+    HASHTABLE = 136
+    HASHTABLE_FIND = 137
+    HASHTABLE_IMPORT = 138
+    HASHTABLE_SIZE = 139
+    REDUCE_ALL = 140
+    CONV_3D_TRANSPOSE = 141
+    VAR_HANDLE = 142
+    READ_VARIABLE = 143
+    ASSIGN_VARIABLE = 144
+    BROADCAST_ARGS = 145
+    RANDOM_STANDARD_NORMAL = 146
+    BUCKETIZE = 147
+    RANDOM_UNIFORM = 148
+    MULTINOMIAL = 149
+    GELU = 150
+    DYNAMIC_UPDATE_SLICE = 151
+    RELU_0_TO_1 = 152
+    UNSORTED_SEGMENT_PROD = 153
+    UNSORTED_SEGMENT_MAX = 154
+    UNSORTED_SEGMENT_SUM = 155
+    ATAN2 = 156
+    UNSORTED_SEGMENT_MIN = 157
+    SIGN = 158
+    BITCAST = 159
+    BITWISE_XOR = 160
+    RIGHT_SHIFT = 161
+    STABLEHLO_LOGISTIC = 162
+    STABLEHLO_ADD = 163
+    STABLEHLO_DIVIDE = 164
+    STABLEHLO_MULTIPLY = 165
+    STABLEHLO_MAXIMUM = 166
+    STABLEHLO_RESHAPE = 167
+    STABLEHLO_CLAMP = 168
+    STABLEHLO_CONCATENATE = 169
+    STABLEHLO_BROADCAST_IN_DIM = 170
+    STABLEHLO_CONVOLUTION = 171
+    STABLEHLO_SLICE = 172
+    STABLEHLO_CUSTOM_CALL = 173
+    STABLEHLO_REDUCE = 174
+    STABLEHLO_ABS = 175
+    STABLEHLO_AND = 176
+    STABLEHLO_COSINE = 177
+    STABLEHLO_EXPONENTIAL = 178
+    STABLEHLO_FLOOR = 179
+    STABLEHLO_LOG = 180
+    STABLEHLO_MINIMUM = 181
+    STABLEHLO_NEGATE = 182
+    STABLEHLO_OR = 183
+    STABLEHLO_POWER = 184
+    STABLEHLO_REMAINDER = 185
+    STABLEHLO_RSQRT = 186
+    STABLEHLO_SELECT = 187
+    STABLEHLO_SUBTRACT = 188
+    STABLEHLO_TANH = 189
+    STABLEHLO_SCATTER = 190
+    STABLEHLO_COMPARE = 191
+    STABLEHLO_CONVERT = 192
+    STABLEHLO_DYNAMIC_SLICE = 193
+    STABLEHLO_DYNAMIC_UPDATE_SLICE = 194
+    STABLEHLO_PAD = 195
+    STABLEHLO_IOTA = 196
+    STABLEHLO_DOT_GENERAL = 197
+    STABLEHLO_REDUCE_WINDOW = 198
+    STABLEHLO_SORT = 199
+    STABLEHLO_WHILE = 200
+    STABLEHLO_GATHER = 201
+    STABLEHLO_TRANSPOSE = 202
+    DILATE = 203
+    STABLEHLO_RNG_BIT_GENERATOR = 204
+    REDUCE_WINDOW = 205
diff --git a/backends/nxp/backend/ir/lib/tflite/BuiltinOptions.py b/backends/nxp/backend/ir/lib/tflite/BuiltinOptions.py
new file mode 100755
index 00000000000..8e416fad7c6
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BuiltinOptions.py
@@ -0,0 +1,133 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class BuiltinOptions(object):
+    NONE = 0
+    Conv2DOptions = 1
+    DepthwiseConv2DOptions = 2
+    ConcatEmbeddingsOptions = 3
+    LSHProjectionOptions = 4
+    Pool2DOptions = 5
+    SVDFOptions = 6
+    RNNOptions = 7
+    FullyConnectedOptions = 8
+    SoftmaxOptions = 9
+    ConcatenationOptions = 10
+    AddOptions = 11
+    L2NormOptions = 12
+    LocalResponseNormalizationOptions = 13
+    LSTMOptions = 14
+    ResizeBilinearOptions = 15
+    CallOptions = 16
+    ReshapeOptions = 17
+    SkipGramOptions = 18
+    SpaceToDepthOptions = 19
+    EmbeddingLookupSparseOptions = 20
+    MulOptions = 21
+    PadOptions = 22
+    GatherOptions = 23
+    BatchToSpaceNDOptions = 24
+    SpaceToBatchNDOptions = 25
+    TransposeOptions = 26
+    ReducerOptions = 27
+    SubOptions = 28
+    DivOptions = 29
+    SqueezeOptions = 30
+    SequenceRNNOptions = 31
+    StridedSliceOptions = 32
+    ExpOptions = 33
+    TopKV2Options = 34
+    SplitOptions = 35
+    LogSoftmaxOptions = 36
+    CastOptions = 37
+    DequantizeOptions = 38
+    MaximumMinimumOptions = 39
+    ArgMaxOptions = 40
+    LessOptions = 41
+    NegOptions = 42
+    PadV2Options = 43
+    GreaterOptions = 44
+    GreaterEqualOptions = 45
+    LessEqualOptions = 46
+    SelectOptions = 47
+    SliceOptions = 48
+    TransposeConvOptions = 49
+    SparseToDenseOptions = 50
+    TileOptions = 51
+    ExpandDimsOptions = 52
+    EqualOptions = 53
+    NotEqualOptions = 54
+    ShapeOptions = 55
+    PowOptions = 56
+    ArgMinOptions = 57
+    FakeQuantOptions = 58
+    PackOptions = 59
+    LogicalOrOptions = 60
+    OneHotOptions = 61
+    LogicalAndOptions = 62
+    LogicalNotOptions = 63
+    UnpackOptions = 64
+    FloorDivOptions = 65
+    SquareOptions = 66
+    ZerosLikeOptions = 67
+    FillOptions = 68
+    BidirectionalSequenceLSTMOptions = 69
+    BidirectionalSequenceRNNOptions = 70
+    UnidirectionalSequenceLSTMOptions = 71
+    FloorModOptions = 72
+    RangeOptions = 73
+    ResizeNearestNeighborOptions = 74
+    LeakyReluOptions = 75
+    SquaredDifferenceOptions = 76
+    MirrorPadOptions = 77
+    AbsOptions = 78
+    SplitVOptions = 79
+    UniqueOptions = 80
+    ReverseV2Options = 81
+    AddNOptions = 82
+    GatherNdOptions = 83
+    CosOptions = 84
+    WhereOptions = 85
+    RankOptions = 86
+    ReverseSequenceOptions = 87
+    MatrixDiagOptions = 88
+    QuantizeOptions = 89
+    MatrixSetDiagOptions = 90
+    HardSwishOptions = 91
+    IfOptions = 92
+    WhileOptions = 93
+    DepthToSpaceOptions = 94
+    NonMaxSuppressionV4Options = 95
+    NonMaxSuppressionV5Options = 96
+    ScatterNdOptions = 97
+    SelectV2Options = 98
+    DensifyOptions = 99
+    SegmentSumOptions = 100
+    BatchMatMulOptions = 101
+    CumsumOptions = 102
+    CallOnceOptions = 103
+    BroadcastToOptions = 104
+    Rfft2dOptions = 105
+    Conv3DOptions = 106
+    HashtableOptions = 107
+    HashtableFindOptions = 108
+    HashtableImportOptions = 109
+    HashtableSizeOptions = 110
+    VarHandleOptions = 111
+    ReadVariableOptions = 112
+    AssignVariableOptions = 113
+    RandomOptions = 114
+    BucketizeOptions = 115
+    GeluOptions = 116
+    DynamicUpdateSliceOptions = 117
+    UnsortedSegmentProdOptions = 118
+    UnsortedSegmentMaxOptions = 119
+    UnsortedSegmentMinOptions = 120
+    UnsortedSegmentSumOptions = 121
+    ATan2Options = 122
+    SignOptions = 123
+    BitcastOptions = 124
+    BitwiseXorOptions = 125
+    RightShiftOptions = 126
diff --git a/backends/nxp/backend/ir/lib/tflite/BuiltinOptions2.py b/backends/nxp/backend/ir/lib/tflite/BuiltinOptions2.py
new file mode 100755
index 00000000000..5df8e2fe998
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/BuiltinOptions2.py
@@ -0,0 +1,27 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class BuiltinOptions2(object):
+    NONE = 0
+    StablehloConcatenateOptions = 1
+    StablehloBroadcastInDimOptions = 2
+    StablehloSliceOptions = 3
+    StablehloConvolutionOptions = 4
+    StablehloCustomCallOptions = 5
+    StablehloReduceOptions = 6
+    StablehloScatterOptions = 7
+    StablehloCompareOptions = 8
+    StablehloDynamicSliceOptions = 9
+    StablehloPadOptions = 10
+    StablehloIotaOptions = 11
+    StablehloDotGeneralOptions = 12
+    StablehloReduceWindowOptions = 13
+    StablehloSortOptions = 14
+    StablehloWhileOptions = 15
+    StablehloGatherOptions = 16
+    StablehloTransposeOptions = 17
+    DilateOptions = 18
+    StablehloRngBitGeneratorOptions = 19
+    ReduceWindowOptions = 20
diff --git a/backends/nxp/backend/ir/lib/tflite/CallOnceOptions.py b/backends/nxp/backend/ir/lib/tflite/CallOnceOptions.py
new file mode 100755
index 00000000000..1bb63035716
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CallOnceOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class CallOnceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CallOnceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsCallOnceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def CallOnceOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # CallOnceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CallOnceOptions
+    def InitSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def CallOnceOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    CallOnceOptionsStart(builder)
+
+
+def CallOnceOptionsAddInitSubgraphIndex(builder, initSubgraphIndex):
+    builder.PrependInt32Slot(0, initSubgraphIndex, 0)
+
+
+def AddInitSubgraphIndex(builder, initSubgraphIndex):
+    CallOnceOptionsAddInitSubgraphIndex(builder, initSubgraphIndex)
+
+
+def CallOnceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return CallOnceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/CallOptions.py b/backends/nxp/backend/ir/lib/tflite/CallOptions.py
new file mode 100755
index 00000000000..4522a53917f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CallOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class CallOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CallOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsCallOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def CallOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # CallOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CallOptions
+    def Subgraph(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+
+def CallOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    CallOptionsStart(builder)
+
+
+def CallOptionsAddSubgraph(builder, subgraph):
+    builder.PrependUint32Slot(0, subgraph, 0)
+
+
+def AddSubgraph(builder, subgraph):
+    CallOptionsAddSubgraph(builder, subgraph)
+
+
+def CallOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return CallOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/CastOptions.py b/backends/nxp/backend/ir/lib/tflite/CastOptions.py
new file mode 100755
index 00000000000..cbec318be5b
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CastOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class CastOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CastOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsCastOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def CastOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # CastOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CastOptions
+    def InDataType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # CastOptions
+    def OutDataType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def CastOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    CastOptionsStart(builder)
+
+
+def CastOptionsAddInDataType(builder, inDataType):
+    builder.PrependInt8Slot(0, inDataType, 0)
+
+
+def AddInDataType(builder, inDataType):
+    CastOptionsAddInDataType(builder, inDataType)
+
+
+def CastOptionsAddOutDataType(builder, outDataType):
+    builder.PrependInt8Slot(1, outDataType, 0)
+
+
+def AddOutDataType(builder, outDataType):
+    CastOptionsAddOutDataType(builder, outDataType)
+
+
+def CastOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return CastOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/CombinerType.py b/backends/nxp/backend/ir/lib/tflite/CombinerType.py
new file mode 100755
index 00000000000..dfe8afb9fc8
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CombinerType.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class CombinerType(object):
+    SUM = 0
+    MEAN = 1
+    SQRTN = 2
diff --git a/backends/nxp/backend/ir/lib/tflite/ConcatEmbeddingsOptions.py b/backends/nxp/backend/ir/lib/tflite/ConcatEmbeddingsOptions.py
new file mode 100755
index 00000000000..43234f666ad
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ConcatEmbeddingsOptions.py
@@ -0,0 +1,163 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ConcatEmbeddingsOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ConcatEmbeddingsOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsConcatEmbeddingsOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ConcatEmbeddingsOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ConcatEmbeddingsOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ConcatEmbeddingsOptions
+    def NumChannels(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def NumColumnsPerChannel(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def NumColumnsPerChannelAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def NumColumnsPerChannelLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def NumColumnsPerChannelIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # ConcatEmbeddingsOptions
+    def EmbeddingDimPerChannel(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def EmbeddingDimPerChannelAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def EmbeddingDimPerChannelLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def EmbeddingDimPerChannelIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+
+def ConcatEmbeddingsOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    ConcatEmbeddingsOptionsStart(builder)
+
+
+def ConcatEmbeddingsOptionsAddNumChannels(builder, numChannels):
+    builder.PrependInt32Slot(0, numChannels, 0)
+
+
+def AddNumChannels(builder, numChannels):
+    ConcatEmbeddingsOptionsAddNumChannels(builder, numChannels)
+
+
+def ConcatEmbeddingsOptionsAddNumColumnsPerChannel(builder, numColumnsPerChannel):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(numColumnsPerChannel), 0
+    )
+
+
+def AddNumColumnsPerChannel(builder, numColumnsPerChannel):
+    ConcatEmbeddingsOptionsAddNumColumnsPerChannel(builder, numColumnsPerChannel)
+
+
+def ConcatEmbeddingsOptionsStartNumColumnsPerChannelVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartNumColumnsPerChannelVector(builder, numElems: int) -> int:
+    return ConcatEmbeddingsOptionsStartNumColumnsPerChannelVector(builder, numElems)
+
+
+def ConcatEmbeddingsOptionsAddEmbeddingDimPerChannel(builder, embeddingDimPerChannel):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(embeddingDimPerChannel), 0
+    )
+
+
+def AddEmbeddingDimPerChannel(builder, embeddingDimPerChannel):
+    ConcatEmbeddingsOptionsAddEmbeddingDimPerChannel(builder, embeddingDimPerChannel)
+
+
+def ConcatEmbeddingsOptionsStartEmbeddingDimPerChannelVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartEmbeddingDimPerChannelVector(builder, numElems: int) -> int:
+    return ConcatEmbeddingsOptionsStartEmbeddingDimPerChannelVector(builder, numElems)
+
+
+def ConcatEmbeddingsOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ConcatEmbeddingsOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ConcatenationOptions.py b/backends/nxp/backend/ir/lib/tflite/ConcatenationOptions.py
new file mode 100755
index 00000000000..b1ec3c98e90
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ConcatenationOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ConcatenationOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ConcatenationOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsConcatenationOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ConcatenationOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ConcatenationOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ConcatenationOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # ConcatenationOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def ConcatenationOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    ConcatenationOptionsStart(builder)
+
+
+def ConcatenationOptionsAddAxis(builder, axis):
+    builder.PrependInt32Slot(0, axis, 0)
+
+
+def AddAxis(builder, axis):
+    ConcatenationOptionsAddAxis(builder, axis)
+
+
+def ConcatenationOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    ConcatenationOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def ConcatenationOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ConcatenationOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Conv2DOptions.py b/backends/nxp/backend/ir/lib/tflite/Conv2DOptions.py
new file mode 100755
index 00000000000..36ece4ad2db
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Conv2DOptions.py
@@ -0,0 +1,155 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Conv2DOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Conv2DOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsConv2DOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def Conv2DOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Conv2DOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Conv2DOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def DilationWFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # Conv2DOptions
+    def DilationHFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # Conv2DOptions
+    def QuantizedBiasType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def Conv2DOptionsStart(builder):
+    builder.StartObject(7)
+
+
+def Start(builder):
+    Conv2DOptionsStart(builder)
+
+
+def Conv2DOptionsAddPadding(builder, padding):
+    builder.PrependInt8Slot(0, padding, 0)
+
+
+def AddPadding(builder, padding):
+    Conv2DOptionsAddPadding(builder, padding)
+
+
+def Conv2DOptionsAddStrideW(builder, strideW):
+    builder.PrependInt32Slot(1, strideW, 0)
+
+
+def AddStrideW(builder, strideW):
+    Conv2DOptionsAddStrideW(builder, strideW)
+
+
+def Conv2DOptionsAddStrideH(builder, strideH):
+    builder.PrependInt32Slot(2, strideH, 0)
+
+
+def AddStrideH(builder, strideH):
+    Conv2DOptionsAddStrideH(builder, strideH)
+
+
+def Conv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(3, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    Conv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def Conv2DOptionsAddDilationWFactor(builder, dilationWFactor):
+    builder.PrependInt32Slot(4, dilationWFactor, 1)
+
+
+def AddDilationWFactor(builder, dilationWFactor):
+    Conv2DOptionsAddDilationWFactor(builder, dilationWFactor)
+
+
+def Conv2DOptionsAddDilationHFactor(builder, dilationHFactor):
+    builder.PrependInt32Slot(5, dilationHFactor, 1)
+
+
+def AddDilationHFactor(builder, dilationHFactor):
+    Conv2DOptionsAddDilationHFactor(builder, dilationHFactor)
+
+
+def Conv2DOptionsAddQuantizedBiasType(builder, quantizedBiasType):
+    builder.PrependInt8Slot(6, quantizedBiasType, 0)
+
+
+def AddQuantizedBiasType(builder, quantizedBiasType):
+    Conv2DOptionsAddQuantizedBiasType(builder, quantizedBiasType)
+
+
+def Conv2DOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return Conv2DOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Conv3DOptions.py b/backends/nxp/backend/ir/lib/tflite/Conv3DOptions.py
new file mode 100755
index 00000000000..31d7003461c
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Conv3DOptions.py
@@ -0,0 +1,170 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Conv3DOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Conv3DOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsConv3DOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def Conv3DOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Conv3DOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Conv3DOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv3DOptions
+    def StrideD(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv3DOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv3DOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv3DOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv3DOptions
+    def DilationDFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # Conv3DOptions
+    def DilationWFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # Conv3DOptions
+    def DilationHFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+
+def Conv3DOptionsStart(builder):
+    builder.StartObject(8)
+
+
+def Start(builder):
+    Conv3DOptionsStart(builder)
+
+
+def Conv3DOptionsAddPadding(builder, padding):
+    builder.PrependInt8Slot(0, padding, 0)
+
+
+def AddPadding(builder, padding):
+    Conv3DOptionsAddPadding(builder, padding)
+
+
+def Conv3DOptionsAddStrideD(builder, strideD):
+    builder.PrependInt32Slot(1, strideD, 0)
+
+
+def AddStrideD(builder, strideD):
+    Conv3DOptionsAddStrideD(builder, strideD)
+
+
+def Conv3DOptionsAddStrideW(builder, strideW):
+    builder.PrependInt32Slot(2, strideW, 0)
+
+
+def AddStrideW(builder, strideW):
+    Conv3DOptionsAddStrideW(builder, strideW)
+
+
+def Conv3DOptionsAddStrideH(builder, strideH):
+    builder.PrependInt32Slot(3, strideH, 0)
+
+
+def AddStrideH(builder, strideH):
+    Conv3DOptionsAddStrideH(builder, strideH)
+
+
+def Conv3DOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(4, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    Conv3DOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def Conv3DOptionsAddDilationDFactor(builder, dilationDFactor):
+    builder.PrependInt32Slot(5, dilationDFactor, 1)
+
+
+def AddDilationDFactor(builder, dilationDFactor):
+    Conv3DOptionsAddDilationDFactor(builder, dilationDFactor)
+
+
+def Conv3DOptionsAddDilationWFactor(builder, dilationWFactor):
+    builder.PrependInt32Slot(6, dilationWFactor, 1)
+
+
+def AddDilationWFactor(builder, dilationWFactor):
+    Conv3DOptionsAddDilationWFactor(builder, dilationWFactor)
+
+
+def Conv3DOptionsAddDilationHFactor(builder, dilationHFactor):
+    builder.PrependInt32Slot(7, dilationHFactor, 1)
+
+
+def AddDilationHFactor(builder, dilationHFactor):
+    Conv3DOptionsAddDilationHFactor(builder, dilationHFactor)
+
+
+def Conv3DOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return Conv3DOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/CosOptions.py b/backends/nxp/backend/ir/lib/tflite/CosOptions.py
new file mode 100755
index 00000000000..fa3e7c46643
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CosOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class CosOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CosOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsCosOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def CosOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # CosOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def CosOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    CosOptionsStart(builder)
+
+
+def CosOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return CosOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/CumsumOptions.py b/backends/nxp/backend/ir/lib/tflite/CumsumOptions.py
new file mode 100755
index 00000000000..4d1d9e1eeb2
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CumsumOptions.py
@@ -0,0 +1,84 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class CumsumOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CumsumOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsCumsumOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def CumsumOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # CumsumOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CumsumOptions
+    def Exclusive(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # CumsumOptions
+    def Reverse(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def CumsumOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    CumsumOptionsStart(builder)
+
+
+def CumsumOptionsAddExclusive(builder, exclusive):
+    builder.PrependBoolSlot(0, exclusive, 0)
+
+
+def AddExclusive(builder, exclusive):
+    CumsumOptionsAddExclusive(builder, exclusive)
+
+
+def CumsumOptionsAddReverse(builder, reverse):
+    builder.PrependBoolSlot(1, reverse, 0)
+
+
+def AddReverse(builder, reverse):
+    CumsumOptionsAddReverse(builder, reverse)
+
+
+def CumsumOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return CumsumOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/CustomOptionsFormat.py b/backends/nxp/backend/ir/lib/tflite/CustomOptionsFormat.py
new file mode 100755
index 00000000000..18bc07d023d
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CustomOptionsFormat.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class CustomOptionsFormat(object):
+    FLEXBUFFERS = 0
diff --git a/backends/nxp/backend/ir/lib/tflite/CustomQuantization.py b/backends/nxp/backend/ir/lib/tflite/CustomQuantization.py
new file mode 100755
index 00000000000..79fb359ebe0
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/CustomQuantization.py
@@ -0,0 +1,98 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class CustomQuantization(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CustomQuantization()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsCustomQuantization(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def CustomQuantizationBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # CustomQuantization
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CustomQuantization
+    def Custom(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint8Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1),
+            )
+        return 0
+
+    # CustomQuantization
+    def CustomAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # CustomQuantization
+    def CustomLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # CustomQuantization
+    def CustomIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def CustomQuantizationStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    CustomQuantizationStart(builder)
+
+
+def CustomQuantizationAddCustom(builder, custom):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(custom), 0
+    )
+
+
+def AddCustom(builder, custom):
+    CustomQuantizationAddCustom(builder, custom)
+
+
+def CustomQuantizationStartCustomVector(builder, numElems):
+    return builder.StartVector(1, numElems, 1)
+
+
+def StartCustomVector(builder, numElems: int) -> int:
+    return CustomQuantizationStartCustomVector(builder, numElems)
+
+
+def CustomQuantizationEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return CustomQuantizationEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DensifyOptions.py b/backends/nxp/backend/ir/lib/tflite/DensifyOptions.py
new file mode 100755
index 00000000000..1b54a16034a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DensifyOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DensifyOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DensifyOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDensifyOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DensifyOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DensifyOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def DensifyOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    DensifyOptionsStart(builder)
+
+
+def DensifyOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DensifyOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DepthToSpaceOptions.py b/backends/nxp/backend/ir/lib/tflite/DepthToSpaceOptions.py
new file mode 100755
index 00000000000..315b2da52e8
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DepthToSpaceOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DepthToSpaceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DepthToSpaceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDepthToSpaceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DepthToSpaceOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DepthToSpaceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DepthToSpaceOptions
+    def BlockSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def DepthToSpaceOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    DepthToSpaceOptionsStart(builder)
+
+
+def DepthToSpaceOptionsAddBlockSize(builder, blockSize):
+    builder.PrependInt32Slot(0, blockSize, 0)
+
+
+def AddBlockSize(builder, blockSize):
+    DepthToSpaceOptionsAddBlockSize(builder, blockSize)
+
+
+def DepthToSpaceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DepthToSpaceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DepthwiseConv2DOptions.py b/backends/nxp/backend/ir/lib/tflite/DepthwiseConv2DOptions.py
new file mode 100755
index 00000000000..e07ff231563
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DepthwiseConv2DOptions.py
@@ -0,0 +1,157 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DepthwiseConv2DOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DepthwiseConv2DOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDepthwiseConv2DOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DepthwiseConv2DOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DepthwiseConv2DOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DepthwiseConv2DOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def DepthMultiplier(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def DilationWFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # DepthwiseConv2DOptions
+    def DilationHFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+
+def DepthwiseConv2DOptionsStart(builder):
+    builder.StartObject(7)
+
+
+def Start(builder):
+    DepthwiseConv2DOptionsStart(builder)
+
+
+def DepthwiseConv2DOptionsAddPadding(builder, padding):
+    builder.PrependInt8Slot(0, padding, 0)
+
+
+def AddPadding(builder, padding):
+    DepthwiseConv2DOptionsAddPadding(builder, padding)
+
+
+def DepthwiseConv2DOptionsAddStrideW(builder, strideW):
+    builder.PrependInt32Slot(1, strideW, 0)
+
+
+def AddStrideW(builder, strideW):
+    DepthwiseConv2DOptionsAddStrideW(builder, strideW)
+
+
+def DepthwiseConv2DOptionsAddStrideH(builder, strideH):
+    builder.PrependInt32Slot(2, strideH, 0)
+
+
+def AddStrideH(builder, strideH):
+    DepthwiseConv2DOptionsAddStrideH(builder, strideH)
+
+
+def DepthwiseConv2DOptionsAddDepthMultiplier(builder, depthMultiplier):
+    builder.PrependInt32Slot(3, depthMultiplier, 0)
+
+
+def AddDepthMultiplier(builder, depthMultiplier):
+    DepthwiseConv2DOptionsAddDepthMultiplier(builder, depthMultiplier)
+
+
+def DepthwiseConv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(4, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    DepthwiseConv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def DepthwiseConv2DOptionsAddDilationWFactor(builder, dilationWFactor):
+    builder.PrependInt32Slot(5, dilationWFactor, 1)
+
+
+def AddDilationWFactor(builder, dilationWFactor):
+    DepthwiseConv2DOptionsAddDilationWFactor(builder, dilationWFactor)
+
+
+def DepthwiseConv2DOptionsAddDilationHFactor(builder, dilationHFactor):
+    builder.PrependInt32Slot(6, dilationHFactor, 1)
+
+
+def AddDilationHFactor(builder, dilationHFactor):
+    DepthwiseConv2DOptionsAddDilationHFactor(builder, dilationHFactor)
+
+
+def DepthwiseConv2DOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DepthwiseConv2DOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DequantizeOptions.py b/backends/nxp/backend/ir/lib/tflite/DequantizeOptions.py
new file mode 100755
index 00000000000..f30ab73727f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DequantizeOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DequantizeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DequantizeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDequantizeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DequantizeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DequantizeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def DequantizeOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    DequantizeOptionsStart(builder)
+
+
+def DequantizeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DequantizeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DilateOptions.py b/backends/nxp/backend/ir/lib/tflite/DilateOptions.py
new file mode 100755
index 00000000000..520d3e5957c
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DilateOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DilateOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DilateOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDilateOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DilateOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DilateOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def DilateOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    DilateOptionsStart(builder)
+
+
+def DilateOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DilateOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DimensionMetadata.py b/backends/nxp/backend/ir/lib/tflite/DimensionMetadata.py
new file mode 100755
index 00000000000..900c3380143
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DimensionMetadata.py
@@ -0,0 +1,152 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DimensionMetadata(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DimensionMetadata()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDimensionMetadata(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DimensionMetadataBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DimensionMetadata
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DimensionMetadata
+    def Format(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def DenseSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def ArraySegmentsType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def ArraySegments(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            from flatbuffers.table import Table
+
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # DimensionMetadata
+    def ArrayIndicesType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def ArrayIndices(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            from flatbuffers.table import Table
+
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+
+def DimensionMetadataStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    DimensionMetadataStart(builder)
+
+
+def DimensionMetadataAddFormat(builder, format):
+    builder.PrependInt8Slot(0, format, 0)
+
+
+def AddFormat(builder, format):
+    DimensionMetadataAddFormat(builder, format)
+
+
+def DimensionMetadataAddDenseSize(builder, denseSize):
+    builder.PrependInt32Slot(1, denseSize, 0)
+
+
+def AddDenseSize(builder, denseSize):
+    DimensionMetadataAddDenseSize(builder, denseSize)
+
+
+def DimensionMetadataAddArraySegmentsType(builder, arraySegmentsType):
+    builder.PrependUint8Slot(2, arraySegmentsType, 0)
+
+
+def AddArraySegmentsType(builder, arraySegmentsType):
+    DimensionMetadataAddArraySegmentsType(builder, arraySegmentsType)
+
+
+def DimensionMetadataAddArraySegments(builder, arraySegments):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(arraySegments), 0
+    )
+
+
+def AddArraySegments(builder, arraySegments):
+    DimensionMetadataAddArraySegments(builder, arraySegments)
+
+
+def DimensionMetadataAddArrayIndicesType(builder, arrayIndicesType):
+    builder.PrependUint8Slot(4, arrayIndicesType, 0)
+
+
+def AddArrayIndicesType(builder, arrayIndicesType):
+    DimensionMetadataAddArrayIndicesType(builder, arrayIndicesType)
+
+
+def DimensionMetadataAddArrayIndices(builder, arrayIndices):
+    builder.PrependUOffsetTRelativeSlot(
+        5, flatbuffers.number_types.UOffsetTFlags.py_type(arrayIndices), 0
+    )
+
+
+def AddArrayIndices(builder, arrayIndices):
+    DimensionMetadataAddArrayIndices(builder, arrayIndices)
+
+
+def DimensionMetadataEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DimensionMetadataEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DimensionType.py b/backends/nxp/backend/ir/lib/tflite/DimensionType.py
new file mode 100755
index 00000000000..53429e9b76e
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DimensionType.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class DimensionType(object):
+    DENSE = 0
+    SPARSE_CSR = 1
diff --git a/backends/nxp/backend/ir/lib/tflite/DivOptions.py b/backends/nxp/backend/ir/lib/tflite/DivOptions.py
new file mode 100755
index 00000000000..5aee6be7182
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DivOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DivOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DivOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDivOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DivOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DivOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DivOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def DivOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    DivOptionsStart(builder)
+
+
+def DivOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    DivOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def DivOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DivOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/DynamicUpdateSliceOptions.py b/backends/nxp/backend/ir/lib/tflite/DynamicUpdateSliceOptions.py
new file mode 100755
index 00000000000..7b86b66c466
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/DynamicUpdateSliceOptions.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class DynamicUpdateSliceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DynamicUpdateSliceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsDynamicUpdateSliceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def DynamicUpdateSliceOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # DynamicUpdateSliceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def DynamicUpdateSliceOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    DynamicUpdateSliceOptionsStart(builder)
+
+
+def DynamicUpdateSliceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return DynamicUpdateSliceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/EmbeddingLookupSparseOptions.py b/backends/nxp/backend/ir/lib/tflite/EmbeddingLookupSparseOptions.py
new file mode 100755
index 00000000000..d1f636056e5
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/EmbeddingLookupSparseOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class EmbeddingLookupSparseOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = EmbeddingLookupSparseOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsEmbeddingLookupSparseOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def EmbeddingLookupSparseOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # EmbeddingLookupSparseOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # EmbeddingLookupSparseOptions
+    def Combiner(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def EmbeddingLookupSparseOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    EmbeddingLookupSparseOptionsStart(builder)
+
+
+def EmbeddingLookupSparseOptionsAddCombiner(builder, combiner):
+    builder.PrependInt8Slot(0, combiner, 0)
+
+
+def AddCombiner(builder, combiner):
+    EmbeddingLookupSparseOptionsAddCombiner(builder, combiner)
+
+
+def EmbeddingLookupSparseOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return EmbeddingLookupSparseOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/EqualOptions.py b/backends/nxp/backend/ir/lib/tflite/EqualOptions.py
new file mode 100755
index 00000000000..aa184e876ba
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/EqualOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class EqualOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = EqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsEqualOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def EqualOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # EqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def EqualOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    EqualOptionsStart(builder)
+
+
+def EqualOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return EqualOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ExpOptions.py b/backends/nxp/backend/ir/lib/tflite/ExpOptions.py
new file mode 100755
index 00000000000..c36969ecefb
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ExpOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ExpOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ExpOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsExpOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ExpOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ExpOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def ExpOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    ExpOptionsStart(builder)
+
+
+def ExpOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ExpOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ExpandDimsOptions.py b/backends/nxp/backend/ir/lib/tflite/ExpandDimsOptions.py
new file mode 100755
index 00000000000..cdaab92767e
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ExpandDimsOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ExpandDimsOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ExpandDimsOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsExpandDimsOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ExpandDimsOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ExpandDimsOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def ExpandDimsOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    ExpandDimsOptionsStart(builder)
+
+
+def ExpandDimsOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ExpandDimsOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/FakeQuantOptions.py b/backends/nxp/backend/ir/lib/tflite/FakeQuantOptions.py
new file mode 100755
index 00000000000..6b8ad914285
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/FakeQuantOptions.py
@@ -0,0 +1,116 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class FakeQuantOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FakeQuantOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsFakeQuantOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def FakeQuantOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # FakeQuantOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # FakeQuantOptions
+    def Min(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # FakeQuantOptions
+    def Max(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # FakeQuantOptions
+    def NumBits(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # FakeQuantOptions
+    def NarrowRange(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def FakeQuantOptionsStart(builder):
+    builder.StartObject(4)
+
+
+def Start(builder):
+    FakeQuantOptionsStart(builder)
+
+
+def FakeQuantOptionsAddMin(builder, min):
+    builder.PrependFloat32Slot(0, min, 0.0)
+
+
+def AddMin(builder, min):
+    FakeQuantOptionsAddMin(builder, min)
+
+
+def FakeQuantOptionsAddMax(builder, max):
+    builder.PrependFloat32Slot(1, max, 0.0)
+
+
+def AddMax(builder, max):
+    FakeQuantOptionsAddMax(builder, max)
+
+
+def FakeQuantOptionsAddNumBits(builder, numBits):
+    builder.PrependInt32Slot(2, numBits, 0)
+
+
+def AddNumBits(builder, numBits):
+    FakeQuantOptionsAddNumBits(builder, numBits)
+
+
+def FakeQuantOptionsAddNarrowRange(builder, narrowRange):
+    builder.PrependBoolSlot(3, narrowRange, 0)
+
+
+def AddNarrowRange(builder, narrowRange):
+    FakeQuantOptionsAddNarrowRange(builder, narrowRange)
+
+
+def FakeQuantOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return FakeQuantOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/FillOptions.py b/backends/nxp/backend/ir/lib/tflite/FillOptions.py
new file mode 100755
index 00000000000..a99e717220b
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/FillOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class FillOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FillOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsFillOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def FillOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # FillOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def FillOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    FillOptionsStart(builder)
+
+
+def FillOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return FillOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/FloorDivOptions.py b/backends/nxp/backend/ir/lib/tflite/FloorDivOptions.py
new file mode 100755
index 00000000000..38b149c5b49
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/FloorDivOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class FloorDivOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FloorDivOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsFloorDivOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def FloorDivOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # FloorDivOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def FloorDivOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    FloorDivOptionsStart(builder)
+
+
+def FloorDivOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return FloorDivOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/FloorModOptions.py b/backends/nxp/backend/ir/lib/tflite/FloorModOptions.py
new file mode 100755
index 00000000000..16fb3dc21f2
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/FloorModOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class FloorModOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FloorModOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsFloorModOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def FloorModOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # FloorModOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def FloorModOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    FloorModOptionsStart(builder)
+
+
+def FloorModOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return FloorModOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/FullyConnectedOptions.py b/backends/nxp/backend/ir/lib/tflite/FullyConnectedOptions.py
new file mode 100755
index 00000000000..190cfd7ff7f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/FullyConnectedOptions.py
@@ -0,0 +1,129 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class FullyConnectedOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FullyConnectedOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsFullyConnectedOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def FullyConnectedOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # FullyConnectedOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # FullyConnectedOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # FullyConnectedOptions
+    def WeightsFormat(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # FullyConnectedOptions
+    def KeepNumDims(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # FullyConnectedOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # FullyConnectedOptions
+    def QuantizedBiasType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def FullyConnectedOptionsStart(builder):
+    builder.StartObject(5)
+
+
+def Start(builder):
+    FullyConnectedOptionsStart(builder)
+
+
+def FullyConnectedOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    FullyConnectedOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def FullyConnectedOptionsAddWeightsFormat(builder, weightsFormat):
+    builder.PrependInt8Slot(1, weightsFormat, 0)
+
+
+def AddWeightsFormat(builder, weightsFormat):
+    FullyConnectedOptionsAddWeightsFormat(builder, weightsFormat)
+
+
+def FullyConnectedOptionsAddKeepNumDims(builder, keepNumDims):
+    builder.PrependBoolSlot(2, keepNumDims, 0)
+
+
+def AddKeepNumDims(builder, keepNumDims):
+    FullyConnectedOptionsAddKeepNumDims(builder, keepNumDims)
+
+
+def FullyConnectedOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    builder.PrependBoolSlot(3, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    FullyConnectedOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs)
+
+
+def FullyConnectedOptionsAddQuantizedBiasType(builder, quantizedBiasType):
+    builder.PrependInt8Slot(4, quantizedBiasType, 0)
+
+
+def AddQuantizedBiasType(builder, quantizedBiasType):
+    FullyConnectedOptionsAddQuantizedBiasType(builder, quantizedBiasType)
+
+
+def FullyConnectedOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return FullyConnectedOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/FullyConnectedOptionsWeightsFormat.py b/backends/nxp/backend/ir/lib/tflite/FullyConnectedOptionsWeightsFormat.py
new file mode 100755
index 00000000000..143fc512266
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/FullyConnectedOptionsWeightsFormat.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class FullyConnectedOptionsWeightsFormat(object):
+    DEFAULT = 0
+    SHUFFLED4x16INT8 = 1
diff --git a/backends/nxp/backend/ir/lib/tflite/GatherNdOptions.py b/backends/nxp/backend/ir/lib/tflite/GatherNdOptions.py
new file mode 100755
index 00000000000..e2aa9292b45
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/GatherNdOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class GatherNdOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GatherNdOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsGatherNdOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def GatherNdOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # GatherNdOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def GatherNdOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    GatherNdOptionsStart(builder)
+
+
+def GatherNdOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return GatherNdOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/GatherOptions.py b/backends/nxp/backend/ir/lib/tflite/GatherOptions.py
new file mode 100755
index 00000000000..6817f57c6dc
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/GatherOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class GatherOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GatherOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsGatherOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def GatherOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # GatherOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # GatherOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # GatherOptions
+    def BatchDims(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def GatherOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    GatherOptionsStart(builder)
+
+
+def GatherOptionsAddAxis(builder, axis):
+    builder.PrependInt32Slot(0, axis, 0)
+
+
+def AddAxis(builder, axis):
+    GatherOptionsAddAxis(builder, axis)
+
+
+def GatherOptionsAddBatchDims(builder, batchDims):
+    builder.PrependInt32Slot(1, batchDims, 0)
+
+
+def AddBatchDims(builder, batchDims):
+    GatherOptionsAddBatchDims(builder, batchDims)
+
+
+def GatherOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return GatherOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/GeluOptions.py b/backends/nxp/backend/ir/lib/tflite/GeluOptions.py
new file mode 100755
index 00000000000..edd27f3b69b
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/GeluOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class GeluOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GeluOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsGeluOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def GeluOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # GeluOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # GeluOptions
+    def Approximate(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def GeluOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    GeluOptionsStart(builder)
+
+
+def GeluOptionsAddApproximate(builder, approximate):
+    builder.PrependBoolSlot(0, approximate, 0)
+
+
+def AddApproximate(builder, approximate):
+    GeluOptionsAddApproximate(builder, approximate)
+
+
+def GeluOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return GeluOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/GreaterEqualOptions.py b/backends/nxp/backend/ir/lib/tflite/GreaterEqualOptions.py
new file mode 100755
index 00000000000..bedd7fe3d10
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/GreaterEqualOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class GreaterEqualOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GreaterEqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsGreaterEqualOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def GreaterEqualOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # GreaterEqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def GreaterEqualOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    GreaterEqualOptionsStart(builder)
+
+
+def GreaterEqualOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return GreaterEqualOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/GreaterOptions.py b/backends/nxp/backend/ir/lib/tflite/GreaterOptions.py
new file mode 100755
index 00000000000..6d235140aa8
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/GreaterOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class GreaterOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GreaterOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsGreaterOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def GreaterOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # GreaterOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def GreaterOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    GreaterOptionsStart(builder)
+
+
+def GreaterOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return GreaterOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/HardSwishOptions.py b/backends/nxp/backend/ir/lib/tflite/HardSwishOptions.py
new file mode 100755
index 00000000000..7f1c2034394
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/HardSwishOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class HardSwishOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HardSwishOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsHardSwishOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def HardSwishOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # HardSwishOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def HardSwishOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    HardSwishOptionsStart(builder)
+
+
+def HardSwishOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return HardSwishOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/HashtableFindOptions.py b/backends/nxp/backend/ir/lib/tflite/HashtableFindOptions.py
new file mode 100755
index 00000000000..13d35d93644
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/HashtableFindOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class HashtableFindOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HashtableFindOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsHashtableFindOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def HashtableFindOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # HashtableFindOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def HashtableFindOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    HashtableFindOptionsStart(builder)
+
+
+def HashtableFindOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return HashtableFindOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/HashtableImportOptions.py b/backends/nxp/backend/ir/lib/tflite/HashtableImportOptions.py
new file mode 100755
index 00000000000..6c3f6a571ad
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/HashtableImportOptions.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class HashtableImportOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HashtableImportOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsHashtableImportOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def HashtableImportOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # HashtableImportOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def HashtableImportOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    HashtableImportOptionsStart(builder)
+
+
+def HashtableImportOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return HashtableImportOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/HashtableOptions.py b/backends/nxp/backend/ir/lib/tflite/HashtableOptions.py
new file mode 100755
index 00000000000..c8934965862
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/HashtableOptions.py
@@ -0,0 +1,95 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class HashtableOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HashtableOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsHashtableOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def HashtableOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # HashtableOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # HashtableOptions
+    def TableId(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # HashtableOptions
+    def KeyDtype(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # HashtableOptions
+    def ValueDtype(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def HashtableOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    HashtableOptionsStart(builder)
+
+
+def HashtableOptionsAddTableId(builder, tableId):
+    builder.PrependInt32Slot(0, tableId, 0)
+
+
+def AddTableId(builder, tableId):
+    HashtableOptionsAddTableId(builder, tableId)
+
+
+def HashtableOptionsAddKeyDtype(builder, keyDtype):
+    builder.PrependInt8Slot(1, keyDtype, 0)
+
+
+def AddKeyDtype(builder, keyDtype):
+    HashtableOptionsAddKeyDtype(builder, keyDtype)
+
+
+def HashtableOptionsAddValueDtype(builder, valueDtype):
+    builder.PrependInt8Slot(2, valueDtype, 0)
+
+
+def AddValueDtype(builder, valueDtype):
+    HashtableOptionsAddValueDtype(builder, valueDtype)
+
+
+def HashtableOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return HashtableOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/HashtableSizeOptions.py b/backends/nxp/backend/ir/lib/tflite/HashtableSizeOptions.py
new file mode 100755
index 00000000000..ce585b7d9be
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/HashtableSizeOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class HashtableSizeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HashtableSizeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsHashtableSizeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def HashtableSizeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # HashtableSizeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def HashtableSizeOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    HashtableSizeOptionsStart(builder)
+
+
+def HashtableSizeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return HashtableSizeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/IfOptions.py b/backends/nxp/backend/ir/lib/tflite/IfOptions.py
new file mode 100755
index 00000000000..5bd12962429
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/IfOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class IfOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = IfOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsIfOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def IfOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # IfOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # IfOptions
+    def ThenSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # IfOptions
+    def ElseSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def IfOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    IfOptionsStart(builder)
+
+
+def IfOptionsAddThenSubgraphIndex(builder, thenSubgraphIndex):
+    builder.PrependInt32Slot(0, thenSubgraphIndex, 0)
+
+
+def AddThenSubgraphIndex(builder, thenSubgraphIndex):
+    IfOptionsAddThenSubgraphIndex(builder, thenSubgraphIndex)
+
+
+def IfOptionsAddElseSubgraphIndex(builder, elseSubgraphIndex):
+    builder.PrependInt32Slot(1, elseSubgraphIndex, 0)
+
+
+def AddElseSubgraphIndex(builder, elseSubgraphIndex):
+    IfOptionsAddElseSubgraphIndex(builder, elseSubgraphIndex)
+
+
+def IfOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return IfOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Int32Vector.py b/backends/nxp/backend/ir/lib/tflite/Int32Vector.py
new file mode 100755
index 00000000000..1b87b43784f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Int32Vector.py
@@ -0,0 +1,98 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Int32Vector(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Int32Vector()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsInt32Vector(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def Int32VectorBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Int32Vector
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Int32Vector
+    def Values(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # Int32Vector
+    def ValuesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Int32Vector
+    def ValuesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Int32Vector
+    def ValuesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def Int32VectorStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    Int32VectorStart(builder)
+
+
+def Int32VectorAddValues(builder, values):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0
+    )
+
+
+def AddValues(builder, values):
+    Int32VectorAddValues(builder, values)
+
+
+def Int32VectorStartValuesVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartValuesVector(builder, numElems: int) -> int:
+    return Int32VectorStartValuesVector(builder, numElems)
+
+
+def Int32VectorEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return Int32VectorEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/L2NormOptions.py b/backends/nxp/backend/ir/lib/tflite/L2NormOptions.py
new file mode 100755
index 00000000000..908c45652f4
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/L2NormOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class L2NormOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = L2NormOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsL2NormOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def L2NormOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # L2NormOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # L2NormOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def L2NormOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    L2NormOptionsStart(builder)
+
+
+def L2NormOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    L2NormOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def L2NormOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return L2NormOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LSHProjectionOptions.py b/backends/nxp/backend/ir/lib/tflite/LSHProjectionOptions.py
new file mode 100755
index 00000000000..f0285b144ed
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LSHProjectionOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LSHProjectionOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LSHProjectionOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLSHProjectionOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LSHProjectionOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LSHProjectionOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LSHProjectionOptions
+    def Type(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def LSHProjectionOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    LSHProjectionOptionsStart(builder)
+
+
+def LSHProjectionOptionsAddType(builder, type):
+    builder.PrependInt8Slot(0, type, 0)
+
+
+def AddType(builder, type):
+    LSHProjectionOptionsAddType(builder, type)
+
+
+def LSHProjectionOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LSHProjectionOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LSHProjectionType.py b/backends/nxp/backend/ir/lib/tflite/LSHProjectionType.py
new file mode 100755
index 00000000000..32817911448
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LSHProjectionType.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class LSHProjectionType(object):
+    UNKNOWN = 0
+    SPARSE = 1
+    DENSE = 2
diff --git a/backends/nxp/backend/ir/lib/tflite/LSTMKernelType.py b/backends/nxp/backend/ir/lib/tflite/LSTMKernelType.py
new file mode 100755
index 00000000000..f0e96f3fc56
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LSTMKernelType.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class LSTMKernelType(object):
+    FULL = 0
+    BASIC = 1
diff --git a/backends/nxp/backend/ir/lib/tflite/LSTMOptions.py b/backends/nxp/backend/ir/lib/tflite/LSTMOptions.py
new file mode 100755
index 00000000000..ff845946b40
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LSTMOptions.py
@@ -0,0 +1,131 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LSTMOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LSTMOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLSTMOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LSTMOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LSTMOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LSTMOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # LSTMOptions
+    def CellClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # LSTMOptions
+    def ProjClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # LSTMOptions
+    def KernelType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # LSTMOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def LSTMOptionsStart(builder):
+    builder.StartObject(5)
+
+
+def Start(builder):
+    LSTMOptionsStart(builder)
+
+
+def LSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    LSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def LSTMOptionsAddCellClip(builder, cellClip):
+    builder.PrependFloat32Slot(1, cellClip, 0.0)
+
+
+def AddCellClip(builder, cellClip):
+    LSTMOptionsAddCellClip(builder, cellClip)
+
+
+def LSTMOptionsAddProjClip(builder, projClip):
+    builder.PrependFloat32Slot(2, projClip, 0.0)
+
+
+def AddProjClip(builder, projClip):
+    LSTMOptionsAddProjClip(builder, projClip)
+
+
+def LSTMOptionsAddKernelType(builder, kernelType):
+    builder.PrependInt8Slot(3, kernelType, 0)
+
+
+def AddKernelType(builder, kernelType):
+    LSTMOptionsAddKernelType(builder, kernelType)
+
+
+def LSTMOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    builder.PrependBoolSlot(4, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    LSTMOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs)
+
+
+def LSTMOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LSTMOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LeakyReluOptions.py b/backends/nxp/backend/ir/lib/tflite/LeakyReluOptions.py
new file mode 100755
index 00000000000..e940e39dfbb
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LeakyReluOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LeakyReluOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LeakyReluOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLeakyReluOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LeakyReluOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LeakyReluOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LeakyReluOptions
+    def Alpha(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+
+def LeakyReluOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    LeakyReluOptionsStart(builder)
+
+
+def LeakyReluOptionsAddAlpha(builder, alpha):
+    builder.PrependFloat32Slot(0, alpha, 0.0)
+
+
+def AddAlpha(builder, alpha):
+    LeakyReluOptionsAddAlpha(builder, alpha)
+
+
+def LeakyReluOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LeakyReluOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LessEqualOptions.py b/backends/nxp/backend/ir/lib/tflite/LessEqualOptions.py
new file mode 100755
index 00000000000..2475f1614a6
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LessEqualOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LessEqualOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LessEqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLessEqualOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LessEqualOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LessEqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def LessEqualOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    LessEqualOptionsStart(builder)
+
+
+def LessEqualOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LessEqualOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LessOptions.py b/backends/nxp/backend/ir/lib/tflite/LessOptions.py
new file mode 100755
index 00000000000..651fece38e2
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LessOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LessOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LessOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLessOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LessOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LessOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def LessOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    LessOptionsStart(builder)
+
+
+def LessOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LessOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LocalResponseNormalizationOptions.py b/backends/nxp/backend/ir/lib/tflite/LocalResponseNormalizationOptions.py
new file mode 100755
index 00000000000..aaf2b5968ee
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LocalResponseNormalizationOptions.py
@@ -0,0 +1,118 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LocalResponseNormalizationOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LocalResponseNormalizationOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLocalResponseNormalizationOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LocalResponseNormalizationOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LocalResponseNormalizationOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LocalResponseNormalizationOptions
+    def Radius(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # LocalResponseNormalizationOptions
+    def Bias(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # LocalResponseNormalizationOptions
+    def Alpha(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # LocalResponseNormalizationOptions
+    def Beta(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+
+def LocalResponseNormalizationOptionsStart(builder):
+    builder.StartObject(4)
+
+
+def Start(builder):
+    LocalResponseNormalizationOptionsStart(builder)
+
+
+def LocalResponseNormalizationOptionsAddRadius(builder, radius):
+    builder.PrependInt32Slot(0, radius, 0)
+
+
+def AddRadius(builder, radius):
+    LocalResponseNormalizationOptionsAddRadius(builder, radius)
+
+
+def LocalResponseNormalizationOptionsAddBias(builder, bias):
+    builder.PrependFloat32Slot(1, bias, 0.0)
+
+
+def AddBias(builder, bias):
+    LocalResponseNormalizationOptionsAddBias(builder, bias)
+
+
+def LocalResponseNormalizationOptionsAddAlpha(builder, alpha):
+    builder.PrependFloat32Slot(2, alpha, 0.0)
+
+
+def AddAlpha(builder, alpha):
+    LocalResponseNormalizationOptionsAddAlpha(builder, alpha)
+
+
+def LocalResponseNormalizationOptionsAddBeta(builder, beta):
+    builder.PrependFloat32Slot(3, beta, 0.0)
+
+
+def AddBeta(builder, beta):
+    LocalResponseNormalizationOptionsAddBeta(builder, beta)
+
+
+def LocalResponseNormalizationOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LocalResponseNormalizationOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LogSoftmaxOptions.py b/backends/nxp/backend/ir/lib/tflite/LogSoftmaxOptions.py
new file mode 100755
index 00000000000..3ede83f2bc0
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LogSoftmaxOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LogSoftmaxOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogSoftmaxOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLogSoftmaxOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LogSoftmaxOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LogSoftmaxOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def LogSoftmaxOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    LogSoftmaxOptionsStart(builder)
+
+
+def LogSoftmaxOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LogSoftmaxOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LogicalAndOptions.py b/backends/nxp/backend/ir/lib/tflite/LogicalAndOptions.py
new file mode 100755
index 00000000000..7ce030db964
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LogicalAndOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LogicalAndOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogicalAndOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLogicalAndOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LogicalAndOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LogicalAndOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def LogicalAndOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    LogicalAndOptionsStart(builder)
+
+
+def LogicalAndOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LogicalAndOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LogicalNotOptions.py b/backends/nxp/backend/ir/lib/tflite/LogicalNotOptions.py
new file mode 100755
index 00000000000..356ef751c80
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LogicalNotOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LogicalNotOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogicalNotOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLogicalNotOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LogicalNotOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LogicalNotOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def LogicalNotOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    LogicalNotOptionsStart(builder)
+
+
+def LogicalNotOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LogicalNotOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/LogicalOrOptions.py b/backends/nxp/backend/ir/lib/tflite/LogicalOrOptions.py
new file mode 100755
index 00000000000..f1bd65d16af
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/LogicalOrOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class LogicalOrOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogicalOrOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsLogicalOrOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def LogicalOrOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # LogicalOrOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def LogicalOrOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    LogicalOrOptionsStart(builder)
+
+
+def LogicalOrOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return LogicalOrOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/MatrixDiagOptions.py b/backends/nxp/backend/ir/lib/tflite/MatrixDiagOptions.py
new file mode 100755
index 00000000000..3e7488ad6af
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/MatrixDiagOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class MatrixDiagOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MatrixDiagOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMatrixDiagOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def MatrixDiagOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # MatrixDiagOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def MatrixDiagOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    MatrixDiagOptionsStart(builder)
+
+
+def MatrixDiagOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return MatrixDiagOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/MatrixSetDiagOptions.py b/backends/nxp/backend/ir/lib/tflite/MatrixSetDiagOptions.py
new file mode 100755
index 00000000000..7abc0165447
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/MatrixSetDiagOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class MatrixSetDiagOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MatrixSetDiagOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMatrixSetDiagOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def MatrixSetDiagOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # MatrixSetDiagOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def MatrixSetDiagOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    MatrixSetDiagOptionsStart(builder)
+
+
+def MatrixSetDiagOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return MatrixSetDiagOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/MaximumMinimumOptions.py b/backends/nxp/backend/ir/lib/tflite/MaximumMinimumOptions.py
new file mode 100755
index 00000000000..ea37c25824f
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/MaximumMinimumOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class MaximumMinimumOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MaximumMinimumOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMaximumMinimumOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def MaximumMinimumOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # MaximumMinimumOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def MaximumMinimumOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    MaximumMinimumOptionsStart(builder)
+
+
+def MaximumMinimumOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return MaximumMinimumOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Metadata.py b/backends/nxp/backend/ir/lib/tflite/Metadata.py
new file mode 100755
index 00000000000..3816caffa09
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Metadata.py
@@ -0,0 +1,84 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Metadata(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Metadata()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMetadata(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def MetadataBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Metadata
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Metadata
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Metadata
+    def Buffer(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+
+def MetadataStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    MetadataStart(builder)
+
+
+def MetadataAddName(builder, name):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0
+    )
+
+
+def AddName(builder, name):
+    MetadataAddName(builder, name)
+
+
+def MetadataAddBuffer(builder, buffer):
+    builder.PrependUint32Slot(1, buffer, 0)
+
+
+def AddBuffer(builder, buffer):
+    MetadataAddBuffer(builder, buffer)
+
+
+def MetadataEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return MetadataEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/MirrorPadMode.py b/backends/nxp/backend/ir/lib/tflite/MirrorPadMode.py
new file mode 100755
index 00000000000..85718ebf244
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/MirrorPadMode.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class MirrorPadMode(object):
+    REFLECT = 0
+    SYMMETRIC = 1
diff --git a/backends/nxp/backend/ir/lib/tflite/MirrorPadOptions.py b/backends/nxp/backend/ir/lib/tflite/MirrorPadOptions.py
new file mode 100755
index 00000000000..2cf97b85350
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/MirrorPadOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class MirrorPadOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MirrorPadOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMirrorPadOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def MirrorPadOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # MirrorPadOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # MirrorPadOptions
+    def Mode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def MirrorPadOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    MirrorPadOptionsStart(builder)
+
+
+def MirrorPadOptionsAddMode(builder, mode):
+    builder.PrependInt8Slot(0, mode, 0)
+
+
+def AddMode(builder, mode):
+    MirrorPadOptionsAddMode(builder, mode)
+
+
+def MirrorPadOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return MirrorPadOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Model.py b/backends/nxp/backend/ir/lib/tflite/Model.py
new file mode 100755
index 00000000000..767243442db
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Model.py
@@ -0,0 +1,352 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Model(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Model()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsModel(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ModelBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Model
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Model
+    def Version(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+    # Model
+    def OperatorCodes(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .OperatorCode import OperatorCode
+
+            obj = OperatorCode()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def OperatorCodesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def OperatorCodesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # Model
+    def Subgraphs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .SubGraph import SubGraph
+
+            obj = SubGraph()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def SubgraphsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def SubgraphsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # Model
+    def Description(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Model
+    def Buffers(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Buffer import Buffer
+
+            obj = Buffer()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def BuffersLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def BuffersIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        return o == 0
+
+    # Model
+    def MetadataBuffer(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # Model
+    def MetadataBufferAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Model
+    def MetadataBufferLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def MetadataBufferIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        return o == 0
+
+    # Model
+    def Metadata(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Metadata import Metadata
+
+            obj = Metadata()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def MetadataLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def MetadataIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        return o == 0
+
+    # Model
+    def SignatureDefs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .SignatureDef import SignatureDef
+
+            obj = SignatureDef()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def SignatureDefsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def SignatureDefsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        return o == 0
+
+
+def ModelStart(builder):
+    builder.StartObject(8)
+
+
+def Start(builder):
+    ModelStart(builder)
+
+
+def ModelAddVersion(builder, version):
+    builder.PrependUint32Slot(0, version, 0)
+
+
+def AddVersion(builder, version):
+    ModelAddVersion(builder, version)
+
+
+def ModelAddOperatorCodes(builder, operatorCodes):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(operatorCodes), 0
+    )
+
+
+def AddOperatorCodes(builder, operatorCodes):
+    ModelAddOperatorCodes(builder, operatorCodes)
+
+
+def ModelStartOperatorCodesVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartOperatorCodesVector(builder, numElems: int) -> int:
+    return ModelStartOperatorCodesVector(builder, numElems)
+
+
+def ModelAddSubgraphs(builder, subgraphs):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(subgraphs), 0
+    )
+
+
+def AddSubgraphs(builder, subgraphs):
+    ModelAddSubgraphs(builder, subgraphs)
+
+
+def ModelStartSubgraphsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartSubgraphsVector(builder, numElems: int) -> int:
+    return ModelStartSubgraphsVector(builder, numElems)
+
+
+def ModelAddDescription(builder, description):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(description), 0
+    )
+
+
+def AddDescription(builder, description):
+    ModelAddDescription(builder, description)
+
+
+def ModelAddBuffers(builder, buffers):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(buffers), 0
+    )
+
+
+def AddBuffers(builder, buffers):
+    ModelAddBuffers(builder, buffers)
+
+
+def ModelStartBuffersVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartBuffersVector(builder, numElems: int) -> int:
+    return ModelStartBuffersVector(builder, numElems)
+
+
+def ModelAddMetadataBuffer(builder, metadataBuffer):
+    builder.PrependUOffsetTRelativeSlot(
+        5, flatbuffers.number_types.UOffsetTFlags.py_type(metadataBuffer), 0
+    )
+
+
+def AddMetadataBuffer(builder, metadataBuffer):
+    ModelAddMetadataBuffer(builder, metadataBuffer)
+
+
+def ModelStartMetadataBufferVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartMetadataBufferVector(builder, numElems: int) -> int:
+    return ModelStartMetadataBufferVector(builder, numElems)
+
+
+def ModelAddMetadata(builder, metadata):
+    builder.PrependUOffsetTRelativeSlot(
+        6, flatbuffers.number_types.UOffsetTFlags.py_type(metadata), 0
+    )
+
+
+def AddMetadata(builder, metadata):
+    ModelAddMetadata(builder, metadata)
+
+
+def ModelStartMetadataVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartMetadataVector(builder, numElems: int) -> int:
+    return ModelStartMetadataVector(builder, numElems)
+
+
+def ModelAddSignatureDefs(builder, signatureDefs):
+    builder.PrependUOffsetTRelativeSlot(
+        7, flatbuffers.number_types.UOffsetTFlags.py_type(signatureDefs), 0
+    )
+
+
+def AddSignatureDefs(builder, signatureDefs):
+    ModelAddSignatureDefs(builder, signatureDefs)
+
+
+def ModelStartSignatureDefsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartSignatureDefsVector(builder, numElems: int) -> int:
+    return ModelStartSignatureDefsVector(builder, numElems)
+
+
+def ModelEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ModelEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/MulOptions.py b/backends/nxp/backend/ir/lib/tflite/MulOptions.py
new file mode 100755
index 00000000000..afa33eaaee6
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/MulOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class MulOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MulOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMulOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def MulOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # MulOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # MulOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def MulOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    MulOptionsStart(builder)
+
+
+def MulOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    MulOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def MulOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return MulOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/NegOptions.py b/backends/nxp/backend/ir/lib/tflite/NegOptions.py
new file mode 100755
index 00000000000..892f1883f2a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/NegOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class NegOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NegOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsNegOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def NegOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # NegOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def NegOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    NegOptionsStart(builder)
+
+
+def NegOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return NegOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV4Options.py b/backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV4Options.py
new file mode 100755
index 00000000000..27b46c178ba
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV4Options.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class NonMaxSuppressionV4Options(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NonMaxSuppressionV4Options()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsNonMaxSuppressionV4Options(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def NonMaxSuppressionV4OptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # NonMaxSuppressionV4Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def NonMaxSuppressionV4OptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    NonMaxSuppressionV4OptionsStart(builder)
+
+
+def NonMaxSuppressionV4OptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return NonMaxSuppressionV4OptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV5Options.py b/backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV5Options.py
new file mode 100755
index 00000000000..3007be79baa
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/NonMaxSuppressionV5Options.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class NonMaxSuppressionV5Options(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NonMaxSuppressionV5Options()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsNonMaxSuppressionV5Options(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def NonMaxSuppressionV5OptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # NonMaxSuppressionV5Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def NonMaxSuppressionV5OptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    NonMaxSuppressionV5OptionsStart(builder)
+
+
+def NonMaxSuppressionV5OptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return NonMaxSuppressionV5OptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/NotEqualOptions.py b/backends/nxp/backend/ir/lib/tflite/NotEqualOptions.py
new file mode 100755
index 00000000000..910ae01b0aa
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/NotEqualOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class NotEqualOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NotEqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsNotEqualOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def NotEqualOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # NotEqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def NotEqualOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    NotEqualOptionsStart(builder)
+
+
+def NotEqualOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return NotEqualOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/OneHotOptions.py b/backends/nxp/backend/ir/lib/tflite/OneHotOptions.py
new file mode 100755
index 00000000000..6ef4974a559
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/OneHotOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class OneHotOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = OneHotOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsOneHotOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def OneHotOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # OneHotOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # OneHotOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def OneHotOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    OneHotOptionsStart(builder)
+
+
+def OneHotOptionsAddAxis(builder, axis):
+    builder.PrependInt32Slot(0, axis, 0)
+
+
+def AddAxis(builder, axis):
+    OneHotOptionsAddAxis(builder, axis)
+
+
+def OneHotOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return OneHotOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Operator.py b/backends/nxp/backend/ir/lib/tflite/Operator.py
new file mode 100755
index 00000000000..2638b5c2878
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Operator.py
@@ -0,0 +1,428 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Operator(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Operator()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsOperator(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def OperatorBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Operator
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Operator
+    def OpcodeIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+    # Operator
+    def Inputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # Operator
+    def InputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Operator
+    def InputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def InputsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # Operator
+    def Outputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # Operator
+    def OutputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Operator
+    def OutputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def OutputsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # Operator
+    def BuiltinOptionsType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # Operator
+    def BuiltinOptions(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            from flatbuffers.table import Table
+
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # Operator
+    def CustomOptions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint8Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1),
+            )
+        return 0
+
+    # Operator
+    def CustomOptionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Operator
+    def CustomOptionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def CustomOptionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        return o == 0
+
+    # Operator
+    def CustomOptionsFormat(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Operator
+    def MutatingVariableInputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.BoolFlags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1),
+            )
+        return 0
+
+    # Operator
+    def MutatingVariableInputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.BoolFlags, o)
+        return 0
+
+    # Operator
+    def MutatingVariableInputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def MutatingVariableInputsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        return o == 0
+
+    # Operator
+    def Intermediates(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # Operator
+    def IntermediatesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Operator
+    def IntermediatesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def IntermediatesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        return o == 0
+
+    # Operator
+    def LargeCustomOptionsOffset(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint64Flags, o + self._tab.Pos
+            )
+        return 0
+
+    # Operator
+    def LargeCustomOptionsSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint64Flags, o + self._tab.Pos
+            )
+        return 0
+
+    # Operator
+    def BuiltinOptions2Type(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # Operator
+    def BuiltinOptions2(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(28))
+        if o != 0:
+            from flatbuffers.table import Table
+
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+
+def OperatorStart(builder):
+    builder.StartObject(13)
+
+
+def Start(builder):
+    OperatorStart(builder)
+
+
+def OperatorAddOpcodeIndex(builder, opcodeIndex):
+    builder.PrependUint32Slot(0, opcodeIndex, 0)
+
+
+def AddOpcodeIndex(builder, opcodeIndex):
+    OperatorAddOpcodeIndex(builder, opcodeIndex)
+
+
+def OperatorAddInputs(builder, inputs):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0
+    )
+
+
+def AddInputs(builder, inputs):
+    OperatorAddInputs(builder, inputs)
+
+
+def OperatorStartInputsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartInputsVector(builder, numElems: int) -> int:
+    return OperatorStartInputsVector(builder, numElems)
+
+
+def OperatorAddOutputs(builder, outputs):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0
+    )
+
+
+def AddOutputs(builder, outputs):
+    OperatorAddOutputs(builder, outputs)
+
+
+def OperatorStartOutputsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartOutputsVector(builder, numElems: int) -> int:
+    return OperatorStartOutputsVector(builder, numElems)
+
+
+def OperatorAddBuiltinOptionsType(builder, builtinOptionsType):
+    builder.PrependUint8Slot(3, builtinOptionsType, 0)
+
+
+def AddBuiltinOptionsType(builder, builtinOptionsType):
+    OperatorAddBuiltinOptionsType(builder, builtinOptionsType)
+
+
+def OperatorAddBuiltinOptions(builder, builtinOptions):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(builtinOptions), 0
+    )
+
+
+def AddBuiltinOptions(builder, builtinOptions):
+    OperatorAddBuiltinOptions(builder, builtinOptions)
+
+
+def OperatorAddCustomOptions(builder, customOptions):
+    builder.PrependUOffsetTRelativeSlot(
+        5, flatbuffers.number_types.UOffsetTFlags.py_type(customOptions), 0
+    )
+
+
+def AddCustomOptions(builder, customOptions):
+    OperatorAddCustomOptions(builder, customOptions)
+
+
+def OperatorStartCustomOptionsVector(builder, numElems):
+    return builder.StartVector(1, numElems, 1)
+
+
+def StartCustomOptionsVector(builder, numElems: int) -> int:
+    return OperatorStartCustomOptionsVector(builder, numElems)
+
+
+def OperatorAddCustomOptionsFormat(builder, customOptionsFormat):
+    builder.PrependInt8Slot(6, customOptionsFormat, 0)
+
+
+def AddCustomOptionsFormat(builder, customOptionsFormat):
+    OperatorAddCustomOptionsFormat(builder, customOptionsFormat)
+
+
+def OperatorAddMutatingVariableInputs(builder, mutatingVariableInputs):
+    builder.PrependUOffsetTRelativeSlot(
+        7, flatbuffers.number_types.UOffsetTFlags.py_type(mutatingVariableInputs), 0
+    )
+
+
+def AddMutatingVariableInputs(builder, mutatingVariableInputs):
+    OperatorAddMutatingVariableInputs(builder, mutatingVariableInputs)
+
+
+def OperatorStartMutatingVariableInputsVector(builder, numElems):
+    return builder.StartVector(1, numElems, 1)
+
+
+def StartMutatingVariableInputsVector(builder, numElems: int) -> int:
+    return OperatorStartMutatingVariableInputsVector(builder, numElems)
+
+
+def OperatorAddIntermediates(builder, intermediates):
+    builder.PrependUOffsetTRelativeSlot(
+        8, flatbuffers.number_types.UOffsetTFlags.py_type(intermediates), 0
+    )
+
+
+def AddIntermediates(builder, intermediates):
+    OperatorAddIntermediates(builder, intermediates)
+
+
+def OperatorStartIntermediatesVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartIntermediatesVector(builder, numElems: int) -> int:
+    return OperatorStartIntermediatesVector(builder, numElems)
+
+
+def OperatorAddLargeCustomOptionsOffset(builder, largeCustomOptionsOffset):
+    builder.PrependUint64Slot(9, largeCustomOptionsOffset, 0)
+
+
+def AddLargeCustomOptionsOffset(builder, largeCustomOptionsOffset):
+    OperatorAddLargeCustomOptionsOffset(builder, largeCustomOptionsOffset)
+
+
+def OperatorAddLargeCustomOptionsSize(builder, largeCustomOptionsSize):
+    builder.PrependUint64Slot(10, largeCustomOptionsSize, 0)
+
+
+def AddLargeCustomOptionsSize(builder, largeCustomOptionsSize):
+    OperatorAddLargeCustomOptionsSize(builder, largeCustomOptionsSize)
+
+
+def OperatorAddBuiltinOptions2Type(builder, builtinOptions2Type):
+    builder.PrependUint8Slot(11, builtinOptions2Type, 0)
+
+
+def AddBuiltinOptions2Type(builder, builtinOptions2Type):
+    OperatorAddBuiltinOptions2Type(builder, builtinOptions2Type)
+
+
+def OperatorAddBuiltinOptions2(builder, builtinOptions2):
+    builder.PrependUOffsetTRelativeSlot(
+        12, flatbuffers.number_types.UOffsetTFlags.py_type(builtinOptions2), 0
+    )
+
+
+def AddBuiltinOptions2(builder, builtinOptions2):
+    OperatorAddBuiltinOptions2(builder, builtinOptions2)
+
+
+def OperatorEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return OperatorEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/OperatorCode.py b/backends/nxp/backend/ir/lib/tflite/OperatorCode.py
new file mode 100755
index 00000000000..fa714103125
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/OperatorCode.py
@@ -0,0 +1,112 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class OperatorCode(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = OperatorCode()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsOperatorCode(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def OperatorCodeBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # OperatorCode
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # OperatorCode
+    def DeprecatedBuiltinCode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # OperatorCode
+    def CustomCode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # OperatorCode
+    def Version(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # OperatorCode
+    def BuiltinCode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def OperatorCodeStart(builder):
+    builder.StartObject(4)
+
+
+def Start(builder):
+    OperatorCodeStart(builder)
+
+
+def OperatorCodeAddDeprecatedBuiltinCode(builder, deprecatedBuiltinCode):
+    builder.PrependInt8Slot(0, deprecatedBuiltinCode, 0)
+
+
+def AddDeprecatedBuiltinCode(builder, deprecatedBuiltinCode):
+    OperatorCodeAddDeprecatedBuiltinCode(builder, deprecatedBuiltinCode)
+
+
+def OperatorCodeAddCustomCode(builder, customCode):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(customCode), 0
+    )
+
+
+def AddCustomCode(builder, customCode):
+    OperatorCodeAddCustomCode(builder, customCode)
+
+
+def OperatorCodeAddVersion(builder, version):
+    builder.PrependInt32Slot(2, version, 1)
+
+
+def AddVersion(builder, version):
+    OperatorCodeAddVersion(builder, version)
+
+
+def OperatorCodeAddBuiltinCode(builder, builtinCode):
+    builder.PrependInt32Slot(3, builtinCode, 0)
+
+
+def AddBuiltinCode(builder, builtinCode):
+    OperatorCodeAddBuiltinCode(builder, builtinCode)
+
+
+def OperatorCodeEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return OperatorCodeEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/PackOptions.py b/backends/nxp/backend/ir/lib/tflite/PackOptions.py
new file mode 100755
index 00000000000..1c527337eb9
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/PackOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class PackOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PackOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsPackOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def PackOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # PackOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # PackOptions
+    def ValuesCount(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # PackOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def PackOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    PackOptionsStart(builder)
+
+
+def PackOptionsAddValuesCount(builder, valuesCount):
+    builder.PrependInt32Slot(0, valuesCount, 0)
+
+
+def AddValuesCount(builder, valuesCount):
+    PackOptionsAddValuesCount(builder, valuesCount)
+
+
+def PackOptionsAddAxis(builder, axis):
+    builder.PrependInt32Slot(1, axis, 0)
+
+
+def AddAxis(builder, axis):
+    PackOptionsAddAxis(builder, axis)
+
+
+def PackOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return PackOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/PadOptions.py b/backends/nxp/backend/ir/lib/tflite/PadOptions.py
new file mode 100755
index 00000000000..a35dcba995d
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/PadOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class PadOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PadOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsPadOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def PadOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # PadOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def PadOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    PadOptionsStart(builder)
+
+
+def PadOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return PadOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/PadV2Options.py b/backends/nxp/backend/ir/lib/tflite/PadV2Options.py
new file mode 100755
index 00000000000..b1ebd175be8
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/PadV2Options.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class PadV2Options(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PadV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsPadV2Options(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def PadV2OptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # PadV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def PadV2OptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    PadV2OptionsStart(builder)
+
+
+def PadV2OptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return PadV2OptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Padding.py b/backends/nxp/backend/ir/lib/tflite/Padding.py
new file mode 100755
index 00000000000..b8b908c0c21
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Padding.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class Padding(object):
+    SAME = 0
+    VALID = 1
diff --git a/backends/nxp/backend/ir/lib/tflite/Pool2DOptions.py b/backends/nxp/backend/ir/lib/tflite/Pool2DOptions.py
new file mode 100755
index 00000000000..454a385b6d3
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Pool2DOptions.py
@@ -0,0 +1,140 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Pool2DOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Pool2DOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsPool2DOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def Pool2DOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Pool2DOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Pool2DOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def FilterWidth(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def FilterHeight(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def Pool2DOptionsStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    Pool2DOptionsStart(builder)
+
+
+def Pool2DOptionsAddPadding(builder, padding):
+    builder.PrependInt8Slot(0, padding, 0)
+
+
+def AddPadding(builder, padding):
+    Pool2DOptionsAddPadding(builder, padding)
+
+
+def Pool2DOptionsAddStrideW(builder, strideW):
+    builder.PrependInt32Slot(1, strideW, 0)
+
+
+def AddStrideW(builder, strideW):
+    Pool2DOptionsAddStrideW(builder, strideW)
+
+
+def Pool2DOptionsAddStrideH(builder, strideH):
+    builder.PrependInt32Slot(2, strideH, 0)
+
+
+def AddStrideH(builder, strideH):
+    Pool2DOptionsAddStrideH(builder, strideH)
+
+
+def Pool2DOptionsAddFilterWidth(builder, filterWidth):
+    builder.PrependInt32Slot(3, filterWidth, 0)
+
+
+def AddFilterWidth(builder, filterWidth):
+    Pool2DOptionsAddFilterWidth(builder, filterWidth)
+
+
+def Pool2DOptionsAddFilterHeight(builder, filterHeight):
+    builder.PrependInt32Slot(4, filterHeight, 0)
+
+
+def AddFilterHeight(builder, filterHeight):
+    Pool2DOptionsAddFilterHeight(builder, filterHeight)
+
+
+def Pool2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(5, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    Pool2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def Pool2DOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return Pool2DOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/PowOptions.py b/backends/nxp/backend/ir/lib/tflite/PowOptions.py
new file mode 100755
index 00000000000..bd1e5f6e0b0
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/PowOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class PowOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PowOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsPowOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def PowOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # PowOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def PowOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    PowOptionsStart(builder)
+
+
+def PowOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return PowOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/QuantizationDetails.py b/backends/nxp/backend/ir/lib/tflite/QuantizationDetails.py
new file mode 100755
index 00000000000..93e322e1865
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/QuantizationDetails.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class QuantizationDetails(object):
+    NONE = 0
+    CustomQuantization = 1
diff --git a/backends/nxp/backend/ir/lib/tflite/QuantizationParameters.py b/backends/nxp/backend/ir/lib/tflite/QuantizationParameters.py
new file mode 100755
index 00000000000..58b56643af2
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/QuantizationParameters.py
@@ -0,0 +1,295 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class QuantizationParameters(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = QuantizationParameters()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsQuantizationParameters(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def QuantizationParametersBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # QuantizationParameters
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # QuantizationParameters
+    def Min(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # QuantizationParameters
+    def MinAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def MinLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def MinIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # QuantizationParameters
+    def Max(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # QuantizationParameters
+    def MaxAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def MaxLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def MaxIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # QuantizationParameters
+    def Scale(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # QuantizationParameters
+    def ScaleAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def ScaleLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def ScaleIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # QuantizationParameters
+    def ZeroPoint(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # QuantizationParameters
+    def ZeroPointAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def ZeroPointLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def ZeroPointIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        return o == 0
+
+    # QuantizationParameters
+    def DetailsType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # QuantizationParameters
+    def Details(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            from flatbuffers.table import Table
+
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # QuantizationParameters
+    def QuantizedDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def QuantizationParametersStart(builder):
+    builder.StartObject(7)
+
+
+def Start(builder):
+    QuantizationParametersStart(builder)
+
+
+def QuantizationParametersAddMin(builder, min):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(min), 0
+    )
+
+
+def AddMin(builder, min):
+    QuantizationParametersAddMin(builder, min)
+
+
+def QuantizationParametersStartMinVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartMinVector(builder, numElems: int) -> int:
+    return QuantizationParametersStartMinVector(builder, numElems)
+
+
+def QuantizationParametersAddMax(builder, max):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(max), 0
+    )
+
+
+def AddMax(builder, max):
+    QuantizationParametersAddMax(builder, max)
+
+
+def QuantizationParametersStartMaxVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartMaxVector(builder, numElems: int) -> int:
+    return QuantizationParametersStartMaxVector(builder, numElems)
+
+
+def QuantizationParametersAddScale(builder, scale):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(scale), 0
+    )
+
+
+def AddScale(builder, scale):
+    QuantizationParametersAddScale(builder, scale)
+
+
+def QuantizationParametersStartScaleVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartScaleVector(builder, numElems: int) -> int:
+    return QuantizationParametersStartScaleVector(builder, numElems)
+
+
+def QuantizationParametersAddZeroPoint(builder, zeroPoint):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(zeroPoint), 0
+    )
+
+
+def AddZeroPoint(builder, zeroPoint):
+    QuantizationParametersAddZeroPoint(builder, zeroPoint)
+
+
+def QuantizationParametersStartZeroPointVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartZeroPointVector(builder, numElems: int) -> int:
+    return QuantizationParametersStartZeroPointVector(builder, numElems)
+
+
+def QuantizationParametersAddDetailsType(builder, detailsType):
+    builder.PrependUint8Slot(4, detailsType, 0)
+
+
+def AddDetailsType(builder, detailsType):
+    QuantizationParametersAddDetailsType(builder, detailsType)
+
+
+def QuantizationParametersAddDetails(builder, details):
+    builder.PrependUOffsetTRelativeSlot(
+        5, flatbuffers.number_types.UOffsetTFlags.py_type(details), 0
+    )
+
+
+def AddDetails(builder, details):
+    QuantizationParametersAddDetails(builder, details)
+
+
+def QuantizationParametersAddQuantizedDimension(builder, quantizedDimension):
+    builder.PrependInt32Slot(6, quantizedDimension, 0)
+
+
+def AddQuantizedDimension(builder, quantizedDimension):
+    QuantizationParametersAddQuantizedDimension(builder, quantizedDimension)
+
+
+def QuantizationParametersEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return QuantizationParametersEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/QuantizeOptions.py b/backends/nxp/backend/ir/lib/tflite/QuantizeOptions.py
new file mode 100755
index 00000000000..225c5fe2814
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/QuantizeOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class QuantizeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = QuantizeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsQuantizeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def QuantizeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # QuantizeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def QuantizeOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    QuantizeOptionsStart(builder)
+
+
+def QuantizeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return QuantizeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/RNNOptions.py b/backends/nxp/backend/ir/lib/tflite/RNNOptions.py
new file mode 100755
index 00000000000..889baf3dc0c
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/RNNOptions.py
@@ -0,0 +1,82 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class RNNOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RNNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsRNNOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def RNNOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # RNNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # RNNOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # RNNOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def RNNOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    RNNOptionsStart(builder)
+
+
+def RNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    RNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def RNNOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    builder.PrependBoolSlot(1, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    RNNOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs)
+
+
+def RNNOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return RNNOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/RandomOptions.py b/backends/nxp/backend/ir/lib/tflite/RandomOptions.py
new file mode 100755
index 00000000000..6b5496bac29
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/RandomOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class RandomOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RandomOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsRandomOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def RandomOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # RandomOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # RandomOptions
+    def Seed(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # RandomOptions
+    def Seed2(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+
+def RandomOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    RandomOptionsStart(builder)
+
+
+def RandomOptionsAddSeed(builder, seed):
+    builder.PrependInt64Slot(0, seed, 0)
+
+
+def AddSeed(builder, seed):
+    RandomOptionsAddSeed(builder, seed)
+
+
+def RandomOptionsAddSeed2(builder, seed2):
+    builder.PrependInt64Slot(1, seed2, 0)
+
+
+def AddSeed2(builder, seed2):
+    RandomOptionsAddSeed2(builder, seed2)
+
+
+def RandomOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return RandomOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/RangeOptions.py b/backends/nxp/backend/ir/lib/tflite/RangeOptions.py
new file mode 100755
index 00000000000..b585c0332f6
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/RangeOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class RangeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RangeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsRangeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def RangeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # RangeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def RangeOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    RangeOptionsStart(builder)
+
+
+def RangeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return RangeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/RankOptions.py b/backends/nxp/backend/ir/lib/tflite/RankOptions.py
new file mode 100755
index 00000000000..834e81dfe53
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/RankOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class RankOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RankOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsRankOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def RankOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # RankOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def RankOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    RankOptionsStart(builder)
+
+
+def RankOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return RankOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ReadVariableOptions.py b/backends/nxp/backend/ir/lib/tflite/ReadVariableOptions.py
new file mode 100755
index 00000000000..d33a50cb66a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ReadVariableOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ReadVariableOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReadVariableOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsReadVariableOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ReadVariableOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ReadVariableOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def ReadVariableOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    ReadVariableOptionsStart(builder)
+
+
+def ReadVariableOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ReadVariableOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ReduceWindowFunction.py b/backends/nxp/backend/ir/lib/tflite/ReduceWindowFunction.py
new file mode 100755
index 00000000000..6d9bf48a692
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ReduceWindowFunction.py
@@ -0,0 +1,13 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class ReduceWindowFunction(object):
+    UNSUPPORTED = 0
+    ADD = 1
+    MUL = 2
+    MINIMUM = 3
+    MAXIMUM = 4
+    ALL = 5
+    ANY = 6
diff --git a/backends/nxp/backend/ir/lib/tflite/ReduceWindowOptions.py b/backends/nxp/backend/ir/lib/tflite/ReduceWindowOptions.py
new file mode 100755
index 00000000000..273e5ddf8ca
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ReduceWindowOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ReduceWindowOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReduceWindowOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsReduceWindowOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ReduceWindowOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ReduceWindowOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ReduceWindowOptions
+    def ReduceFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def ReduceWindowOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    ReduceWindowOptionsStart(builder)
+
+
+def ReduceWindowOptionsAddReduceFunction(builder, reduceFunction):
+    builder.PrependInt32Slot(0, reduceFunction, 0)
+
+
+def AddReduceFunction(builder, reduceFunction):
+    ReduceWindowOptionsAddReduceFunction(builder, reduceFunction)
+
+
+def ReduceWindowOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ReduceWindowOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ReducerOptions.py b/backends/nxp/backend/ir/lib/tflite/ReducerOptions.py
new file mode 100755
index 00000000000..81b4ca56716
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ReducerOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ReducerOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReducerOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsReducerOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ReducerOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ReducerOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ReducerOptions
+    def KeepDims(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def ReducerOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    ReducerOptionsStart(builder)
+
+
+def ReducerOptionsAddKeepDims(builder, keepDims):
+    builder.PrependBoolSlot(0, keepDims, 0)
+
+
+def AddKeepDims(builder, keepDims):
+    ReducerOptionsAddKeepDims(builder, keepDims)
+
+
+def ReducerOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ReducerOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ReshapeOptions.py b/backends/nxp/backend/ir/lib/tflite/ReshapeOptions.py
new file mode 100755
index 00000000000..12c67f92668
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ReshapeOptions.py
@@ -0,0 +1,98 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ReshapeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReshapeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsReshapeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ReshapeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ReshapeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ReshapeOptions
+    def NewShape(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # ReshapeOptions
+    def NewShapeAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # ReshapeOptions
+    def NewShapeLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ReshapeOptions
+    def NewShapeIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def ReshapeOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    ReshapeOptionsStart(builder)
+
+
+def ReshapeOptionsAddNewShape(builder, newShape):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(newShape), 0
+    )
+
+
+def AddNewShape(builder, newShape):
+    ReshapeOptionsAddNewShape(builder, newShape)
+
+
+def ReshapeOptionsStartNewShapeVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartNewShapeVector(builder, numElems: int) -> int:
+    return ReshapeOptionsStartNewShapeVector(builder, numElems)
+
+
+def ReshapeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ReshapeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ResizeBilinearOptions.py b/backends/nxp/backend/ir/lib/tflite/ResizeBilinearOptions.py
new file mode 100755
index 00000000000..9c6e22c233a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ResizeBilinearOptions.py
@@ -0,0 +1,84 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ResizeBilinearOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ResizeBilinearOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsResizeBilinearOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ResizeBilinearOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ResizeBilinearOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ResizeBilinearOptions
+    def AlignCorners(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # ResizeBilinearOptions
+    def HalfPixelCenters(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def ResizeBilinearOptionsStart(builder):
+    builder.StartObject(4)
+
+
+def Start(builder):
+    ResizeBilinearOptionsStart(builder)
+
+
+def ResizeBilinearOptionsAddAlignCorners(builder, alignCorners):
+    builder.PrependBoolSlot(2, alignCorners, 0)
+
+
+def AddAlignCorners(builder, alignCorners):
+    ResizeBilinearOptionsAddAlignCorners(builder, alignCorners)
+
+
+def ResizeBilinearOptionsAddHalfPixelCenters(builder, halfPixelCenters):
+    builder.PrependBoolSlot(3, halfPixelCenters, 0)
+
+
+def AddHalfPixelCenters(builder, halfPixelCenters):
+    ResizeBilinearOptionsAddHalfPixelCenters(builder, halfPixelCenters)
+
+
+def ResizeBilinearOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ResizeBilinearOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ResizeNearestNeighborOptions.py b/backends/nxp/backend/ir/lib/tflite/ResizeNearestNeighborOptions.py
new file mode 100755
index 00000000000..cb75ce03d04
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ResizeNearestNeighborOptions.py
@@ -0,0 +1,86 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ResizeNearestNeighborOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ResizeNearestNeighborOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsResizeNearestNeighborOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ResizeNearestNeighborOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ResizeNearestNeighborOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ResizeNearestNeighborOptions
+    def AlignCorners(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # ResizeNearestNeighborOptions
+    def HalfPixelCenters(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def ResizeNearestNeighborOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    ResizeNearestNeighborOptionsStart(builder)
+
+
+def ResizeNearestNeighborOptionsAddAlignCorners(builder, alignCorners):
+    builder.PrependBoolSlot(0, alignCorners, 0)
+
+
+def AddAlignCorners(builder, alignCorners):
+    ResizeNearestNeighborOptionsAddAlignCorners(builder, alignCorners)
+
+
+def ResizeNearestNeighborOptionsAddHalfPixelCenters(builder, halfPixelCenters):
+    builder.PrependBoolSlot(1, halfPixelCenters, 0)
+
+
+def AddHalfPixelCenters(builder, halfPixelCenters):
+    ResizeNearestNeighborOptionsAddHalfPixelCenters(builder, halfPixelCenters)
+
+
+def ResizeNearestNeighborOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ResizeNearestNeighborOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ReverseSequenceOptions.py b/backends/nxp/backend/ir/lib/tflite/ReverseSequenceOptions.py
new file mode 100755
index 00000000000..ecf0f8f1069
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ReverseSequenceOptions.py
@@ -0,0 +1,82 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ReverseSequenceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReverseSequenceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsReverseSequenceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ReverseSequenceOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ReverseSequenceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ReverseSequenceOptions
+    def SeqDim(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # ReverseSequenceOptions
+    def BatchDim(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def ReverseSequenceOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    ReverseSequenceOptionsStart(builder)
+
+
+def ReverseSequenceOptionsAddSeqDim(builder, seqDim):
+    builder.PrependInt32Slot(0, seqDim, 0)
+
+
+def AddSeqDim(builder, seqDim):
+    ReverseSequenceOptionsAddSeqDim(builder, seqDim)
+
+
+def ReverseSequenceOptionsAddBatchDim(builder, batchDim):
+    builder.PrependInt32Slot(1, batchDim, 0)
+
+
+def AddBatchDim(builder, batchDim):
+    ReverseSequenceOptionsAddBatchDim(builder, batchDim)
+
+
+def ReverseSequenceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ReverseSequenceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ReverseV2Options.py b/backends/nxp/backend/ir/lib/tflite/ReverseV2Options.py
new file mode 100755
index 00000000000..1da88bea516
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ReverseV2Options.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ReverseV2Options(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReverseV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsReverseV2Options(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ReverseV2OptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ReverseV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def ReverseV2OptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    ReverseV2OptionsStart(builder)
+
+
+def ReverseV2OptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ReverseV2OptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Rfft2dOptions.py b/backends/nxp/backend/ir/lib/tflite/Rfft2dOptions.py
new file mode 100755
index 00000000000..5090608da39
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Rfft2dOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Rfft2dOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Rfft2dOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsRfft2dOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def Rfft2dOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Rfft2dOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def Rfft2dOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    Rfft2dOptionsStart(builder)
+
+
+def Rfft2dOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return Rfft2dOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/RightShiftOptions.py b/backends/nxp/backend/ir/lib/tflite/RightShiftOptions.py
new file mode 100755
index 00000000000..09a54e862f3
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/RightShiftOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class RightShiftOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RightShiftOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsRightShiftOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def RightShiftOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # RightShiftOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def RightShiftOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    RightShiftOptionsStart(builder)
+
+
+def RightShiftOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return RightShiftOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/RngAlgorithm.py b/backends/nxp/backend/ir/lib/tflite/RngAlgorithm.py
new file mode 100755
index 00000000000..ae8c633fd57
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/RngAlgorithm.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class RngAlgorithm(object):
+    DEFAULT = 0
+    PHILOX = 1
+    THREEFRY = 2
diff --git a/backends/nxp/backend/ir/lib/tflite/SVDFOptions.py b/backends/nxp/backend/ir/lib/tflite/SVDFOptions.py
new file mode 100755
index 00000000000..03e17149167
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SVDFOptions.py
@@ -0,0 +1,97 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SVDFOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SVDFOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSVDFOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SVDFOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SVDFOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SVDFOptions
+    def Rank(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # SVDFOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # SVDFOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def SVDFOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    SVDFOptionsStart(builder)
+
+
+def SVDFOptionsAddRank(builder, rank):
+    builder.PrependInt32Slot(0, rank, 0)
+
+
+def AddRank(builder, rank):
+    SVDFOptionsAddRank(builder, rank)
+
+
+def SVDFOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    SVDFOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def SVDFOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    builder.PrependBoolSlot(2, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    SVDFOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs)
+
+
+def SVDFOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SVDFOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ScatterNdOptions.py b/backends/nxp/backend/ir/lib/tflite/ScatterNdOptions.py
new file mode 100755
index 00000000000..f5c63dfb784
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ScatterNdOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ScatterNdOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ScatterNdOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsScatterNdOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ScatterNdOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ScatterNdOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def ScatterNdOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    ScatterNdOptionsStart(builder)
+
+
+def ScatterNdOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ScatterNdOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SegmentSumOptions.py b/backends/nxp/backend/ir/lib/tflite/SegmentSumOptions.py
new file mode 100755
index 00000000000..fa8b723728a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SegmentSumOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SegmentSumOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SegmentSumOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSegmentSumOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SegmentSumOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SegmentSumOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SegmentSumOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SegmentSumOptionsStart(builder)
+
+
+def SegmentSumOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SegmentSumOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SelectOptions.py b/backends/nxp/backend/ir/lib/tflite/SelectOptions.py
new file mode 100755
index 00000000000..080702bc489
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SelectOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SelectOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SelectOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSelectOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SelectOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SelectOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SelectOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SelectOptionsStart(builder)
+
+
+def SelectOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SelectOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SelectV2Options.py b/backends/nxp/backend/ir/lib/tflite/SelectV2Options.py
new file mode 100755
index 00000000000..6853d38059d
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SelectV2Options.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SelectV2Options(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SelectV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSelectV2Options(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SelectV2OptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SelectV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SelectV2OptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SelectV2OptionsStart(builder)
+
+
+def SelectV2OptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SelectV2OptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SequenceRNNOptions.py b/backends/nxp/backend/ir/lib/tflite/SequenceRNNOptions.py
new file mode 100755
index 00000000000..787e2bfc2a0
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SequenceRNNOptions.py
@@ -0,0 +1,99 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SequenceRNNOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SequenceRNNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSequenceRNNOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SequenceRNNOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SequenceRNNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SequenceRNNOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # SequenceRNNOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # SequenceRNNOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def SequenceRNNOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    SequenceRNNOptionsStart(builder)
+
+
+def SequenceRNNOptionsAddTimeMajor(builder, timeMajor):
+    builder.PrependBoolSlot(0, timeMajor, 0)
+
+
+def AddTimeMajor(builder, timeMajor):
+    SequenceRNNOptionsAddTimeMajor(builder, timeMajor)
+
+
+def SequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    SequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def SequenceRNNOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    builder.PrependBoolSlot(2, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    SequenceRNNOptionsAddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs)
+
+
+def SequenceRNNOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SequenceRNNOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ShapeOptions.py b/backends/nxp/backend/ir/lib/tflite/ShapeOptions.py
new file mode 100755
index 00000000000..ee89fcda8a4
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ShapeOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ShapeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ShapeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsShapeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ShapeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ShapeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ShapeOptions
+    def OutType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def ShapeOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    ShapeOptionsStart(builder)
+
+
+def ShapeOptionsAddOutType(builder, outType):
+    builder.PrependInt8Slot(0, outType, 0)
+
+
+def AddOutType(builder, outType):
+    ShapeOptionsAddOutType(builder, outType)
+
+
+def ShapeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ShapeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SignOptions.py b/backends/nxp/backend/ir/lib/tflite/SignOptions.py
new file mode 100755
index 00000000000..7ca65fa478e
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SignOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SignOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SignOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSignOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SignOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SignOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SignOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SignOptionsStart(builder)
+
+
+def SignOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SignOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SignatureDef.py b/backends/nxp/backend/ir/lib/tflite/SignatureDef.py
new file mode 100755
index 00000000000..b806f44b7ed
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SignatureDef.py
@@ -0,0 +1,172 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SignatureDef(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SignatureDef()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSignatureDef(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SignatureDefBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SignatureDef
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SignatureDef
+    def Inputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .TensorMap import TensorMap
+
+            obj = TensorMap()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SignatureDef
+    def InputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SignatureDef
+    def InputsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # SignatureDef
+    def Outputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .TensorMap import TensorMap
+
+            obj = TensorMap()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SignatureDef
+    def OutputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SignatureDef
+    def OutputsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # SignatureDef
+    def SignatureKey(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # SignatureDef
+    def SubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+
+def SignatureDefStart(builder):
+    builder.StartObject(5)
+
+
+def Start(builder):
+    SignatureDefStart(builder)
+
+
+def SignatureDefAddInputs(builder, inputs):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0
+    )
+
+
+def AddInputs(builder, inputs):
+    SignatureDefAddInputs(builder, inputs)
+
+
+def SignatureDefStartInputsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartInputsVector(builder, numElems: int) -> int:
+    return SignatureDefStartInputsVector(builder, numElems)
+
+
+def SignatureDefAddOutputs(builder, outputs):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0
+    )
+
+
+def AddOutputs(builder, outputs):
+    SignatureDefAddOutputs(builder, outputs)
+
+
+def SignatureDefStartOutputsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartOutputsVector(builder, numElems: int) -> int:
+    return SignatureDefStartOutputsVector(builder, numElems)
+
+
+def SignatureDefAddSignatureKey(builder, signatureKey):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(signatureKey), 0
+    )
+
+
+def AddSignatureKey(builder, signatureKey):
+    SignatureDefAddSignatureKey(builder, signatureKey)
+
+
+def SignatureDefAddSubgraphIndex(builder, subgraphIndex):
+    builder.PrependUint32Slot(4, subgraphIndex, 0)
+
+
+def AddSubgraphIndex(builder, subgraphIndex):
+    SignatureDefAddSubgraphIndex(builder, subgraphIndex)
+
+
+def SignatureDefEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SignatureDefEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SkipGramOptions.py b/backends/nxp/backend/ir/lib/tflite/SkipGramOptions.py
new file mode 100755
index 00000000000..9a655ca7373
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SkipGramOptions.py
@@ -0,0 +1,97 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SkipGramOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SkipGramOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSkipGramOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SkipGramOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SkipGramOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SkipGramOptions
+    def NgramSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # SkipGramOptions
+    def MaxSkipSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # SkipGramOptions
+    def IncludeAllNgrams(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def SkipGramOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    SkipGramOptionsStart(builder)
+
+
+def SkipGramOptionsAddNgramSize(builder, ngramSize):
+    builder.PrependInt32Slot(0, ngramSize, 0)
+
+
+def AddNgramSize(builder, ngramSize):
+    SkipGramOptionsAddNgramSize(builder, ngramSize)
+
+
+def SkipGramOptionsAddMaxSkipSize(builder, maxSkipSize):
+    builder.PrependInt32Slot(1, maxSkipSize, 0)
+
+
+def AddMaxSkipSize(builder, maxSkipSize):
+    SkipGramOptionsAddMaxSkipSize(builder, maxSkipSize)
+
+
+def SkipGramOptionsAddIncludeAllNgrams(builder, includeAllNgrams):
+    builder.PrependBoolSlot(2, includeAllNgrams, 0)
+
+
+def AddIncludeAllNgrams(builder, includeAllNgrams):
+    SkipGramOptionsAddIncludeAllNgrams(builder, includeAllNgrams)
+
+
+def SkipGramOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SkipGramOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SliceOptions.py b/backends/nxp/backend/ir/lib/tflite/SliceOptions.py
new file mode 100755
index 00000000000..a9257926836
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SliceOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SliceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SliceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSliceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SliceOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SliceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SliceOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SliceOptionsStart(builder)
+
+
+def SliceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SliceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SoftmaxOptions.py b/backends/nxp/backend/ir/lib/tflite/SoftmaxOptions.py
new file mode 100755
index 00000000000..b9cf096196e
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SoftmaxOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SoftmaxOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SoftmaxOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSoftmaxOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SoftmaxOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SoftmaxOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SoftmaxOptions
+    def Beta(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+
+def SoftmaxOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    SoftmaxOptionsStart(builder)
+
+
+def SoftmaxOptionsAddBeta(builder, beta):
+    builder.PrependFloat32Slot(0, beta, 0.0)
+
+
+def AddBeta(builder, beta):
+    SoftmaxOptionsAddBeta(builder, beta)
+
+
+def SoftmaxOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SoftmaxOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SpaceToBatchNDOptions.py b/backends/nxp/backend/ir/lib/tflite/SpaceToBatchNDOptions.py
new file mode 100755
index 00000000000..6fa440d179b
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SpaceToBatchNDOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SpaceToBatchNDOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SpaceToBatchNDOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSpaceToBatchNDOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SpaceToBatchNDOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SpaceToBatchNDOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SpaceToBatchNDOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SpaceToBatchNDOptionsStart(builder)
+
+
+def SpaceToBatchNDOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SpaceToBatchNDOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SpaceToDepthOptions.py b/backends/nxp/backend/ir/lib/tflite/SpaceToDepthOptions.py
new file mode 100755
index 00000000000..5a66d53a9c5
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SpaceToDepthOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SpaceToDepthOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SpaceToDepthOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSpaceToDepthOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SpaceToDepthOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SpaceToDepthOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SpaceToDepthOptions
+    def BlockSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def SpaceToDepthOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    SpaceToDepthOptionsStart(builder)
+
+
+def SpaceToDepthOptionsAddBlockSize(builder, blockSize):
+    builder.PrependInt32Slot(0, blockSize, 0)
+
+
+def AddBlockSize(builder, blockSize):
+    SpaceToDepthOptionsAddBlockSize(builder, blockSize)
+
+
+def SpaceToDepthOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SpaceToDepthOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SparseIndexVector.py b/backends/nxp/backend/ir/lib/tflite/SparseIndexVector.py
new file mode 100755
index 00000000000..ddd80de7241
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SparseIndexVector.py
@@ -0,0 +1,10 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class SparseIndexVector(object):
+    NONE = 0
+    Int32Vector = 1
+    Uint16Vector = 2
+    Uint8Vector = 3
diff --git a/backends/nxp/backend/ir/lib/tflite/SparseToDenseOptions.py b/backends/nxp/backend/ir/lib/tflite/SparseToDenseOptions.py
new file mode 100755
index 00000000000..8748c665d19
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SparseToDenseOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SparseToDenseOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SparseToDenseOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSparseToDenseOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SparseToDenseOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SparseToDenseOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SparseToDenseOptions
+    def ValidateIndices(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def SparseToDenseOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    SparseToDenseOptionsStart(builder)
+
+
+def SparseToDenseOptionsAddValidateIndices(builder, validateIndices):
+    builder.PrependBoolSlot(0, validateIndices, 0)
+
+
+def AddValidateIndices(builder, validateIndices):
+    SparseToDenseOptionsAddValidateIndices(builder, validateIndices)
+
+
+def SparseToDenseOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SparseToDenseOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SparsityParameters.py b/backends/nxp/backend/ir/lib/tflite/SparsityParameters.py
new file mode 100755
index 00000000000..a126189eca1
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SparsityParameters.py
@@ -0,0 +1,190 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SparsityParameters(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SparsityParameters()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSparsityParameters(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SparsityParametersBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SparsityParameters
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SparsityParameters
+    def TraversalOrder(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # SparsityParameters
+    def TraversalOrderAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SparsityParameters
+    def TraversalOrderLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SparsityParameters
+    def TraversalOrderIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # SparsityParameters
+    def BlockMap(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # SparsityParameters
+    def BlockMapAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SparsityParameters
+    def BlockMapLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SparsityParameters
+    def BlockMapIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # SparsityParameters
+    def DimMetadata(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .DimensionMetadata import DimensionMetadata
+
+            obj = DimensionMetadata()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SparsityParameters
+    def DimMetadataLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SparsityParameters
+    def DimMetadataIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+
+def SparsityParametersStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    SparsityParametersStart(builder)
+
+
+def SparsityParametersAddTraversalOrder(builder, traversalOrder):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(traversalOrder), 0
+    )
+
+
+def AddTraversalOrder(builder, traversalOrder):
+    SparsityParametersAddTraversalOrder(builder, traversalOrder)
+
+
+def SparsityParametersStartTraversalOrderVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartTraversalOrderVector(builder, numElems: int) -> int:
+    return SparsityParametersStartTraversalOrderVector(builder, numElems)
+
+
+def SparsityParametersAddBlockMap(builder, blockMap):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(blockMap), 0
+    )
+
+
+def AddBlockMap(builder, blockMap):
+    SparsityParametersAddBlockMap(builder, blockMap)
+
+
+def SparsityParametersStartBlockMapVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartBlockMapVector(builder, numElems: int) -> int:
+    return SparsityParametersStartBlockMapVector(builder, numElems)
+
+
+def SparsityParametersAddDimMetadata(builder, dimMetadata):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(dimMetadata), 0
+    )
+
+
+def AddDimMetadata(builder, dimMetadata):
+    SparsityParametersAddDimMetadata(builder, dimMetadata)
+
+
+def SparsityParametersStartDimMetadataVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartDimMetadataVector(builder, numElems: int) -> int:
+    return SparsityParametersStartDimMetadataVector(builder, numElems)
+
+
+def SparsityParametersEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SparsityParametersEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SplitOptions.py b/backends/nxp/backend/ir/lib/tflite/SplitOptions.py
new file mode 100755
index 00000000000..4d07f046d73
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SplitOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SplitOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SplitOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSplitOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SplitOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SplitOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SplitOptions
+    def NumSplits(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def SplitOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    SplitOptionsStart(builder)
+
+
+def SplitOptionsAddNumSplits(builder, numSplits):
+    builder.PrependInt32Slot(0, numSplits, 0)
+
+
+def AddNumSplits(builder, numSplits):
+    SplitOptionsAddNumSplits(builder, numSplits)
+
+
+def SplitOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SplitOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SplitVOptions.py b/backends/nxp/backend/ir/lib/tflite/SplitVOptions.py
new file mode 100755
index 00000000000..692296a1a42
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SplitVOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SplitVOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SplitVOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSplitVOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SplitVOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SplitVOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SplitVOptions
+    def NumSplits(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def SplitVOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    SplitVOptionsStart(builder)
+
+
+def SplitVOptionsAddNumSplits(builder, numSplits):
+    builder.PrependInt32Slot(0, numSplits, 0)
+
+
+def AddNumSplits(builder, numSplits):
+    SplitVOptionsAddNumSplits(builder, numSplits)
+
+
+def SplitVOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SplitVOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SquareOptions.py b/backends/nxp/backend/ir/lib/tflite/SquareOptions.py
new file mode 100755
index 00000000000..01f4f28f4e8
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SquareOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SquareOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SquareOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSquareOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SquareOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SquareOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SquareOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SquareOptionsStart(builder)
+
+
+def SquareOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SquareOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SquaredDifferenceOptions.py b/backends/nxp/backend/ir/lib/tflite/SquaredDifferenceOptions.py
new file mode 100755
index 00000000000..ffc96ef5179
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SquaredDifferenceOptions.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SquaredDifferenceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SquaredDifferenceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSquaredDifferenceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SquaredDifferenceOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SquaredDifferenceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def SquaredDifferenceOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    SquaredDifferenceOptionsStart(builder)
+
+
+def SquaredDifferenceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SquaredDifferenceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SqueezeOptions.py b/backends/nxp/backend/ir/lib/tflite/SqueezeOptions.py
new file mode 100755
index 00000000000..ab3da484564
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SqueezeOptions.py
@@ -0,0 +1,98 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SqueezeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SqueezeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSqueezeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SqueezeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SqueezeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SqueezeOptions
+    def SqueezeDims(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # SqueezeOptions
+    def SqueezeDimsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SqueezeOptions
+    def SqueezeDimsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SqueezeOptions
+    def SqueezeDimsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def SqueezeOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    SqueezeOptionsStart(builder)
+
+
+def SqueezeOptionsAddSqueezeDims(builder, squeezeDims):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(squeezeDims), 0
+    )
+
+
+def AddSqueezeDims(builder, squeezeDims):
+    SqueezeOptionsAddSqueezeDims(builder, squeezeDims)
+
+
+def SqueezeOptionsStartSqueezeDimsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartSqueezeDimsVector(builder, numElems: int) -> int:
+    return SqueezeOptionsStartSqueezeDimsVector(builder, numElems)
+
+
+def SqueezeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SqueezeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloBroadcastInDimOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloBroadcastInDimOptions.py
new file mode 100755
index 00000000000..08b1daaec4d
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloBroadcastInDimOptions.py
@@ -0,0 +1,102 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloBroadcastInDimOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloBroadcastInDimOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloBroadcastInDimOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloBroadcastInDimOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloBroadcastInDimOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloBroadcastInDimOptions
+    def BroadcastDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloBroadcastInDimOptions
+    def BroadcastDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloBroadcastInDimOptions
+    def BroadcastDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloBroadcastInDimOptions
+    def BroadcastDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def StablehloBroadcastInDimOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    StablehloBroadcastInDimOptionsStart(builder)
+
+
+def StablehloBroadcastInDimOptionsAddBroadcastDimensions(builder, broadcastDimensions):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(broadcastDimensions), 0
+    )
+
+
+def AddBroadcastDimensions(builder, broadcastDimensions):
+    StablehloBroadcastInDimOptionsAddBroadcastDimensions(builder, broadcastDimensions)
+
+
+def StablehloBroadcastInDimOptionsStartBroadcastDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartBroadcastDimensionsVector(builder, numElems: int) -> int:
+    return StablehloBroadcastInDimOptionsStartBroadcastDimensionsVector(
+        builder, numElems
+    )
+
+
+def StablehloBroadcastInDimOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloBroadcastInDimOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloCompareOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloCompareOptions.py
new file mode 100755
index 00000000000..93a67064487
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloCompareOptions.py
@@ -0,0 +1,86 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloCompareOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloCompareOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloCompareOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloCompareOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloCompareOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloCompareOptions
+    def ComparisonDirection(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+    # StablehloCompareOptions
+    def CompareType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+
+def StablehloCompareOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    StablehloCompareOptionsStart(builder)
+
+
+def StablehloCompareOptionsAddComparisonDirection(builder, comparisonDirection):
+    builder.PrependUint32Slot(0, comparisonDirection, 0)
+
+
+def AddComparisonDirection(builder, comparisonDirection):
+    StablehloCompareOptionsAddComparisonDirection(builder, comparisonDirection)
+
+
+def StablehloCompareOptionsAddCompareType(builder, compareType):
+    builder.PrependUint32Slot(1, compareType, 0)
+
+
+def AddCompareType(builder, compareType):
+    StablehloCompareOptionsAddCompareType(builder, compareType)
+
+
+def StablehloCompareOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloCompareOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloComparisonDirection.py b/backends/nxp/backend/ir/lib/tflite/StablehloComparisonDirection.py
new file mode 100755
index 00000000000..6a9b1ab722a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloComparisonDirection.py
@@ -0,0 +1,12 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class StablehloComparisonDirection(object):
+    STABLEHLO_COMPARISON_DIRECTION_EQ = 0
+    STABLEHLO_COMPARISON_DIRECTION_NE = 1
+    STABLEHLO_COMPARISON_DIRECTION_GE = 2
+    STABLEHLO_COMPARISON_DIRECTION_GT = 3
+    STABLEHLO_COMPARISON_DIRECTION_LE = 4
+    STABLEHLO_COMPARISON_DIRECTION_LT = 5
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloComparisonType.py b/backends/nxp/backend/ir/lib/tflite/StablehloComparisonType.py
new file mode 100755
index 00000000000..8a1206f5450
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloComparisonType.py
@@ -0,0 +1,11 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class StablehloComparisonType(object):
+    STABLEHLO_COMPARISON_TYPE_NOTYPE = 0
+    STABLEHLO_COMPARISON_TYPE_FLOAT = 1
+    STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER = 2
+    STABLEHLO_COMPARISON_TYPE_SIGNED = 3
+    STABLEHLO_COMPARISON_TYPE_UNSIGNED = 4
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloConcatenateOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloConcatenateOptions.py
new file mode 100755
index 00000000000..c556007f15d
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloConcatenateOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloConcatenateOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloConcatenateOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloConcatenateOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloConcatenateOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloConcatenateOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloConcatenateOptions
+    def Dimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloConcatenateOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    StablehloConcatenateOptionsStart(builder)
+
+
+def StablehloConcatenateOptionsAddDimension(builder, dimension):
+    builder.PrependInt64Slot(0, dimension, 0)
+
+
+def AddDimension(builder, dimension):
+    StablehloConcatenateOptionsAddDimension(builder, dimension)
+
+
+def StablehloConcatenateOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloConcatenateOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloConvolutionOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloConvolutionOptions.py
new file mode 100755
index 00000000000..b3feba712f4
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloConvolutionOptions.py
@@ -0,0 +1,634 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloConvolutionOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloConvolutionOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloConvolutionOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloConvolutionOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloConvolutionOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloConvolutionOptions
+    def WindowStrides(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def WindowStridesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def WindowStridesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def WindowStridesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def Padding(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def PaddingAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def PaddingLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def PaddingIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def LhsDilation(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def LhsDilationAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def LhsDilationLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def LhsDilationIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def RhsDilation(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def RhsDilationAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def RhsDilationLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def RhsDilationIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def WindowReversal(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.BoolFlags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def WindowReversalAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.BoolFlags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def WindowReversalLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def WindowReversalIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def InputBatchDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def InputFeatureDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def InputSpatialDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def InputSpatialDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def InputSpatialDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def InputSpatialDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def KernelInputFeatureDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def KernelOutputFeatureDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def KernelSpatialDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def KernelSpatialDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def KernelSpatialDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def KernelSpatialDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def OutputBatchDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def OutputFeatureDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(28))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def OutputSpatialDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def OutputSpatialDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def OutputSpatialDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def OutputSpatialDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        return o == 0
+
+    # StablehloConvolutionOptions
+    def FeatureGroupCount(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(32))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def BatchGroupCount(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(34))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloConvolutionOptions
+    def PrecisionConfig(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(36))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # StablehloConvolutionOptions
+    def PrecisionConfigAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(36))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint32Flags, o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def PrecisionConfigLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(36))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloConvolutionOptions
+    def PrecisionConfigIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(36))
+        return o == 0
+
+
+def StablehloConvolutionOptionsStart(builder):
+    builder.StartObject(17)
+
+
+def Start(builder):
+    StablehloConvolutionOptionsStart(builder)
+
+
+def StablehloConvolutionOptionsAddWindowStrides(builder, windowStrides):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(windowStrides), 0
+    )
+
+
+def AddWindowStrides(builder, windowStrides):
+    StablehloConvolutionOptionsAddWindowStrides(builder, windowStrides)
+
+
+def StablehloConvolutionOptionsStartWindowStridesVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartWindowStridesVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartWindowStridesVector(builder, numElems)
+
+
+def StablehloConvolutionOptionsAddPadding(builder, padding):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(padding), 0
+    )
+
+
+def AddPadding(builder, padding):
+    StablehloConvolutionOptionsAddPadding(builder, padding)
+
+
+def StablehloConvolutionOptionsStartPaddingVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartPaddingVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartPaddingVector(builder, numElems)
+
+
+def StablehloConvolutionOptionsAddLhsDilation(builder, lhsDilation):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(lhsDilation), 0
+    )
+
+
+def AddLhsDilation(builder, lhsDilation):
+    StablehloConvolutionOptionsAddLhsDilation(builder, lhsDilation)
+
+
+def StablehloConvolutionOptionsStartLhsDilationVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartLhsDilationVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartLhsDilationVector(builder, numElems)
+
+
+def StablehloConvolutionOptionsAddRhsDilation(builder, rhsDilation):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(rhsDilation), 0
+    )
+
+
+def AddRhsDilation(builder, rhsDilation):
+    StablehloConvolutionOptionsAddRhsDilation(builder, rhsDilation)
+
+
+def StablehloConvolutionOptionsStartRhsDilationVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartRhsDilationVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartRhsDilationVector(builder, numElems)
+
+
+def StablehloConvolutionOptionsAddWindowReversal(builder, windowReversal):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(windowReversal), 0
+    )
+
+
+def AddWindowReversal(builder, windowReversal):
+    StablehloConvolutionOptionsAddWindowReversal(builder, windowReversal)
+
+
+def StablehloConvolutionOptionsStartWindowReversalVector(builder, numElems):
+    return builder.StartVector(1, numElems, 1)
+
+
+def StartWindowReversalVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartWindowReversalVector(builder, numElems)
+
+
+def StablehloConvolutionOptionsAddInputBatchDimension(builder, inputBatchDimension):
+    builder.PrependInt64Slot(5, inputBatchDimension, 0)
+
+
+def AddInputBatchDimension(builder, inputBatchDimension):
+    StablehloConvolutionOptionsAddInputBatchDimension(builder, inputBatchDimension)
+
+
+def StablehloConvolutionOptionsAddInputFeatureDimension(builder, inputFeatureDimension):
+    builder.PrependInt64Slot(6, inputFeatureDimension, 0)
+
+
+def AddInputFeatureDimension(builder, inputFeatureDimension):
+    StablehloConvolutionOptionsAddInputFeatureDimension(builder, inputFeatureDimension)
+
+
+def StablehloConvolutionOptionsAddInputSpatialDimensions(
+    builder, inputSpatialDimensions
+):
+    builder.PrependUOffsetTRelativeSlot(
+        7, flatbuffers.number_types.UOffsetTFlags.py_type(inputSpatialDimensions), 0
+    )
+
+
+def AddInputSpatialDimensions(builder, inputSpatialDimensions):
+    StablehloConvolutionOptionsAddInputSpatialDimensions(
+        builder, inputSpatialDimensions
+    )
+
+
+def StablehloConvolutionOptionsStartInputSpatialDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartInputSpatialDimensionsVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartInputSpatialDimensionsVector(
+        builder, numElems
+    )
+
+
+def StablehloConvolutionOptionsAddKernelInputFeatureDimension(
+    builder, kernelInputFeatureDimension
+):
+    builder.PrependInt64Slot(8, kernelInputFeatureDimension, 0)
+
+
+def AddKernelInputFeatureDimension(builder, kernelInputFeatureDimension):
+    StablehloConvolutionOptionsAddKernelInputFeatureDimension(
+        builder, kernelInputFeatureDimension
+    )
+
+
+def StablehloConvolutionOptionsAddKernelOutputFeatureDimension(
+    builder, kernelOutputFeatureDimension
+):
+    builder.PrependInt64Slot(9, kernelOutputFeatureDimension, 0)
+
+
+def AddKernelOutputFeatureDimension(builder, kernelOutputFeatureDimension):
+    StablehloConvolutionOptionsAddKernelOutputFeatureDimension(
+        builder, kernelOutputFeatureDimension
+    )
+
+
+def StablehloConvolutionOptionsAddKernelSpatialDimensions(
+    builder, kernelSpatialDimensions
+):
+    builder.PrependUOffsetTRelativeSlot(
+        10, flatbuffers.number_types.UOffsetTFlags.py_type(kernelSpatialDimensions), 0
+    )
+
+
+def AddKernelSpatialDimensions(builder, kernelSpatialDimensions):
+    StablehloConvolutionOptionsAddKernelSpatialDimensions(
+        builder, kernelSpatialDimensions
+    )
+
+
+def StablehloConvolutionOptionsStartKernelSpatialDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartKernelSpatialDimensionsVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartKernelSpatialDimensionsVector(
+        builder, numElems
+    )
+
+
+def StablehloConvolutionOptionsAddOutputBatchDimension(builder, outputBatchDimension):
+    builder.PrependInt64Slot(11, outputBatchDimension, 0)
+
+
+def AddOutputBatchDimension(builder, outputBatchDimension):
+    StablehloConvolutionOptionsAddOutputBatchDimension(builder, outputBatchDimension)
+
+
+def StablehloConvolutionOptionsAddOutputFeatureDimension(
+    builder, outputFeatureDimension
+):
+    builder.PrependInt64Slot(12, outputFeatureDimension, 0)
+
+
+def AddOutputFeatureDimension(builder, outputFeatureDimension):
+    StablehloConvolutionOptionsAddOutputFeatureDimension(
+        builder, outputFeatureDimension
+    )
+
+
+def StablehloConvolutionOptionsAddOutputSpatialDimensions(
+    builder, outputSpatialDimensions
+):
+    builder.PrependUOffsetTRelativeSlot(
+        13, flatbuffers.number_types.UOffsetTFlags.py_type(outputSpatialDimensions), 0
+    )
+
+
+def AddOutputSpatialDimensions(builder, outputSpatialDimensions):
+    StablehloConvolutionOptionsAddOutputSpatialDimensions(
+        builder, outputSpatialDimensions
+    )
+
+
+def StablehloConvolutionOptionsStartOutputSpatialDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartOutputSpatialDimensionsVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartOutputSpatialDimensionsVector(
+        builder, numElems
+    )
+
+
+def StablehloConvolutionOptionsAddFeatureGroupCount(builder, featureGroupCount):
+    builder.PrependInt64Slot(14, featureGroupCount, 0)
+
+
+def AddFeatureGroupCount(builder, featureGroupCount):
+    StablehloConvolutionOptionsAddFeatureGroupCount(builder, featureGroupCount)
+
+
+def StablehloConvolutionOptionsAddBatchGroupCount(builder, batchGroupCount):
+    builder.PrependInt64Slot(15, batchGroupCount, 0)
+
+
+def AddBatchGroupCount(builder, batchGroupCount):
+    StablehloConvolutionOptionsAddBatchGroupCount(builder, batchGroupCount)
+
+
+def StablehloConvolutionOptionsAddPrecisionConfig(builder, precisionConfig):
+    builder.PrependUOffsetTRelativeSlot(
+        16, flatbuffers.number_types.UOffsetTFlags.py_type(precisionConfig), 0
+    )
+
+
+def AddPrecisionConfig(builder, precisionConfig):
+    StablehloConvolutionOptionsAddPrecisionConfig(builder, precisionConfig)
+
+
+def StablehloConvolutionOptionsStartPrecisionConfigVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartPrecisionConfigVector(builder, numElems: int) -> int:
+    return StablehloConvolutionOptionsStartPrecisionConfigVector(builder, numElems)
+
+
+def StablehloConvolutionOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloConvolutionOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloCustomCallOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloCustomCallOptions.py
new file mode 100755
index 00000000000..6de9b88cba2
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloCustomCallOptions.py
@@ -0,0 +1,214 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloCustomCallOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloCustomCallOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloCustomCallOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloCustomCallOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloCustomCallOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloCustomCallOptions
+    def CallTargetName(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # StablehloCustomCallOptions
+    def HasSideEffect(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # StablehloCustomCallOptions
+    def BackendConfig(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # StablehloCustomCallOptions
+    def ApiVersion(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloCustomCallOptions
+    def CalledComputations(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # StablehloCustomCallOptions
+    def CalledComputationsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # StablehloCustomCallOptions
+    def CalledComputationsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloCustomCallOptions
+    def CalledComputationsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        return o == 0
+
+    # StablehloCustomCallOptions
+    def CustomAttributes(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint8Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1),
+            )
+        return 0
+
+    # StablehloCustomCallOptions
+    def CustomAttributesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # StablehloCustomCallOptions
+    def CustomAttributesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloCustomCallOptions
+    def CustomAttributesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        return o == 0
+
+
+def StablehloCustomCallOptionsStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    StablehloCustomCallOptionsStart(builder)
+
+
+def StablehloCustomCallOptionsAddCallTargetName(builder, callTargetName):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(callTargetName), 0
+    )
+
+
+def AddCallTargetName(builder, callTargetName):
+    StablehloCustomCallOptionsAddCallTargetName(builder, callTargetName)
+
+
+def StablehloCustomCallOptionsAddHasSideEffect(builder, hasSideEffect):
+    builder.PrependBoolSlot(1, hasSideEffect, 0)
+
+
+def AddHasSideEffect(builder, hasSideEffect):
+    StablehloCustomCallOptionsAddHasSideEffect(builder, hasSideEffect)
+
+
+def StablehloCustomCallOptionsAddBackendConfig(builder, backendConfig):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(backendConfig), 0
+    )
+
+
+def AddBackendConfig(builder, backendConfig):
+    StablehloCustomCallOptionsAddBackendConfig(builder, backendConfig)
+
+
+def StablehloCustomCallOptionsAddApiVersion(builder, apiVersion):
+    builder.PrependInt32Slot(3, apiVersion, 0)
+
+
+def AddApiVersion(builder, apiVersion):
+    StablehloCustomCallOptionsAddApiVersion(builder, apiVersion)
+
+
+def StablehloCustomCallOptionsAddCalledComputations(builder, calledComputations):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(calledComputations), 0
+    )
+
+
+def AddCalledComputations(builder, calledComputations):
+    StablehloCustomCallOptionsAddCalledComputations(builder, calledComputations)
+
+
+def StablehloCustomCallOptionsStartCalledComputationsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartCalledComputationsVector(builder, numElems: int) -> int:
+    return StablehloCustomCallOptionsStartCalledComputationsVector(builder, numElems)
+
+
+def StablehloCustomCallOptionsAddCustomAttributes(builder, customAttributes):
+    builder.PrependUOffsetTRelativeSlot(
+        5, flatbuffers.number_types.UOffsetTFlags.py_type(customAttributes), 0
+    )
+
+
+def AddCustomAttributes(builder, customAttributes):
+    StablehloCustomCallOptionsAddCustomAttributes(builder, customAttributes)
+
+
+def StablehloCustomCallOptionsStartCustomAttributesVector(builder, numElems):
+    return builder.StartVector(1, numElems, 1)
+
+
+def StartCustomAttributesVector(builder, numElems: int) -> int:
+    return StablehloCustomCallOptionsStartCustomAttributesVector(builder, numElems)
+
+
+def StablehloCustomCallOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloCustomCallOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloDotGeneralOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloDotGeneralOptions.py
new file mode 100755
index 00000000000..e5c936239ea
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloDotGeneralOptions.py
@@ -0,0 +1,304 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloDotGeneralOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloDotGeneralOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloDotGeneralOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloDotGeneralOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloDotGeneralOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloDotGeneralOptions
+    def LhsBatchingDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloDotGeneralOptions
+    def LhsBatchingDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def LhsBatchingDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def LhsBatchingDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # StablehloDotGeneralOptions
+    def RhsBatchingDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloDotGeneralOptions
+    def RhsBatchingDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def RhsBatchingDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def RhsBatchingDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # StablehloDotGeneralOptions
+    def LhsContractingDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloDotGeneralOptions
+    def LhsContractingDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def LhsContractingDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def LhsContractingDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # StablehloDotGeneralOptions
+    def RhsContractingDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloDotGeneralOptions
+    def RhsContractingDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def RhsContractingDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def RhsContractingDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        return o == 0
+
+    # StablehloDotGeneralOptions
+    def PrecisionConfig(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # StablehloDotGeneralOptions
+    def PrecisionConfigAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint32Flags, o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def PrecisionConfigLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloDotGeneralOptions
+    def PrecisionConfigIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        return o == 0
+
+
+def StablehloDotGeneralOptionsStart(builder):
+    builder.StartObject(5)
+
+
+def Start(builder):
+    StablehloDotGeneralOptionsStart(builder)
+
+
+def StablehloDotGeneralOptionsAddLhsBatchingDimensions(builder, lhsBatchingDimensions):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(lhsBatchingDimensions), 0
+    )
+
+
+def AddLhsBatchingDimensions(builder, lhsBatchingDimensions):
+    StablehloDotGeneralOptionsAddLhsBatchingDimensions(builder, lhsBatchingDimensions)
+
+
+def StablehloDotGeneralOptionsStartLhsBatchingDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartLhsBatchingDimensionsVector(builder, numElems: int) -> int:
+    return StablehloDotGeneralOptionsStartLhsBatchingDimensionsVector(builder, numElems)
+
+
+def StablehloDotGeneralOptionsAddRhsBatchingDimensions(builder, rhsBatchingDimensions):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(rhsBatchingDimensions), 0
+    )
+
+
+def AddRhsBatchingDimensions(builder, rhsBatchingDimensions):
+    StablehloDotGeneralOptionsAddRhsBatchingDimensions(builder, rhsBatchingDimensions)
+
+
+def StablehloDotGeneralOptionsStartRhsBatchingDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartRhsBatchingDimensionsVector(builder, numElems: int) -> int:
+    return StablehloDotGeneralOptionsStartRhsBatchingDimensionsVector(builder, numElems)
+
+
+def StablehloDotGeneralOptionsAddLhsContractingDimensions(
+    builder, lhsContractingDimensions
+):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(lhsContractingDimensions), 0
+    )
+
+
+def AddLhsContractingDimensions(builder, lhsContractingDimensions):
+    StablehloDotGeneralOptionsAddLhsContractingDimensions(
+        builder, lhsContractingDimensions
+    )
+
+
+def StablehloDotGeneralOptionsStartLhsContractingDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartLhsContractingDimensionsVector(builder, numElems: int) -> int:
+    return StablehloDotGeneralOptionsStartLhsContractingDimensionsVector(
+        builder, numElems
+    )
+
+
+def StablehloDotGeneralOptionsAddRhsContractingDimensions(
+    builder, rhsContractingDimensions
+):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(rhsContractingDimensions), 0
+    )
+
+
+def AddRhsContractingDimensions(builder, rhsContractingDimensions):
+    StablehloDotGeneralOptionsAddRhsContractingDimensions(
+        builder, rhsContractingDimensions
+    )
+
+
+def StablehloDotGeneralOptionsStartRhsContractingDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartRhsContractingDimensionsVector(builder, numElems: int) -> int:
+    return StablehloDotGeneralOptionsStartRhsContractingDimensionsVector(
+        builder, numElems
+    )
+
+
+def StablehloDotGeneralOptionsAddPrecisionConfig(builder, precisionConfig):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(precisionConfig), 0
+    )
+
+
+def AddPrecisionConfig(builder, precisionConfig):
+    StablehloDotGeneralOptionsAddPrecisionConfig(builder, precisionConfig)
+
+
+def StablehloDotGeneralOptionsStartPrecisionConfigVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartPrecisionConfigVector(builder, numElems: int) -> int:
+    return StablehloDotGeneralOptionsStartPrecisionConfigVector(builder, numElems)
+
+
+def StablehloDotGeneralOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloDotGeneralOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloDynamicSliceOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloDynamicSliceOptions.py
new file mode 100755
index 00000000000..ca643402cb7
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloDynamicSliceOptions.py
@@ -0,0 +1,100 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloDynamicSliceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloDynamicSliceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloDynamicSliceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloDynamicSliceOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloDynamicSliceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloDynamicSliceOptions
+    def SliceSizes(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloDynamicSliceOptions
+    def SliceSizesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloDynamicSliceOptions
+    def SliceSizesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloDynamicSliceOptions
+    def SliceSizesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def StablehloDynamicSliceOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    StablehloDynamicSliceOptionsStart(builder)
+
+
+def StablehloDynamicSliceOptionsAddSliceSizes(builder, sliceSizes):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(sliceSizes), 0
+    )
+
+
+def AddSliceSizes(builder, sliceSizes):
+    StablehloDynamicSliceOptionsAddSliceSizes(builder, sliceSizes)
+
+
+def StablehloDynamicSliceOptionsStartSliceSizesVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartSliceSizesVector(builder, numElems: int) -> int:
+    return StablehloDynamicSliceOptionsStartSliceSizesVector(builder, numElems)
+
+
+def StablehloDynamicSliceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloDynamicSliceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloGatherOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloGatherOptions.py
new file mode 100755
index 00000000000..cbb1f17249e
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloGatherOptions.py
@@ -0,0 +1,276 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloGatherOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloGatherOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloGatherOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloGatherOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloGatherOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloGatherOptions
+    def OffsetDims(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloGatherOptions
+    def OffsetDimsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloGatherOptions
+    def OffsetDimsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloGatherOptions
+    def OffsetDimsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # StablehloGatherOptions
+    def CollapsedSliceDims(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloGatherOptions
+    def CollapsedSliceDimsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloGatherOptions
+    def CollapsedSliceDimsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloGatherOptions
+    def CollapsedSliceDimsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # StablehloGatherOptions
+    def StartIndexMap(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloGatherOptions
+    def StartIndexMapAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloGatherOptions
+    def StartIndexMapLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloGatherOptions
+    def StartIndexMapIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # StablehloGatherOptions
+    def IndexVectorDim(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloGatherOptions
+    def SliceSizes(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloGatherOptions
+    def SliceSizesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloGatherOptions
+    def SliceSizesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloGatherOptions
+    def SliceSizesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        return o == 0
+
+    # StablehloGatherOptions
+    def IndicesAreSorted(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def StablehloGatherOptionsStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    StablehloGatherOptionsStart(builder)
+
+
+def StablehloGatherOptionsAddOffsetDims(builder, offsetDims):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(offsetDims), 0
+    )
+
+
+def AddOffsetDims(builder, offsetDims):
+    StablehloGatherOptionsAddOffsetDims(builder, offsetDims)
+
+
+def StablehloGatherOptionsStartOffsetDimsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartOffsetDimsVector(builder, numElems: int) -> int:
+    return StablehloGatherOptionsStartOffsetDimsVector(builder, numElems)
+
+
+def StablehloGatherOptionsAddCollapsedSliceDims(builder, collapsedSliceDims):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(collapsedSliceDims), 0
+    )
+
+
+def AddCollapsedSliceDims(builder, collapsedSliceDims):
+    StablehloGatherOptionsAddCollapsedSliceDims(builder, collapsedSliceDims)
+
+
+def StablehloGatherOptionsStartCollapsedSliceDimsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartCollapsedSliceDimsVector(builder, numElems: int) -> int:
+    return StablehloGatherOptionsStartCollapsedSliceDimsVector(builder, numElems)
+
+
+def StablehloGatherOptionsAddStartIndexMap(builder, startIndexMap):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(startIndexMap), 0
+    )
+
+
+def AddStartIndexMap(builder, startIndexMap):
+    StablehloGatherOptionsAddStartIndexMap(builder, startIndexMap)
+
+
+def StablehloGatherOptionsStartStartIndexMapVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartStartIndexMapVector(builder, numElems: int) -> int:
+    return StablehloGatherOptionsStartStartIndexMapVector(builder, numElems)
+
+
+def StablehloGatherOptionsAddIndexVectorDim(builder, indexVectorDim):
+    builder.PrependInt64Slot(3, indexVectorDim, 0)
+
+
+def AddIndexVectorDim(builder, indexVectorDim):
+    StablehloGatherOptionsAddIndexVectorDim(builder, indexVectorDim)
+
+
+def StablehloGatherOptionsAddSliceSizes(builder, sliceSizes):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(sliceSizes), 0
+    )
+
+
+def AddSliceSizes(builder, sliceSizes):
+    StablehloGatherOptionsAddSliceSizes(builder, sliceSizes)
+
+
+def StablehloGatherOptionsStartSliceSizesVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartSliceSizesVector(builder, numElems: int) -> int:
+    return StablehloGatherOptionsStartSliceSizesVector(builder, numElems)
+
+
+def StablehloGatherOptionsAddIndicesAreSorted(builder, indicesAreSorted):
+    builder.PrependBoolSlot(5, indicesAreSorted, 0)
+
+
+def AddIndicesAreSorted(builder, indicesAreSorted):
+    StablehloGatherOptionsAddIndicesAreSorted(builder, indicesAreSorted)
+
+
+def StablehloGatherOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloGatherOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloIotaOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloIotaOptions.py
new file mode 100755
index 00000000000..af13f3b28fb
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloIotaOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloIotaOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloIotaOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloIotaOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloIotaOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloIotaOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloIotaOptions
+    def IotaDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloIotaOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    StablehloIotaOptionsStart(builder)
+
+
+def StablehloIotaOptionsAddIotaDimension(builder, iotaDimension):
+    builder.PrependInt64Slot(0, iotaDimension, 0)
+
+
+def AddIotaDimension(builder, iotaDimension):
+    StablehloIotaOptionsAddIotaDimension(builder, iotaDimension)
+
+
+def StablehloIotaOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloIotaOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloPadOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloPadOptions.py
new file mode 100755
index 00000000000..6b744145299
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloPadOptions.py
@@ -0,0 +1,194 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloPadOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloPadOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloPadOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloPadOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloPadOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloPadOptions
+    def EdgePaddingLow(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloPadOptions
+    def EdgePaddingLowAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloPadOptions
+    def EdgePaddingLowLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloPadOptions
+    def EdgePaddingLowIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # StablehloPadOptions
+    def EdgePaddingHigh(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloPadOptions
+    def EdgePaddingHighAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloPadOptions
+    def EdgePaddingHighLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloPadOptions
+    def EdgePaddingHighIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # StablehloPadOptions
+    def InteriorPadding(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloPadOptions
+    def InteriorPaddingAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloPadOptions
+    def InteriorPaddingLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloPadOptions
+    def InteriorPaddingIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+
+def StablehloPadOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    StablehloPadOptionsStart(builder)
+
+
+def StablehloPadOptionsAddEdgePaddingLow(builder, edgePaddingLow):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(edgePaddingLow), 0
+    )
+
+
+def AddEdgePaddingLow(builder, edgePaddingLow):
+    StablehloPadOptionsAddEdgePaddingLow(builder, edgePaddingLow)
+
+
+def StablehloPadOptionsStartEdgePaddingLowVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartEdgePaddingLowVector(builder, numElems: int) -> int:
+    return StablehloPadOptionsStartEdgePaddingLowVector(builder, numElems)
+
+
+def StablehloPadOptionsAddEdgePaddingHigh(builder, edgePaddingHigh):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(edgePaddingHigh), 0
+    )
+
+
+def AddEdgePaddingHigh(builder, edgePaddingHigh):
+    StablehloPadOptionsAddEdgePaddingHigh(builder, edgePaddingHigh)
+
+
+def StablehloPadOptionsStartEdgePaddingHighVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartEdgePaddingHighVector(builder, numElems: int) -> int:
+    return StablehloPadOptionsStartEdgePaddingHighVector(builder, numElems)
+
+
+def StablehloPadOptionsAddInteriorPadding(builder, interiorPadding):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(interiorPadding), 0
+    )
+
+
+def AddInteriorPadding(builder, interiorPadding):
+    StablehloPadOptionsAddInteriorPadding(builder, interiorPadding)
+
+
+def StablehloPadOptionsStartInteriorPaddingVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartInteriorPaddingVector(builder, numElems: int) -> int:
+    return StablehloPadOptionsStartInteriorPaddingVector(builder, numElems)
+
+
+def StablehloPadOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloPadOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloPrecisionConfig.py b/backends/nxp/backend/ir/lib/tflite/StablehloPrecisionConfig.py
new file mode 100755
index 00000000000..a9d4f41c364
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloPrecisionConfig.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class StablehloPrecisionConfig(object):
+    DEFAULT = 0
+    HIGH = 1
+    HIGHEST = 2
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloReduceOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloReduceOptions.py
new file mode 100755
index 00000000000..a4f42e1988e
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloReduceOptions.py
@@ -0,0 +1,115 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloReduceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloReduceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloReduceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloReduceOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloReduceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloReduceOptions
+    def Dimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloReduceOptions
+    def DimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloReduceOptions
+    def DimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloReduceOptions
+    def DimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # StablehloReduceOptions
+    def BodySubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloReduceOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    StablehloReduceOptionsStart(builder)
+
+
+def StablehloReduceOptionsAddDimensions(builder, dimensions):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(dimensions), 0
+    )
+
+
+def AddDimensions(builder, dimensions):
+    StablehloReduceOptionsAddDimensions(builder, dimensions)
+
+
+def StablehloReduceOptionsStartDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartDimensionsVector(builder, numElems: int) -> int:
+    return StablehloReduceOptionsStartDimensionsVector(builder, numElems)
+
+
+def StablehloReduceOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex):
+    builder.PrependInt32Slot(1, bodySubgraphIndex, 0)
+
+
+def AddBodySubgraphIndex(builder, bodySubgraphIndex):
+    StablehloReduceOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex)
+
+
+def StablehloReduceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloReduceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloReduceWindowOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloReduceWindowOptions.py
new file mode 100755
index 00000000000..d87dd82c2c5
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloReduceWindowOptions.py
@@ -0,0 +1,307 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloReduceWindowOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloReduceWindowOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloReduceWindowOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloReduceWindowOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloReduceWindowOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloReduceWindowOptions
+    def WindowDimensions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowDimensionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowDimensionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowDimensionsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # StablehloReduceWindowOptions
+    def WindowStrides(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowStridesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowStridesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowStridesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # StablehloReduceWindowOptions
+    def BaseDilations(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloReduceWindowOptions
+    def BaseDilationsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def BaseDilationsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def BaseDilationsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # StablehloReduceWindowOptions
+    def WindowDilations(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowDilationsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowDilationsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def WindowDilationsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        return o == 0
+
+    # StablehloReduceWindowOptions
+    def Padding(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloReduceWindowOptions
+    def PaddingAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def PaddingLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloReduceWindowOptions
+    def PaddingIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        return o == 0
+
+    # StablehloReduceWindowOptions
+    def BodySubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloReduceWindowOptionsStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    StablehloReduceWindowOptionsStart(builder)
+
+
+def StablehloReduceWindowOptionsAddWindowDimensions(builder, windowDimensions):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(windowDimensions), 0
+    )
+
+
+def AddWindowDimensions(builder, windowDimensions):
+    StablehloReduceWindowOptionsAddWindowDimensions(builder, windowDimensions)
+
+
+def StablehloReduceWindowOptionsStartWindowDimensionsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartWindowDimensionsVector(builder, numElems: int) -> int:
+    return StablehloReduceWindowOptionsStartWindowDimensionsVector(builder, numElems)
+
+
+def StablehloReduceWindowOptionsAddWindowStrides(builder, windowStrides):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(windowStrides), 0
+    )
+
+
+def AddWindowStrides(builder, windowStrides):
+    StablehloReduceWindowOptionsAddWindowStrides(builder, windowStrides)
+
+
+def StablehloReduceWindowOptionsStartWindowStridesVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartWindowStridesVector(builder, numElems: int) -> int:
+    return StablehloReduceWindowOptionsStartWindowStridesVector(builder, numElems)
+
+
+def StablehloReduceWindowOptionsAddBaseDilations(builder, baseDilations):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(baseDilations), 0
+    )
+
+
+def AddBaseDilations(builder, baseDilations):
+    StablehloReduceWindowOptionsAddBaseDilations(builder, baseDilations)
+
+
+def StablehloReduceWindowOptionsStartBaseDilationsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartBaseDilationsVector(builder, numElems: int) -> int:
+    return StablehloReduceWindowOptionsStartBaseDilationsVector(builder, numElems)
+
+
+def StablehloReduceWindowOptionsAddWindowDilations(builder, windowDilations):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(windowDilations), 0
+    )
+
+
+def AddWindowDilations(builder, windowDilations):
+    StablehloReduceWindowOptionsAddWindowDilations(builder, windowDilations)
+
+
+def StablehloReduceWindowOptionsStartWindowDilationsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartWindowDilationsVector(builder, numElems: int) -> int:
+    return StablehloReduceWindowOptionsStartWindowDilationsVector(builder, numElems)
+
+
+def StablehloReduceWindowOptionsAddPadding(builder, padding):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(padding), 0
+    )
+
+
+def AddPadding(builder, padding):
+    StablehloReduceWindowOptionsAddPadding(builder, padding)
+
+
+def StablehloReduceWindowOptionsStartPaddingVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartPaddingVector(builder, numElems: int) -> int:
+    return StablehloReduceWindowOptionsStartPaddingVector(builder, numElems)
+
+
+def StablehloReduceWindowOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex):
+    builder.PrependInt32Slot(5, bodySubgraphIndex, 0)
+
+
+def AddBodySubgraphIndex(builder, bodySubgraphIndex):
+    StablehloReduceWindowOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex)
+
+
+def StablehloReduceWindowOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloReduceWindowOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloRngBitGeneratorOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloRngBitGeneratorOptions.py
new file mode 100755
index 00000000000..bbeb66c1603
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloRngBitGeneratorOptions.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloRngBitGeneratorOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloRngBitGeneratorOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloRngBitGeneratorOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloRngBitGeneratorOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloRngBitGeneratorOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloRngBitGeneratorOptions
+    def Algorithm(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloRngBitGeneratorOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    StablehloRngBitGeneratorOptionsStart(builder)
+
+
+def StablehloRngBitGeneratorOptionsAddAlgorithm(builder, algorithm):
+    builder.PrependInt8Slot(0, algorithm, 0)
+
+
+def AddAlgorithm(builder, algorithm):
+    StablehloRngBitGeneratorOptionsAddAlgorithm(builder, algorithm)
+
+
+def StablehloRngBitGeneratorOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloRngBitGeneratorOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloScatterOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloScatterOptions.py
new file mode 100755
index 00000000000..19a7feb0a4a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloScatterOptions.py
@@ -0,0 +1,268 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloScatterOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloScatterOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloScatterOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloScatterOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloScatterOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloScatterOptions
+    def IndicesAreSorted(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # StablehloScatterOptions
+    def UpdateWindowDims(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloScatterOptions
+    def UpdateWindowDimsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloScatterOptions
+    def UpdateWindowDimsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloScatterOptions
+    def UpdateWindowDimsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # StablehloScatterOptions
+    def InsertedWindowDims(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloScatterOptions
+    def InsertedWindowDimsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloScatterOptions
+    def InsertedWindowDimsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloScatterOptions
+    def InsertedWindowDimsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # StablehloScatterOptions
+    def ScatterDimsToOperandDims(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloScatterOptions
+    def ScatterDimsToOperandDimsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloScatterOptions
+    def ScatterDimsToOperandDimsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloScatterOptions
+    def ScatterDimsToOperandDimsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        return o == 0
+
+    # StablehloScatterOptions
+    def IndexVectorDim(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloScatterOptions
+    def UniqueIndices(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # StablehloScatterOptions
+    def UpdateComputationSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloScatterOptionsStart(builder):
+    builder.StartObject(7)
+
+
+def Start(builder):
+    StablehloScatterOptionsStart(builder)
+
+
+def StablehloScatterOptionsAddIndicesAreSorted(builder, indicesAreSorted):
+    builder.PrependBoolSlot(0, indicesAreSorted, 0)
+
+
+def AddIndicesAreSorted(builder, indicesAreSorted):
+    StablehloScatterOptionsAddIndicesAreSorted(builder, indicesAreSorted)
+
+
+def StablehloScatterOptionsAddUpdateWindowDims(builder, updateWindowDims):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(updateWindowDims), 0
+    )
+
+
+def AddUpdateWindowDims(builder, updateWindowDims):
+    StablehloScatterOptionsAddUpdateWindowDims(builder, updateWindowDims)
+
+
+def StablehloScatterOptionsStartUpdateWindowDimsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartUpdateWindowDimsVector(builder, numElems: int) -> int:
+    return StablehloScatterOptionsStartUpdateWindowDimsVector(builder, numElems)
+
+
+def StablehloScatterOptionsAddInsertedWindowDims(builder, insertedWindowDims):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(insertedWindowDims), 0
+    )
+
+
+def AddInsertedWindowDims(builder, insertedWindowDims):
+    StablehloScatterOptionsAddInsertedWindowDims(builder, insertedWindowDims)
+
+
+def StablehloScatterOptionsStartInsertedWindowDimsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartInsertedWindowDimsVector(builder, numElems: int) -> int:
+    return StablehloScatterOptionsStartInsertedWindowDimsVector(builder, numElems)
+
+
+def StablehloScatterOptionsAddScatterDimsToOperandDims(
+    builder, scatterDimsToOperandDims
+):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(scatterDimsToOperandDims), 0
+    )
+
+
+def AddScatterDimsToOperandDims(builder, scatterDimsToOperandDims):
+    StablehloScatterOptionsAddScatterDimsToOperandDims(
+        builder, scatterDimsToOperandDims
+    )
+
+
+def StablehloScatterOptionsStartScatterDimsToOperandDimsVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartScatterDimsToOperandDimsVector(builder, numElems: int) -> int:
+    return StablehloScatterOptionsStartScatterDimsToOperandDimsVector(builder, numElems)
+
+
+def StablehloScatterOptionsAddIndexVectorDim(builder, indexVectorDim):
+    builder.PrependInt64Slot(4, indexVectorDim, 0)
+
+
+def AddIndexVectorDim(builder, indexVectorDim):
+    StablehloScatterOptionsAddIndexVectorDim(builder, indexVectorDim)
+
+
+def StablehloScatterOptionsAddUniqueIndices(builder, uniqueIndices):
+    builder.PrependBoolSlot(5, uniqueIndices, 0)
+
+
+def AddUniqueIndices(builder, uniqueIndices):
+    StablehloScatterOptionsAddUniqueIndices(builder, uniqueIndices)
+
+
+def StablehloScatterOptionsAddUpdateComputationSubgraphIndex(
+    builder, updateComputationSubgraphIndex
+):
+    builder.PrependInt32Slot(6, updateComputationSubgraphIndex, 0)
+
+
+def AddUpdateComputationSubgraphIndex(builder, updateComputationSubgraphIndex):
+    StablehloScatterOptionsAddUpdateComputationSubgraphIndex(
+        builder, updateComputationSubgraphIndex
+    )
+
+
+def StablehloScatterOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloScatterOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloSliceOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloSliceOptions.py
new file mode 100755
index 00000000000..36aa55f6bfa
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloSliceOptions.py
@@ -0,0 +1,194 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloSliceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloSliceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloSliceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloSliceOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloSliceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloSliceOptions
+    def StartIndices(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloSliceOptions
+    def StartIndicesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloSliceOptions
+    def StartIndicesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloSliceOptions
+    def StartIndicesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # StablehloSliceOptions
+    def LimitIndices(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloSliceOptions
+    def LimitIndicesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloSliceOptions
+    def LimitIndicesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloSliceOptions
+    def LimitIndicesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # StablehloSliceOptions
+    def Strides(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloSliceOptions
+    def StridesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloSliceOptions
+    def StridesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloSliceOptions
+    def StridesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+
+def StablehloSliceOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    StablehloSliceOptionsStart(builder)
+
+
+def StablehloSliceOptionsAddStartIndices(builder, startIndices):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(startIndices), 0
+    )
+
+
+def AddStartIndices(builder, startIndices):
+    StablehloSliceOptionsAddStartIndices(builder, startIndices)
+
+
+def StablehloSliceOptionsStartStartIndicesVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartStartIndicesVector(builder, numElems: int) -> int:
+    return StablehloSliceOptionsStartStartIndicesVector(builder, numElems)
+
+
+def StablehloSliceOptionsAddLimitIndices(builder, limitIndices):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(limitIndices), 0
+    )
+
+
+def AddLimitIndices(builder, limitIndices):
+    StablehloSliceOptionsAddLimitIndices(builder, limitIndices)
+
+
+def StablehloSliceOptionsStartLimitIndicesVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartLimitIndicesVector(builder, numElems: int) -> int:
+    return StablehloSliceOptionsStartLimitIndicesVector(builder, numElems)
+
+
+def StablehloSliceOptionsAddStrides(builder, strides):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(strides), 0
+    )
+
+
+def AddStrides(builder, strides):
+    StablehloSliceOptionsAddStrides(builder, strides)
+
+
+def StablehloSliceOptionsStartStridesVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartStridesVector(builder, numElems: int) -> int:
+    return StablehloSliceOptionsStartStridesVector(builder, numElems)
+
+
+def StablehloSliceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloSliceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloSortOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloSortOptions.py
new file mode 100755
index 00000000000..4b227876468
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloSortOptions.py
@@ -0,0 +1,97 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloSortOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloSortOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloSortOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloSortOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloSortOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloSortOptions
+    def Dimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloSortOptions
+    def IsStable(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # StablehloSortOptions
+    def ComparatorSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloSortOptionsStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    StablehloSortOptionsStart(builder)
+
+
+def StablehloSortOptionsAddDimension(builder, dimension):
+    builder.PrependInt64Slot(0, dimension, 0)
+
+
+def AddDimension(builder, dimension):
+    StablehloSortOptionsAddDimension(builder, dimension)
+
+
+def StablehloSortOptionsAddIsStable(builder, isStable):
+    builder.PrependBoolSlot(1, isStable, 0)
+
+
+def AddIsStable(builder, isStable):
+    StablehloSortOptionsAddIsStable(builder, isStable)
+
+
+def StablehloSortOptionsAddComparatorSubgraphIndex(builder, comparatorSubgraphIndex):
+    builder.PrependInt32Slot(2, comparatorSubgraphIndex, 0)
+
+
+def AddComparatorSubgraphIndex(builder, comparatorSubgraphIndex):
+    StablehloSortOptionsAddComparatorSubgraphIndex(builder, comparatorSubgraphIndex)
+
+
+def StablehloSortOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloSortOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloTransposeOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloTransposeOptions.py
new file mode 100755
index 00000000000..373c40e2f7b
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloTransposeOptions.py
@@ -0,0 +1,100 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloTransposeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloTransposeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloTransposeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloTransposeOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloTransposeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloTransposeOptions
+    def Permutation(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int64Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8),
+            )
+        return 0
+
+    # StablehloTransposeOptions
+    def PermutationAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # StablehloTransposeOptions
+    def PermutationLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # StablehloTransposeOptions
+    def PermutationIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def StablehloTransposeOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    StablehloTransposeOptionsStart(builder)
+
+
+def StablehloTransposeOptionsAddPermutation(builder, permutation):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(permutation), 0
+    )
+
+
+def AddPermutation(builder, permutation):
+    StablehloTransposeOptionsAddPermutation(builder, permutation)
+
+
+def StablehloTransposeOptionsStartPermutationVector(builder, numElems):
+    return builder.StartVector(8, numElems, 8)
+
+
+def StartPermutationVector(builder, numElems: int) -> int:
+    return StablehloTransposeOptionsStartPermutationVector(builder, numElems)
+
+
+def StablehloTransposeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloTransposeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StablehloWhileOptions.py b/backends/nxp/backend/ir/lib/tflite/StablehloWhileOptions.py
new file mode 100755
index 00000000000..3fe68865e6d
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StablehloWhileOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StablehloWhileOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StablehloWhileOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStablehloWhileOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StablehloWhileOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StablehloWhileOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StablehloWhileOptions
+    def CondSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StablehloWhileOptions
+    def BodySubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def StablehloWhileOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    StablehloWhileOptionsStart(builder)
+
+
+def StablehloWhileOptionsAddCondSubgraphIndex(builder, condSubgraphIndex):
+    builder.PrependInt32Slot(0, condSubgraphIndex, 0)
+
+
+def AddCondSubgraphIndex(builder, condSubgraphIndex):
+    StablehloWhileOptionsAddCondSubgraphIndex(builder, condSubgraphIndex)
+
+
+def StablehloWhileOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex):
+    builder.PrependInt32Slot(1, bodySubgraphIndex, 0)
+
+
+def AddBodySubgraphIndex(builder, bodySubgraphIndex):
+    StablehloWhileOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex)
+
+
+def StablehloWhileOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StablehloWhileOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/StridedSliceOptions.py b/backends/nxp/backend/ir/lib/tflite/StridedSliceOptions.py
new file mode 100755
index 00000000000..0d9a84d5644
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/StridedSliceOptions.py
@@ -0,0 +1,142 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class StridedSliceOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StridedSliceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStridedSliceOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def StridedSliceOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # StridedSliceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StridedSliceOptions
+    def BeginMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def EndMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def EllipsisMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def NewAxisMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def ShrinkAxisMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def Offset(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def StridedSliceOptionsStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    StridedSliceOptionsStart(builder)
+
+
+def StridedSliceOptionsAddBeginMask(builder, beginMask):
+    builder.PrependInt32Slot(0, beginMask, 0)
+
+
+def AddBeginMask(builder, beginMask):
+    StridedSliceOptionsAddBeginMask(builder, beginMask)
+
+
+def StridedSliceOptionsAddEndMask(builder, endMask):
+    builder.PrependInt32Slot(1, endMask, 0)
+
+
+def AddEndMask(builder, endMask):
+    StridedSliceOptionsAddEndMask(builder, endMask)
+
+
+def StridedSliceOptionsAddEllipsisMask(builder, ellipsisMask):
+    builder.PrependInt32Slot(2, ellipsisMask, 0)
+
+
+def AddEllipsisMask(builder, ellipsisMask):
+    StridedSliceOptionsAddEllipsisMask(builder, ellipsisMask)
+
+
+def StridedSliceOptionsAddNewAxisMask(builder, newAxisMask):
+    builder.PrependInt32Slot(3, newAxisMask, 0)
+
+
+def AddNewAxisMask(builder, newAxisMask):
+    StridedSliceOptionsAddNewAxisMask(builder, newAxisMask)
+
+
+def StridedSliceOptionsAddShrinkAxisMask(builder, shrinkAxisMask):
+    builder.PrependInt32Slot(4, shrinkAxisMask, 0)
+
+
+def AddShrinkAxisMask(builder, shrinkAxisMask):
+    StridedSliceOptionsAddShrinkAxisMask(builder, shrinkAxisMask)
+
+
+def StridedSliceOptionsAddOffset(builder, offset):
+    builder.PrependBoolSlot(5, offset, 0)
+
+
+def AddOffset(builder, offset):
+    StridedSliceOptionsAddOffset(builder, offset)
+
+
+def StridedSliceOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return StridedSliceOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SubGraph.py b/backends/nxp/backend/ir/lib/tflite/SubGraph.py
new file mode 100755
index 00000000000..3bfc4fd69b2
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SubGraph.py
@@ -0,0 +1,251 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SubGraph(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SubGraph()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSubGraph(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SubGraphBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SubGraph
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SubGraph
+    def Tensors(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Tensor import Tensor
+
+            obj = Tensor()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SubGraph
+    def TensorsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def TensorsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # SubGraph
+    def Inputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # SubGraph
+    def InputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SubGraph
+    def InputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def InputsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        return o == 0
+
+    # SubGraph
+    def Outputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # SubGraph
+    def OutputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SubGraph
+    def OutputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def OutputsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        return o == 0
+
+    # SubGraph
+    def Operators(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Operator import Operator
+
+            obj = Operator()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SubGraph
+    def OperatorsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def OperatorsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        return o == 0
+
+    # SubGraph
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+
+def SubGraphStart(builder):
+    builder.StartObject(5)
+
+
+def Start(builder):
+    SubGraphStart(builder)
+
+
+def SubGraphAddTensors(builder, tensors):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(tensors), 0
+    )
+
+
+def AddTensors(builder, tensors):
+    SubGraphAddTensors(builder, tensors)
+
+
+def SubGraphStartTensorsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartTensorsVector(builder, numElems: int) -> int:
+    return SubGraphStartTensorsVector(builder, numElems)
+
+
+def SubGraphAddInputs(builder, inputs):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0
+    )
+
+
+def AddInputs(builder, inputs):
+    SubGraphAddInputs(builder, inputs)
+
+
+def SubGraphStartInputsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartInputsVector(builder, numElems: int) -> int:
+    return SubGraphStartInputsVector(builder, numElems)
+
+
+def SubGraphAddOutputs(builder, outputs):
+    builder.PrependUOffsetTRelativeSlot(
+        2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0
+    )
+
+
+def AddOutputs(builder, outputs):
+    SubGraphAddOutputs(builder, outputs)
+
+
+def SubGraphStartOutputsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartOutputsVector(builder, numElems: int) -> int:
+    return SubGraphStartOutputsVector(builder, numElems)
+
+
+def SubGraphAddOperators(builder, operators):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(operators), 0
+    )
+
+
+def AddOperators(builder, operators):
+    SubGraphAddOperators(builder, operators)
+
+
+def SubGraphStartOperatorsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartOperatorsVector(builder, numElems: int) -> int:
+    return SubGraphStartOperatorsVector(builder, numElems)
+
+
+def SubGraphAddName(builder, name):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0
+    )
+
+
+def AddName(builder, name):
+    SubGraphAddName(builder, name)
+
+
+def SubGraphEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SubGraphEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/SubOptions.py b/backends/nxp/backend/ir/lib/tflite/SubOptions.py
new file mode 100755
index 00000000000..35ce822f376
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/SubOptions.py
@@ -0,0 +1,82 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class SubOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SubOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSubOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def SubOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # SubOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SubOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # SubOptions
+    def PotScaleInt16(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return True
+
+
+def SubOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    SubOptionsStart(builder)
+
+
+def SubOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    SubOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def SubOptionsAddPotScaleInt16(builder, potScaleInt16):
+    builder.PrependBoolSlot(1, potScaleInt16, 1)
+
+
+def AddPotScaleInt16(builder, potScaleInt16):
+    SubOptionsAddPotScaleInt16(builder, potScaleInt16)
+
+
+def SubOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return SubOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Tensor.py b/backends/nxp/backend/ir/lib/tflite/Tensor.py
new file mode 100755
index 00000000000..fdfcdfe5786
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Tensor.py
@@ -0,0 +1,317 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Tensor(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Tensor()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTensor(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def TensorBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Tensor
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Tensor
+    def Shape(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # Tensor
+    def ShapeAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Tensor
+    def ShapeLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Tensor
+    def ShapeIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # Tensor
+    def Type(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Tensor
+    def Buffer(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+    # Tensor
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Tensor
+    def Quantization(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from .QuantizationParameters import QuantizationParameters
+
+            obj = QuantizationParameters()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Tensor
+    def IsVariable(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # Tensor
+    def Sparsity(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from .SparsityParameters import SparsityParameters
+
+            obj = SparsityParameters()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Tensor
+    def ShapeSignature(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # Tensor
+    def ShapeSignatureAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Tensor
+    def ShapeSignatureLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Tensor
+    def ShapeSignatureIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        return o == 0
+
+    # Tensor
+    def HasRank(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # Tensor
+    def VariantTensors(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .VariantSubType import VariantSubType
+
+            obj = VariantSubType()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Tensor
+    def VariantTensorsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Tensor
+    def VariantTensorsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        return o == 0
+
+
+def TensorStart(builder):
+    builder.StartObject(10)
+
+
+def Start(builder):
+    TensorStart(builder)
+
+
+def TensorAddShape(builder, shape):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(shape), 0
+    )
+
+
+def AddShape(builder, shape):
+    TensorAddShape(builder, shape)
+
+
+def TensorStartShapeVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartShapeVector(builder, numElems: int) -> int:
+    return TensorStartShapeVector(builder, numElems)
+
+
+def TensorAddType(builder, type):
+    builder.PrependInt8Slot(1, type, 0)
+
+
+def AddType(builder, type):
+    TensorAddType(builder, type)
+
+
+def TensorAddBuffer(builder, buffer):
+    builder.PrependUint32Slot(2, buffer, 0)
+
+
+def AddBuffer(builder, buffer):
+    TensorAddBuffer(builder, buffer)
+
+
+def TensorAddName(builder, name):
+    builder.PrependUOffsetTRelativeSlot(
+        3, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0
+    )
+
+
+def AddName(builder, name):
+    TensorAddName(builder, name)
+
+
+def TensorAddQuantization(builder, quantization):
+    builder.PrependUOffsetTRelativeSlot(
+        4, flatbuffers.number_types.UOffsetTFlags.py_type(quantization), 0
+    )
+
+
+def AddQuantization(builder, quantization):
+    TensorAddQuantization(builder, quantization)
+
+
+def TensorAddIsVariable(builder, isVariable):
+    builder.PrependBoolSlot(5, isVariable, 0)
+
+
+def AddIsVariable(builder, isVariable):
+    TensorAddIsVariable(builder, isVariable)
+
+
+def TensorAddSparsity(builder, sparsity):
+    builder.PrependUOffsetTRelativeSlot(
+        6, flatbuffers.number_types.UOffsetTFlags.py_type(sparsity), 0
+    )
+
+
+def AddSparsity(builder, sparsity):
+    TensorAddSparsity(builder, sparsity)
+
+
+def TensorAddShapeSignature(builder, shapeSignature):
+    builder.PrependUOffsetTRelativeSlot(
+        7, flatbuffers.number_types.UOffsetTFlags.py_type(shapeSignature), 0
+    )
+
+
+def AddShapeSignature(builder, shapeSignature):
+    TensorAddShapeSignature(builder, shapeSignature)
+
+
+def TensorStartShapeSignatureVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartShapeSignatureVector(builder, numElems: int) -> int:
+    return TensorStartShapeSignatureVector(builder, numElems)
+
+
+def TensorAddHasRank(builder, hasRank):
+    builder.PrependBoolSlot(8, hasRank, 0)
+
+
+def AddHasRank(builder, hasRank):
+    TensorAddHasRank(builder, hasRank)
+
+
+def TensorAddVariantTensors(builder, variantTensors):
+    builder.PrependUOffsetTRelativeSlot(
+        9, flatbuffers.number_types.UOffsetTFlags.py_type(variantTensors), 0
+    )
+
+
+def AddVariantTensors(builder, variantTensors):
+    TensorAddVariantTensors(builder, variantTensors)
+
+
+def TensorStartVariantTensorsVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartVariantTensorsVector(builder, numElems: int) -> int:
+    return TensorStartVariantTensorsVector(builder, numElems)
+
+
+def TensorEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return TensorEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/TensorMap.py b/backends/nxp/backend/ir/lib/tflite/TensorMap.py
new file mode 100755
index 00000000000..d133e462b50
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/TensorMap.py
@@ -0,0 +1,84 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TensorMap(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TensorMap()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTensorMap(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def TensorMapBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # TensorMap
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TensorMap
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # TensorMap
+    def TensorIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Uint32Flags, o + self._tab.Pos
+            )
+        return 0
+
+
+def TensorMapStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    TensorMapStart(builder)
+
+
+def TensorMapAddName(builder, name):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0
+    )
+
+
+def AddName(builder, name):
+    TensorMapAddName(builder, name)
+
+
+def TensorMapAddTensorIndex(builder, tensorIndex):
+    builder.PrependUint32Slot(1, tensorIndex, 0)
+
+
+def AddTensorIndex(builder, tensorIndex):
+    TensorMapAddTensorIndex(builder, tensorIndex)
+
+
+def TensorMapEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return TensorMapEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/TensorType.py b/backends/nxp/backend/ir/lib/tflite/TensorType.py
new file mode 100755
index 00000000000..95137102b6a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/TensorType.py
@@ -0,0 +1,24 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+
+class TensorType(object):
+    FLOAT32 = 0
+    FLOAT16 = 1
+    INT32 = 2
+    UINT8 = 3
+    INT64 = 4
+    STRING = 5
+    BOOL = 6
+    INT16 = 7
+    COMPLEX64 = 8
+    INT8 = 9
+    FLOAT64 = 10
+    COMPLEX128 = 11
+    UINT64 = 12
+    RESOURCE = 13
+    VARIANT = 14
+    UINT32 = 15
+    UINT16 = 16
+    INT4 = 17
diff --git a/backends/nxp/backend/ir/lib/tflite/TileOptions.py b/backends/nxp/backend/ir/lib/tflite/TileOptions.py
new file mode 100755
index 00000000000..4ac1010b46a
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/TileOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TileOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TileOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTileOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def TileOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # TileOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def TileOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    TileOptionsStart(builder)
+
+
+def TileOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return TileOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/TopKV2Options.py b/backends/nxp/backend/ir/lib/tflite/TopKV2Options.py
new file mode 100755
index 00000000000..08b7c833d15
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/TopKV2Options.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TopKV2Options(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TopKV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTopKV2Options(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def TopKV2OptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # TopKV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def TopKV2OptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    TopKV2OptionsStart(builder)
+
+
+def TopKV2OptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return TopKV2OptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/TransposeConvOptions.py b/backends/nxp/backend/ir/lib/tflite/TransposeConvOptions.py
new file mode 100755
index 00000000000..836199a0912
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/TransposeConvOptions.py
@@ -0,0 +1,125 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TransposeConvOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TransposeConvOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTransposeConvOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def TransposeConvOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # TransposeConvOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TransposeConvOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # TransposeConvOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # TransposeConvOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # TransposeConvOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # TransposeConvOptions
+    def QuantizedBiasType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+
+def TransposeConvOptionsStart(builder):
+    builder.StartObject(5)
+
+
+def Start(builder):
+    TransposeConvOptionsStart(builder)
+
+
+def TransposeConvOptionsAddPadding(builder, padding):
+    builder.PrependInt8Slot(0, padding, 0)
+
+
+def AddPadding(builder, padding):
+    TransposeConvOptionsAddPadding(builder, padding)
+
+
+def TransposeConvOptionsAddStrideW(builder, strideW):
+    builder.PrependInt32Slot(1, strideW, 0)
+
+
+def AddStrideW(builder, strideW):
+    TransposeConvOptionsAddStrideW(builder, strideW)
+
+
+def TransposeConvOptionsAddStrideH(builder, strideH):
+    builder.PrependInt32Slot(2, strideH, 0)
+
+
+def AddStrideH(builder, strideH):
+    TransposeConvOptionsAddStrideH(builder, strideH)
+
+
+def TransposeConvOptionsAddFusedActivationFunction(builder, fusedActivationFunction):
+    builder.PrependInt8Slot(3, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    TransposeConvOptionsAddFusedActivationFunction(builder, fusedActivationFunction)
+
+
+def TransposeConvOptionsAddQuantizedBiasType(builder, quantizedBiasType):
+    builder.PrependInt8Slot(4, quantizedBiasType, 0)
+
+
+def AddQuantizedBiasType(builder, quantizedBiasType):
+    TransposeConvOptionsAddQuantizedBiasType(builder, quantizedBiasType)
+
+
+def TransposeConvOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return TransposeConvOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/TransposeOptions.py b/backends/nxp/backend/ir/lib/tflite/TransposeOptions.py
new file mode 100755
index 00000000000..eccb306fd88
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/TransposeOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TransposeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TransposeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTransposeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def TransposeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # TransposeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def TransposeOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    TransposeOptionsStart(builder)
+
+
+def TransposeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return TransposeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Uint16Vector.py b/backends/nxp/backend/ir/lib/tflite/Uint16Vector.py
new file mode 100755
index 00000000000..0eb7940d32c
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Uint16Vector.py
@@ -0,0 +1,98 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Uint16Vector(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Uint16Vector()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUint16Vector(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def Uint16VectorBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Uint16Vector
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Uint16Vector
+    def Values(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint16Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 2),
+            )
+        return 0
+
+    # Uint16Vector
+    def ValuesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint16Flags, o)
+        return 0
+
+    # Uint16Vector
+    def ValuesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Uint16Vector
+    def ValuesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def Uint16VectorStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    Uint16VectorStart(builder)
+
+
+def Uint16VectorAddValues(builder, values):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0
+    )
+
+
+def AddValues(builder, values):
+    Uint16VectorAddValues(builder, values)
+
+
+def Uint16VectorStartValuesVector(builder, numElems):
+    return builder.StartVector(2, numElems, 2)
+
+
+def StartValuesVector(builder, numElems: int) -> int:
+    return Uint16VectorStartValuesVector(builder, numElems)
+
+
+def Uint16VectorEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return Uint16VectorEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/Uint8Vector.py b/backends/nxp/backend/ir/lib/tflite/Uint8Vector.py
new file mode 100755
index 00000000000..869807790c6
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/Uint8Vector.py
@@ -0,0 +1,98 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class Uint8Vector(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Uint8Vector()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUint8Vector(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def Uint8VectorBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # Uint8Vector
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Uint8Vector
+    def Values(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Uint8Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1),
+            )
+        return 0
+
+    # Uint8Vector
+    def ValuesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Uint8Vector
+    def ValuesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Uint8Vector
+    def ValuesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def Uint8VectorStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    Uint8VectorStart(builder)
+
+
+def Uint8VectorAddValues(builder, values):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0
+    )
+
+
+def AddValues(builder, values):
+    Uint8VectorAddValues(builder, values)
+
+
+def Uint8VectorStartValuesVector(builder, numElems):
+    return builder.StartVector(1, numElems, 1)
+
+
+def StartValuesVector(builder, numElems: int) -> int:
+    return Uint8VectorStartValuesVector(builder, numElems)
+
+
+def Uint8VectorEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return Uint8VectorEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/UnidirectionalSequenceLSTMOptions.py b/backends/nxp/backend/ir/lib/tflite/UnidirectionalSequenceLSTMOptions.py
new file mode 100755
index 00000000000..d86806eaa76
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/UnidirectionalSequenceLSTMOptions.py
@@ -0,0 +1,164 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class UnidirectionalSequenceLSTMOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnidirectionalSequenceLSTMOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUnidirectionalSequenceLSTMOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def UnidirectionalSequenceLSTMOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # UnidirectionalSequenceLSTMOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # UnidirectionalSequenceLSTMOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # UnidirectionalSequenceLSTMOptions
+    def CellClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # UnidirectionalSequenceLSTMOptions
+    def ProjClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(
+                flatbuffers.number_types.Float32Flags, o + self._tab.Pos
+            )
+        return 0.0
+
+    # UnidirectionalSequenceLSTMOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # UnidirectionalSequenceLSTMOptions
+    def AsymmetricQuantizeInputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+    # UnidirectionalSequenceLSTMOptions
+    def DiagonalRecurrentTensors(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def UnidirectionalSequenceLSTMOptionsStart(builder):
+    builder.StartObject(6)
+
+
+def Start(builder):
+    UnidirectionalSequenceLSTMOptionsStart(builder)
+
+
+def UnidirectionalSequenceLSTMOptionsAddFusedActivationFunction(
+    builder, fusedActivationFunction
+):
+    builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+
+
+def AddFusedActivationFunction(builder, fusedActivationFunction):
+    UnidirectionalSequenceLSTMOptionsAddFusedActivationFunction(
+        builder, fusedActivationFunction
+    )
+
+
+def UnidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip):
+    builder.PrependFloat32Slot(1, cellClip, 0.0)
+
+
+def AddCellClip(builder, cellClip):
+    UnidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip)
+
+
+def UnidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip):
+    builder.PrependFloat32Slot(2, projClip, 0.0)
+
+
+def AddProjClip(builder, projClip):
+    UnidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip)
+
+
+def UnidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor):
+    builder.PrependBoolSlot(3, timeMajor, 0)
+
+
+def AddTimeMajor(builder, timeMajor):
+    UnidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor)
+
+
+def UnidirectionalSequenceLSTMOptionsAddAsymmetricQuantizeInputs(
+    builder, asymmetricQuantizeInputs
+):
+    builder.PrependBoolSlot(4, asymmetricQuantizeInputs, 0)
+
+
+def AddAsymmetricQuantizeInputs(builder, asymmetricQuantizeInputs):
+    UnidirectionalSequenceLSTMOptionsAddAsymmetricQuantizeInputs(
+        builder, asymmetricQuantizeInputs
+    )
+
+
+def UnidirectionalSequenceLSTMOptionsAddDiagonalRecurrentTensors(
+    builder, diagonalRecurrentTensors
+):
+    builder.PrependBoolSlot(5, diagonalRecurrentTensors, 0)
+
+
+def AddDiagonalRecurrentTensors(builder, diagonalRecurrentTensors):
+    UnidirectionalSequenceLSTMOptionsAddDiagonalRecurrentTensors(
+        builder, diagonalRecurrentTensors
+    )
+
+
+def UnidirectionalSequenceLSTMOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return UnidirectionalSequenceLSTMOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/UniqueOptions.py b/backends/nxp/backend/ir/lib/tflite/UniqueOptions.py
new file mode 100755
index 00000000000..4e9f1a51088
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/UniqueOptions.py
@@ -0,0 +1,65 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class UniqueOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UniqueOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUniqueOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def UniqueOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # UniqueOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # UniqueOptions
+    def IdxOutType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 2
+
+
+def UniqueOptionsStart(builder):
+    builder.StartObject(1)
+
+
+def Start(builder):
+    UniqueOptionsStart(builder)
+
+
+def UniqueOptionsAddIdxOutType(builder, idxOutType):
+    builder.PrependInt8Slot(0, idxOutType, 2)
+
+
+def AddIdxOutType(builder, idxOutType):
+    UniqueOptionsAddIdxOutType(builder, idxOutType)
+
+
+def UniqueOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return UniqueOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/UnpackOptions.py b/backends/nxp/backend/ir/lib/tflite/UnpackOptions.py
new file mode 100755
index 00000000000..7378e4bc8ab
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/UnpackOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class UnpackOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnpackOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUnpackOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def UnpackOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # UnpackOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # UnpackOptions
+    def Num(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # UnpackOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def UnpackOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    UnpackOptionsStart(builder)
+
+
+def UnpackOptionsAddNum(builder, num):
+    builder.PrependInt32Slot(0, num, 0)
+
+
+def AddNum(builder, num):
+    UnpackOptionsAddNum(builder, num)
+
+
+def UnpackOptionsAddAxis(builder, axis):
+    builder.PrependInt32Slot(1, axis, 0)
+
+
+def AddAxis(builder, axis):
+    UnpackOptionsAddAxis(builder, axis)
+
+
+def UnpackOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return UnpackOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMaxOptions.py b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMaxOptions.py
new file mode 100755
index 00000000000..cf0b2d50952
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMaxOptions.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class UnsortedSegmentMaxOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnsortedSegmentMaxOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUnsortedSegmentMaxOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def UnsortedSegmentMaxOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # UnsortedSegmentMaxOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def UnsortedSegmentMaxOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    UnsortedSegmentMaxOptionsStart(builder)
+
+
+def UnsortedSegmentMaxOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return UnsortedSegmentMaxOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMinOptions.py b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMinOptions.py
new file mode 100755
index 00000000000..88906797bc8
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentMinOptions.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class UnsortedSegmentMinOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnsortedSegmentMinOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUnsortedSegmentMinOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def UnsortedSegmentMinOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # UnsortedSegmentMinOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def UnsortedSegmentMinOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    UnsortedSegmentMinOptionsStart(builder)
+
+
+def UnsortedSegmentMinOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return UnsortedSegmentMinOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentProdOptions.py b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentProdOptions.py
new file mode 100755
index 00000000000..c15dc9533bc
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentProdOptions.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class UnsortedSegmentProdOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnsortedSegmentProdOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUnsortedSegmentProdOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def UnsortedSegmentProdOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # UnsortedSegmentProdOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def UnsortedSegmentProdOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    UnsortedSegmentProdOptionsStart(builder)
+
+
+def UnsortedSegmentProdOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return UnsortedSegmentProdOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentSumOptions.py b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentSumOptions.py
new file mode 100755
index 00000000000..f7394e99089
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/UnsortedSegmentSumOptions.py
@@ -0,0 +1,52 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class UnsortedSegmentSumOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnsortedSegmentSumOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsUnsortedSegmentSumOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def UnsortedSegmentSumOptionsBufferHasIdentifier(
+        cls, buf, offset, size_prefixed=False
+    ):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # UnsortedSegmentSumOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def UnsortedSegmentSumOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    UnsortedSegmentSumOptionsStart(builder)
+
+
+def UnsortedSegmentSumOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return UnsortedSegmentSumOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/VarHandleOptions.py b/backends/nxp/backend/ir/lib/tflite/VarHandleOptions.py
new file mode 100755
index 00000000000..0869c9c9055
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/VarHandleOptions.py
@@ -0,0 +1,84 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class VarHandleOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = VarHandleOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsVarHandleOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def VarHandleOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # VarHandleOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # VarHandleOptions
+    def Container(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # VarHandleOptions
+    def SharedName(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+
+def VarHandleOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    VarHandleOptionsStart(builder)
+
+
+def VarHandleOptionsAddContainer(builder, container):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(container), 0
+    )
+
+
+def AddContainer(builder, container):
+    VarHandleOptionsAddContainer(builder, container)
+
+
+def VarHandleOptionsAddSharedName(builder, sharedName):
+    builder.PrependUOffsetTRelativeSlot(
+        1, flatbuffers.number_types.UOffsetTFlags.py_type(sharedName), 0
+    )
+
+
+def AddSharedName(builder, sharedName):
+    VarHandleOptionsAddSharedName(builder, sharedName)
+
+
+def VarHandleOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return VarHandleOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/VariantSubType.py b/backends/nxp/backend/ir/lib/tflite/VariantSubType.py
new file mode 100755
index 00000000000..aefb3955692
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/VariantSubType.py
@@ -0,0 +1,130 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class VariantSubType(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = VariantSubType()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsVariantSubType(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def VariantSubTypeBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # VariantSubType
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # VariantSubType
+    def Shape(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(
+                flatbuffers.number_types.Int32Flags,
+                a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4),
+            )
+        return 0
+
+    # VariantSubType
+    def ShapeAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # VariantSubType
+    def ShapeLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # VariantSubType
+    def ShapeIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # VariantSubType
+    def Type(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # VariantSubType
+    def HasRank(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(
+                self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)
+            )
+        return False
+
+
+def VariantSubTypeStart(builder):
+    builder.StartObject(3)
+
+
+def Start(builder):
+    VariantSubTypeStart(builder)
+
+
+def VariantSubTypeAddShape(builder, shape):
+    builder.PrependUOffsetTRelativeSlot(
+        0, flatbuffers.number_types.UOffsetTFlags.py_type(shape), 0
+    )
+
+
+def AddShape(builder, shape):
+    VariantSubTypeAddShape(builder, shape)
+
+
+def VariantSubTypeStartShapeVector(builder, numElems):
+    return builder.StartVector(4, numElems, 4)
+
+
+def StartShapeVector(builder, numElems: int) -> int:
+    return VariantSubTypeStartShapeVector(builder, numElems)
+
+
+def VariantSubTypeAddType(builder, type):
+    builder.PrependInt8Slot(1, type, 0)
+
+
+def AddType(builder, type):
+    VariantSubTypeAddType(builder, type)
+
+
+def VariantSubTypeAddHasRank(builder, hasRank):
+    builder.PrependBoolSlot(2, hasRank, 0)
+
+
+def AddHasRank(builder, hasRank):
+    VariantSubTypeAddHasRank(builder, hasRank)
+
+
+def VariantSubTypeEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return VariantSubTypeEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/WhereOptions.py b/backends/nxp/backend/ir/lib/tflite/WhereOptions.py
new file mode 100755
index 00000000000..8d731f3a3fa
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/WhereOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class WhereOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = WhereOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsWhereOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def WhereOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # WhereOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def WhereOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    WhereOptionsStart(builder)
+
+
+def WhereOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return WhereOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/WhileOptions.py b/backends/nxp/backend/ir/lib/tflite/WhileOptions.py
new file mode 100755
index 00000000000..21fb3072629
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/WhileOptions.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class WhileOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = WhileOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsWhileOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def WhileOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # WhileOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # WhileOptions
+    def CondSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # WhileOptions
+    def BodySubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+
+def WhileOptionsStart(builder):
+    builder.StartObject(2)
+
+
+def Start(builder):
+    WhileOptionsStart(builder)
+
+
+def WhileOptionsAddCondSubgraphIndex(builder, condSubgraphIndex):
+    builder.PrependInt32Slot(0, condSubgraphIndex, 0)
+
+
+def AddCondSubgraphIndex(builder, condSubgraphIndex):
+    WhileOptionsAddCondSubgraphIndex(builder, condSubgraphIndex)
+
+
+def WhileOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex):
+    builder.PrependInt32Slot(1, bodySubgraphIndex, 0)
+
+
+def AddBodySubgraphIndex(builder, bodySubgraphIndex):
+    WhileOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex)
+
+
+def WhileOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return WhileOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/ZerosLikeOptions.py b/backends/nxp/backend/ir/lib/tflite/ZerosLikeOptions.py
new file mode 100755
index 00000000000..4ddd6318a15
--- /dev/null
+++ b/backends/nxp/backend/ir/lib/tflite/ZerosLikeOptions.py
@@ -0,0 +1,50 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class ZerosLikeOptions(object):
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ZerosLikeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsZerosLikeOptions(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    @classmethod
+    def ZerosLikeOptionsBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(
+            buf, offset, b"\x54\x46\x4C\x33", size_prefixed=size_prefixed
+        )
+
+    # ZerosLikeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+
+def ZerosLikeOptionsStart(builder):
+    builder.StartObject(0)
+
+
+def Start(builder):
+    ZerosLikeOptionsStart(builder)
+
+
+def ZerosLikeOptionsEnd(builder):
+    return builder.EndObject()
+
+
+def End(builder):
+    return ZerosLikeOptionsEnd(builder)
diff --git a/backends/nxp/backend/ir/lib/tflite/__init__.py b/backends/nxp/backend/ir/lib/tflite/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/logger.py b/backends/nxp/backend/ir/logger.py
new file mode 100644
index 00000000000..ce8da2a31df
--- /dev/null
+++ b/backends/nxp/backend/ir/logger.py
@@ -0,0 +1,343 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    logger
+
+Module implements functions for logging, error messages and custom assertions.
+"""
+
+import sys
+from collections import defaultdict
+from enum import Enum
+from typing import NoReturn, Optional
+
+
+class Style:
+    """Strings used to set a color and other styles to the output printed to console.
+
+    example usage:
+        logger.w(f'{logger.Style.orange + logger.Style.bold}Some warning. {logger.Style.end}Additional info.')
+
+    """
+
+    red = "\033[91m"
+    green = "\033[92m"
+    orange = "\033[93m"
+    blue = "\033[94m"
+    magenta = "\033[95m"
+    cyan = "\033[96m"
+
+    bold = "\033[1m"
+    underline = "\033[4m"
+
+    end = "\033[0m"
+
+
+class MessageImportance(Enum):
+    """Importance levels of messages to print."""
+
+    DEBUG = 0
+    INFO = 1
+    WARNING = 2
+    ERROR = 3
+
+
+MIN_OUTPUT_IMPORTANCE = MessageImportance.WARNING
+
+
+class Message:
+    """Custom messages, that are printed to console from different locations in the code."""
+
+    ALLOW_SELECT_OPS = (
+        "If you want to convert the model using the SELECT_TF_OPS, run the conversion again with "
+        f"the flag {Style.bold + Style.cyan}--allow-select-ops{Style.end}."
+    )
+
+    GUARANTEE_NON_NEGATIVE_INDICES = (
+        f"{Style.green}If you know that the indices are always non-negative, you can run"
+        f" the converter with the flag {Style.bold + Style.cyan}--non-negative-indices"
+        f"{Style.end}."
+    )
+
+    CAST_INT64_TO_INT32 = (
+        f"Use option {Style.bold + Style.cyan}--cast-int64-to-int32{Style.end} to disable this "
+        "check and re-cast input/output to INT32."
+    )
+
+    IGNORE_OPSET_VERSION = (
+        "If you want to try and convert the model anyway, run the conversion again with the flag "
+        f"{Style.bold + Style.cyan}--ignore-opset-version{Style.end}. Keep in mind that the output"
+        " TFLite model may potentially be invalid."
+    )
+
+
+class Code(Enum):
+    """Error codes"""
+
+    INTERNAL_ERROR = 1
+    GENERATED_MODEL_INVALID = 2
+    INVALID_OPTIMIZATION = 3
+    PREPROCESSING_ERROR = 4
+
+    UNSUPPORTED_OPERATOR = 21
+    UNSUPPORTED_ONNX_TYPE = 22
+    UNSUPPORTED_OPERATOR_ATTRIBUTES = 23
+    NOT_IMPLEMENTED = 24
+
+    INVALID_TYPE = 31
+    INVALID_TENSOR_SHAPE = 32
+    INVALID_ONNX_OPERATOR = 33
+    INVALID_ONNX_OPERATOR_ATTRIBUTE = 34
+    INVALID_ONNX_MODEL = 35
+
+    CONVERSION_IMPOSSIBLE = 41
+    SHAPE_INFERENCE_ERROR = 42
+    IO_PRESERVATION_ERROR = 43
+
+    INVALID_INPUT = 51
+
+    UNSUPPORTED_NODE = 61
+
+
+class Error(Exception):
+
+    def __init__(self, err_code: Code, msg, exception: Optional[Exception] = None):
+        self.error_code = err_code
+        self.msg = msg
+        self.exception = exception
+
+    def __str__(self):
+        output = f"[{self.error_code}] - {self.msg}"
+        if self.exception is not None:
+            output += f" - (Parent exception: {self.exception})"
+
+        return output
+
+
+class LoggingContext:
+    """
+    Context that represents part of an application to which current logs belong to. Contexts are meant
+    to be nested from most general (global) to most specific (node context etc.). Use context manager
+    'logger.loggingContext()' to enable specific context.
+    """
+
+    def __init__(self, context_name):
+        self.context_name = context_name
+
+    def __str__(self) -> str:
+        return self.context_name
+
+    def __repr__(self) -> str:
+        return self.context_name
+
+
+class BasicLoggingContext(LoggingContext):
+    """
+    Basic logging contexts specified by its name.
+    """
+
+    GLOBAL = LoggingContext("global")
+    SHAPE_INFERENCE = LoggingContext("shape_inference")
+    ONNX_PARSER = LoggingContext("onnx_parser")
+    OPERATOR_CONVERSION = LoggingContext("operator_conversion")
+    TFLITE_GENERATOR = LoggingContext("tflite_generator")
+    QDQ_QUANTIZER = LoggingContext("qdq_quantizer")
+
+
+class NodeLoggingContext(LoggingContext):
+    """
+    ONNX node specific context. Logs reported within this context are related to node with index 'node_id'.
+    """
+
+    def __init__(self, node_id):
+        self.node_id = node_id
+        super().__init__(f"node_{node_id}")
+
+
+class ConversionLog:
+    """
+    Record logs sent within some logging context. Log might belong to multiple contexts. Single log
+    event are present with: message, logging context hierarchy, importance (logger.MessageImportance) and
+    optional error code (logger.Code). Logs added outside any context are ignored.
+    """
+
+    _current_logging_context = []
+    _log = defaultdict(list)
+    _log_count = 0
+
+    def append_context(self, loggingContext: LoggingContext):
+        if len(self._current_logging_context) == 0:
+            self._log = defaultdict(list)
+            self._log_count = 0
+
+        self._current_logging_context.append(loggingContext.context_name)
+
+    def pop_last_context(self):
+        self._current_logging_context.pop()
+
+    def reset(self):
+        self._log = defaultdict(list)
+        self._current_logging_context = []
+        self._log_count = 0
+
+    def add_log(
+        self,
+        importance: MessageImportance,
+        message: str,
+        error_code: Code | None = None,
+    ):
+        data = {
+            "message": message,
+            "logging_context_hierarchy": list(self._current_logging_context),
+            "importance": importance.value,
+            "message_id": self._log_count,
+        }
+
+        if error_code is not None:
+            data["error_code"] = error_code
+
+        if len(self._current_logging_context) != 0:
+            self._log[self._current_logging_context[-1]].append(data)
+            self._log_count += 1
+
+    def get_logs(self) -> dict:
+        return self._log
+
+    def _get_node_error(self, node_id: int, dict_item: str) -> Code | str | None:
+        """
+        Return first error log item that belong to node with id 'node_id'. If no error is present
+        None is returned instead.
+
+        :param node_id: ONNX node id.
+        :param dict_item: Dictionary item to return from `log`
+        :return: Error code or None if there's no error related to node.
+        """
+
+        node_logs = self._log[f"node_{node_id}"]
+        for log in node_logs:
+            if log["importance"] == MessageImportance.ERROR.value:
+                return log[dict_item]
+
+        return None
+
+    def get_node_error_code(self, node_id: int) -> Code | None:
+        """
+        Return first error code that belong to node with id 'node_id'. If no error is present
+        None is returned instead.
+
+        :param node_id: ONNX node id.
+        :return: Error code or None if there's no error related to node.
+        """
+
+        return self._get_node_error(node_id, "error_code")
+
+    def get_node_error_message(self, node_id: int) -> str | None:
+        """
+        Return first error message that belong to node with id 'node_id'. If no error is present
+        None is returned instead.
+
+        :param node_id: ONNX node id
+        :return: Error message or None if there is no error related to node.
+        """
+
+        return self._get_node_error(node_id, "message")
+
+
+conversion_log = ConversionLog()
+
+
+class loggingContext:
+    """
+    Context manager used to nest logging contexts. Usage:
+
+    with loggingContext(BasicLoggingContext.GLOBAL):
+        with loggingContext(BasicLoggingContext.ONNX_PARSER):
+            logger.i("My log") # this log is automatically assigned to both parent contexts
+
+    """
+
+    def __init__(self, logging_context: LoggingContext):
+        self.logging_context = logging_context
+
+    def __enter__(self):
+        conversion_log.append_context(self.logging_context)
+
+    def __exit__(self, _, __, ___):
+        conversion_log.pop_last_context()
+
+
+def d(msg: str):
+    """Log internal debug message with given parameters."""
+
+    if MIN_OUTPUT_IMPORTANCE.value > MessageImportance.DEBUG.value:
+        return
+
+    print("DEBUG: ", msg)
+    conversion_log.add_log(MessageImportance.DEBUG, msg)
+
+
+def i(msg: str):
+    """Log info message with given parameters."""
+
+    if MIN_OUTPUT_IMPORTANCE.value > MessageImportance.INFO.value:
+        return
+
+    print("INFO: ", msg)
+    conversion_log.add_log(MessageImportance.INFO, msg)
+
+
+def w(msg: str):
+    """Log warning message with given parameters."""
+
+    if MIN_OUTPUT_IMPORTANCE.value > MessageImportance.WARNING.value:
+        return
+
+    print("WARNING: ", msg)
+    conversion_log.add_log(MessageImportance.WARNING, msg)
+
+
+def e(err_code: Code, msg: str, exception: Optional[Exception] = None) -> NoReturn:
+    """Print and raise exception with error message composed of provided error code, messages and optional exception.
+    :param err_code: Error code.
+    :param msg: Error message.
+    :param exception: (Optional) Exception object to print before the program exits.
+    """
+
+    error = Error(err_code, msg, exception)
+    conversion_log.add_log(MessageImportance.ERROR, str(error), error_code=err_code)
+    print("ERROR: ", str(error), file=sys.stderr)
+
+    raise error
+
+
+def expect_type(obj, expected_type, msg: str = ""):
+    if type(obj) is not expected_type:
+        w(
+            msg
+            + f":Object '{obj}' is of type '{type(obj)}' where '{expected_type}' was expected!"
+        )
+
+
+def require_type(obj, required_type, msg: str = ""):
+    if type(obj) is not required_type:
+        e(
+            Code.INVALID_TYPE,
+            msg
+            + f":Object '{obj}' is of type '{type(obj)}' where '{required_type}' was required!",
+        )
+
+
+def internal_assert(truth_value: bool, msg: str = ""):
+    """Assert that the 'truth_value' is True. If not, raise a logger INTERNAL_ERROR with message 'msg'.
+
+    :param truth_value: Boolean to check.
+    :param msg: Message to raise the Error with.
+    """
+
+    if not truth_value:
+        e(Code.INTERNAL_ERROR, msg)
diff --git a/backends/nxp/backend/ir/tensor_formatting.py b/backends/nxp/backend/ir/tensor_formatting.py
new file mode 100644
index 00000000000..aab22c3c368
--- /dev/null
+++ b/backends/nxp/backend/ir/tensor_formatting.py
@@ -0,0 +1,55 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023-2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+from enum import Enum
+
+from executorch.backends.nxp.backend.node_format_inference import NodeFormat
+
+
+class TensorFormat(Enum):
+    CHANNELS_FIRST = 0
+
+    CHANNELS_LAST = 10
+
+    # The format of TFLite Conv3D weights tensor: [output_channels, input_channels, D, H, W]
+    CONV_3D_WEIGHT_FORMAT = 11
+
+    # Intermediate format between 'Transpose' and 'Reshape' ops when single dimension with value 1
+    # is added/removed via reshaping
+    RESHAPE_SINGLE_UNITARY_TRANSPOSITION = 12
+
+    # The format of TFLite TransposeConv 2D weights tensor: [M/group, kH, kW, C]
+    TRANSPOSE_CONV_2D_WEIGHT_FORMAT = 13
+
+    # No special format (matrices, vectors, shapes etc.). All tensors with the FORMATLESS format MUST have EXACTLY
+    #  the same shape and data in the TFLite model and in the ONNX model.
+    FORMATLESS = 20
+
+    NONE = 30  # Format has not been identified
+
+    def is_channels_first(self) -> bool:
+        return self == TensorFormat.CHANNELS_FIRST
+
+    def is_channels_last(self) -> bool:
+        return self == TensorFormat.CHANNELS_LAST
+
+    @staticmethod
+    def from_node_format(node_format: NodeFormat):
+        if node_format.is_channels_first():
+            return TensorFormat.CHANNELS_LAST
+        elif node_format == NodeFormat.FORMATLESS:
+            return TensorFormat.FORMATLESS
+        else:
+            return TensorFormat.NONE
+
+    def to_node_format(self):
+        if self == TensorFormat.CHANNELS_LAST:
+            return NodeFormat.CHANNELS_FIRST
+        elif self == TensorFormat.FORMATLESS:
+            return NodeFormat.FORMATLESS
+        else:
+            return NodeFormat.NONE
diff --git a/backends/nxp/backend/ir/tflite_generator/__init__.py b/backends/nxp/backend/ir/tflite_generator/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/__init__.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/abs_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/abs_options.py
new file mode 100755
index 00000000000..68bc1ec6dce
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/abs_options.py
@@ -0,0 +1,25 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import AbsOptions
+
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class Abs(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.AbsOptions, BuiltinOperator.ABS)
+
+    def gen_tflite(self, builder: fb.Builder):
+        AbsOptions.Start(builder)
+
+        return AbsOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/add_n_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/add_n_options.py
new file mode 100755
index 00000000000..2646f326852
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/add_n_options.py
@@ -0,0 +1,31 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    AddN
+
+Representation of the TFLite operator 'AddN'.
+"""
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    AddNOptions as libAddNOptions,
+    BuiltinOperator as libBuiltinOperator,
+    BuiltinOptions as libBuiltinOptions,
+)
+
+
+class AddN(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.AddNOptions,
+            libBuiltinOperator.BuiltinOperator.ADD_N,
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        libAddNOptions.Start(builder)
+        return libAddNOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/add_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/add_options.py
new file mode 100755
index 00000000000..37c04a84588
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/add_options.py
@@ -0,0 +1,45 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    Add
+
+Representation of the TFLite operator 'Add'.
+"""
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    ActivationFunctionType as libActivationFunctionType,
+    AddOptions as libAddOptions,
+    BuiltinOperator as libBuiltinOperator,
+    BuiltinOptions as libBuiltinOptions,
+)
+
+
+class Add(meta.BuiltinOptions):
+    fused_activation_function: libActivationFunctionType.ActivationFunctionType
+
+    # TODO potScaleInt16
+
+    def __init__(
+        self,
+        fused_activation_function: libActivationFunctionType.ActivationFunctionType = libActivationFunctionType.ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.AddOptions,
+            libBuiltinOperator.BuiltinOperator.ADD,
+        )
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libAddOptions.Start(builder)
+
+        libAddOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        return libAddOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/arg_max_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/arg_max_options.py
new file mode 100755
index 00000000000..cd826201358
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/arg_max_options.py
@@ -0,0 +1,28 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import ArgMaxOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+
+
+class ArgMax(meta.BuiltinOptions):
+    output_type: TensorType
+
+    def __init__(self, output_type: TensorType) -> None:
+        super().__init__(BuiltinOptions.ArgMaxOptions, BuiltinOperator.ARG_MAX)
+        self.output_type = output_type
+
+    def gen_tflite(self, builder: fb.Builder):
+        ArgMaxOptions.Start(builder)
+
+        ArgMaxOptions.AddOutputType(builder, self.output_type)
+
+        return ArgMaxOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/arg_min_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/arg_min_options.py
new file mode 100755
index 00000000000..2ea3bfe0f55
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/arg_min_options.py
@@ -0,0 +1,28 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import ArgMinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+
+
+class ArgMin(meta.BuiltinOptions):
+    output_type: TensorType
+
+    def __init__(self, output_type: TensorType) -> None:
+        super().__init__(BuiltinOptions.ArgMinOptions, BuiltinOperator.ARG_MIN)
+        self.output_type = output_type
+
+    def gen_tflite(self, builder: fb.Builder):
+        ArgMinOptions.Start(builder)
+
+        ArgMinOptions.AddOutputType(builder, self.output_type)
+
+        return ArgMinOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/average_pool_2d_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/average_pool_2d_options.py
new file mode 100755
index 00000000000..d3f59b3844d
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/average_pool_2d_options.py
@@ -0,0 +1,62 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    AveragePool2D
+
+Representation of the TFLite operator 'AveragePool2D'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType as libActivationFunctionType
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.Padding as libPadding
+import executorch.backends.nxp.backend.ir.lib.tflite.Pool2DOptions as libPool2DOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class AveragePool2D(meta.BuiltinOptions):
+    padding: libPadding.Padding
+    stride_w: int
+    stride_h: int
+    filter_w: int
+    filter_h: int
+    fused_activation_function: libActivationFunctionType.ActivationFunctionType
+
+    def __init__(
+        self,
+        padding: libPadding.Padding = libPadding.Padding.SAME,
+        stride_w: int = 1,
+        stride_h: int = 1,
+        filter_w: int = 1,
+        filter_h: int = 1,
+        fused_activation_function: libActivationFunctionType.ActivationFunctionType = libActivationFunctionType.ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.Pool2DOptions,
+            libBuiltinOperator.BuiltinOperator.AVERAGE_POOL_2D,
+        )
+        self.padding = padding
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+        self.filter_w = filter_w
+        self.filter_h = filter_h
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libPool2DOptions.Start(builder)
+
+        libPool2DOptions.AddPadding(builder, self.padding)
+        libPool2DOptions.AddStrideW(builder, self.stride_w)
+        libPool2DOptions.AddStrideH(builder, self.stride_h)
+        libPool2DOptions.AddFilterHeight(builder, self.filter_h)
+        libPool2DOptions.AddFilterWidth(builder, self.filter_w)
+        libPool2DOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        return libPool2DOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/batch_mat_mul_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/batch_mat_mul_options.py
new file mode 100755
index 00000000000..48aebd5dce4
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/batch_mat_mul_options.py
@@ -0,0 +1,45 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+    batch_mat_mul_options
+
+Representation of the TFLite operator 'BatchMatMul'.
+"""
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    BatchMatMulOptions,
+    BuiltinOperator,
+    BuiltinOptions,
+)
+
+
+class BatchMatMul(meta.BuiltinOptions):
+    adj_x: bool
+    adj_y: bool
+    asymmetric_quantize_inputs: bool
+
+    def __init__(
+        self, adj_x: bool, adj_y: bool, asymmetric_quantize_inputs: bool
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.BuiltinOptions.BatchMatMulOptions,
+            BuiltinOperator.BuiltinOperator.BATCH_MATMUL,
+        )
+        self.adj_x = adj_x
+        self.adj_y = adj_y
+        self.asymmetric_quantize_inputs = asymmetric_quantize_inputs
+
+    def gen_tflite(self, builder: fb.Builder):
+        BatchMatMulOptions.Start(builder)
+
+        BatchMatMulOptions.AddAdjX(builder, self.adj_x)
+        BatchMatMulOptions.AddAdjY(builder, self.adj_y)
+        BatchMatMulOptions.AddAsymmetricQuantizeInputs(
+            builder, self.asymmetric_quantize_inputs
+        )
+
+        return BatchMatMulOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_lstm_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_lstm_options.py
new file mode 100755
index 00000000000..a0589d19d3d
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_lstm_options.py
@@ -0,0 +1,65 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BidirectionalSequenceLSTMOptions as libBSLSTMOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class BidirectionalSequenceLSTM(meta.BuiltinOptions):
+    fused_activation_function: ActivationFunctionType
+    cell_clip: float
+    proj_clip: float
+    merge_outputs: bool
+
+    # V2+
+    time_major: bool  # If True, the first dimension is sequence, otherwise batch.
+
+    # V3+
+    asymmetric_quantize_inputs: bool
+
+    def __init__(
+        self,
+        cell_clip: float,
+        proj_clip: float,
+        time_major: bool = True,
+        merge_outputs: bool = True,
+        asymmetric_quantize_inputs: bool = False,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.BidirectionalSequenceLSTMOptions,
+            BuiltinOperator.BIDIRECTIONAL_SEQUENCE_LSTM,
+        )
+
+        self.fused_activation_function = fused_activation_function
+        self.cell_clip = cell_clip
+        self.proj_clip = proj_clip
+        self.merge_outputs = merge_outputs
+        self.time_major = time_major
+        self.asymmetric_quantize_inputs = asymmetric_quantize_inputs
+
+    def gen_tflite(self, builder: fb.Builder):
+        libBSLSTMOptions.Start(builder)
+
+        libBSLSTMOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libBSLSTMOptions.AddCellClip(builder, self.cell_clip)
+        libBSLSTMOptions.AddProjClip(builder, self.proj_clip)
+        libBSLSTMOptions.AddMergeOutputs(builder, self.merge_outputs)
+        libBSLSTMOptions.AddTimeMajor(builder, self.time_major)
+        libBSLSTMOptions.AddAsymmetricQuantizeInputs(
+            builder, self.asymmetric_quantize_inputs
+        )
+
+        return libBSLSTMOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_rnn_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_rnn_options.py
new file mode 100755
index 00000000000..f55c05197f5
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/bidirectional_sequence_rnn_options.py
@@ -0,0 +1,53 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BidirectionalSequenceRNNOptions as libBRNNOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class BidirectionalSequenceRNN(meta.BuiltinOptions):
+    time_major: bool  # If True, the first dimension is sequence, otherwise batch.
+    fused_activation_function: ActivationFunctionType
+    merge_outputs: bool
+    asymmetric_quantize_inputs: bool
+
+    def __init__(
+        self,
+        time_major: bool = True,
+        merge_outputs: bool = True,
+        asymmetric_quantize_inputs: bool = False,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.BidirectionalSequenceRNNOptions,
+            BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN,
+        )
+
+        self.time_major = time_major
+        self.fused_activation_function = fused_activation_function
+        self.merge_outputs = merge_outputs
+        self.asymmetric_quantize_inputs = asymmetric_quantize_inputs
+
+    def gen_tflite(self, builder: fb.Builder):
+        libBRNNOptions.Start(builder)
+
+        libBRNNOptions.AddTimeMajor(builder, self.time_major)
+        libBRNNOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libBRNNOptions.AddMergeOutputs(builder, self.merge_outputs)
+        libBRNNOptions.AddAsymmetricQuantizeInputs(
+            builder, self.asymmetric_quantize_inputs
+        )
+
+        return libBRNNOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/bitwise_xor_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/bitwise_xor_options.py
new file mode 100755
index 00000000000..dee9032cce6
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/bitwise_xor_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import BitwiseXorOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class BitwiseXor(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.BitwiseXorOptions, BuiltinOperator.BITWISE_XOR)
+
+    def gen_tflite(self, builder: fb.Builder):
+        BitwiseXorOptions.Start(builder)
+
+        return BitwiseXorOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/broadcast_to_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/broadcast_to_options.py
new file mode 100755
index 00000000000..6c19aa4229c
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/broadcast_to_options.py
@@ -0,0 +1,25 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BroadcastToOptions as libBroadcastToOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class BroadcastTo(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            BuiltinOptions.BroadcastToOptions, BuiltinOperator.BROADCAST_TO
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        libBroadcastToOptions.Start(builder)
+
+        return libBroadcastToOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/cast_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/cast_options.py
new file mode 100755
index 00000000000..ff7904f1169
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/cast_options.py
@@ -0,0 +1,32 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import CastOptions as libCastOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+
+
+class Cast(meta.BuiltinOptions):
+    in_data_type: TensorType
+    out_data_type: TensorType
+
+    def __init__(self, in_data_type: TensorType, out_data_type: TensorType) -> None:
+        super().__init__(BuiltinOptions.CastOptions, BuiltinOperator.CAST)
+        self.in_data_type = in_data_type
+        self.out_data_type = out_data_type
+
+    def gen_tflite(self, builder: fb.Builder):
+        libCastOptions.Start(builder)
+
+        libCastOptions.AddInDataType(builder, self.in_data_type)
+        libCastOptions.AddOutDataType(builder, self.out_data_type)
+
+        return libCastOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/concatenation_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/concatenation_options.py
new file mode 100755
index 00000000000..c7dee397556
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/concatenation_options.py
@@ -0,0 +1,38 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType as libActivationFunctionType
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.ConcatenationOptions as libConcatenationOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class Concatenation(meta.BuiltinOptions):
+    axis: int
+    fused_activation_function: libActivationFunctionType.ActivationFunctionType
+
+    def __init__(
+        self,
+        axis: int,
+        fused_activation_function: libActivationFunctionType.ActivationFunctionType = libActivationFunctionType.ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.ConcatenationOptions,
+            libBuiltinOperator.BuiltinOperator.CONCATENATION,
+        )
+        self.axis = axis
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libConcatenationOptions.Start(builder)
+
+        libConcatenationOptions.AddAxis(builder, self.axis)
+        libConcatenationOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        return libConcatenationOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/conv_2d_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/conv_2d_options.py
new file mode 100755
index 00000000000..62709181bf7
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/conv_2d_options.py
@@ -0,0 +1,59 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+
+import executorch.backends.nxp.backend.ir.lib.tflite.Conv2DOptions as libConv2DOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.Padding import Padding
+
+
+class Conv2D(meta.BuiltinOptions):
+    padding: Padding
+    stride_w: int
+    stride_h: int
+    dilation_w_factor: int
+    dilation_h_factor: int
+    fused_activation_function: ActivationFunctionType
+
+    def __init__(
+        self,
+        padding: Padding = Padding.SAME,
+        stride_w: int = 1,
+        stride_h: int = 1,
+        dilation_w_factor: int = 1,
+        dilation_h_factor: int = 1,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(BuiltinOptions.Conv2DOptions, BuiltinOperator.CONV_2D)
+        self.padding = padding
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+        self.dilation_w_factor = dilation_w_factor
+        self.dilation_h_factor = dilation_h_factor
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libConv2DOptions.Start(builder)
+
+        libConv2DOptions.AddPadding(builder, self.padding)
+        libConv2DOptions.AddStrideW(builder, self.stride_w)
+        libConv2DOptions.AddStrideH(builder, self.stride_h)
+        libConv2DOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libConv2DOptions.AddDilationWFactor(builder, self.dilation_w_factor)
+        libConv2DOptions.AddDilationHFactor(builder, self.dilation_h_factor)
+
+        return libConv2DOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/conv_3d_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/conv_3d_options.py
new file mode 100755
index 00000000000..bbd62efbb73
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/conv_3d_options.py
@@ -0,0 +1,67 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.Conv3DOptions as libConv3DOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.Padding import Padding
+
+
+class Conv3D(meta.BuiltinOptions):
+    padding: Padding
+    stride_w: int
+    stride_h: int
+    stride_d: int
+    dilation_w_factor: int
+    dilation_h_factor: int
+    dilation_d_factor: int
+    fused_activation_function: ActivationFunctionType
+
+    def __init__(
+        self,
+        padding: Padding = Padding.SAME,
+        stride_w: int = 1,
+        stride_h: int = 1,
+        stride_d: int = 1,
+        dilation_w_factor: int = 1,
+        dilation_h_factor: int = 1,
+        dilation_d_factor: int = 1,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(BuiltinOptions.Conv3DOptions, BuiltinOperator.CONV_3D)
+        self.padding = padding
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+        self.stride_d = stride_d
+        self.dilation_w_factor = dilation_w_factor
+        self.dilation_h_factor = dilation_h_factor
+        self.dilation_d_factor = dilation_d_factor
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libConv3DOptions.Start(builder)
+
+        libConv3DOptions.AddPadding(builder, self.padding)
+
+        libConv3DOptions.AddStrideW(builder, self.stride_w)
+        libConv3DOptions.AddStrideH(builder, self.stride_h)
+        libConv3DOptions.AddStrideD(builder, self.stride_d)
+
+        libConv3DOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        libConv3DOptions.AddDilationWFactor(builder, self.dilation_w_factor)
+        libConv3DOptions.AddDilationHFactor(builder, self.dilation_h_factor)
+        libConv3DOptions.AddDilationDFactor(builder, self.dilation_d_factor)
+
+        return libConv3DOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/cum_sum_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/cum_sum_options.py
new file mode 100755
index 00000000000..54b9379a654
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/cum_sum_options.py
@@ -0,0 +1,30 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import CumsumOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class CumSum(meta.BuiltinOptions):
+    exclusive: bool
+    reverse: bool
+
+    def __init__(self, exclusive: bool, reverse: bool) -> None:
+        super().__init__(BuiltinOptions.CumsumOptions, BuiltinOperator.CUMSUM)
+        self.exclusive = exclusive
+        self.reverse = reverse
+
+    def gen_tflite(self, builder: fb.Builder):
+        CumsumOptions.Start(builder)
+
+        CumsumOptions.AddExclusive(builder, self.exclusive)
+        CumsumOptions.AddReverse(builder, self.reverse)
+
+        return CumsumOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/depth_to_space_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/depth_to_space_options.py
new file mode 100755
index 00000000000..90033e72196
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/depth_to_space_options.py
@@ -0,0 +1,29 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import DepthToSpaceOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class DepthToSpace(meta.BuiltinOptions):
+    block_size: int
+
+    def __init__(self, block_size: int) -> None:
+        super().__init__(
+            BuiltinOptions.DepthToSpaceOptions, BuiltinOperator.DEPTH_TO_SPACE
+        )
+        self.block_size = block_size
+
+    def gen_tflite(self, builder: fb.Builder):
+        DepthToSpaceOptions.Start(builder)
+
+        DepthToSpaceOptions.DepthToSpaceOptionsAddBlockSize(builder, self.block_size)
+
+        return DepthToSpaceOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/depthwise_conv_2d_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/depthwise_conv_2d_options.py
new file mode 100755
index 00000000000..66b5fd952b6
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/depthwise_conv_2d_options.py
@@ -0,0 +1,69 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+    depthwise_conv_2d_options
+
+Representation of the TFLite operator 'DepthwiseConv2D'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.DepthwiseConv2DOptions as libDepthwiseConv2DOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.Padding import Padding
+
+
+class DepthwiseConv2D(meta.BuiltinOptions):
+    padding: Padding
+    stride_w: int = 1
+    stride_h: int = 1
+    fused_activation_function: ActivationFunctionType
+    dilation_w_factor: int = 1
+    dilation_h_factor: int = 1
+    depth_multiplier: int = 1  # Redundant according to schema.fbs (line 597)
+
+    def __init__(
+        self,
+        padding: Padding = Padding.SAME,
+        stride_w: int = 1,
+        stride_h: int = 1,
+        dilation_w_factor: int = 1,
+        dilation_h_factor: int = 1,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+        depth_multiplier: int = 1,
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.DepthwiseConv2DOptions, BuiltinOperator.DEPTHWISE_CONV_2D
+        )
+        self.padding = padding
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+        self.fused_activation_function = fused_activation_function
+        self.dilation_w_factor = dilation_w_factor
+        self.dilation_h_factor = dilation_h_factor
+        self.depth_multiplier = depth_multiplier
+
+    def gen_tflite(self, builder: fb.Builder):
+        libDepthwiseConv2DOptions.Start(builder)
+
+        libDepthwiseConv2DOptions.AddPadding(builder, self.padding)
+        libDepthwiseConv2DOptions.AddStrideW(builder, self.stride_w)
+        libDepthwiseConv2DOptions.AddStrideH(builder, self.stride_h)
+        libDepthwiseConv2DOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libDepthwiseConv2DOptions.AddDilationWFactor(builder, self.dilation_w_factor)
+        libDepthwiseConv2DOptions.AddDilationHFactor(builder, self.dilation_h_factor)
+
+        libDepthwiseConv2DOptions.AddDepthMultiplier(builder, self.depth_multiplier)
+
+        return libDepthwiseConv2DOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/dequantize_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/dequantize_options.py
new file mode 100755
index 00000000000..aebccb47e1c
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/dequantize_options.py
@@ -0,0 +1,24 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.QuantizeOptions as libQuantizeOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class Dequantize(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.DequantizeOptions,
+            libBuiltinOperator.BuiltinOperator.DEQUANTIZE,
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        libQuantizeOptions.Start(builder)
+
+        return libQuantizeOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/div_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/div_options.py
new file mode 100755
index 00000000000..24f35498df1
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/div_options.py
@@ -0,0 +1,43 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    Div
+
+Representation of the TFLite operator 'Div'.
+"""
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import DivOptions as libDivOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Div(meta.BuiltinOptions):
+    fused_activation_function: ActivationFunctionType
+
+    def __init__(
+        self,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(BuiltinOptions.DivOptions, BuiltinOperator.DIV)
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libDivOptions.Start(builder)
+
+        libDivOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        return libDivOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/equal_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/equal_options.py
new file mode 100755
index 00000000000..238d334314a
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/equal_options.py
@@ -0,0 +1,23 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.EqualOptions as libEqualOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class Equal(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.EqualOptions, BuiltinOperator.EQUAL)
+
+    def gen_tflite(self, builder: fb.Builder):
+        libEqualOptions.Start(builder)
+
+        return libEqualOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/exp_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/exp_options.py
new file mode 100755
index 00000000000..b6fe351edb2
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/exp_options.py
@@ -0,0 +1,23 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.ExpOptions as libExpOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Exp(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.ExpOptions, BuiltinOperator.EXP)
+
+    def gen_tflite(self, builder: fb.Builder):
+        libExpOptions.Start(builder)
+
+        return libExpOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/floor_mod_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/floor_mod_options.py
new file mode 100755
index 00000000000..448a3b77d1b
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/floor_mod_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import FloorModOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class FloorMod(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.FloorModOptions, BuiltinOperator.FLOOR_MOD)
+
+    def gen_tflite(self, builder: fb.Builder):
+        FloorModOptions.Start(builder)
+
+        return FloorModOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/fully_connected_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/fully_connected_options.py
new file mode 100755
index 00000000000..1aa2c783032
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/fully_connected_options.py
@@ -0,0 +1,57 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+
+import executorch.backends.nxp.backend.ir.lib.tflite.FullyConnectedOptions as libFullyConnectedOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.FullyConnectedOptionsWeightsFormat import (
+    FullyConnectedOptionsWeightsFormat,
+)
+
+
+class FullyConnected(meta.BuiltinOptions):
+    fused_activation_function: ActivationFunctionType
+    weights_format: FullyConnectedOptionsWeightsFormat
+    keep_num_dims: bool
+    asymmetric_quantize_inputs: bool
+
+    def __init__(
+        self,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+        weights_format: FullyConnectedOptionsWeightsFormat = FullyConnectedOptionsWeightsFormat.DEFAULT,
+        keep_num_dims: bool = False,
+        asymmetric_quantize_inputs: bool = False,
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.FullyConnectedOptions, BuiltinOperator.FULLY_CONNECTED
+        )
+        self.fused_activation_function = fused_activation_function
+        self.weights_format = weights_format
+        self.keep_num_dims = keep_num_dims
+        self.asymmetric_quantize_inputs = asymmetric_quantize_inputs
+
+    def gen_tflite(self, builder: fb.Builder):
+        libFullyConnectedOptions.Start(builder)
+
+        libFullyConnectedOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libFullyConnectedOptions.AddWeightsFormat(builder, self.weights_format)
+        libFullyConnectedOptions.AddKeepNumDims(builder, self.keep_num_dims)
+        libFullyConnectedOptions.AddAsymmetricQuantizeInputs(
+            builder, self.asymmetric_quantize_inputs
+        )
+
+        return libFullyConnectedOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/gather_nd_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/gather_nd_options.py
new file mode 100755
index 00000000000..9198995fb47
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/gather_nd_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.GatherNdOptions as libGatherNDOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class GatherND(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.GatherNdOptions, BuiltinOperator.GATHER_ND)
+
+    def gen_tflite(self, builder: fb.Builder):
+        libGatherNDOptions.Start(builder)
+
+        return libGatherNDOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/gather_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/gather_options.py
new file mode 100755
index 00000000000..395500a7117
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/gather_options.py
@@ -0,0 +1,30 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.GatherOptions as libGatherOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Gather(meta.BuiltinOptions):
+    axis: int
+    batch_dims: int
+
+    def __init__(self, axis: int, batch_dims: int = 0) -> None:
+        super().__init__(BuiltinOptions.GatherOptions, BuiltinOperator.GATHER)
+        self.axis = axis
+        self.batch_dims = batch_dims
+
+    def gen_tflite(self, builder: fb.Builder):
+        libGatherOptions.Start(builder)
+
+        libGatherOptions.AddAxis(builder, self.axis)
+        libGatherOptions.AddBatchDims(builder, self.batch_dims)
+
+        return libGatherOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/gelu_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/gelu_options.py
new file mode 100755
index 00000000000..4a20e787263
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/gelu_options.py
@@ -0,0 +1,27 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import GeluOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Gelu(meta.BuiltinOptions):
+    approximate: bool
+
+    def __init__(self, approximate: bool) -> None:
+        super().__init__(BuiltinOptions.GeluOptions, BuiltinOperator.GELU)
+        self.approximate = approximate
+
+    def gen_tflite(self, builder: fb.Builder):
+        GeluOptions.Start(builder)
+
+        GeluOptions.AddApproximate(builder, self.approximate)
+
+        return GeluOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/greater_equal_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/greater_equal_options.py
new file mode 100755
index 00000000000..a5071cd057f
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/greater_equal_options.py
@@ -0,0 +1,25 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import GreaterEqualOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class GreaterEqual(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            BuiltinOptions.GreaterEqualOptions, BuiltinOperator.GREATER_EQUAL
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        GreaterEqualOptions.Start(builder)
+
+        return GreaterEqualOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/greater_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/greater_options.py
new file mode 100755
index 00000000000..a129281b86b
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/greater_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import GreaterOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Greater(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.GreaterOptions, BuiltinOperator.GREATER)
+
+    def gen_tflite(self, builder: fb.Builder):
+        GreaterOptions.Start(builder)
+
+        return GreaterOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/hard_swish_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/hard_swish_options.py
new file mode 100755
index 00000000000..be18603715a
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/hard_swish_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import HardSwishOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class HardSwish(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.HardSwishOptions, BuiltinOperator.HARD_SWISH)
+
+    def gen_tflite(self, builder: fb.Builder):
+        HardSwishOptions.Start(builder)
+
+        return HardSwishOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/leaky_relu_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/leaky_relu_options.py
new file mode 100755
index 00000000000..6ba7bb65d72
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/leaky_relu_options.py
@@ -0,0 +1,31 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.LeakyReluOptions as libLeakyReluOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class LeakyRelu(meta.BuiltinOptions):
+    alpha: float
+
+    def __init__(self, alpha: float) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.LeakyReluOptions,
+            libBuiltinOperator.BuiltinOperator.LEAKY_RELU,
+        )
+        self.alpha = alpha
+
+    def gen_tflite(self, builder: fb.Builder):
+        libLeakyReluOptions.Start(builder)
+
+        libLeakyReluOptions.AddAlpha(builder, self.alpha)
+
+        return libLeakyReluOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/less_equal_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/less_equal_options.py
new file mode 100755
index 00000000000..3bc5e36e721
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/less_equal_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import LessEqualOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class LessEqual(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.LessEqualOptions, BuiltinOperator.LESS_EQUAL)
+
+    def gen_tflite(self, builder: fb.Builder):
+        LessEqualOptions.Start(builder)
+
+        return LessEqualOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/less_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/less_options.py
new file mode 100755
index 00000000000..2f7c4696892
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/less_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.LessOptions as libLessOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Less(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.LessOptions, BuiltinOperator.LESS)
+
+    def gen_tflite(self, builder: fb.Builder):
+        libLessOptions.Start(builder)
+
+        return libLessOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/log_softmax_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/log_softmax_options.py
new file mode 100755
index 00000000000..163cbfb7cf9
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/log_softmax_options.py
@@ -0,0 +1,29 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    LogSoftmax
+
+Representation of the TFLite operator 'LogSoftmax'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.LogSoftmaxOptions as libLogSoftmaxOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class LogSoftmax(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.LogSoftmaxOptions,
+            libBuiltinOperator.BuiltinOperator.LOG_SOFTMAX,
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        libLogSoftmaxOptions.Start(builder)
+        return libLogSoftmaxOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_and_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_and_options.py
new file mode 100755
index 00000000000..95253c5841e
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_and_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import LogicalAndOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class LogicalAnd(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.LogicalAndOptions, BuiltinOperator.LOGICAL_AND)
+
+    def gen_tflite(self, builder: fb.Builder):
+        LogicalAndOptions.Start(builder)
+
+        return LogicalAndOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_not_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_not_options.py
new file mode 100755
index 00000000000..05798388f1b
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_not_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import LogicalNotOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class LogicalNot(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.LogicalNotOptions, BuiltinOperator.LOGICAL_NOT)
+
+    def gen_tflite(self, builder: fb.Builder):
+        LogicalNotOptions.Start(builder)
+
+        return LogicalNotOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_or_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_or_options.py
new file mode 100755
index 00000000000..ff39fe42ee4
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/logical_or_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import LogicalOrOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class LogicalOr(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.LogicalOrOptions, BuiltinOperator.LOGICAL_OR)
+
+    def gen_tflite(self, builder: fb.Builder):
+        LogicalOrOptions.Start(builder)
+
+        return LogicalOrOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/lrn_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/lrn_options.py
new file mode 100755
index 00000000000..8d4fe5d20ff
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/lrn_options.py
@@ -0,0 +1,45 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    LRN
+
+Representation of the TFLite operator 'LocalResponseNormalization'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.LocalResponseNormalizationOptions as libLocalResponseNormalizationOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class LRN(meta.BuiltinOptions):
+    radius: int
+    bias: float
+    alpha: float
+    beta: float
+
+    def __init__(self, radius: int, bias: float, alpha: float, beta: float) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.LocalResponseNormalizationOptions,
+            libBuiltinOperator.BuiltinOperator.LOCAL_RESPONSE_NORMALIZATION,
+        )
+        self.radius = radius
+        self.bias = bias
+        self.alpha = alpha
+        self.beta = beta
+
+    def gen_tflite(self, builder: fb.Builder):
+        libLocalResponseNormalizationOptions.Start(builder)
+
+        libLocalResponseNormalizationOptions.AddRadius(builder, self.radius)
+        libLocalResponseNormalizationOptions.AddBias(builder, self.bias)
+        libLocalResponseNormalizationOptions.AddAlpha(builder, self.alpha)
+        libLocalResponseNormalizationOptions.AddBeta(builder, self.beta)
+
+        return libLocalResponseNormalizationOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/lstm_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/lstm_options.py
new file mode 100755
index 00000000000..47a624b4cbd
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/lstm_options.py
@@ -0,0 +1,60 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.LSTMOptions as libLSTMOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.LSTMKernelType import LSTMKernelType
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class LSTM(meta.BuiltinOptions):
+    # LSTM v1+
+    fused_activation_function: ActivationFunctionType
+    cell_clip: float
+    proj_clip: float
+
+    # LSTM v2+
+    kernel_type: LSTMKernelType
+
+    # LSTM v4+
+    asymmetric_quantize_inputs: bool
+
+    def __init__(
+        self,
+        cell_clip: float,
+        proj_clip: float,
+        kernel_type: LSTMKernelType = LSTMKernelType.FULL,
+        asymmetric_quantize_inputs: bool = False,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(BuiltinOptions.LSTMOptions, BuiltinOperator.LSTM)
+
+        self.cell_clip = cell_clip
+        self.proj_clip = proj_clip
+        self.kernel_type = kernel_type
+        self.asymmetric_quantize_inputs = asymmetric_quantize_inputs
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libLSTMOptions.Start(builder)
+
+        libLSTMOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libLSTMOptions.AddCellClip(builder, self.cell_clip)
+        libLSTMOptions.AddProjClip(builder, self.proj_clip)
+        libLSTMOptions.AddKernelType(builder, self.kernel_type)
+        libLSTMOptions.AddAsymmetricQuantizeInputs(
+            builder, self.asymmetric_quantize_inputs
+        )
+
+        return libLSTMOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/max_pool_2d_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/max_pool_2d_options.py
new file mode 100755
index 00000000000..b87a2f46de2
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/max_pool_2d_options.py
@@ -0,0 +1,62 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    MaxPool2D
+
+Representation of the TFLite operator 'MaxPool2D'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType as libActivationFunctionType
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.Padding as libPadding
+import executorch.backends.nxp.backend.ir.lib.tflite.Pool2DOptions as libPool2DOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class MaxPool2D(meta.BuiltinOptions):
+    padding: libPadding.Padding
+    stride_w: int
+    stride_h: int
+    filter_w: int
+    filter_h: int
+    fused_activation_function: libActivationFunctionType.ActivationFunctionType
+
+    def __init__(
+        self,
+        padding: libPadding.Padding = libPadding.Padding.SAME,
+        stride_w: int = 1,
+        stride_h: int = 1,
+        filter_w: int = 1,
+        filter_h: int = 1,
+        fused_activation_function: libActivationFunctionType.ActivationFunctionType = libActivationFunctionType.ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.Pool2DOptions,
+            libBuiltinOperator.BuiltinOperator.MAX_POOL_2D,
+        )
+        self.padding = padding
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+        self.filter_w = filter_w
+        self.filter_h = filter_h
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libPool2DOptions.Start(builder)
+
+        libPool2DOptions.AddPadding(builder, self.padding)
+        libPool2DOptions.AddStrideW(builder, self.stride_w)
+        libPool2DOptions.AddStrideH(builder, self.stride_h)
+        libPool2DOptions.AddFilterHeight(builder, self.filter_h)
+        libPool2DOptions.AddFilterWidth(builder, self.filter_w)
+        libPool2DOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        return libPool2DOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/maximum_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/maximum_options.py
new file mode 100755
index 00000000000..1a0c7d9f630
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/maximum_options.py
@@ -0,0 +1,21 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import MaximumMinimumOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Maximum(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.MaximumMinimumOptions, BuiltinOperator.MAXIMUM)
+
+    def gen_tflite(self, builder: fb.Builder):
+        MaximumMinimumOptions.Start(builder)
+        return MaximumMinimumOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/mean_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/mean_options.py
new file mode 100755
index 00000000000..8cf526b72ae
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/mean_options.py
@@ -0,0 +1,28 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import ReducerOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class Mean(meta.BuiltinOptions):
+    keep_dims: bool
+
+    def __init__(self, keep_dims: bool) -> None:
+        super().__init__(BuiltinOptions.ReducerOptions, BuiltinOperator.MEAN)
+        self.keep_dims = keep_dims
+
+    def gen_tflite(self, builder: fb.Builder):
+        ReducerOptions.Start(builder)
+
+        ReducerOptions.AddKeepDims(builder, self.keep_dims)
+
+        return ReducerOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/minimum_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/minimum_options.py
new file mode 100755
index 00000000000..02def0d7eb9
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/minimum_options.py
@@ -0,0 +1,21 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import MaximumMinimumOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Minimum(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.MaximumMinimumOptions, BuiltinOperator.MINIMUM)
+
+    def gen_tflite(self, builder: fb.Builder):
+        MaximumMinimumOptions.Start(builder)
+        return MaximumMinimumOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/mirror_pad_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/mirror_pad_options.py
new file mode 100755
index 00000000000..a7111381734
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/mirror_pad_options.py
@@ -0,0 +1,60 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers
+
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    BuiltinOperator,
+    BuiltinOptions,
+    MirrorPadOptions,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.MirrorPadMode import MirrorPadMode
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+# SYMMETRIC pads = [2, 2, 2, 2]
+# input:
+# [[0. 1.]
+#  [2. 3.]]
+#
+# output:
+# [[3. 2. 2. 3. 3. 2.]
+#  [1. 0. 0. 1. 1. 0.]
+#  [1. 0. 0. 1. 1. 0.]
+#  [3. 2. 2. 3. 3. 2.]
+#  [3. 2. 2. 3. 3. 2.]
+#  [1. 0. 0. 1. 1. 0.]]
+
+# REFLECT pads = [2, 2, 2, 2]
+# input:
+# [[0. 1.]
+#  [2. 3.]]
+#
+# output: (Doesn't make sense to me, why the first row is all 0. Also last row is weird.)
+#         (Also the element [0][0] is sometimes 9.462417e-28, so the computations seems non-deterministic!)
+# [[0. 0. 0. 0. 0. 0.]
+#  [0. 3. 2. 3. 2. 2.]
+#  [2. 1. 0. 1. 0. 0.]
+#  [0. 3. 2. 3. 2. 2.]
+#  [2. 1. 0. 1. 0. 0.]
+#  [2. 1. 0. 1. 0. 0.]]
+
+
+class MirrorPad(meta.BuiltinOptions):
+    mode: MirrorPadMode
+
+    def __init__(self, mode: MirrorPadMode = MirrorPadMode.REFLECT) -> None:
+        super().__init__(
+            BuiltinOptions.BuiltinOptions.MirrorPadOptions,
+            BuiltinOperator.BuiltinOperator.MIRROR_PAD,
+        )
+        self.mode = mode
+
+    def gen_tflite(self, builder: flatbuffers.Builder):
+        MirrorPadOptions.Start(builder)
+
+        MirrorPadOptions.AddMode(builder, self.mode)
+
+        return MirrorPadOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/mul_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/mul_options.py
new file mode 100755
index 00000000000..3eef87b2344
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/mul_options.py
@@ -0,0 +1,36 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import MulOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Mul(meta.BuiltinOptions):
+    fused_activation_function: ActivationFunctionType
+
+    def __init__(
+        self,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(BuiltinOptions.MulOptions, BuiltinOperator.MUL)
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        MulOptions.Start(builder)
+
+        MulOptions.AddFusedActivationFunction(builder, self.fused_activation_function)
+
+        return MulOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/multinomial_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/multinomial_options.py
new file mode 100755
index 00000000000..cd59fe7f608
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/multinomial_options.py
@@ -0,0 +1,30 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import RandomOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Multinomial(meta.BuiltinOptions):
+    seed: int
+    seed2: int
+
+    def __init__(self, seed: int, seed2: int) -> None:
+        super().__init__(BuiltinOptions.RandomOptions, BuiltinOperator.MULTINOMIAL)
+        self.seed = seed
+        self.seed2 = seed2
+
+    def gen_tflite(self, builder: fb.Builder):
+        RandomOptions.Start(builder)
+
+        RandomOptions.AddSeed(builder, self.seed)
+        RandomOptions.AddSeed2(builder, self.seed2)
+
+        return RandomOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/neg_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/neg_options.py
new file mode 100755
index 00000000000..5ddea3bfe6e
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/neg_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import NegOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Neg(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.NegOptions, BuiltinOperator.NEG)
+
+    def gen_tflite(self, builder: fb.Builder):
+        NegOptions.Start(builder)
+
+        return NegOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/not_equal_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/not_equal_options.py
new file mode 100755
index 00000000000..98b2819b944
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/not_equal_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import NotEqualOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class NotEqual(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.NotEqualOptions, BuiltinOperator.NOT_EQUAL)
+
+    def gen_tflite(self, builder: fb.Builder):
+        NotEqualOptions.Start(builder)
+
+        return NotEqualOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/one_hot_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/one_hot_options.py
new file mode 100755
index 00000000000..c075c96f08d
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/one_hot_options.py
@@ -0,0 +1,27 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import OneHotOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class OneHot(meta.BuiltinOptions):
+    axis: int
+
+    def __init__(self, axis: int) -> None:
+        super().__init__(BuiltinOptions.OneHotOptions, BuiltinOperator.ONE_HOT)
+        self.axis = axis
+
+    def gen_tflite(self, builder: fb.Builder):
+        OneHotOptions.Start(builder)
+
+        OneHotOptions.AddAxis(builder, self.axis)
+
+        return OneHotOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/pad_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/pad_options.py
new file mode 100755
index 00000000000..86d81f20af5
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/pad_options.py
@@ -0,0 +1,27 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers
+
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    BuiltinOperator,
+    BuiltinOptions,
+    PadOptions,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class Pad(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            BuiltinOptions.BuiltinOptions.PadOptions,
+            BuiltinOperator.BuiltinOperator.PAD,
+        )
+
+    def gen_tflite(self, builder: flatbuffers.Builder):
+        PadOptions.Start(builder)
+
+        return PadOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/pad_v2_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/pad_v2_options.py
new file mode 100755
index 00000000000..48b6c404517
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/pad_v2_options.py
@@ -0,0 +1,27 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers
+
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    BuiltinOperator,
+    BuiltinOptions,
+    PadV2Options,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class PadV2(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            BuiltinOptions.BuiltinOptions.PadV2Options,
+            BuiltinOperator.BuiltinOperator.PADV2,
+        )
+
+    def gen_tflite(self, builder: flatbuffers.Builder):
+        PadV2Options.Start(builder)
+
+        return PadV2Options.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/pow_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/pow_options.py
new file mode 100755
index 00000000000..c87204feeaf
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/pow_options.py
@@ -0,0 +1,24 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers
+
+from executorch.backends.nxp.backend.ir.lib.tflite import PowOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class Pow(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.PowOptions, BuiltinOperator.POW)
+
+    def gen_tflite(self, builder: flatbuffers.Builder):
+        PowOptions.Start(builder)
+
+        return PowOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/quantize_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/quantize_options.py
new file mode 100755
index 00000000000..e80e642a09f
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/quantize_options.py
@@ -0,0 +1,30 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+    quantize_options
+
+    Representation of a TFLite operator 'Quantize'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.QuantizeOptions as libQuantizeOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class Quantize(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.QuantizeOptions,
+            libBuiltinOperator.BuiltinOperator.QUANTIZE,
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        libQuantizeOptions.Start(builder)
+
+        return libQuantizeOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/range_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/range_options.py
new file mode 100755
index 00000000000..aaeed7f87b2
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/range_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import RangeOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Range(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.RangeOptions, BuiltinOperator.RANGE)
+
+    def gen_tflite(self, builder: fb.Builder):
+        RangeOptions.Start(builder)
+
+        return RangeOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_max_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_max_options.py
new file mode 100755
index 00000000000..6becf2069c4
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_max_options.py
@@ -0,0 +1,28 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import ReducerOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class ReduceMax(meta.BuiltinOptions):
+    keep_dims: bool
+
+    def __init__(self, keep_dims: bool) -> None:
+        super().__init__(BuiltinOptions.ReducerOptions, BuiltinOperator.REDUCE_MAX)
+        self.keep_dims = keep_dims
+
+    def gen_tflite(self, builder: fb.Builder):
+        ReducerOptions.Start(builder)
+
+        ReducerOptions.AddKeepDims(builder, self.keep_dims)
+
+        return ReducerOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_min_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_min_options.py
new file mode 100755
index 00000000000..1b36203fefc
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_min_options.py
@@ -0,0 +1,28 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import ReducerOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class ReduceMin(meta.BuiltinOptions):
+    keep_dims: bool
+
+    def __init__(self, keep_dims: bool) -> None:
+        super().__init__(BuiltinOptions.ReducerOptions, BuiltinOperator.REDUCE_MIN)
+        self.keep_dims = keep_dims
+
+    def gen_tflite(self, builder: fb.Builder):
+        ReducerOptions.Start(builder)
+
+        ReducerOptions.AddKeepDims(builder, self.keep_dims)
+
+        return ReducerOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_prod_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_prod_options.py
new file mode 100755
index 00000000000..102e2b261ac
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/reduce_prod_options.py
@@ -0,0 +1,28 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import ReducerOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class ReduceProd(meta.BuiltinOptions):
+    keep_dims: bool
+
+    def __init__(self, keep_dims: bool) -> None:
+        super().__init__(BuiltinOptions.ReducerOptions, BuiltinOperator.REDUCE_PROD)
+        self.keep_dims = keep_dims
+
+    def gen_tflite(self, builder: fb.Builder):
+        ReducerOptions.Start(builder)
+
+        ReducerOptions.AddKeepDims(builder, self.keep_dims)
+
+        return ReducerOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/reshape_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/reshape_options.py
new file mode 100755
index 00000000000..800bd645b8a
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/reshape_options.py
@@ -0,0 +1,52 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    Reshape
+
+Representation of the TFLite operator 'Reshape'.
+"""
+
+from typing import List, Optional
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.ReshapeOptions as libReshapeOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+
+import flatbuffers as fb
+
+
+class NewShape(meta.IntVector):
+    def __init__(self, new_shape: List[int]) -> None:
+        super().__init__(new_shape, libReshapeOptions.StartNewShapeVector)
+
+
+class Reshape(meta.BuiltinOptions):
+    new_shape: Optional[NewShape]
+
+    def __init__(self, new_shape: Optional[List[int]]) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.ReshapeOptions,
+            libBuiltinOperator.BuiltinOperator.RESHAPE,
+        )
+        if new_shape is not None:
+            self.new_shape = NewShape(new_shape)
+        else:
+            self.new_shape = None
+
+    def gen_tflite(self, builder: fb.Builder):
+        if self.new_shape is not None:
+            tfl_new_shape = self.new_shape.gen_tflite(builder)
+        else:
+            tfl_new_shape = None
+
+        libReshapeOptions.Start(builder)
+
+        if tfl_new_shape is not None:
+            libReshapeOptions.AddNewShape(builder, tfl_new_shape)
+
+        return libReshapeOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/resize_bilinear_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/resize_bilinear_options.py
new file mode 100755
index 00000000000..9630941a28a
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/resize_bilinear_options.py
@@ -0,0 +1,34 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import ResizeBilinearOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+# noinspection SpellCheckingInspection
+class ResizeBilinear(meta.BuiltinOptions):
+    align_corners: bool
+    half_pixel_centers: bool
+
+    def __init__(self, align_corners: bool, half_pixel_centers: bool) -> None:
+        super().__init__(
+            BuiltinOptions.ResizeBilinearOptions, BuiltinOperator.RESIZE_BILINEAR
+        )
+        self.align_corners = align_corners
+        self.half_pixel_centers = half_pixel_centers
+
+    def gen_tflite(self, builder: fb.Builder):
+        ResizeBilinearOptions.Start(builder)
+
+        ResizeBilinearOptions.AddAlignCorners(builder, self.align_corners)
+        ResizeBilinearOptions.AddHalfPixelCenters(builder, self.half_pixel_centers)
+
+        return ResizeBilinearOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/resize_nearest_neighbor_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/resize_nearest_neighbor_options.py
new file mode 100755
index 00000000000..8837aad0bde
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/resize_nearest_neighbor_options.py
@@ -0,0 +1,37 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import ResizeNearestNeighborOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+# noinspection SpellCheckingInspection
+class ResizeNearestNeighbor(meta.BuiltinOptions):
+    align_corners: bool
+    half_pixel_centers: bool
+
+    def __init__(self, align_corners: bool, half_pixel_centers: bool) -> None:
+        super().__init__(
+            BuiltinOptions.ResizeNearestNeighborOptions,
+            BuiltinOperator.RESIZE_NEAREST_NEIGHBOR,
+        )
+        self.align_corners = align_corners
+        self.half_pixel_centers = half_pixel_centers
+
+    def gen_tflite(self, builder: fb.Builder):
+        ResizeNearestNeighborOptions.Start(builder)
+
+        ResizeNearestNeighborOptions.AddAlignCorners(builder, self.align_corners)
+        ResizeNearestNeighborOptions.AddHalfPixelCenters(
+            builder, self.half_pixel_centers
+        )
+
+        return ResizeNearestNeighborOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/reverse_sequence_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/reverse_sequence_options.py
new file mode 100755
index 00000000000..0896f5137e5
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/reverse_sequence_options.py
@@ -0,0 +1,32 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import ReverseSequenceOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class ReverseSequence(meta.BuiltinOptions):
+    seq_dim: int
+    batch_dim: int
+
+    def __init__(self, seq_dim: int, batch_dim: int) -> None:
+        super().__init__(
+            BuiltinOptions.ReverseSequenceOptions, BuiltinOperator.REVERSE_SEQUENCE
+        )
+        self.seq_dim = seq_dim
+        self.batch_dim = batch_dim
+
+    def gen_tflite(self, builder: fb.Builder):
+        ReverseSequenceOptions.Start(builder)
+
+        ReverseSequenceOptions.AddSeqDim(builder, self.seq_dim)
+        ReverseSequenceOptions.AddBatchDim(builder, self.batch_dim)
+
+        return ReverseSequenceOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/scatter_nd_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/scatter_nd_options.py
new file mode 100755
index 00000000000..c32805d7604
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/scatter_nd_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import ScatterNdOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class ScatterND(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.ScatterNdOptions, BuiltinOperator.SCATTER_ND)
+
+    def gen_tflite(self, builder: fb.Builder):
+        ScatterNdOptions.Start(builder)
+
+        return ScatterNdOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/select_v2_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/select_v2_options.py
new file mode 100755
index 00000000000..dd862a83dd2
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/select_v2_options.py
@@ -0,0 +1,24 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.SelectV2Options as libSelectV2Options
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class SelectV2(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.SelectV2Options,
+            libBuiltinOperator.BuiltinOperator.SELECT_V2,
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        libSelectV2Options.Start(builder)
+        return libSelectV2Options.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/shape_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/shape_options.py
new file mode 100755
index 00000000000..b6d4e5ac7b4
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/shape_options.py
@@ -0,0 +1,34 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+    shape_options
+
+    Representation of a TFLite operator 'Shape'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.ShapeOptions as libShapeOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+
+
+class Shape(meta.BuiltinOptions):
+    out_type: TensorType
+
+    def __init__(self, out_type: TensorType) -> None:
+        super().__init__(BuiltinOptions.ShapeOptions, BuiltinOperator.SHAPE)
+        self.out_type = out_type
+
+    def gen_tflite(self, builder: fb.Builder):
+        libShapeOptions.Start(builder)
+
+        libShapeOptions.AddOutType(builder, self.out_type)
+
+        return libShapeOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/sign_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/sign_options.py
new file mode 100755
index 00000000000..8881dc0a910
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/sign_options.py
@@ -0,0 +1,22 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import SignOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Sign(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.SignOptions, BuiltinOperator.SIGN)
+
+    def gen_tflite(self, builder: fb.Builder):
+        SignOptions.Start(builder)
+
+        return SignOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/slice_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/slice_options.py
new file mode 100755
index 00000000000..f5e8ef7d43a
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/slice_options.py
@@ -0,0 +1,29 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+    slice_options
+
+Representation of the TFLite operator 'Slice'.
+"""
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    SliceOptions as libSliceOptions,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Slice(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.SliceOptions, BuiltinOperator.SLICE)
+
+    def gen_tflite(self, builder: fb.Builder):
+        libSliceOptions.Start(builder)
+        return libSliceOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/softmax_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/softmax_options.py
new file mode 100755
index 00000000000..3001f659d40
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/softmax_options.py
@@ -0,0 +1,35 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    Softmax
+
+Representation of the TFLite operator 'Softmax'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.SoftmaxOptions as libSoftmaxOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class Softmax(meta.BuiltinOptions):
+    beta: float
+
+    def __init__(self, beta: float) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.SoftmaxOptions,
+            libBuiltinOperator.BuiltinOperator.SOFTMAX,
+        )
+        self.beta = beta
+
+    def gen_tflite(self, builder: fb.Builder):
+        libSoftmaxOptions.Start(builder)
+
+        libSoftmaxOptions.AddBeta(builder, self.beta)
+
+        return libSoftmaxOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/space_to_depth_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/space_to_depth_options.py
new file mode 100755
index 00000000000..b6112247dad
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/space_to_depth_options.py
@@ -0,0 +1,29 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import SpaceToDepthOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class SpaceToDepth(meta.BuiltinOptions):
+    block_size: int
+
+    def __init__(self, block_size: int) -> None:
+        super().__init__(
+            BuiltinOptions.SpaceToDepthOptions, BuiltinOperator.SPACE_TO_DEPTH
+        )
+        self.block_size = block_size
+
+    def gen_tflite(self, builder: fb.Builder):
+        SpaceToDepthOptions.Start(builder)
+
+        SpaceToDepthOptions.SpaceToDepthOptionsAddBlockSize(builder, self.block_size)
+
+        return SpaceToDepthOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/split_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/split_options.py
new file mode 100755
index 00000000000..e975af881f1
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/split_options.py
@@ -0,0 +1,27 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.SplitOptions as libSplitOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class Split(meta.BuiltinOptions):
+    num_splits: int
+
+    def __init__(self, num_splits: int) -> None:
+        super().__init__(BuiltinOptions.SplitOptions, BuiltinOperator.SPLIT)
+        self.num_splits = num_splits
+
+    def gen_tflite(self, builder: fb.Builder):
+        libSplitOptions.Start(builder)
+
+        libSplitOptions.AddNumSplits(builder, self.num_splits)
+
+        return libSplitOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/split_v_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/split_v_options.py
new file mode 100755
index 00000000000..9f6751d6944
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/split_v_options.py
@@ -0,0 +1,27 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.SplitVOptions as libSplitVOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class SplitV(meta.BuiltinOptions):
+    num_splits: int
+
+    def __init__(self, num_splits: int) -> None:
+        super().__init__(BuiltinOptions.SplitVOptions, BuiltinOperator.SPLIT_V)
+        self.num_splits = num_splits
+
+    def gen_tflite(self, builder: fb.Builder):
+        libSplitVOptions.Start(builder)
+
+        libSplitVOptions.AddNumSplits(builder, self.num_splits)
+
+        return libSplitVOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/square_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/square_options.py
new file mode 100755
index 00000000000..327e161c167
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/square_options.py
@@ -0,0 +1,23 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    SquareOptions as libSquareOptions,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Square(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.SquareOptions, BuiltinOperator.SQUARE)
+
+    def gen_tflite(self, builder: fb.Builder):
+        libSquareOptions.Start(builder)
+        return libSquareOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/squared_difference_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/squared_difference_options.py
new file mode 100755
index 00000000000..1b073782b93
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/squared_difference_options.py
@@ -0,0 +1,25 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import SquaredDifferenceOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class SquaredDifference(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            BuiltinOptions.SquaredDifferenceOptions, BuiltinOperator.SQUARED_DIFFERENCE
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        SquaredDifferenceOptions.Start(builder)
+
+        return SquaredDifferenceOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/squeeze_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/squeeze_options.py
new file mode 100755
index 00000000000..4718917fc60
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/squeeze_options.py
@@ -0,0 +1,42 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.SqueezeOptions as libSqueezeOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class SqueezeDims(meta.IntVector):
+    def __init__(self, new_shape: list[int]) -> None:
+        super().__init__(new_shape, libSqueezeOptions.StartSqueezeDimsVector)
+
+
+class Squeeze(meta.BuiltinOptions):
+    squeeze_dims: SqueezeDims | None
+
+    def __init__(self, squeeze_dims: list[int] | None) -> None:
+        super().__init__(BuiltinOptions.SqueezeOptions, BuiltinOperator.SQUEEZE)
+
+        if squeeze_dims is not None:
+            self.squeeze_dims = SqueezeDims(squeeze_dims)
+        else:
+            self.squeeze_dims = None
+
+    def gen_tflite(self, builder: fb.Builder):
+        if self.squeeze_dims is not None:
+            tfl_squeeze_dims = self.squeeze_dims.gen_tflite(builder)
+        else:
+            tfl_squeeze_dims = None
+
+        libSqueezeOptions.Start(builder)
+
+        if tfl_squeeze_dims is not None:
+            libSqueezeOptions.AddSqueezeDims(builder, tfl_squeeze_dims)
+
+        return libSqueezeOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/strided_slice_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/strided_slice_options.py
new file mode 100755
index 00000000000..6f04b7e1db3
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/strided_slice_options.py
@@ -0,0 +1,35 @@
+# Copyright 2023 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+    slice_options
+
+Representation of the TFLite operator 'Slice'.
+"""
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    StridedSliceOptions as libStridedSliceOptions,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class StridedSlice(meta.BuiltinOptions):
+    offset: bool
+
+    def __init__(self, offset: bool = False) -> None:
+        super().__init__(
+            BuiltinOptions.StridedSliceOptions, BuiltinOperator.STRIDED_SLICE
+        )
+        self.offset = offset
+
+    def gen_tflite(self, builder: fb.Builder):
+        libStridedSliceOptions.Start(builder)
+        libStridedSliceOptions.AddOffset(builder, self.offset)
+        return libStridedSliceOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/sub_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/sub_options.py
new file mode 100755
index 00000000000..16dcd1e64ab
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/sub_options.py
@@ -0,0 +1,45 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    Sub
+
+Representation of the TFLite operator 'Sub'.
+"""
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import (
+    ActivationFunctionType as libActivationFunctionType,
+    BuiltinOperator as libBuiltinOperator,
+    BuiltinOptions as libBuiltinOptions,
+    SubOptions as libSubOptions,
+)
+
+
+class Sub(meta.BuiltinOptions):
+    fused_activation_function: libActivationFunctionType.ActivationFunctionType
+
+    # TODO potScaleInt16
+
+    def __init__(
+        self,
+        fused_activation_function: libActivationFunctionType.ActivationFunctionType = libActivationFunctionType.ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.SubOptions,
+            libBuiltinOperator.BuiltinOperator.SUB,
+        )
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libSubOptions.Start(builder)
+
+        libSubOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        return libSubOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/sum_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/sum_options.py
new file mode 100755
index 00000000000..f309e5f7f54
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/sum_options.py
@@ -0,0 +1,28 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import flatbuffers as fb
+
+from executorch.backends.nxp.backend.ir.lib.tflite import ReducerOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class Sum(meta.BuiltinOptions):
+    keep_dims: bool
+
+    def __init__(self, keep_dims: bool) -> None:
+        super().__init__(BuiltinOptions.ReducerOptions, BuiltinOperator.SUM)
+        self.keep_dims = keep_dims
+
+    def gen_tflite(self, builder: fb.Builder):
+        ReducerOptions.Start(builder)
+
+        ReducerOptions.AddKeepDims(builder, self.keep_dims)
+
+        return ReducerOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/tile_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/tile_options.py
new file mode 100755
index 00000000000..92ba780de42
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/tile_options.py
@@ -0,0 +1,23 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite import TileOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+
+
+class Tile(meta.BuiltinOptions):
+
+    def __init__(self) -> None:
+        super().__init__(BuiltinOptions.TileOptions, BuiltinOperator.TILE)
+
+    def gen_tflite(self, builder: fb.Builder):
+        TileOptions.Start(builder)
+
+        return TileOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_conv_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_conv_options.py
new file mode 100755
index 00000000000..87bc8ea3639
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_conv_options.py
@@ -0,0 +1,50 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.TransposeConvOptions as libTransposeConvOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.Padding import Padding
+
+
+class TransposeConv(meta.BuiltinOptions):
+    padding: Padding
+    stride_w: int
+    stride_h: int
+    fused_activation_function: ActivationFunctionType
+
+    def __init__(
+        self,
+        padding: Padding = Padding.SAME,
+        stride_w: int = 1,
+        stride_h: int = 1,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.TransposeConvOptions, BuiltinOperator.TRANSPOSE_CONV
+        )
+        self.padding = padding
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+        self.fused_activation_function = fused_activation_function
+
+    def gen_tflite(self, builder: fb.Builder):
+        libTransposeConvOptions.Start(builder)
+
+        libTransposeConvOptions.AddPadding(builder, self.padding)
+        libTransposeConvOptions.AddStrideW(builder, self.stride_w)
+        libTransposeConvOptions.AddStrideH(builder, self.stride_h)
+        libTransposeConvOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+
+        return libTransposeConvOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_options.py
new file mode 100755
index 00000000000..5869b1ed315
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/transpose_options.py
@@ -0,0 +1,29 @@
+#
+# Copyright 2023 Martin Pavella
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    Transpose
+
+Representation of the TFLite operator 'Transpose'.
+"""
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as libBuiltinOptions
+import executorch.backends.nxp.backend.ir.lib.tflite.TransposeOptions as libTransposeOptions
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+import flatbuffers as fb
+
+
+class Transpose(meta.BuiltinOptions):
+    def __init__(self) -> None:
+        super().__init__(
+            libBuiltinOptions.BuiltinOptions.TransposeOptions,
+            libBuiltinOperator.BuiltinOperator.TRANSPOSE,
+        )
+
+    def gen_tflite(self, builder: fb.Builder):
+        libTransposeOptions.Start(builder)
+        return libTransposeOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_lstm_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_lstm_options.py
new file mode 100755
index 00000000000..b8921453a62
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_lstm_options.py
@@ -0,0 +1,67 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.UnidirectionalSequenceLSTMOptions as libUSLSTMOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class UnidirectionalSequenceLSTM(meta.BuiltinOptions):
+    fused_activation_function: ActivationFunctionType
+    cell_clip: float
+    proj_clip: float
+    time_major: bool  # If True, the first dimension is sequence, otherwise batch.
+
+    # Unidirectional Sequence LSTM v3+
+    asymmetric_quantize_inputs: bool
+
+    # Unidirectional Sequence LSTM v4+
+    diagonal_recurrent_tensors: bool
+
+    def __init__(
+        self,
+        cell_clip: float,
+        proj_clip: float,
+        time_major: bool = True,
+        asymmetric_quantize_inputs: bool = False,
+        diagonal_recurrent_tensors: bool = False,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.UnidirectionalSequenceLSTMOptions,
+            BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM,
+        )
+
+        self.fused_activation_function = fused_activation_function
+        self.cell_clip = cell_clip
+        self.proj_clip = proj_clip
+        self.time_major = time_major
+        self.asymmetric_quantize_inputs = asymmetric_quantize_inputs
+        self.diagonal_recurrent_tensors = diagonal_recurrent_tensors
+
+    def gen_tflite(self, builder: fb.Builder):
+        libUSLSTMOptions.Start(builder)
+
+        libUSLSTMOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libUSLSTMOptions.AddCellClip(builder, self.cell_clip)
+        libUSLSTMOptions.AddProjClip(builder, self.proj_clip)
+        libUSLSTMOptions.AddTimeMajor(builder, self.time_major)
+        libUSLSTMOptions.AddAsymmetricQuantizeInputs(
+            builder, self.asymmetric_quantize_inputs
+        )
+        libUSLSTMOptions.AddDiagonalRecurrentTensors(
+            builder, self.diagonal_recurrent_tensors
+        )
+
+        return libUSLSTMOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_rnn_options.py b/backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_rnn_options.py
new file mode 100755
index 00000000000..30e16678a36
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/builtin_options/unidirectional_sequence_rnn_options.py
@@ -0,0 +1,49 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.lib.tflite.SequenceRNNOptions as libUSRNNOptions
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import meta
+
+
+class UnidirectionalSequenceRNN(meta.BuiltinOptions):
+    time_major: bool  # If True, the first dimension is sequence, otherwise batch.
+    fused_activation_function: ActivationFunctionType
+    asymmetric_quantize_inputs: bool
+
+    def __init__(
+        self,
+        time_major: bool = True,
+        asymmetric_quantize_inputs: bool = False,
+        fused_activation_function: ActivationFunctionType = ActivationFunctionType.NONE,
+    ) -> None:
+        super().__init__(
+            BuiltinOptions.SequenceRNNOptions,
+            BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN,
+        )
+
+        self.time_major = time_major
+        self.fused_activation_function = fused_activation_function
+        self.asymmetric_quantize_inputs = asymmetric_quantize_inputs
+
+    def gen_tflite(self, builder: fb.Builder):
+        libUSRNNOptions.Start(builder)
+
+        libUSRNNOptions.AddTimeMajor(builder, self.time_major)
+        libUSRNNOptions.AddFusedActivationFunction(
+            builder, self.fused_activation_function
+        )
+        libUSRNNOptions.AddAsymmetricQuantizeInputs(
+            builder, self.asymmetric_quantize_inputs
+        )
+
+        return libUSRNNOptions.End(builder)
diff --git a/backends/nxp/backend/ir/tflite_generator/custom_options/flex_transpose_options.py b/backends/nxp/backend/ir/tflite_generator/custom_options/flex_transpose_options.py
new file mode 100755
index 00000000000..fa0624cb852
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/custom_options/flex_transpose_options.py
@@ -0,0 +1,78 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.tflite_generator.meta.meta import CustomOptions
+
+
+class FlexTranspose(CustomOptions):
+
+    def __init__(self) -> None:
+        super().__init__(
+            "FlexTranspose",
+            bytearray(
+                [
+                    9,
+                    84,
+                    114,
+                    97,
+                    110,
+                    115,
+                    112,
+                    111,
+                    115,
+                    101,
+                    0,
+                    39,
+                    18,
+                    9,
+                    84,
+                    114,
+                    97,
+                    110,
+                    115,
+                    112,
+                    111,
+                    115,
+                    101,
+                    26,
+                    0,
+                    26,
+                    0,
+                    42,
+                    11,
+                    10,
+                    5,
+                    84,
+                    112,
+                    101,
+                    114,
+                    109,
+                    18,
+                    2,
+                    48,
+                    3,
+                    42,
+                    7,
+                    10,
+                    1,
+                    84,
+                    18,
+                    2,
+                    48,
+                    1,
+                    50,
+                    0,
+                    0,
+                    2,
+                    52,
+                    42,
+                    20,
+                    20,
+                    4,
+                    40,
+                    1,
+                ]
+            ),
+        )
diff --git a/backends/nxp/backend/ir/tflite_generator/meta/__init__.py b/backends/nxp/backend/ir/tflite_generator/meta/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/tflite_generator/meta/meta.py b/backends/nxp/backend/ir/tflite_generator/meta/meta.py
new file mode 100755
index 00000000000..bfa48744351
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/meta/meta.py
@@ -0,0 +1,255 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    meta
+
+Implementations of classes that all classes in /src/tflite_generator/ inherit from.
+"""
+import logging
+from typing import Callable, Iterator, List, Union
+
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as bOp
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions as bOpt
+
+import flatbuffers as fb
+
+logger = logging.getLogger(__name__)
+
+""" This file contains parent classes for simple classes used in the '/model' directory. """
+
+
+class TFLiteObject:
+    """Parent class for all tflite objects. That is all objects in the 'tflite_generator' directory."""
+
+    """ Generates tflite representation for this object. MUST be overridden! """
+
+    def gen_tflite(self, builder: fb.Builder) -> int:
+        logger.warning("TFLiteObject: genTFLite() is not defined!")
+        return 0
+
+
+class TFLiteVector(TFLiteObject):
+    """Represents a TFLite vector of TFLiteObjects. Provides interface for storing data
+    and generating output TFLite code."""
+
+    vector: List[Union[TFLiteObject, int, float, bool]]
+
+    """ Indicates if an empty vector should be generated if 'vector' attribute is
+    empty, or to not generate anything in that case. """
+    gen_empty: bool = True
+
+    """ TFLite 'Start...Vector' function for the exact vector. Takes 2 arguments, 
+    'flatbuffers.Builder' and number of vector elements """
+    start_function: Callable[[fb.Builder, int], None]
+
+    """ TFLite 'Prepend...' function for the exact vector item type. Takes 'flatbuffers.Builder' 
+    as argument """
+    prepend_function: Callable[[fb.Builder], Callable[[int], None]]
+
+    def __init__(
+        self,
+        vector: List[Union[TFLiteObject, int, float, bool]],
+        start_function: Callable[[fb.Builder, int], None],
+        prepend_function: Callable[
+            [fb.Builder], Callable[[int], None]
+        ] = lambda builder: builder.PrependUOffsetTRelative,
+        gen_empty: bool = True,
+    ) -> None:
+        if vector is None:
+            vector = []
+        self.vector = vector
+        self.start_function = start_function
+        self.prepend_function = prepend_function
+        self.gen_empty = gen_empty
+
+    def append(self, item):
+        self.vector.append(item)
+
+    def insert(self, index: int, item):
+        self.vector.insert(index, item)
+
+    def index(self, item) -> int:
+        return self.vector.index(item)
+
+    def remove(self, item):
+        self.vector.remove(item)
+
+    def get(self, index: int):
+        return self.vector[index]
+
+    def get_last(self):
+        if len(self.vector) > 0:
+            return self.vector[-1]
+        return None
+
+    def len(self):
+        return self.vector.__len__()
+
+    def __str__(self):
+        return self.vector.__str__()
+
+    def __iter__(self) -> Iterator:
+        return self.vector.__iter__()
+
+    def __getitem__(self, index):
+        return self.vector[index]
+
+    def gen_tflite(self, builder: fb.Builder):
+        """Generates TFLite code for the vector"""
+
+        if (not self.gen_empty) and (len(self.vector) == 0):
+            # Nothing to generate
+            return
+
+        # IMPORTANT! tflite MUST be generated for list items in REVERSE ORDER!
+        # Otherwise, the order will be wrong.
+        tfl_vector = [item.gen_tflite(builder) for item in reversed(self.vector)]
+
+        self.start_function(builder, len(self.vector))
+
+        for tfl_item in tfl_vector:
+            self.prepend_function(builder)(tfl_item)
+
+        return builder.EndVector()
+
+
+class TFLiteAtomicVector(TFLiteVector):
+    def __init__(
+        self,
+        vector: List[Union[int, float, bool]],
+        start_function: Callable[[fb.Builder, int], None],
+        prepend_function: Callable[[fb.Builder], Callable[[int], None]],
+        gen_empty: bool = True,
+    ) -> None:
+        super().__init__(vector, start_function, prepend_function, gen_empty)
+
+    def __eq__(self, other):
+        return self.vector == other.vector
+
+    def gen_tflite(self, builder: fb.Builder):
+        """Generates TFLite code for the vector"""
+
+        if (not self.gen_empty) and (len(self.vector) == 0):
+            # Nothing to generate
+            return
+
+        self.start_function(builder, len(self.vector))
+
+        # IMPORTANT! tflite MUST be generated for list items in REVERSE ORDER!
+        # Otherwise, the order will be wrong.
+        for val in reversed(self.vector):
+            self.prepend_function(builder)(val)
+
+        return builder.EndVector()
+
+
+class FloatVector(TFLiteAtomicVector):
+    """Class represents a TFLite vector of float values. Provides interface for storing data
+    and generating output TFLite code."""
+
+    def __init__(
+        self,
+        float_list: List[float],
+        start_function: Callable[[fb.Builder, int], None],
+        prepend_function: Callable[
+            [fb.Builder], Callable[[int], None]
+        ] = lambda builder: builder.PrependFloat32,
+        gen_empty: bool = True,
+    ) -> None:
+        super().__init__(float_list, start_function, prepend_function, gen_empty)
+
+
+class IntVector(TFLiteAtomicVector):
+    """Class represents a TFLite vector of integer values. Provides interface for storing data
+    and generating output TFLite code."""
+
+    vector: List[int]
+
+    def __init__(
+        self,
+        int_list: List[int],
+        start_function: Callable[[fb.Builder, int], None],
+        prepend_function: Callable[
+            [fb.Builder], Callable[[int], None]
+        ] = lambda builder: builder.PrependInt32,
+        gen_empty: bool = True,
+    ) -> None:
+        super().__init__(int_list, start_function, prepend_function, gen_empty)
+
+
+class BoolVector(TFLiteAtomicVector):
+    """Class represents a TFLite vector of boolean values. Provides interface for storing data
+    and generating output TFLite code."""
+
+    vector: List[bool]
+
+    def __init__(
+        self,
+        bool_list: List[bool],
+        start_function: Callable[[fb.Builder, int], None],
+        prepend_function: Callable[
+            [fb.Builder], Callable[[int], None]
+        ] = lambda builder: builder.PrependBool,
+        gen_empty: bool = True,
+    ) -> None:
+        super().__init__(bool_list, start_function, prepend_function, gen_empty)
+
+
+class BuiltinOptions(TFLiteObject):
+    """Class represents 'BuiltinOptions' for an Operator. Used in 'model/Operators.py'.
+    Provides interface for work with any BuiltinOptions table.
+    This class alone does NOT generate any TFLite.
+    Subclasses do NOT generate TFLite for the 'builtinOptionsType', only for the exact options.
+    'builtinOptionsType' is merely stored here for convenience and an 'Operator' object
+    generates its TFLite representation (as it is the child of the 'operator' table in 'operators').
+    """
+
+    """ The type of parameters of this operator. """
+    builtin_options_type: bOpt.BuiltinOptions
+
+    """ The type of this operator. """
+    operator_type: bOp.BuiltinOperator
+
+    def __init__(
+        self,
+        builtin_options_type: bOpt.BuiltinOptions,
+        operator_type: bOp.BuiltinOperator,
+    ) -> None:
+        if builtin_options_type is None:
+            logger.d(
+                "TFLITE: Operator inheriting from 'BuiltinOptions'. MUST specify the 'builtinOptionsType'!"
+            )
+        if operator_type is None:
+            logger.d(
+                "TFLITE: Operator inheriting from 'BuiltinOptions'. MUST specify the 'operatorType'!"
+            )
+        self.builtin_options_type = builtin_options_type
+        self.operator_type = operator_type
+
+    """ Function has to be overwritten """
+
+    def gen_tflite(self, builder: fb.Builder):
+        logger.w(
+            f"BuiltinOperator '{self.builtin_options_type}':genTFLite() is not defined!"
+        )
+
+
+class CustomOptions(bytearray):
+    """Class represents a `custom_options` object in the TFLite model, i.e. a bytearray form of the parameters of a
+     `custom` TFLite operator.
+
+    Currently, this is being used for `Flex Delegate` operators / `SELECT_TF_OPS`.
+    """
+
+    operator_type = bOp.BuiltinOperator.CUSTOM
+    custom_code: str
+
+    def __init__(self, custom_code: str, data: bytearray):
+        super().__init__()
+        self.custom_code = custom_code
+        self[:] = data
diff --git a/backends/nxp/backend/ir/tflite_generator/meta/types.py b/backends/nxp/backend/ir/tflite_generator/meta/types.py
new file mode 100755
index 00000000000..34e1760602e
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/meta/types.py
@@ -0,0 +1,198 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+"""
+    types
+
+Module contains helper functions that work with TFLite data types.
+"""
+from enum import Enum
+
+import executorch.backends.nxp.backend.ir.logger as logger
+
+import flatbuffers as fb
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+
+# Lists of types. Used to simplify specification of supported types in conversion modules.
+FLOATS = [TensorType.FLOAT16, TensorType.FLOAT32, TensorType.FLOAT64]
+INTS = [TensorType.INT8, TensorType.INT16, TensorType.INT32, TensorType.INT64]
+UINTS = [TensorType.UINT8, TensorType.UINT16, TensorType.UINT32, TensorType.UINT64]
+ALL_TYPES = (
+    FLOATS
+    + INTS
+    + UINTS
+    + [TensorType.STRING, TensorType.BOOL, TensorType.COMPLEX64, TensorType.COMPLEX128]
+)
+
+
+class TensorFlowDataType(Enum):
+    # The DataType enum used internally by TensorFlow.
+    # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/core/framework/types.proto#L13-L87
+
+    DT_INVALID = 0
+    DT_FLOAT = 1
+    DT_DOUBLE = 2
+    DT_INT32 = 3
+    DT_UINT8 = 4
+    DT_INT16 = 5
+    DT_INT8 = 6
+    DT_STRING = 7
+    DT_COMPLEX64 = 8
+    DT_INT64 = 9
+    DT_BOOL = 10
+    DT_QINT8 = 11
+    DT_QUINT8 = 12
+    DT_QINT32 = 13
+    DT_BFLOAT16 = 14
+    DT_QINT16 = 15
+    DT_QUINT16 = 16
+    DT_UINT16 = 17
+    DT_COMPLEX128 = 18
+    DT_HALF = 19
+    DT_RESOURCE = 20
+    DT_VARIANT = 21
+    DT_UINT32 = 22
+    DT_UINT64 = 23
+    DT_FLOAT8_E5M2 = 24
+    DT_FLOAT8_E4M3FN = 25
+    DT_INT4 = 29
+    DT_UINT4 = 30
+
+    DT_FLOAT_REF = 101
+    DT_DOUBLE_REF = 102
+    DT_INT32_REF = 103
+    DT_UINT8_REF = 104
+    DT_INT16_REF = 105
+    DT_INT8_REF = 106
+    DT_STRING_REF = 107
+    DT_COMPLEX64_REF = 108
+    DT_INT64_REF = 109
+    DT_BOOL_REF = 110
+    DT_QINT8_REF = 111
+    DT_QUINT8_REF = 112
+    DT_QINT32_REF = 113
+    DT_BFLOAT16_REF = 114
+    DT_QINT16_REF = 115
+    DT_QUINT16_REF = 116
+    DT_UINT16_REF = 117
+    DT_COMPLEX128_REF = 118
+    DT_HALF_REF = 119
+    DT_RESOURCE_REF = 120
+    DT_VARIANT_REF = 121
+    DT_UINT32_REF = 122
+    DT_UINT64_REF = 123
+    DT_FLOAT8_E5M2_REF = 124
+    DT_FLOAT8_E4M3FN_REF = 125
+    DT_INT4_REF = 129
+    DT_UINT4_REF = 130
+
+
+def is_unsigned(data_type: TensorType) -> bool:
+    return data_type in {
+        TensorType.UINT8,
+        TensorType.UINT16,
+        TensorType.UINT32,
+        TensorType.UINT64,
+    }
+
+
+def is_signed(data_type: TensorType) -> bool:
+    return data_type in {
+        TensorType.INT8,
+        TensorType.INT16,
+        TensorType.INT32,
+        TensorType.INT64,
+    }
+
+
+def name_for_type(data_type: TensorType) -> str:
+    """Return the name of given TFLite data type."""
+    names = [
+        "FLOAT32",
+        "FLOAT16",
+        "INT32",
+        "UINT8",
+        "INT64",
+        "STRING",
+        "BOOL",
+        "INT16",
+        "COMPLEX64",
+        "INT8",
+        "FLOAT64",
+        "COMPLEX128",
+        "UINT64",
+        "RESOURCE",
+        "VARIANT",
+        "UINT32",
+        "UINT16",
+        "INT4",
+    ]
+
+    return names[data_type]
+
+
+def type_size(data_type: TensorType):
+    """Return the memory size in bytes of given TFLite data type."""
+    if data_type in {TensorType.UINT8, TensorType.INT8}:
+        return 1
+    elif data_type in {TensorType.UINT16, TensorType.INT16, TensorType.FLOAT16}:
+        return 2
+    elif data_type in {TensorType.UINT32, TensorType.INT32, TensorType.FLOAT32}:
+        return 4
+    elif data_type in {
+        TensorType.UINT64,
+        TensorType.INT64,
+        TensorType.FLOAT64,
+        TensorType.COMPLEX64,
+    }:
+        return 8
+    elif data_type in {TensorType.COMPLEX128}:
+        return 16
+
+    logger.e(
+        logger.Code.INTERNAL_ERROR,
+        f"Unexpected type '{data_type}' in types.type_size().",
+    )
+
+
+def prepend_function(builder: fb.Builder, data_type: TensorType):  # noqa C901
+    """Return the flatbuffer 'Prepend<type>()' function for given type."""
+    if data_type == TensorType.UINT8:
+        return builder.PrependUint8
+    elif data_type == TensorType.UINT16:
+        return builder.PrependUint16
+    elif data_type == TensorType.UINT32:
+        return builder.PrependUint32
+    elif data_type == TensorType.UINT64:
+        return builder.PrependUint64
+
+    elif data_type == TensorType.INT8:
+        return builder.PrependInt8
+    elif data_type == TensorType.INT16:
+        return builder.PrependInt16
+    elif data_type == TensorType.INT32:
+        return builder.PrependInt32
+    elif data_type == TensorType.INT64:
+        return builder.PrependInt64
+
+    elif data_type == TensorType.FLOAT16:
+        logger.w(
+            "Flatbuffer prepend function for FLOAT16 datatype is not supported! Using default 16b alternative."
+        )
+        return builder.PrependInt16  # Might not work
+    elif data_type == TensorType.FLOAT32:
+        return builder.PrependFloat32
+    elif data_type == TensorType.FLOAT64:
+        return builder.PrependFloat64
+
+    elif data_type == TensorType.BOOL:
+        return builder.PrependBool
+
+    logger.e(
+        logger.Code.NOT_IMPLEMENTED,
+        f"Unsupported flatbuffer prepend function for type '{data_type}'!",
+    )
diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
new file mode 100755
index 00000000000..a9384861178
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
@@ -0,0 +1,816 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2023-2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+
+import itertools
+import logging
+from typing import List, Optional
+
+import executorch.backends.nxp.backend.ir.lib.tflite.Buffer as libBuffer
+import executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator as libBuiltinOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.CustomOptionsFormat as libCustomOptionsFormat
+import executorch.backends.nxp.backend.ir.lib.tflite.Model as libModel
+import executorch.backends.nxp.backend.ir.lib.tflite.Operator as libOperator
+import executorch.backends.nxp.backend.ir.lib.tflite.OperatorCode as libOperatorCode
+import executorch.backends.nxp.backend.ir.lib.tflite.QuantizationDetails as libQuantizedDetails
+import executorch.backends.nxp.backend.ir.lib.tflite.QuantizationParameters as libQuantizedParameters
+import executorch.backends.nxp.backend.ir.lib.tflite.SubGraph as libSubGraphs
+import executorch.backends.nxp.backend.ir.lib.tflite.Tensor as libTensor
+import executorch.backends.nxp.backend.ir.lib.tflite.TensorType as libTensorType
+import executorch.backends.nxp.backend.ir.tflite_generator.meta.meta as meta
+
+import flatbuffers as fb
+import numpy as np
+from executorch.backends.nxp.backend.ir import tensor_formatting
+from executorch.backends.nxp.backend.ir.tflite_generator.meta import types
+from executorch.backends.nxp.backend.ir.tflite_generator.meta.types import name_for_type
+
+logger = logging.getLogger(__name__)
+
+
+def _exactly_one_is_none(obj1: Optional, obj2: Optional):
+    return (obj1 is not None and obj2 is None) or (obj1 is None and obj2 is not None)
+
+
+class Buffer(meta.TFLiteObject):
+    """'data' is an array of any type, but MUST have the correct 'dtype' specified!"""
+
+    data: np.ndarray
+    type: libTensorType.TensorType
+
+    """ IMPORTANT! The following attributes are used only by 'ModelBuilder' 
+        in order to make model creation more efficient. """
+
+    """ Index to the 'buffers' vector. Used to assign the 'buffer' attribute of the 
+        Tensor, this buffer belongs to."""
+    tmp_index: int
+
+    def __init__(
+        self,
+        data: np.ndarray = None,
+        data_type: libTensorType.TensorType = libTensorType.TensorType.INT32,
+    ) -> None:
+        self.data = data
+        self.type = data_type
+
+    def __data_is_empty(self):
+        """Determine if the buffer data is empty."""
+        return (self.data is None) or (self.data.size == 0)
+
+    def get_prepend_function(self, builder: fb.Builder):
+        return types.prepend_function(builder, self.type)
+
+    def gen_tflite(self, builder: fb.Builder):
+        if self.__data_is_empty():
+            # If there is no data, table is empty
+            libBuffer.Start(builder)
+            return libBuffer.End(builder)
+
+        if self.data.dtype.itemsize != 1:
+            # TFLite Buffer is an array of bytes. Larger datatypes must be reduced to bytes first.
+            self.data = np.frombuffer(self.data.tobytes(), np.uint8)
+        else:
+            # Arrays of bytes must also be flattened.
+            self.data = self.data.flatten()
+
+        if self.data.dtype.kind in ["b", "i", "u", "f"]:  # flatbuffers.builder line 483
+            tfl_data = builder.CreateNumpyVector(self.data)
+            # In case of problems, see 'https://github.com/google/flatbuffers/issues/4668'.
+
+        elif self.data.dtype.kind == "S":
+            # String tensor. Not sure how to handle this case. I've played around with 'builder.CreateString()' but I
+            #  couldn't quite make it work. As it is not a priority right now, just exit with error.
+            logger.error(
+                "Generating a TFLite static string tensor is not yet supported."
+            )
+            raise RuntimeError()
+
+        else:
+            # Cannot use the 'CreateNumpyVector' method -> use specific prepend functions.
+            logger.warning(
+                f"Creating a static TFLite tensor buffer for type '{name_for_type(self.type)}'. "
+                "This is not a common case and it has not been tested!"
+            )
+
+            prepend = self.get_prepend_function(builder)
+
+            # 'data' length has to be multiplied by item size, because tflite.Buffer is a vector of type 'UINT8'.
+            #  So e.g. one 'INT32' item will take up 4 spaces in the vector.
+            len_bytes = len(self.data) * types.type_size(self.type)
+
+            libBuffer.StartDataVector(builder, len_bytes)
+            # Flatbuffer is built in reverse, so for correct order, data must be iterated in reverse.
+            for val in reversed(self.data):
+                prepend(val)
+            tfl_data = builder.EndVector()
+
+        libBuffer.Start(builder)
+        libBuffer.AddData(builder, tfl_data)
+
+        return libBuffer.End(builder)
+
+
+class Buffers(meta.TFLiteVector):
+    vector: List[Buffer]
+
+    def __init__(self, vector: List[Buffer] = None) -> None:
+        super().__init__(vector, libModel.StartBuffersVector)
+
+
+class OperatorCode(meta.TFLiteObject):
+    """Represents an OperatorCode object, used in the vector 'operator_codes' in the model."""
+
+    builtin_code: libBuiltinOperator.BuiltinOperator
+    version: int
+    custom_code: str
+
+    def __init__(
+        self,
+        builtin_code: libBuiltinOperator.BuiltinOperator,
+        version: int = 1,
+        custom_code: str = None,
+    ):
+        """
+        :param builtin_code: Operator code from the 'BuiltinOperator' enum.
+        :param version: Operator version. Defaults to 1.
+        :param custom_code: Custom code name. Parameter 'builtin_code' must be set to
+                'BuiltinOperator.CUSTOM' when custom code is used.
+        """
+        self.version = version
+        self.builtin_code = builtin_code
+        self.custom_code = custom_code
+
+        if (
+            self.custom_code is not None
+            and builtin_code != libBuiltinOperator.BuiltinOperator.CUSTOM
+        ):
+            logger.error(
+                f"Attempt to use custom code with non-CUSTOM builtin code ({builtin_code})."
+            )
+
+    def gen_tflite(self, builder: fb.builder):
+        """Generate TFLite representation for this OperatorCode"""
+        if self.custom_code is not None:
+            custom_code = builder.CreateString(self.custom_code)
+        else:
+            custom_code = None
+
+        libOperatorCode.Start(builder)
+
+        # The 'deprecated_builtin_code' is a byte. Make sure it doesn't overflow.
+        # noinspection PyTypeChecker
+        if self.builtin_code <= 127:
+            libOperatorCode.AddDeprecatedBuiltinCode(builder, self.builtin_code)
+
+        libOperatorCode.AddVersion(builder, self.version)
+        libOperatorCode.AddBuiltinCode(builder, self.builtin_code)
+        if custom_code is not None:
+            libOperatorCode.AddCustomCode(builder, custom_code)
+        return libOperatorCode.End(builder)
+
+
+class OperatorCodes(meta.TFLiteVector):
+    vector: List[OperatorCode]
+
+    def __init__(self, operator_codes: List[OperatorCode] = None) -> None:
+        super().__init__(operator_codes, libModel.StartOperatorCodesVector)
+
+
+class Min(meta.FloatVector):
+    def __init__(self, min: List[float] = None) -> None:
+        super().__init__(min, libQuantizedParameters.StartMinVector, gen_empty=False)
+
+
+class Max(meta.FloatVector):
+    def __init__(self, max: List[float] = None) -> None:
+        super().__init__(max, libQuantizedParameters.StartMaxVector, gen_empty=False)
+
+
+class Scale(meta.FloatVector):
+    def __init__(self, scale: List[float] = None) -> None:
+        super().__init__(scale, libQuantizedParameters.StartScaleVector)
+
+
+class ZeroPoint(meta.IntVector):
+    def __init__(self, zero_point: List[int] = None) -> None:
+        super().__init__(
+            zero_point,
+            libQuantizedParameters.StartZeroPointVector,
+            lambda builder: builder.PrependInt64,
+        )
+
+
+class Quantization(meta.TFLiteObject):
+    min: Min
+    max: Max
+    scale: Optional[Scale]
+    zero_point: ZeroPoint
+    quantized_dimension: int
+    details_type: libQuantizedDetails.QuantizationDetails
+
+    # TODO details
+
+    def __init__(
+        self,
+        min: Min = Min(),  # noqa B008
+        max: Max = Max(),  # noqa B008
+        scale: Scale = None,
+        zero_point: ZeroPoint = ZeroPoint([0]),  # noqa B008
+        quantized_dimension: int = 0,
+        details_type: libQuantizedDetails.QuantizationDetails = libQuantizedDetails.QuantizationDetails.NONE,
+    ) -> None:
+        self.min = min
+        self.max = max
+        self.scale = scale
+        self.zero_point = zero_point
+        self.quantized_dimension = quantized_dimension
+        self.details_type = details_type
+
+    def __eq__(self, other):
+        if self is None and other is None:
+            return True
+        elif _exactly_one_is_none(self, other):
+            return False
+
+        if _exactly_one_is_none(self.scale, other.scale):
+            return False
+
+        if self.scale is not None:
+            if self.scale != other.scale:
+                return False
+        if self.zero_point != other.zero_point:
+            return False
+        if self.quantized_dimension != other.quantized_dimension:
+            return False
+        if self.min != other.min:
+            return False
+        if self.max != other.max:
+            return False
+
+        return True
+
+    def is_per_channel(self) -> bool:
+        """Determine if this quantization is per channel, instead of per tensor."""
+        if (self.scale is not None and self.zero_point is not None) and (
+            self.scale.len() == self.zero_point.len()
+        ):
+            return self.scale.len() > 1
+
+        return False
+
+    def is_per_tensor(self) -> bool:
+        """Determine if this quantization is per tensor"""
+        if (self.scale is not None and self.zero_point is not None) and (
+            self.scale.len() == self.zero_point.len()
+        ):
+            return self.scale.len() == 1
+
+        return False
+
+    def gen_tflite(self, builder: fb.Builder):
+        # Sometimes 1D per-tensor quantized tensors can have quantized_dimension != 0
+        # (residue from badly defined ONNX models). This would cause TFLite inference to crash.
+        if not self.is_per_channel():
+            self.quantized_dimension = 0
+
+        tfl_min = self.min.gen_tflite(builder)
+        tfl_max = self.max.gen_tflite(builder)
+        tfl_scale = self.scale.gen_tflite(builder)
+        tfl_zero_point = self.zero_point.gen_tflite(builder)
+
+        libQuantizedParameters.Start(builder)
+
+        if tfl_min is not None:
+            libQuantizedParameters.AddMin(builder, tfl_min)
+
+        if tfl_max is not None:
+            libQuantizedParameters.AddMax(builder, tfl_max)
+
+        libQuantizedParameters.AddScale(builder, tfl_scale)
+
+        libQuantizedParameters.AddZeroPoint(builder, tfl_zero_point)
+
+        libQuantizedParameters.AddDetailsType(builder, self.details_type)
+
+        libQuantizedParameters.AddQuantizedDimension(builder, self.quantized_dimension)
+
+        return libQuantizedParameters.End(builder)
+
+
+class Shape(meta.IntVector):
+    __shape_offset: int
+
+    __also_has_signature: bool
+    __shape_signature_vector: List[int]
+    __shape_signature_offset: int
+
+    def __init__(self, shape: List[int]) -> None:
+        super().__init__(shape, libTensor.StartShapeVector)
+        self.__also_has_signature = False
+
+    @property
+    def flat_size(self):
+        return np.prod(self.vector).item()
+
+    def is_symbolic(self) -> bool:
+        """Determine if the shape uses symbolic dimensions
+
+        :return: True, if at least 1 dimension of the shape is not a positive integer.
+        """
+
+        return not all(isinstance(dim, int) and dim >= 0 for dim in self.vector)
+
+    def is_well_defined(self) -> bool:
+        """Determine if the shape is not empty and also is not symbolic.
+
+        :return: True, if the shape contains just positive integers.
+        """
+
+        if self.len() == 0:
+            return False
+
+        return not self.is_symbolic()
+
+    def __check_dims(self):
+        """Check if all dimensions are integers. If not, transform this
+        to 'shape_signature'."""
+
+        self.__shape_signature_vector = []
+
+        for val in self.vector:
+            if (not isinstance(val, int)) or (val < 0):
+                val = -1
+                self.__also_has_signature = True
+
+            self.__shape_signature_vector.append(val)
+
+        if self.__also_has_signature:
+            self.vector = [abs(val) for val in self.__shape_signature_vector]
+
+    def gen_tflite(self, builder: fb.Builder, tensor):
+        """Generates TFLite code for the Shape"""
+        self.__check_dims()
+
+        if self.__also_has_signature:
+            tensor.has_rank = True
+
+        self.__shape_offset = super().gen_tflite(builder)
+        if self.__also_has_signature:
+            self.vector = self.__shape_signature_vector
+            self.__shape_signature_offset = super().gen_tflite(builder)
+
+    def add_tf_lite(self, builder):
+        libTensor.AddShape(builder, self.__shape_offset)
+
+        if self.__also_has_signature:
+            libTensor.AddShapeSignature(builder, self.__shape_signature_offset)
+
+
+class Tensor(meta.TFLiteObject):
+    is_variable: bool
+    has_rank: bool
+    type: libTensorType.TensorType
+    buffer: int
+    name: str
+    shape: Shape
+    quantization: Quantization
+    # TODO sparsity
+    # TODO shapeSignature
+    # TODO variantTensors
+
+    tensor_format: tensor_formatting.TensorFormat
+
+    # TODO If 'hasRank' is false, "shape" must be [].
+
+    """ IMPORTANT! The following attributes are used only by 'ModelBuilder' 
+        in order to make model creation more efficient. """
+
+    """ Reference to the 'Buffer' object holding this tensors data. 'tmpBuffer' MUST be 
+        stored a 'Buffers' object and MUST be referenced using the index 'buffer'.  """
+    tmp_buffer: Buffer
+
+    """ Index to the 'tensors' vector for this tensor. """
+    tmp_index: int
+
+    # A boolean indicating, that this tensor should be considered as empty, by the TFLite inference engine.
+    # Can only be used for optional tensors. Whether a tensor is optional is usually indicated in the comments of the
+    #  corresponding TFLite kernel files.
+    # If set to True, TFLite kernel modules will receive 'nullptr' as the returned value from the
+    #  'GetOptionalInputTensor()' function.
+    tmp_null_tensor: bool
+
+    @property
+    def rank(self):
+        """Get the number of dimensions of this `Tensor`."""
+        return self.shape.len()
+
+    def __init__(
+        self,
+        shape: Shape = None,
+        name: str = None,
+        buffer: int = None,
+        data_type: libTensorType.TensorType = libTensorType.TensorType.FLOAT32,
+        quantization: Quantization = None,
+        is_variable: bool = False,
+        has_rank: bool = False,
+    ) -> None:
+        self.is_variable = is_variable
+        self.has_rank = has_rank
+        self.type = data_type
+        self.buffer = buffer
+        self.name = name
+        self.shape = shape
+        self.quantization = quantization
+
+        self.tmp_null_tensor = False
+
+        self.tensor_format = tensor_formatting.TensorFormat.NONE
+
+    def gen_tflite(self, builder: fb.Builder):
+
+        if self.shape is not None:
+            self.shape.gen_tflite(builder, self)
+
+        if self.name is not None:
+            name = builder.CreateString(self.name)
+        else:
+            name = None
+
+        if self.quantization is not None:
+            tfl_quantization = self.quantization.gen_tflite(builder)
+        else:
+            tfl_quantization = None
+
+        libTensor.Start(builder)
+
+        if self.shape is not None:
+            self.shape.add_tf_lite(builder)
+
+        libTensor.AddType(builder, self.type)
+
+        if self.buffer is not None:
+            libTensor.AddBuffer(builder, self.buffer)
+
+        if name is not None:
+            libTensor.AddName(builder, name)
+
+        if tfl_quantization is not None:
+            libTensor.AddQuantization(builder, tfl_quantization)
+
+        libTensor.AddIsVariable(builder, self.is_variable)
+
+        libTensor.AddHasRank(builder, self.has_rank)
+
+        return libTensor.End(builder)
+
+
+class Tensors(meta.TFLiteVector):
+    vector: List[Tensor]
+
+    def __init__(self, tensors: List[Tensor] = None) -> None:
+        super().__init__(tensors, libSubGraphs.StartTensorsVector)
+
+
+class OperatorInputs(meta.IntVector):
+    def __init__(self, inputs: List[int] = None):
+        super().__init__(inputs, libOperator.StartInputsVector)
+
+
+class OperatorOutputs(meta.IntVector):
+    def __init__(self, outputs: List[int] = None):
+        super().__init__(outputs, libOperator.StartOutputsVector)
+
+
+class MutatingVariableInputs(meta.BoolVector):
+    def __init__(self, mutating_variable_inputs: List[bool] = None) -> None:
+        super().__init__(
+            mutating_variable_inputs, libOperator.StartMutatingVariableInputsVector
+        )
+
+
+class Operator(meta.TFLiteObject):
+    opcode_index: int
+    custom_options_format: (
+        libCustomOptionsFormat.CustomOptionsFormat
+    )  # Only default value is possible
+    mutating_variable_inputs: MutatingVariableInputs
+    inputs: OperatorInputs
+    outputs: OperatorOutputs
+    builtin_options: meta.BuiltinOptions
+    custom_options: meta.CustomOptions
+    # TODO intermediates
+
+    """ IMPORTANT! The following attributes are used only by 'ModelBuilder' 
+        in order to make model creation more efficient. """
+
+    """ Lists of references to 'Tensor' objects. Simpler to use when converting
+        than 'inputs' and 'outputs'. """
+    tmp_inputs: List[Tensor]
+    tmp_outputs: List[Tensor]
+    tmp_version: int  # OperatorConverter uses this to assign the corresponding operator code with correct version.
+
+    # If `True`, this is an extra operator added during conversion. It was not present in the original ONNX model.
+    tmp_added_extra: bool
+
+    def __init__(
+        self,
+        inputs: OperatorInputs = None,
+        outputs: OperatorOutputs = None,
+        builtin_options: meta.BuiltinOptions = None,
+        opcode_index: int = 0,
+        mutating_variable_inputs: MutatingVariableInputs = MutatingVariableInputs(),  # noqa B008
+        custom_options_format: libCustomOptionsFormat.CustomOptionsFormat = libCustomOptionsFormat.CustomOptionsFormat.FLEXBUFFERS,
+        custom_options: meta.CustomOptions = None,
+    ) -> None:
+        self.opcode_index = opcode_index
+        self.custom_options_format = custom_options_format
+        self.mutating_variable_inputs = mutating_variable_inputs
+        self.builtin_options = builtin_options
+        if inputs is None:
+            inputs = OperatorInputs()
+        self.inputs = inputs
+        if outputs is None:
+            outputs = OperatorOutputs()
+        self.outputs = outputs
+        self.custom_options = custom_options
+
+        self.tmp_inputs = []
+        self.tmp_outputs = []
+        self.tmp_version = 1
+        self.tmp_added_extra = False
+
+    def uses_per_channel_quantization(self) -> bool:
+        """Determine if this operator uses per-channel quantization."""
+        for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs):
+            if tensor.quantization is None:
+                continue
+
+            if tensor.quantization.is_per_channel():
+                return True
+
+        return False
+
+    def is_quantized_without_qdq(self) -> bool:
+        """Determine if the Operator was quantized but not using the QDQ schema.
+
+        ! This only works before quantization parameters are propagated !
+        """
+        y = self.tmp_outputs[0]
+
+        if y.type not in {
+            libTensorType.TensorType.INT8,
+            libTensorType.TensorType.UINT8,
+        }:
+            return False
+
+        inputs_quantized = any(x.quantization is not None for x in self.tmp_inputs)
+
+        # Inputs are quantized and output isn't.
+        return inputs_quantized and y.quantization is None
+
+    def is_qdq_quantized(self) -> bool:
+        """Determine if the Operator was quantized using the QDQ schema.
+
+        ! This only works before quantization parameters are propagated !
+        """
+        y = self.tmp_outputs[0]
+        output_quantized = y.quantization is not None
+        output_8b_int = y.type in {
+            libTensorType.TensorType.INT8,
+            libTensorType.TensorType.UINT8,
+        }
+
+        if not output_quantized and output_8b_int:
+            # (U)INT8 but not quantized -> not QDQ
+            return False
+        elif output_quantized and not output_8b_int:
+            # Non-(U)INT8 output, but quantized -> not supported
+            return False
+
+        # Output quantized + INT8/UINT8 or different type (bool etc.)
+
+        # Check if any of the inputs is quantized
+        return any(x.quantization is not None for x in self.tmp_inputs)
+
+    def gen_tflite(self, builder: fb.Builder):
+        if self.inputs is not None:
+            tfl_inputs = self.inputs.gen_tflite(builder)
+        else:
+            tfl_inputs = None
+
+        if self.outputs is not None:
+            tfl_outputs = self.outputs.gen_tflite(builder)
+        else:
+            tfl_outputs = None
+
+        if self.custom_options is not None:
+            tfl_custom_options = builder.CreateByteVector(self.custom_options)
+        else:
+            tfl_custom_options = None
+
+        if self.builtin_options is not None:
+            tfl_builtin_options = self.builtin_options.gen_tflite(builder)
+        else:
+            tfl_builtin_options = None
+
+        if self.mutating_variable_inputs is not None:
+            tfl_mutating_variable_inputs = self.mutating_variable_inputs.gen_tflite(
+                builder
+            )
+        else:
+            tfl_mutating_variable_inputs = None
+
+        libOperator.Start(builder)
+
+        libOperator.AddOpcodeIndex(builder, self.opcode_index)
+
+        if tfl_inputs is not None:
+            libOperator.AddInputs(builder, tfl_inputs)
+
+        if tfl_outputs is not None:
+            libOperator.AddOutputs(builder, tfl_outputs)
+
+        if tfl_builtin_options is not None:
+            libOperator.AddBuiltinOptions(builder, tfl_builtin_options)
+            libOperator.AddBuiltinOptionsType(
+                builder, self.builtin_options.builtin_options_type
+            )
+
+        if tfl_custom_options is not None:
+            libOperator.AddBuiltinOptionsType(builder, 0)
+            libOperator.AddCustomOptionsFormat(builder, self.custom_options_format)
+            libOperator.AddCustomOptions(builder, tfl_custom_options)
+
+        if tfl_mutating_variable_inputs is not None:
+            libOperator.AddMutatingVariableInputs(builder, tfl_mutating_variable_inputs)
+
+        return libOperator.End(builder)
+
+
+class Operators(meta.TFLiteVector):
+    vector: List[Operator]
+
+    def __init__(self, operators: List[Operator] = None) -> None:
+        super().__init__(operators, libSubGraphs.StartOperatorsVector)
+
+
+class SubGraphInputs(meta.IntVector):
+    """List of 'Tensor' objects. Easier to use while converting."""
+
+    tmp_inputs: List[Tensor]
+
+    def __init__(self, inputs: List[int] = None):
+        """'inputs' is a list of indices into the 'tensors' vector."""
+        super().__init__(inputs, libSubGraphs.StartInputsVector)
+        self.tmp_inputs = []
+
+
+class SubGraphOutputs(meta.IntVector):
+    """List of 'Tensor' objects. Easier to use while converting."""
+
+    tmp_outputs: List[Tensor]
+
+    def __init__(self, outputs: List[int] = None):
+        """'outputs' is a list of indices into the 'tensors' vector."""
+        super().__init__(outputs, libSubGraphs.StartOutputsVector)
+        self.tmp_outputs = []
+
+
+class SubGraph(meta.TFLiteObject):
+    inputs: SubGraphInputs
+    outputs: SubGraphOutputs
+    tensors: Tensors
+    operators: Operators
+
+    # TODO name
+
+    def __init__(
+        self,
+        inputs: SubGraphInputs = None,
+        outputs: SubGraphOutputs = None,
+        tensors: Tensors = None,
+        operators: Operators = None,
+    ):
+        self.inputs = inputs
+        self.outputs = outputs
+        self.tensors = tensors
+        self.operators = operators
+
+    def gen_tflite(self, builder: fb.Builder):
+        if self.tensors is not None:
+            tfl_tensors = self.tensors.gen_tflite(builder)
+        else:
+            tfl_tensors = None
+
+        if self.inputs is not None:
+            tfl_inputs = self.inputs.gen_tflite(builder)
+        else:
+            tfl_inputs = None
+
+        if self.outputs is not None:
+            tfl_outputs = self.outputs.gen_tflite(builder)
+        else:
+            tfl_outputs = None
+
+        if self.operators is not None:
+            tfl_operators = self.operators.gen_tflite(builder)
+        else:
+            tfl_operators = None
+
+        libSubGraphs.Start(builder)
+
+        if tfl_tensors is not None:
+            libSubGraphs.AddTensors(builder, tfl_tensors)
+
+        if tfl_inputs is not None:
+            libSubGraphs.AddInputs(builder, tfl_inputs)
+
+        if tfl_outputs is not None:
+            libSubGraphs.AddOutputs(builder, tfl_outputs)
+
+        if tfl_operators is not None:
+            libSubGraphs.AddOperators(builder, tfl_operators)
+
+        return libSubGraphs.End(builder)
+
+
+class SubGraphs(meta.TFLiteVector):
+    vector: List[SubGraph]
+
+    def __init__(self, sub_graphs: List[SubGraph] = None) -> None:
+        super().__init__(sub_graphs, libModel.StartSubgraphsVector)
+
+
+class Model(meta.TFLiteObject):
+    version: int
+    description: str
+    operator_codes: OperatorCodes
+    sub_graphs: SubGraphs
+    buffers: Buffers
+    # TODO signatureDefs
+    # TODO metadata
+    # TODO metadataBuffer
+
+    __fileIdentifier = "TFL3"  # file_identifier from the used TFLite schema
+
+    @classmethod
+    def __gen_file_identifier(cls):
+        """Generate byte-like object representing the TFLite format"""
+        return cls.__fileIdentifier.encode("ascii")
+
+    def __init__(
+        self,
+        version: int = 1,
+        description: str = None,
+        buffers: Buffers = None,
+        operator_codes: OperatorCodes = None,
+        sub_graphs: SubGraphs = None,
+    ) -> None:
+        self.version = version
+        self.description = description
+        self.operator_codes = operator_codes
+        self.sub_graphs = sub_graphs
+        self.buffers = buffers
+
+    def gen_tflite(self, builder: fb.Builder):
+        if self.operator_codes is not None:
+            tfl_operator_codes = self.operator_codes.gen_tflite(builder)
+        else:
+            tfl_operator_codes = None
+
+        if self.sub_graphs is not None:
+            tfl_sub_graphs = self.sub_graphs.gen_tflite(builder)
+        else:
+            tfl_sub_graphs = None
+
+        if self.description is not None:
+            tfl_description = builder.CreateString(self.description)
+        else:
+            tfl_description = None
+
+        if self.buffers is not None:
+            tfl_buffers = self.buffers.gen_tflite(builder)
+        else:
+            tfl_buffers = None
+
+        libModel.Start(builder)
+
+        libModel.AddVersion(builder, self.version)
+
+        if tfl_operator_codes is not None:
+            libModel.AddOperatorCodes(builder, tfl_operator_codes)
+
+        if tfl_sub_graphs is not None:
+            libModel.AddSubgraphs(builder, tfl_sub_graphs)
+
+        if tfl_description is not None:
+            libModel.AddDescription(builder, tfl_description)
+
+        if tfl_buffers is not None:
+            libModel.AddBuffers(builder, tfl_buffers)
+
+        builder.Finish(libModel.End(builder), Model.__gen_file_identifier())
diff --git a/backends/nxp/backend/ir/tflite_optimizer/README.md b/backends/nxp/backend/ir/tflite_optimizer/README.md
new file mode 100755
index 00000000000..5abbfdd9965
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/README.md
@@ -0,0 +1,166 @@
+# Pattern matching
+
+A tool which takes a symbolic definition of a pattern of operators and yields all matching instances of
+the pattern in the internal TFLite model.
+
+### Example use
+
+```python
+matcher = PatternMatcher(
+    builder,
+    [
+        Op(['Transpose'], ['x', 'perm'], ['y']),
+        Op(['Reshape', 'Squeeze'], ['y', ...], ['z']),
+        Op(['FullyConnected'], ['z', 'w'], ['fc_out'], [
+            HasFusedActivationFunction()
+        ]),
+        MultipleSameOps(['Add'], ['fc_out'])
+    ],
+    [
+        TensorsHaveOneConsumer(['y', 'z']),
+        TensorHasData('perm')
+    ])
+
+for [transpose, reshape, fc, add_ops], tensor_map, input_to_ops, output_to_op in matcher.match_patterns():
+    x = tensor_map['x']
+    ...
+```
+
+The `PatternMatcher` has 3 parameters in its constructor:
+
+* A [`ModelBuilder`](../converter/builder/model_builder.py) object which encapsulates the internal
+  TFLite model. This is the model that the `PatternMatcher` will search.
+* A list of symbolic operators, which describe the pattern the `PatternMatcher` will search for. Its details are
+  described in a [later section](#blocks-to-define-a-pattern) of this document.
+* The last parameter is an optional list of tensor rules defined in [tensor_rules.py](tensor_rules.py). They allow
+  additional restrictions to be placed on the tensors present in the pattern. The yielded pattern will always satisfy
+  all of these rules.
+
+The PatternMatcher will perform 1 pass through the TFLite model encapsulated by the given `ModelBuilder`, and gradually
+yield all matching patterns of operators. So changes to the TFLite model done in the body of the `for` loop above, can
+immediately have an effect on the next matched instance of the searched pattern.
+
+The method `match_patterns()` will gradually yield a tuple containing:
+
+* A list of matched operators. Their number and order will exactly match the operators specified in the pattern.
+* A dictionary mapping symbolic tensor names (such as `x` or `perm` in the example above) to the actual TFLite tensors
+  matched in the model.
+* A dictionary mapping the name of a real tensor from the model, to a list of operators which use this tensor as their
+  input.
+* A dictionary mapping the name of a real tensor from the model, to an operator which produces this tensor as its
+  output.
+
+The first block in the pattern must be an `Op`. The pattern matcher will internally go through the model until it finds
+a match for this first `Op`. It then sets its current position to this `Op` and tries to match the rest of the pattern.
+If it succeeds, it yields the matched operators and returns to the current position (the first `Op`). It then continues
+on its single pass through the model and tries to find another match for the first `Op`.
+
+That also means that all subsequent blocks must somehow be connected to some previous block. So the following is **not**
+allowed.
+
+```python
+Op(['Sqrt'], ['a'], ['b']),
+Op(['Cast'], ['c'], ['d'])
+```
+
+---
+
+# Blocks to define a pattern
+
+## Op
+
+Block which represents exactly 1 TFLite operator.
+
+The class is defined as follows:
+
+```python
+class Op:
+    ops: list[str] | None = None
+    inputs: list[str | None] | None = None
+    outputs: list[str | None] | None = None
+    op_rules: list[OpRule] | None = None
+```
+
+* The matched TFLite operator will have 1 of the operator types specified in `ops`. If the `ops` is `None`, the operator
+  type will not be considered during the pattern matching.
+
+* The `inputs` and `outputs` contain symbolic names, which will uniquely identify actual matched tensors from the TFLite
+  model. The number specified tensors will be exactly the same in the matched TFLite operator (except for the case
+  of `...`).
+    * Instead of a symbolic name, `None` can be given which represents an anonymous tensor, which still however must be
+      present
+      in the matched operator.
+    * Another alternative to a symbolic name is the ellipsis `...`. It represents any number of tensors (including 0).
+      It can only be used at the beginning and/or at the end of the `inputs`/`outputs`. If it is at the beginning, the
+      matching will be done in reverse, starting with the last tensor.
+    * The `inputs` and/or `outpus` can be omitted altogether, if they are `None`. This means that the `PatternMatcher`
+      will not take the `inputs`/`outpus` into consideration while matching the pattern.
+
+* `op_rules` is a list of rules that the operator must satisfy in order to be matched. They are defined
+  in [operator_rules.py](operator_rules.py). The yielded pattern will always satisfy all these rules.
+
+## MultipleSameOps
+
+Block which represents multiple (at least 1) occurrences of similar operators. The similar operators must all fit 1
+common definition, which is similar to the definition of `Op`.
+
+```python
+class MultipleSameOps:
+    ops: list[str]
+    inputs: list[str | None] | None = None
+    outputs: list[str | None] | None = None
+    op_rules: list[OpRule] | None = None
+```
+
+At least 1 input of the `MultipleSameOps` must be the output of a previous block. Other inputs/outputs which have not
+been defined by a previous block, will represent a set of tensors instead of just 1 tensor.
+
+The `MultipleSameOps` block will be matched with a list of operators, which consume the already matched input tensor.
+All operators consuming this tensor must match the `MultipleSameOps` block, in order for the match to be successful.
+
+### Example use
+
+```python
+Op(['Quantize'], outputs=['x']),
+MultipleSameOps(['Dequantize'], ['x'], ['y'])
+```
+
+The first `Op` defines a `Quantize` operator with an output tensor `x`, which is a single tensor. The
+following `MultipleSameOps` represents a set of `N` `Dequantize` operators, which all consume the `x` tensor. These `N`
+operators all produce their own output, so `y` represents `N` tensors, not just 1.
+
+Tensor rules can still be used for `y`, and they have to pass for all output tensors of the `Dequantize` operators.
+
+Operator rules can still be used for the `MultipleSameOps`, and they have to pass for all matched operators.
+
+It is **not** possible to use tensor/operator rules to filter the matched operators of `MultipleSameOps`. The pattern
+matcher will find all operators which use the `x` tensor, and if and only if they **all** match the definition, the
+whole pattern is yielded.
+
+Sets of tensors (such as `y` in the example above) cannot be used as inputs to following blocks right now.
+
+### Semantics of consuming a set of tensors
+
+Currently, it is not allowed for any block to consume a set of tensors, such as the `y` in the example above.
+
+## OneOf
+
+Block which represents a single operator, that has to match at least one given `Op` in the `one_of_ops` list.
+
+```python
+class OneOf():
+    one_of_ops: list[Op]
+```
+
+### Example use
+
+```python
+Op(['FullyConnected'], outputs=['y']),
+OneOf([
+    Op(['Add'], ['y', 'b']),
+    Op(['Add'], ['b', 'y'])
+])
+```
+
+The example code above represents a situation where we do not care if the `Add` uses the output of the `FullyConnected`
+as its first input or its second input.
\ No newline at end of file
diff --git a/backends/nxp/backend/ir/tflite_optimizer/__init__.py b/backends/nxp/backend/ir/tflite_optimizer/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/tflite_optimizer/graph_utils.py b/backends/nxp/backend/ir/tflite_optimizer/graph_utils.py
new file mode 100755
index 00000000000..0f2a0d8a447
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/graph_utils.py
@@ -0,0 +1,115 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.nxp.backend.ir.converter.builder.model_builder as model_builder
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+
+InputTensorToOpsMap = dict[str, list[tflite_model.Operator]]
+OutputTensorToOpMap = dict[str, tflite_model.Operator]
+NameToTensorMap = dict[str, tflite_model.Tensor | list[tflite_model.Tensor]]
+
+
+def create_tensor_to_operator_dictionaries(
+    builder: "model_builder.ModelBuilder",
+) -> tuple[InputTensorToOpsMap, OutputTensorToOpMap]:
+    """Create and return 2 dictionaries, which map a tensor name, to a TFLite operator, which has the tensor as
+    input, and output respectively.
+
+    :return: Dictionary mapping a tensor name to a list of operators that use it as an input,
+             dictionary mapping a tensor name to the operator, which produces it as its output.
+    """
+    input_tensor_to_operators: InputTensorToOpsMap = {}
+    output_tensor_to_operator: OutputTensorToOpMap = {}
+
+    for op in builder.get_operators().vector:
+        for input_tensor in op.tmp_inputs:
+            if input_tensor.name not in input_tensor_to_operators.keys():
+                input_tensor_to_operators[input_tensor.name] = []
+
+            input_tensor_to_operators[input_tensor.name].append(op)
+
+        for output_tensor in op.tmp_outputs:
+            output_tensor_to_operator[output_tensor.name] = op
+
+    return input_tensor_to_operators, output_tensor_to_operator
+
+
+# Extend this map with operators required for future optimizations.
+op_type_to_builtin_operator_map = {
+    "Add": BuiltinOperator.ADD,
+    "AddN": BuiltinOperator.ADD_N,
+    "AveragePool2D": BuiltinOperator.AVERAGE_POOL_2D,
+    "BatchMatMul": BuiltinOperator.BATCH_MATMUL,
+    "BidirectionalSequenceLSTM": BuiltinOperator.BIDIRECTIONAL_SEQUENCE_LSTM,
+    "BidirectionalSequenceRNN": BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN,
+    "Cast": BuiltinOperator.CAST,
+    "Concatenation": BuiltinOperator.CONCATENATION,
+    "Conv2D": BuiltinOperator.CONV_2D,
+    "Conv3D": BuiltinOperator.CONV_3D,
+    "DepthwiseConv2D": BuiltinOperator.DEPTHWISE_CONV_2D,
+    "Dequantize": BuiltinOperator.DEQUANTIZE,
+    "Div": BuiltinOperator.DIV,
+    "FullyConnected": BuiltinOperator.FULLY_CONNECTED,
+    "HardSwish": BuiltinOperator.HARD_SWISH,
+    "L2Norm": BuiltinOperator.L2_NORMALIZATION,
+    "LSTM": BuiltinOperator.LSTM,
+    "LeakyRelu": BuiltinOperator.LEAKY_RELU,
+    "Logistic": BuiltinOperator.LOGISTIC,
+    "MaxPool2D": BuiltinOperator.MAX_POOL_2D,
+    "Maximum": BuiltinOperator.MAXIMUM,
+    "Mean": BuiltinOperator.MEAN,
+    "Minimum": BuiltinOperator.MINIMUM,
+    "Mul": BuiltinOperator.MUL,
+    "PRelu": BuiltinOperator.PRELU,
+    "Quantize": BuiltinOperator.QUANTIZE,
+    "RNN": BuiltinOperator.RNN,
+    "ReduceProd": BuiltinOperator.REDUCE_PROD,
+    "Relu": BuiltinOperator.RELU,
+    "Relu6": BuiltinOperator.RELU6,
+    "ReluN1To1": BuiltinOperator.RELU_N1_TO_1,
+    "Reshape": BuiltinOperator.RESHAPE,
+    "SVDF": BuiltinOperator.SVDF,
+    "ScatterND": BuiltinOperator.SCATTER_ND,
+    "SequenceRNN": BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN,
+    "Sign": BuiltinOperator.SIGN,
+    "Slice": BuiltinOperator.SLICE,
+    "Split": BuiltinOperator.SPLIT,
+    "StridedSlice": BuiltinOperator.STRIDED_SLICE,
+    "Sub": BuiltinOperator.SUB,
+    "Sum": BuiltinOperator.SUM,
+    "Tanh": BuiltinOperator.TANH,
+    "Transpose": BuiltinOperator.TRANSPOSE,
+    "TransposeConv": BuiltinOperator.TRANSPOSE_CONV,
+    "UnidirectionalSequenceLSTM": BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM,
+    "Where": BuiltinOperator.WHERE,
+}
+
+
+def builtin_operator_for_op_type(op_type: str) -> BuiltinOperator:
+    builtin_op = op_type_to_builtin_operator_map.get(op_type, None)
+    if builtin_op is None:
+        logger.e(
+            logger.Code.INTERNAL_ERROR,
+            f"PatternMatcher doesn't support `{op_type}` yet.",
+        )
+
+    return builtin_op
+
+
+def operator_is_type(
+    op: tflite_model.Operator, op_type: str, builder: "model_builder.ModelBuilder"
+):
+    builtin_op = builtin_operator_for_op_type(op_type)
+
+    opcode_indices = builder.op_code_type_index_map.get(builtin_op, None)
+    if opcode_indices is None:
+        # The operator is not present in the model at all.
+        return False
+
+    return op.opcode_index in opcode_indices.values()
diff --git a/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py b/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py
new file mode 100755
index 00000000000..253dc9c69a1
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py
@@ -0,0 +1,122 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import executorch.backends.nxp.backend.ir.converter.builder.model_builder as model_builder
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.graph_utils import (
+    NameToTensorMap,
+    operator_is_type,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    InputTensorToOpsMap,
+    OutputTensorToOpMap,
+)
+
+
+class OpRule(ABC):
+    @abstractmethod
+    def __call__(
+        self,
+        op: tflite_model.Operator,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        pass
+
+
+class NoFusedActivationFunction(OpRule):
+
+    def __call__(
+        self,
+        op: tflite_model.Operator,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        if not hasattr(op, "builtin_options"):
+            return False
+
+        if not hasattr(op.builtin_options, "fused_activation_function"):
+            return False
+
+        # noinspection PyUnresolvedReferences
+        return (
+            op.builtin_options.fused_activation_function == ActivationFunctionType.NONE
+        )
+
+
+class HasFusedActivationFunction(OpRule):
+
+    def __call__(
+        self,
+        op: tflite_model.Operator,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        if not hasattr(op, "builtin_options"):
+            return True
+
+        if not hasattr(op.builtin_options, "fused_activation_function"):
+            return True
+
+        # noinspection PyUnresolvedReferences
+        return (
+            op.builtin_options.fused_activation_function != ActivationFunctionType.NONE
+        )
+
+
+@dataclass
+class AllInputsComeFrom(OpRule):
+    """Assures that all input tensors of this operator are produced by operators with op type
+    `single_preceding_op_type`.
+    """
+
+    single_preceding_op_type: str
+
+    def __call__(
+        self,
+        op: tflite_model.Operator,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        preceding_ops = [output_to_op_map[inpt.name] for inpt in op.tmp_inputs]
+
+        return all(
+            operator_is_type(preceding_op, self.single_preceding_op_type, builder)
+            for preceding_op in preceding_ops
+        )
+
+
+@dataclass
+class WasNotInTheOriginalONNXModel(OpRule):
+    """Assures that this operator wasn't created by converting an ONNX operator from the original model, but instead
+     was added extra in order to convert a different operator.
+
+    This rule is currently only satisfied for operators added by ModelBuilder methods `create_..._before()` and
+     `create_..._after()`.
+    """
+
+    def __call__(
+        self,
+        op: tflite_model.Operator,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        return op.tmp_added_extra
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/__init__.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/base_optimization.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/base_optimization.py
new file mode 100755
index 00000000000..6001ca961b8
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/base_optimization.py
@@ -0,0 +1,36 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+
+from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
+from executorch.backends.nxp.backend.ir.converter.builder import model_builder
+from executorch.backends.nxp.backend.ir.tflite_optimizer.graph_utils import (
+    create_tensor_to_operator_dictionaries,
+    InputTensorToOpsMap,
+    OutputTensorToOpMap,
+)
+
+
+class BaseOptimization(ABC):
+    _builder: "model_builder.ModelBuilder"
+
+    def __init__(
+        self, builder: "model_builder.ModelBuilder", conversion_config: ConversionConfig
+    ):
+        self._builder = builder
+        self._conversion_config = conversion_config
+
+    def _create_tensor_to_operator_dictionaries(
+        self,
+    ) -> tuple[InputTensorToOpsMap, OutputTensorToOpMap]:
+        return create_tensor_to_operator_dictionaries(self._builder)
+
+    @abstractmethod
+    def __call__(self) -> bool:
+        """Execute the optimization and return `True` if the optimization had an effect and the model was modified.
+        `False` otherwise.
+        """
+        pass
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py
new file mode 100755
index 00000000000..dddabfe87f1
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py
@@ -0,0 +1,256 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.hard_swish_options import (
+    HardSwish,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    OneOf,
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    RuleOr,
+    TensorHasNConsumers,
+    TensorHasStaticValue,
+    TensorHasType,
+    TensorsAreQuantized,
+    TensorsHaveOneConsumer,
+    TensorsHaveType,
+)
+
+
+class CombineHardSigmoidAndMulIntoHardSwish(BaseOptimization):
+
+    def __call__(self) -> bool:
+        made_changes = self._combine_float_variant()
+        made_changes |= self._combine_quantized_variant()
+
+        return made_changes
+
+    def _combine_float_variant(self) -> bool:
+        """Fuse some operators in the following pattern. The ops `Mul`, `Add` `Minimum` and `Relu` compute the
+        `HardSigmoid` operation, as there is no `HardSigmoid` operator in TFLite.
+
+                      ┌─────┴─────┐  `x`
+                   ┌──▼──┐        │
+             1/6 ──► Mul │        │
+                   └──┬──┘        │
+                   ┌──▼──┐        │
+             1/2 ──► Add │        │                           │
+                   └──┬──┘        │                     ┌─────▼─────┐
+                 ┌────▼────┐      │       ─────►        │ HardSwish │
+             1 ──► Minimum │      │                     └─────┬─────┘
+                 └────┬────┘      │
+                   ┌──▼───┐       │
+                   │ Relu │       │
+                   └──┬───┘       │
+                      └───┐   ┌───┘
+                         ┌▼───▼┐
+                         │ Mul │
+                         └──┬──┘
+        """
+
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Mul"], ["x", "alpha"], ["mul_o"]),
+                OneOf(
+                    [
+                        Op(["Add"], ["mul_o", "beta"], ["add_o"]),
+                        Op(["Add"], ["beta", "mul_o"], ["add_o"]),
+                    ]
+                ),
+                OneOf(
+                    [
+                        Op(["Minimum"], ["add_o", "one"], ["min_o"]),
+                        Op(["Minimum"], ["one", "add_o"], ["min_o"]),
+                    ]
+                ),
+                Op(["Relu"], ["min_o"], ["relu_o"]),
+                OneOf(
+                    [
+                        Op(["Mul"], ["x", "relu_o"], ["y"]),
+                        Op(["Mul"], ["relu_o", "x"], ["y"]),
+                    ]
+                ),
+            ],
+            [
+                TensorHasNConsumers("x", 2),
+                TensorsHaveOneConsumer(["mul_o", "add_o", "min_o", "relu_o"]),
+                TensorHasStaticValue("alpha", 1 / 6),
+                TensorHasStaticValue("beta", 0.5),
+                TensorHasStaticValue("one", 1),
+                # `HardSwishConverter` and `HardSigmoidConverter` both only support float32.
+                TensorHasType("x", TensorType.FLOAT32),
+            ],
+        )
+
+        # The mapped operator (value) will be inserted into the model later, at the position of the `key` operator.
+        to_add: dict[tflite_model.Operator, tflite_model.Operator] = {}
+        to_remove = []
+        for pattern_ops, tensor_map, _, _ in matcher.match_patterns():
+            x, y = tensor_map["x"], tensor_map["y"]
+            hard_swish = tflite_model.Operator(
+                builtin_options=HardSwish(),
+                opcode_index=self._builder.op_code_index_for_op_type(
+                    BuiltinOperator.HARD_SWISH
+                ),
+            )
+            hard_swish.tmp_inputs = [x]
+            hard_swish.tmp_outputs = [y]
+
+            to_add[pattern_ops[0]] = hard_swish
+
+            to_remove.extend(pattern_ops)
+
+        ops = self._builder.get_operators()
+        for k, v in to_add.items():
+            idx = ops.index(k)
+            ops.insert(idx, v)
+
+        for op in to_remove:
+            ops.remove(op)
+
+        return len(to_remove) != 0
+
+    def _combine_quantized_variant(self) -> bool:
+        """Fuse some operators in the following pattern. The ops `Mul`, `Add` `Minimum` and `Relu` compute the
+         `HardSigmoid` operation, as there is no `HardSigmoid` operator in TFLite.
+
+        The following pattern arises from using the `onnx2quant` on a model with `HardSwish`. The quantizer always
+         runs a pre-processing step which splits the ONNX `HardSwish` into `HardSigmoid` and `Mul`. It seems like it
+         cannot be turned off. Therefore, we cannot add QDQ quantization of `HardSwish`. But since `HardSigmoid`
+         gets converted to multiple TFLite operators, we also cannot really add QDQ quantization for that operator.
+         This means that `HardSwish` will never get fully quantized by the `onnx2quant`, and the following pattern
+         will be created.
+        We can, however, convert the entire pattern into a quantized `HardSwish` using this optimization.
+
+                             │  (u)int8    `x`
+                       ┌─────▼──────┐
+                       │ Dequantize │
+                       └─────┬──────┘
+                       ┌─────┴─────┐  float32
+                    ┌──▼──┐        │
+              1/6 ──► Mul │        │
+                    └──┬──┘        │
+                    ┌──▼──┐        │
+              1/2 ──► Add │        │
+                    └──┬──┘        │
+                  ┌────▼────┐      │
+              1 ──► Minimum │      │                           │  (u)int8    `x`
+                  └────┬────┘      │                     ┌─────▼─────┐
+                    ┌──▼───┐       │       ─────►        │ HardSwish │
+                    │ Relu │       │                     └─────┬─────┘
+                    └──┬───┘       │                           │  (u)int8    `y`
+                  ┌────▼─────┐     │
+                  │ Quantize │     │
+                  └────┬─────┘     │
+                 ┌─────▼──────┐    │
+                 │ Dequantize │    │
+                 └─────┬──────┘    │
+                       └───┐   ┌───┘
+                          ┌▼───▼┐
+                          │ Mul │
+                          └──┬──┘
+                             │  float32
+                        ┌────▼─────┐
+                        │ Quantize │
+                        └────┬─────┘
+                             │  (u)int8    `y`
+        """
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Dequantize"], ["x"], ["deq1_o"]),
+                OneOf(
+                    [
+                        Op(["Mul"], ["deq1_o", "alpha"], ["mul1_o"]),
+                        Op(["Mul"], ["alpha", "deq1_o"], ["mul1_o"]),
+                    ]
+                ),
+                OneOf(
+                    [
+                        Op(["Add"], ["mul1_o", "beta"], ["add_o"]),
+                        Op(["Add"], ["beta", "mul1_o"], ["add_o"]),
+                    ]
+                ),
+                OneOf(
+                    [
+                        Op(["Minimum"], ["add_o", "one"], ["min_o"]),
+                        Op(["Minimum"], ["one", "add_o"], ["min_o"]),
+                    ]
+                ),
+                Op(["Relu"], ["min_o"], ["relu_o"]),
+                Op(["Quantize"], ["relu_o"], ["quant1_o"]),
+                Op(["Dequantize"], ["quant1_o"], ["deq2_o"]),
+                OneOf(
+                    [
+                        Op(["Mul"], ["deq1_o", "deq2_o"], ["mul2_o"]),
+                        Op(["Mul"], ["deq2_o", "deq1_o"], ["mul2_o"]),
+                    ]
+                ),
+                Op(["Quantize"], ["mul2_o"], ["y"]),
+            ],
+            [
+                TensorHasNConsumers("deq1_o", 2),
+                TensorsHaveOneConsumer(
+                    [
+                        "mul1_o",
+                        "add_o",
+                        "min_o",
+                        "relu_o",
+                        "quant1_o",
+                        "deq2_o",
+                        "mul2_o",
+                    ]
+                ),
+                TensorHasStaticValue("alpha", 1 / 6),
+                TensorHasStaticValue("beta", 0.5),
+                TensorHasStaticValue("one", 1),
+                TensorHasType("deq1_o", TensorType.FLOAT32),
+                TensorsAreQuantized(["x", "y"]),
+                RuleOr(
+                    TensorsHaveType(["x", "y"], TensorType.INT8),
+                    TensorsHaveType(["x", "y"], TensorType.UINT8),
+                ),
+            ],
+        )
+
+        # The mapped operator (value) will be inserted into the model later, at the position of the `key` operator.
+        to_add: dict[tflite_model.Operator, tflite_model.Operator] = {}
+        to_remove = []
+        for pattern_ops, tensor_map, _, _ in matcher.match_patterns():
+            x, y = tensor_map["x"], tensor_map["y"]
+            hard_swish = tflite_model.Operator(
+                builtin_options=HardSwish(),
+                opcode_index=self._builder.op_code_index_for_op_type(
+                    BuiltinOperator.HARD_SWISH
+                ),
+            )
+            hard_swish.tmp_inputs = [x]
+            hard_swish.tmp_outputs = [y]
+
+            to_add[pattern_ops[0]] = hard_swish
+
+            to_remove.extend(pattern_ops)
+
+        ops = self._builder.get_operators()
+        for k, v in to_add.items():
+            idx = ops.index(k)
+            ops.insert(idx, v)
+
+        for op in to_remove:
+            ops.remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/eliminate_dead_branches.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/eliminate_dead_branches.py
new file mode 100755
index 00000000000..cea179dfb09
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/eliminate_dead_branches.py
@@ -0,0 +1,82 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+
+
+class EliminateDeadBranches(BaseOptimization):
+
+    def __call__(self) -> bool:
+        _, output_to_ops = self._create_tensor_to_operator_dictionaries()
+
+        output_names = [
+            tensor.name for tensor in self._builder.get_sub_graph().outputs.tmp_outputs
+        ]
+
+        tensor_names_to_process = set(output_names)
+        tensors_to_keep = set()
+        ops_to_keep = set()
+        processed_ops = set()
+
+        # Iterate from output tensors to inputs and mark all visited nodes & tensors
+        while len(tensor_names_to_process) != 0:
+            tensor = tensor_names_to_process.pop()
+            tensors_to_keep.add(tensor)
+
+            if tensor not in output_to_ops:
+                # Input tensor or already processed
+                continue
+
+            op: tflite_model.Operator = output_to_ops[tensor]
+
+            if op in processed_ops:
+                continue
+
+            # Append all inputs and outputs to next processing. Outputs of nodes aren't
+            # necessarily outputs of the model but must be preserved.
+            for tensor in op.tmp_inputs + op.tmp_outputs:
+                tensor_names_to_process.add(tensor.name)
+
+            ops_to_keep.add(op)
+            processed_ops.add(op)
+
+        if not self._conversion_config.allow_inputs_stripping:
+            # Keep all inputs (even if they are not used) when prohibited by user
+            tensors_to_keep.update(
+                [
+                    tensor.name
+                    for tensor in self._builder.get_sub_graph().inputs.tmp_inputs
+                ]
+            )
+
+        # Remove unused ops
+        ops = self._builder.get_operators().vector
+        i, removed_ops_count = 0, 0
+        while i < len(ops):
+            if ops[i] in ops_to_keep:
+                i += 1
+            else:
+                removed_ops_count += 1
+                del ops[i]
+
+        # Remove unused tensors
+        tensors = self._builder.get_tensors().vector
+        i = 0
+        while i < len(tensors):
+            if tensors[i].name in tensors_to_keep:
+                i += 1
+            else:
+                del tensors[i]
+
+        if removed_ops_count != 0:
+            logger.i(
+                f"Dead branch elimination optimization removed {removed_ops_count} unused ops from the graph."
+            )
+
+        return removed_ops_count != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_activation_functions.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_activation_functions.py
new file mode 100755
index 00000000000..6b657c4d5b1
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_activation_functions.py
@@ -0,0 +1,235 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
+    ActivationFunctionType,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.graph_utils import (
+    operator_is_type,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import (
+    NoFusedActivationFunction,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    TensorHasOneConsumer,
+)
+
+
+class FuseActivationFunctions(BaseOptimization):
+    ops_with_fused_activation_function = [
+        "Conv2D",
+        "Conv3D",
+        "DepthwiseConv2D",
+        "TransposeConv",
+        "MaxPool2D",
+        "AveragePool2D",
+        "SVDF",
+        "FullyConnected",
+        "Add",
+        "Mul",
+        "Sub",
+        "Div",
+        # 'Concatenation',  # currently disabled
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/concatenation.cc#L139
+        # 'L2Norm',  # currently disabled
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/l2norm.cc#L72
+        # LSTM operators will always already have fused activation functions. They are assigned in `convert_lstm.py`.
+        # 'LSTM', 'UnidirectionalSequenceLSTM', 'BidirectionalSequenceLSTM'
+        # RNN operators will always already have fused activation functions. They are assigned in `convert_rnn.py`.
+        # 'RNN', 'SequenceRNN', 'BidirectionalSequenceRNN',
+    ]
+
+    activation_functions = ["Relu", "ReluN1To1", "Relu6", "Tanh", "Sign"]
+
+    supported_activations_for_op: dict[
+        BuiltinOperator, list[ActivationFunctionType]
+    ] = {
+        BuiltinOperator.CONV_2D: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/conv.cc#L912
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.CONV_3D: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/conv3d.cc#L213
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.DEPTHWISE_CONV_2D: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/depthwise_conv.cc#L307
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.TRANSPOSE_CONV: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/transpose_conv.cc#L516
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.MAX_POOL_2D: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/pooling.cc#L247
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.AVERAGE_POOL_2D: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/pooling.cc#L124
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.FULLY_CONNECTED: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/fully_connected.cc#L627-L630
+        BuiltinOperator.ADD: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/add.cc#L246
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.MUL: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/mul.cc#L159
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.SUB: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/sub.cc#L306
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.DIV: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/div.cc#L180
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/kernel_util.h#L285-L300
+        BuiltinOperator.SVDF: [ActivationFunctionType.RELU],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/svdf.cc#L394
+        BuiltinOperator.RNN: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+            ActivationFunctionType.TANH,
+            ActivationFunctionType.SIGN_BIT,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/basic_rnn.cc#L222
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/internal/kernel_utils.cc#L71
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/internal/tensor_utils.h#L58-L77
+        BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+            ActivationFunctionType.TANH,
+            ActivationFunctionType.SIGN_BIT,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/6887368d6d46223f460358323c4b76d61d1558a8/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc#L239
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/internal/kernel_utils.cc#L71
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/internal/tensor_utils.h#L58-L77
+        BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN: [
+            ActivationFunctionType.RELU,
+            ActivationFunctionType.RELU_N1_TO_1,
+            ActivationFunctionType.RELU6,
+            ActivationFunctionType.TANH,
+            ActivationFunctionType.SIGN_BIT,
+        ],
+        # https://github.com/tensorflow/tensorflow/blob/6887368d6d46223f460358323c4b76d61d1558a8/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc#L433
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/internal/kernel_utils.cc#L71
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/internal/tensor_utils.h#L58-L77
+    }
+
+    ops_that_need_equal_io_quantization = [
+        # Documented restrictions from https://www.tensorflow.org/lite/performance/quantization_spec
+        BuiltinOperator.AVERAGE_POOL_2D,
+        BuiltinOperator.MAX_POOL_2D,
+        BuiltinOperator.CONCATENATION,
+    ]
+
+    def _act_fun_type_for_op(self, op: tflite_model.Operator) -> ActivationFunctionType:
+        if operator_is_type(op, "Relu", self._builder):
+            return ActivationFunctionType.RELU
+        elif operator_is_type(op, "ReluN1To1", self._builder):
+            return ActivationFunctionType.RELU_N1_TO_1
+        elif operator_is_type(op, "Relu6", self._builder):
+            return ActivationFunctionType.RELU6
+        elif operator_is_type(op, "Tanh", self._builder):
+            return ActivationFunctionType.TANH
+        elif operator_is_type(op, "Sign", self._builder):
+            return ActivationFunctionType.SIGN_BIT
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(
+                    self.ops_with_fused_activation_function,
+                    ["x"],
+                    ["x1"],
+                    [NoFusedActivationFunction()],
+                ),
+                Op(self.activation_functions, ["x1"], ["y"]),
+            ],
+            [TensorHasOneConsumer("x1")],
+        )
+
+        to_remove = []
+        for [leading_op, act_fun_op], tensor_map, _, _ in matcher.match_patterns():
+            builtin_leading_op = leading_op.builtin_options.operator_type
+            logger.internal_assert(
+                builtin_leading_op in self.supported_activations_for_op.keys(),
+                f"FuseActivationFunctions: supported activations for operator `{builtin_leading_op}`"
+                "are not known.",
+            )
+
+            act_fun = self._act_fun_type_for_op(act_fun_op)
+            if act_fun not in self.supported_activations_for_op[builtin_leading_op]:
+                # The leading op doesn't support this activation function.
+                continue
+
+            x, y = tensor_map["x"], tensor_map["y"]
+            if (
+                x.quantization != y.quantization
+                and builtin_leading_op in self.ops_that_need_equal_io_quantization
+            ):
+                # The fusion would result in different input and output quantization of `leading_op`, which would cause
+                #  runtime issues for that particular operator.
+                continue
+
+            leading_op.builtin_options.fused_activation_function = act_fun
+            leading_op.tmp_outputs[0] = act_fun_op.tmp_outputs[0]
+            to_remove.append(act_fun_op)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py
new file mode 100755
index 00000000000..b6fd5849551
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py
@@ -0,0 +1,80 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import (
+    NoFusedActivationFunction,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    OneOf,
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    RuleAnd,
+    RuleIf,
+    RuleOr,
+    TensorDimensionsMatch,
+    TensorHasDimensionOfSize,
+    TensorHasOneConsumer,
+    TensorHasRank,
+    TensorHasType,
+    TensorIsQuantized,
+)
+
+
+class FuseFullyConnectedAndAddOperators(BaseOptimization):
+
+    def __call__(self) -> bool:
+        """
+        FullyConnected -> Add sequence can handle more complicated shapes than just FullyConnected with bias
+         (due to shape broadcasting).
+        The bias can have shape [N] or [1, N], where N is the first dimension of the FC weights tensor.
+         It could also have shape [1, ..., 1, N], but then the TFLite FullyConnected removes the leading ones,
+         even if 'keep_num_dims' is True. In ONNX, the output tensor has the leading ones,
+         In this case, a Reshape would have to be added, so we do not perform the fusion.
+
+        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/fully_connected.cc#L398
+        """
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                # Require exactly 2 inputs.
+                Op(
+                    ["FullyConnected"], ["x", "w"], ["y"], [NoFusedActivationFunction()]
+                ),
+                OneOf([Op(["Add"], ["y", "b"]), Op(["Add"], ["b", "y"])]),
+            ],
+            [
+                TensorHasOneConsumer("y"),
+                TensorHasRank("w", 2),
+                RuleOr(
+                    TensorHasRank("b", 1),
+                    RuleAnd(TensorHasRank("b", 2), TensorHasDimensionOfSize("b", 0, 1)),
+                ),
+                TensorDimensionsMatch("w", 0, "b", -1),
+                RuleIf(TensorIsQuantized("x"), TensorHasType("b", TensorType.INT32)),
+            ],
+        )
+
+        to_remove = []
+        for (fc, add), tensor_map, _, _ in matcher.match_patterns():
+            b = tensor_map["b"]
+            fc.tmp_inputs.append(b)
+
+            # Remove the 'Add' operator.
+            fc.tmp_outputs[0] = add.tmp_outputs[0]
+            fc.builtin_options.fused_activation_function = (
+                add.builtin_options.fused_activation_function
+            )
+            to_remove.append(add)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_quanitze_into_preceding_ops.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_quanitze_into_preceding_ops.py
new file mode 100755
index 00000000000..6b3bd70cc01
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_quanitze_into_preceding_ops.py
@@ -0,0 +1,94 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import (
+    WasNotInTheOriginalONNXModel,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    TensorHasOneConsumer,
+    TensorsArePerTensorQuantized,
+    TensorsHaveSameType,
+)
+
+
+class FuseQuantizeIntoPrecedingOps(BaseOptimization):
+    """Remove some `Quantize` operators in the following pattern.
+
+         │
+       ┌─▼──┐
+       │ Op │                                                            │
+       └─┬──┘                                                          ┌─▼──┐
+         │  'x' (same type, quantization params `A`)     ─────►        │ Op │
+    ┌────▼─────┐                                                       └─┬──┘
+    │ Quantize │                                                         │  (same type, quantization params `B`)
+    └────┬─────┘
+         │  'y' (same type, quantization params `B`)
+    """
+
+    ops_that_can_have_any_output_quantization = [
+        # List of operators which don't have restrictions placed on their output quantization and are currently
+        #  supported by `onnx2quant`.
+        "Add",
+        "BatchMatMul",
+        "FullyConnected",
+        "HardSwish",
+        "LeakyRelu",
+        "Mean",
+        "Mul",
+        "PRelu",
+        "ReduceProd",
+        "Relu",
+        "Sub",
+        "Sum",
+    ]
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(
+                    self.ops_that_can_have_any_output_quantization,
+                    outputs=[..., "x", ...],
+                ),
+                Op(
+                    ["Quantize"],
+                    ["x"],
+                    ["y"],
+                    [
+                        # Restrict this optimization to extra `Quantize` operators which were added during conversion.
+                        #  Sometimes the `Quantize` operators which are present in the ONNX model can be essential and
+                        #  shouldn't be removed. They can for example perform clipping.
+                        WasNotInTheOriginalONNXModel()
+                    ],
+                ),
+            ],
+            [
+                TensorHasOneConsumer("x"),
+                # Make sure the `Quantize` is just changing quantization parameters. Otherwise, it couldn't be fused.
+                TensorsHaveSameType(["x", "y"]),
+                TensorsArePerTensorQuantized(["x", "y"]),
+            ],
+        )
+
+        to_remove = []
+        for [leading_op, quantize], tensor_map, _, _ in matcher.match_patterns():
+            x, y = tensor_map["x"], tensor_map["y"]
+
+            x_idx = leading_op.tmp_outputs.index(x)
+            leading_op.tmp_outputs[x_idx] = y
+
+            to_remove.append(quantize)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py
new file mode 100755
index 00000000000..9809719fad4
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py
@@ -0,0 +1,39 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.tensor_utils import tensor_has_data
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+
+
+class KeepOneEmptyBuffer(BaseOptimization):
+
+    def __call__(self) -> bool:
+        """Create a single empty `Buffer` object and assign it to all tensors in the model that don't have static data.
+        :return: True, if any tensors had their buffer changed. Otherwise, False.
+        """
+
+        made_changes = False
+        empty_buffer = self._builder.get_first_empty_buffer()
+
+        for t in self._builder.get_tensors().vector:
+            if tensor_has_data(t):
+                # The buffer of `t` is not empty.
+                continue
+
+            if t.tmp_buffer == empty_buffer:
+                # Already optimized.
+                continue
+
+            if t.is_variable:
+                # The data of the tensor will change at runtime, so it shouldn't share the buffer with other tensors.
+                continue
+
+            # It's safe to replace the buffer.
+            t.tmp_buffer = empty_buffer
+            made_changes = True
+
+        return made_changes
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
new file mode 100755
index 00000000000..4d10b7c80ae
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
@@ -0,0 +1,107 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+from copy import deepcopy
+
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import (
+    AllInputsComeFrom,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    TensorHasOneConsumer,
+    TensorsHaveSameQuantization,
+)
+
+
+class MoveActivationBeforeConcatenation(BaseOptimization):
+    """
+    Move some operators around in the following pattern.
+    This is a common pattern that emerges from the conversion of separable convolutions.
+
+          │                │                            │                │
+      ┌───▼────┐       ┌───▼────┐                   ┌───▼────┐       ┌───▼────┐
+      │ Conv2D │  ...  │ Conv2D │                   │ Conv2D │  ...  │ Conv2D │
+      └───┬────┘       └───┬────┘                   └───┬────┘       └───┬────┘
+          └──┐          ┌──┘                            │                │
+          ┌──▼──────────▼─┐                          ┌──▼───┐         ┌──▼───┐
+          │ Concatenation │           ─────►         │ Relu │   ...   │ Relu │
+          └───────┬───────┘                          └──┬───┘         └──┬───┘
+                  │  'x'                                └──┐          ┌──┘
+               ┌──▼───┐                                 ┌──▼──────────▼─┐
+               │ Relu │                                 │ Concatenation │
+               └──┬───┘                                 └───────┬───────┘
+                  │  'y'                                        │
+    """
+
+    activations = ["Relu", "ReluN1To1", "Relu6", "Tanh", "Sign"]
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Concatenation"], None, ["x"], [AllInputsComeFrom("Conv2D")]),
+                Op(self.activations, ["x"], ["y"]),
+            ],
+            [
+                TensorHasOneConsumer("x"),
+                # If the activation function is not changing the quantization parameters, it can be moved without
+                #  messing with the quantization elsewhere.
+                TensorsHaveSameQuantization(["x", "y"]),
+            ],
+        )
+
+        to_remove = []
+
+        # Mapping an operator to a list of operators. These operators (value) will later be added into the TFLite
+        #  model's `operators` in front of the specified operator (key).
+        to_add: dict[tflite_model.Operator, list[tflite_model.Operator]] = defaultdict(
+            lambda: []
+        )
+
+        for [concat, activation], _, _, _ in matcher.match_patterns():
+            new_concat_inputs = []
+            for concat_input in concat.tmp_inputs:
+                # Create a new operator for the activation function.
+                new_activation = deepcopy(activation)
+                new_activation.tmp_inputs = [concat_input]
+                new_activation_output = self._builder.duplicate_tensor(concat_input)
+                new_activation.tmp_outputs = [new_activation_output]
+
+                to_add[concat].append(
+                    new_activation
+                )  # Insert the new activation into the model later.
+
+                new_concat_inputs.append(
+                    new_activation_output
+                )  # Connect the activation with the `Concatenation`.
+
+            concat.tmp_inputs = new_concat_inputs
+
+            # Tensor rule ensures that only the activation functions is using the output of the `Concatenation`.
+            # It is safe to bypass.
+            concat.tmp_outputs[0] = activation.tmp_outputs[0]
+            to_remove.append(activation)
+
+        operators = self._builder.get_operators()
+
+        # Add the new activations into the model.
+        for concat, activations in to_add.items():
+            idx = operators.index(concat)
+            for activation in activations:
+                operators.insert(idx, activation)
+
+        # Remove the old activations.
+        for activation in to_remove:
+            operators.remove(activation)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py
new file mode 100755
index 00000000000..42eefc1ab56
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py
@@ -0,0 +1,121 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.converter.conversion import translator
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    TensorDimensionsMatch,
+    TensorHasRank,
+    TensorIsChannelsFirst,
+    TensorIsChannelsLast,
+    TensorIsFormatless,
+    TensorsHaveData,
+    TensorsHaveOneConsumer,
+)
+
+
+class PermuteFullyConnectedWeightsAfterReshape(BaseOptimization):
+
+    def __call__(self) -> bool:
+        """Search for the pattern:
+
+                       │  (3D / 4D / 5D, channels last)
+                ┌──────▼──────┐
+                │  Transpose  │
+                └──────┬──────┘
+                       │  (3D / 4D / 5D, channels first)
+                 ┌─────▼─────┐
+                 │  Reshape  │
+                 └─────┬─────┘
+                       │  (2D, formatless)
+              ┌────────▼───────┐
+              │ FullyConnected ◄───── Weights  (static)
+              └────────┬───────┘
+                       │  (2D, formatless)
+                       ▼
+
+        In this case, it is possible to permute the `weights` of the `FullyConnected`, and remove the `Transpose`.
+
+        How it works:
+            - The original model doesn't have the `Transpose`. It just has `Reshape` into `MatMul` (or `Gemm`...).
+            - The `Transpose` is added, because the `Reshape` has a channels last input, which was originally
+                channels first (in the ONNX model), and so the 2D output of the `Reshape` would have the same data.
+                but at different locations. The `Transpose` makes the input channels first, which ensures correct
+                output of the `Reshape`.
+            - In the scenario in the graph above, it is possible to omit the `Transpose`, which causes the `Reshape`
+                output to be "permuted", and then the `weights` of the `FullyConnected` can be statically permuted
+                to match. This will result in correct `FullyConnected` output.
+            - It is required that the `Reshape` output has shape [N, H * W * ... * C] (if the input was
+                [N, H, W, ..., C]). The `weights` will have shape [X, C * H * W * ...] (where X is arbitrary).
+                Since we know the values of C, H, W, ..., we can statically reshape the `weights` to
+                [X, C, H, W, ...], transpose it to [X, H, W, ..., C], and flatten it back to [X, H * W * ... * C].
+        """
+
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Transpose"], ["x", "perm"], ["y"]),
+                Op(["Reshape"], ["y", ...], ["z"]),
+                Op(["FullyConnected"], ["z", "w", ...]),
+            ],
+            [
+                TensorsHaveOneConsumer(["y", "z"]),
+                TensorDimensionsMatch("y", 0, "z", 0),
+                TensorDimensionsMatch("z", 1, "w", 1),
+                TensorIsChannelsLast("x"),
+                TensorIsChannelsFirst("y"),
+                TensorIsFormatless("z"),
+                TensorHasRank("z", 2),
+                TensorsHaveData(["perm", "w"]),
+            ],
+        )
+
+        to_remove = []
+        for (transpose, reshape, fc), tensor_map, _, _ in matcher.match_patterns():
+            # Make sure the `Transpose` is applying the expected permutation.
+            y = tensor_map["y"]
+            to_onnx_perm = (
+                translator.create_channels_last_to_channels_first_permutation(
+                    y.shape.len()
+                )
+            )
+            if not np.allclose(to_onnx_perm, tensor_map["perm"].tmp_buffer.data):
+                continue  # The `Transpose` has an unexpected permutation.
+
+            w = tensor_map["w"]
+            tmp_shape = [w.shape[0]] + y.shape[1:]  # H, W, C
+
+            data = w.tmp_buffer.data.reshape(tmp_shape)  # Reshape from 2D.
+            data = translator.convert_data_to_channels_last(
+                data
+            )  # Permute to TFLite format.
+            data = data.reshape(w.shape.vector)  # Flatten to 2D.
+
+            # Create a new tensor for the data, in case it is used by some other operator as well.
+            new_weights = self._builder.duplicate_tensor(w)
+            new_weights.tmp_buffer.data = data
+            fc.tmp_inputs[1] = new_weights
+
+            # Remove the `Transpose`.
+            logger.i(
+                f"Permuting the `weights`({w.name}) of a FullyConnected operator and removing an artificial "
+                "Transpose operator."
+            )
+            reshape.tmp_inputs[0] = transpose.tmp_inputs[0]
+            to_remove.append(transpose)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_cast_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_cast_operators.py
new file mode 100755
index 00000000000..8cce0bb61e8
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_cast_operators.py
@@ -0,0 +1,117 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    MultipleSameOps,
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    RuleOr,
+    TensorIsNotModelOutput,
+    TensorIsNotQuantized,
+    TensorsAreNotQuantized,
+    TensorsHaveSameType,
+)
+
+
+class FuseCastOperators(BaseOptimization):
+    """Remove some `Cast` operators in the following pattern.
+
+         │  'x'
+      ┌──▼───┐
+      │ Cast │
+      └──┬───┘                                           │  'x'
+       ┌─┴─── ... ──────┐  'y'        ─────►          ┌──┴── ... ─────┐   ('y' is not in the model anymore)
+    ┌──▼───┐         ┌──▼───┐                      ┌──▼───┐        ┌──▼───┐
+    │ Cast │  ...    │ Cast │                      │ Cast │  ...   │ Cast │
+    └──┬───┘         └──┬───┘                      └──┬───┘        └──┬───┘
+       │                │  'z'                        │               │  'z'
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Cast"], outputs=["y"]),
+                MultipleSameOps(["Cast"], ["y", ...]),  # Only `Cast` ops can use `y`.
+            ],
+            [TensorIsNotModelOutput("y"), TensorIsNotQuantized("y")],
+        )
+
+        to_remove = []
+        for [leading_cast, following_cast_ops], _, _, _ in matcher.match_patterns():
+            # Remove the leading cast.
+            for cast in following_cast_ops:
+                cast.tmp_inputs[0] = leading_cast.tmp_inputs[0]
+
+            to_remove.append(leading_cast)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
+
+
+class RemoveCastOperatorsWithNoEffect(BaseOptimization):
+    """Remove operators that match the following pattern.
+
+       │  'x'
+    ┌──▼───┐
+    │ Cast │
+    └──┬───┘
+       │  'y'  (same type as 'x')
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [Op(["Cast"], ["x", ...], ["y"])],
+            [
+                TensorsHaveSameType(["x", "y"]),
+                TensorsAreNotQuantized(["x", "y"]),
+                RuleOr(
+                    TensorIsNotModelOutput("x"),
+                    TensorIsNotModelOutput("y"),
+                    # If both 'x' and 'y' are model outputs, the `Cast` cannot be removed. If the op was removed, its
+                    #  input and output would be combined into 1 tensor, which would have to represent 2 model outputs
+                    #  with 2 different names, which is not possible.
+                ),
+            ],
+        )
+
+        to_remove = []
+        for [cast], tensor_map, input_to_ops, _ in matcher.match_patterns():
+            if not self._builder.operator_can_be_skipped(cast):
+                continue
+
+            x = tensor_map["x"]
+            y = tensor_map["y"]
+            model_outputs = self._builder.get_sub_graph().outputs.tmp_outputs
+
+            # Replace `y` with `x` in the inputs of all following operators.
+            following_ops = input_to_ops.get(y.name, [])
+            for op in following_ops:
+                while y in op.tmp_inputs:
+                    input_idx = op.tmp_inputs.index(y)
+                    op.tmp_inputs[input_idx] = x
+
+            if y in model_outputs:
+                # Replace the output as well.
+                while y in model_outputs:
+                    idx = model_outputs.index(y)
+                    model_outputs[idx] = x
+
+                self._builder.swap_tensor_names(x, y)
+
+            to_remove.append(cast)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_quantize_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_quantize_operators.py
new file mode 100755
index 00000000000..317654cde9a
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_quantize_operators.py
@@ -0,0 +1,304 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    MultipleSameOps,
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    TensorIsNotModelOutput,
+    TensorsHaveSameQuantization,
+)
+
+
+class FuseParallelQuantizeOperators(BaseOptimization):
+    """Fuse some `Quantize` operators in the following pattern.
+
+             │  'x'                                               │  'x'
+         ┌───┴──── ... ───────┐                              ┌────▼─────┐
+    ┌────▼─────┐        ┌────▼─────┐         ─────►          │ Quantize │
+    │ Quantize │   ...  │ Quantize │                         └────┬─────┘
+    └────┬─────┘        └────┬─────┘                          ┌───┴─ ... ─┐
+         │                   │  'y' (same quantization)       │           │  'y'
+
+
+       The pattern below only has 2 `Quantize` operators. But the `PatternMatcher` will gradually match all parallel
+        `Quantize` operators which fit the pattern above, and remove the unnecessary ones.
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [Op(["Quantize"], ["x"], ["y1"]), Op(["Quantize"], ["x"], ["y2"])],
+            [
+                TensorsHaveSameQuantization(["y1", "y2"]),
+                # 'y2' will be removed from the model, so it cannot be a model output. But thanks to the nature of the
+                #  `PatternMatcher`, it doesn't matter which `Quantize` produces the model output. The `PatternMatcher`
+                #  will first match 1 `Quantize` as the first `Op` and try to optimize. If it doesn't work, it will then
+                #  match the second `Quantize` operator with the first `Op` and try to optimize that way. This will
+                #  result in a perfectly optimized pattern every time.
+                TensorIsNotModelOutput("y2"),
+            ],
+        )
+
+        to_remove = []
+        for (
+            [_, quant_to_remove],
+            tensor_map,
+            input_to_ops,
+            _,
+        ) in matcher.match_patterns():
+            to_remove.append(quant_to_remove)
+
+            y1 = tensor_map["y1"]
+            y2 = tensor_map["y2"]
+            next_ops = input_to_ops.get(y2.name, [])
+            for next_op in next_ops:
+                while y2 in next_op.tmp_inputs:
+                    idx = next_op.tmp_inputs.index(y2)
+                    next_op.tmp_inputs[idx] = y1
+
+            quant_to_remove.tmp_inputs = (
+                []
+            )  # To prevent future matches of this operator.
+
+        ops = self._builder.get_operators()
+        for op in to_remove:
+            ops.remove(op)
+
+        return ops.len() != 0
+
+
+# noinspection PyMethodMayBeStatic
+class PruneQuantizeOperators(BaseOptimization):
+    """Remove some `Quantize` operators in the following pattern.
+
+             │  'x'
+        ┌────▼─────┐
+        │ Quantize │
+        └────┬─────┘
+         ┌───┴──── ... ───────┐  'y'
+    ┌────▼─────┐        ┌────▼─────┐
+    │ Quantize │   ...  │ Quantize │
+    └────┬─────┘        └────┬─────┘
+         │                   │  'z'
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Quantize"], ["x"], ["y"]),
+                MultipleSameOps(
+                    ["Quantize"], ["y"]
+                ),  # Nothing other than `Quantize` ops can use `y`.
+            ],
+            [TensorIsNotModelOutput("y")],
+        )
+
+        to_remove = []
+        for (
+            [leading_quantize, following_quantize_ops],
+            tensor_map,
+            input_to_ops,
+            _,
+        ) in matcher.match_patterns():
+            x = tensor_map["x"]
+
+            if self._is_quantization_recasting_from_float(x, following_quantize_ops):
+                # First Quantize can be skipped because it does only recasting
+                to_remove.append(leading_quantize)
+
+                for next_quantize in following_quantize_ops:
+                    next_quantize.tmp_inputs[0] = x
+
+            elif self._is_quantization_recasting_from_integer(
+                x, following_quantize_ops
+            ):
+                # The Quantize ops negate each other -> remove them both
+                to_remove.append(leading_quantize)
+
+                graph_outputs = self._builder.get_sub_graph().outputs.tmp_outputs
+                for next_quantize in following_quantize_ops:
+                    to_remove.append(next_quantize)
+
+                    # Replace the output of the next Quantize with the input of the first Quantize
+                    next_quantize_output = next_quantize.tmp_outputs[0]
+                    self._bypass_to_next_quantize_ops(
+                        input_to_ops, next_quantize_output, x
+                    )
+
+                    # If the output of the first Quantize is also the graph output -> replace the graph output too
+                    if next_quantize_output in graph_outputs:
+                        graph_outputs.remove(next_quantize_output)
+                        if x not in graph_outputs:
+                            graph_outputs.append(x)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
+
+    def _is_quantization_recasting_from_float(
+        self, quantize_input: tflite_model.Tensor, next_ops: list[tflite_model.Operator]
+    ):
+        """
+        Check if 'next_ops' just recast from one type to another. Scale + recalculated zp
+        must be the same for all nodes. Input of first Quantize op has to be float to match
+        criteria.
+
+        float         int8         uint8
+        ----> [quant] ---> [quant] ----->
+                       zp          zp-128
+        OR
+
+        float         uint8          int8
+        ----> [quant] ----> [quant] ----->
+                       zp           zp+128
+
+        OR (forked variant with similar restrictions as mentioned above)
+
+                           u / int         u/int
+        float            | ----> [quant] ---->
+        ----> [quant] --|
+                         | ----> [quant] ---->
+                           u/int         u/int
+
+        :param quantize_input: Input tensor of first QuantizeLinear node.
+        :param next_ops: QuantizeLinear ops that consume output of 'quantize_input'.
+        :return: True if pattern with recasting is found.
+        """
+
+        if not quantize_input.type == TensorType.FLOAT32:
+            return False
+
+        # All 'next_ops' has the same output type and q-params
+        next_op_output_match_first = [
+            self._same_type_and_quantization(
+                next_ops[0].tmp_outputs[0], next_op.tmp_outputs[0]
+            )
+            for next_op in next_ops
+        ]
+        if not all(next_op_output_match_first):
+            return False
+
+        # All 'next_ops' are the same, do some additional checks on the first one
+
+        next_op_input = next_ops[0].tmp_inputs[0]
+        next_op_output = next_ops[0].tmp_outputs[0]
+
+        input_zp = next_op_input.quantization.zero_point.vector
+        output_zp = next_op_output.quantization.zero_point.vector
+
+        if next_op_input.quantization.scale != next_op_output.quantization.scale:
+            return False
+
+        if (
+            next_op_input.type == TensorType.INT8
+            and next_op_output.type == TensorType.UINT8
+        ):
+            return np.equal(input_zp, np.array(output_zp) - 128)
+        elif (
+            next_op_input.type == TensorType.UINT8
+            and next_op_output.type == TensorType.INT8
+        ):
+            return np.equal(input_zp, np.array(output_zp) + 128)
+
+        return False
+
+    def _is_quantization_recasting_from_integer(
+        self, quantize_input: tflite_model.Tensor, next_ops: list[tflite_model.Operator]
+    ):
+        """
+        Check if 'next_ops' just recast from one type to another. Scale + recalculated zp
+        must be the same for all nodes. Input of first Quantize op has to be (u)int8 to
+        match criteria.
+
+        uint8          int8          uint8
+        ----> [quant] -----> [quant] ---->
+         zp           zp+128          zp
+
+        OR
+
+        int8         uint8          int8
+        ---> [quant] -----> [quant] --->
+         zp          zp-128          zp
+
+        OR (forked variant with similar restrictions as mentioned above)
+
+                           u/int         u/int
+        u/int            | ----> [quant] ---->
+        ----> [quant] --|
+                         | ----> [quant] ---->
+                           u/int         u/int
+
+        :param quantize_input: Input tensor of first QuantizeLinear node.
+        :param next_ops: QuantizeLinear ops that consume output of 'quantize_input'.
+        :return: True if pattern with recasting is found.
+        """
+
+        if quantize_input.type not in [TensorType.INT8, TensorType.UINT8]:
+            return False
+
+        # All 'next_ops' has the same output type and q-params as input of first Quantize
+        next_op_output_match_first = [
+            self._same_type_and_quantization(quantize_input, next_op.tmp_outputs[0])
+            for next_op in next_ops
+        ]
+        if not all(next_op_output_match_first):
+            return False
+
+        # All 'next_ops' are the same, do some additional checks on the first one
+
+        next_op_input = next_ops[0].tmp_inputs[0]
+        next_op_output = next_ops[0].tmp_outputs[0]
+
+        input_zp = next_op_input.quantization.zero_point.vector
+        output_zp = next_op_output.quantization.zero_point.vector
+
+        if quantize_input.quantization.scale != next_op_input.quantization.scale:
+            return False
+
+        if next_op_input.quantization.scale != next_op_output.quantization.scale:
+            return False
+
+        if (
+            next_op_input.type == TensorType.INT8
+            and next_op_output.type == TensorType.UINT8
+        ):
+            return np.equal(input_zp, np.array(output_zp) - 128)
+        elif (
+            next_op_input.type == TensorType.UINT8
+            and next_op_output.type == TensorType.INT8
+        ):
+            return np.equal(input_zp, np.array(output_zp) + 128)
+
+        return False
+
+    def _same_type_and_quantization(
+        self, a: tflite_model.Tensor, b: tflite_model.Tensor
+    ):
+        same_type = a.type == b.type
+        same_quantization = a.quantization == b.quantization
+
+        return same_type and same_quantization
+
+    def _bypass_to_next_quantize_ops(
+        self, input_to_ops, next_quantize_output, quantize_input
+    ):
+        ops_after_next_quantize = input_to_ops.get(next_quantize_output.name, [])
+        for op_after_next_quantize in ops_after_next_quantize:
+            for index, input_tensor in enumerate(op_after_next_quantize.tmp_inputs):
+                if input_tensor == next_quantize_output:
+                    # Replace the input
+                    op_after_next_quantize.tmp_inputs[index] = quantize_input
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_reshape_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_reshape_operators.py
new file mode 100755
index 00000000000..229d4747a7c
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_reshape_operators.py
@@ -0,0 +1,116 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    MultipleSameOps,
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    RuleOr,
+    TensorIsNotModelOutput,
+    TensorsHaveSameShape,
+)
+
+
+class FuseReshapeOperators(BaseOptimization):
+    """Remove some `Reshape` operator in the following pattern.
+
+             │  'x'
+        ┌────▼────┐
+        │ Reshape │
+        └────┬────┘                                              │  'x'
+         ┌───┴─── ... ───────┐  'y'        ─────►            ┌───┴─── ... ───────┐   ('y' is not in the model anymore)
+    ┌────▼────┐         ┌────▼────┐                     ┌────▼────┐         ┌────▼────┐
+    │ Reshape │   ...   │ Reshape │                     │ Reshape │   ...   │ Reshape │
+    └────┬────┘         └────┬────┘                     └────┬────┘         └────┬────┘
+         │                   │  'z'                          │                   │  'z'
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Reshape"], outputs=["y"]),
+                MultipleSameOps(
+                    ["Reshape"], ["y", ...]
+                ),  # Nothing other than `Reshape` ops can use `y`.
+            ],
+            [TensorIsNotModelOutput("y")],
+        )
+
+        to_remove = []
+        for [leading_reshape, following_reshapes], _, _, _ in matcher.match_patterns():
+            # Remove the leading reshape.
+            for r in following_reshapes:
+                r.tmp_inputs[0] = leading_reshape.tmp_inputs[0]
+
+            to_remove.append(leading_reshape)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
+
+
+class RemoveReshapeOperatorsWithNoEffect(BaseOptimization):
+    """Remove operators that match the following pattern.
+
+         │  'x'
+    ┌────▼────┐
+    │ Reshape │
+    └────┬────┘
+         │  'y'  (same shape as 'x')
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [Op(["Reshape"], ["x", ...], ["y"])],
+            [
+                TensorsHaveSameShape(["x", "y"]),
+                RuleOr(
+                    TensorIsNotModelOutput("x"),
+                    TensorIsNotModelOutput("y"),
+                    # If both 'x' and 'y' are model outputs, the `Reshape` cannot be removed. If the op was removed, its
+                    #  input and output would be combined into 1 tensor, which would have to represent 2 model outputs
+                    #  with 2 different names, which is not possible.
+                ),
+            ],
+        )
+
+        to_remove = []
+        for [reshape], tensor_map, input_to_ops, _ in matcher.match_patterns():
+            if not self._builder.operator_can_be_skipped(reshape):
+                continue
+
+            x = tensor_map["x"]
+            y = tensor_map["y"]
+            model_outputs = self._builder.get_sub_graph().outputs.tmp_outputs
+
+            # Replace `y` with `x` in the inputs of all following operators.
+            following_ops = input_to_ops.get(y.name, [])
+            for op in following_ops:
+                while y in op.tmp_inputs:
+                    input_idx = op.tmp_inputs.index(y)
+                    op.tmp_inputs[input_idx] = x
+
+            if y in model_outputs:
+                # Replace the output as well.
+                while y in model_outputs:
+                    idx = model_outputs.index(y)
+                    model_outputs[idx] = x
+
+                self._builder.swap_tensor_names(x, y)
+
+            to_remove.append(reshape)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
new file mode 100755
index 00000000000..dc9ad9999b4
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
@@ -0,0 +1,155 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
+    apply_permutation_to,
+    combine_permutations,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    MultipleSameOps,
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    RuleOr,
+    TensorHasData,
+    TensorIsNotModelOutput,
+    TensorsHaveData,
+)
+
+
+class FuseTransposeOperators(BaseOptimization):
+    """Remove some `Transpose` operators in the following pattern.
+
+              │  'x'
+        ┌─────▼─────┐
+        │ Transpose │
+        └─────┬─────┘                                          │  'x'
+          ┌───┴──── ... ────────┐  'y'      ─────►         ┌───┴──── ... ────────┐   ('y' is not in the model anymore)
+    ┌─────▼─────┐         ┌─────▼─────┐              ┌─────▼─────┐         ┌─────▼─────┐
+    │ Transpose │   ...   │ Transpose │              │ Transpose │   ...   │ Transpose │
+    └─────┬─────┘         └─────┬─────┘              └─────┬─────┘         └─────┬─────┘
+          │                     │  'z'                     │                     │  'z'
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["Transpose"], ["x", "perm1"], ["y"]),
+                MultipleSameOps(
+                    ["Transpose"], ["y", "perm2"]
+                ),  # Nothing other than `Transpose` ops can use `y`.
+            ],
+            [TensorsHaveData(["perm1", "perm2"]), TensorIsNotModelOutput("y")],
+        )
+
+        to_remove = []
+        for (
+            [leading_transpose, following_transposes],
+            tensor_map,
+            _,
+            _,
+        ) in matcher.match_patterns():
+            x = tensor_map["x"]
+            perm1 = tensor_map["perm1"].tmp_buffer.data
+
+            # Remove the leading transpose.
+            for second_transpose in following_transposes:
+                # Combine the permutations for a new permutation of the second `Transpose`.
+                perm2 = second_transpose.tmp_inputs[1].tmp_buffer.data
+                combined_perm = np.array(combine_permutations(perm1, perm2), np.int32)
+                second_transpose.tmp_inputs[1] = self._builder.create_tensor_for_data(
+                    combined_perm, "perm"
+                )
+
+                # Compute the output shape of the second `Transpose`.
+                new_output_shape = apply_permutation_to(x.shape.vector, combined_perm)
+                second_transpose.tmp_outputs[0].shape = tflite_model.Shape(
+                    list(new_output_shape)
+                )
+
+                # Bypass the first `Transpose`.
+                second_transpose.tmp_inputs[0] = leading_transpose.tmp_inputs[0]
+
+            to_remove.append(leading_transpose)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
+
+
+class RemoveIdentityTransposeOperators(BaseOptimization):
+    """Remove operators that match the following pattern.
+
+          │  'x'
+    ┌─────▼─────┐
+    │ Transpose ◄───── identity permutation
+    └─────┬─────┘
+          │  'y'
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [Op(["Transpose"], ["x", "perm"], ["y"])],
+            [
+                TensorHasData(
+                    "perm"
+                ),  # Note: identity permutation must be checked later.
+                RuleOr(
+                    TensorIsNotModelOutput("x"),
+                    TensorIsNotModelOutput("y"),
+                    # If both 'x' and 'y' are model outputs, the `Transpose` cannot be removed. If the op was removed,
+                    #  its input and output would be combined into 1 tensor, which would have to represent 2 model
+                    #  outputs with 2 different names, which is not possible.
+                ),
+            ],
+        )
+
+        to_remove = []
+        for [transpose], tensor_map, input_to_ops, _ in matcher.match_patterns():
+            if not self._builder.operator_can_be_skipped(transpose):
+                continue
+
+            x = tensor_map["x"]
+            y = tensor_map["y"]
+
+            # Check if the `Transpose` is doing nothing.
+            permutation = tensor_map["perm"].tmp_buffer.data
+            if not np.allclose(permutation, range(x.rank)):
+                # Not and identity permutation.
+                continue
+
+            model_outputs = self._builder.get_sub_graph().outputs.tmp_outputs
+
+            # Replace `y` with `x` in the inputs of all following operators.
+            following_ops = input_to_ops.get(y.name, [])
+            for op in following_ops:
+                while y in op.tmp_inputs:
+                    input_idx = op.tmp_inputs.index(y)
+                    op.tmp_inputs[input_idx] = x
+
+            if y in model_outputs:
+                # Replace the output as well.
+                while y in model_outputs:
+                    idx = model_outputs.index(y)
+                    model_outputs[idx] = x
+
+                self._builder.swap_tensor_names(x, y)
+
+            to_remove.append(transpose)
+
+        for op in to_remove:
+            self._builder.get_operators().remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/remove_unused_tensors_and_buffers.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/remove_unused_tensors_and_buffers.py
new file mode 100755
index 00000000000..105c06c3709
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/remove_unused_tensors_and_buffers.py
@@ -0,0 +1,62 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+
+
+class RemoveUnusedTensorsAndBuffers(BaseOptimization):
+
+    def _get_used_tensors_and_buffers(
+        self,
+    ) -> (set[tflite_model.Tensor], set[tflite_model.Buffer]):
+        """Get a set of all tensors used by the operators in the model, and a set of all buffers used by these tensors."""
+        used_tensors = set()
+        used_buffers = set()
+
+        for op in self._builder.get_operators():
+            for tensor in op.tmp_inputs + op.tmp_outputs:
+                used_tensors.add(tensor)
+                if tensor.tmp_buffer is not None:
+                    used_buffers.add(tensor.tmp_buffer)
+
+        return used_tensors, used_buffers
+
+    def __call__(self) -> bool:
+        """Remove all tensors and buffers from the model, that are not used.
+        :return: True, if any tensors/buffers were removed. Otherwise, False.
+        """
+
+        used_tensors, used_buffers = self._get_used_tensors_and_buffers()
+
+        made_changes = False
+        model_inputs = self._builder.get_sub_graph().inputs.tmp_inputs
+        to_remove = []
+        for tensor in self._builder.get_tensors():
+            if tensor not in used_tensors:
+                if tensor in model_inputs:
+                    # It is possible that an input tensor ended up not being used by any operators. But removing it from
+                    #  the model would cause errors at runtime, so it must stay.
+                    pass
+
+                else:
+                    to_remove.append(tensor)
+
+        for tensor in to_remove:
+            made_changes = True
+            self._builder.get_tensors().remove(tensor)
+
+        to_remove = []
+        for buffer in self._builder.get_buffers().vector:
+            if buffer not in used_buffers:
+                to_remove.append(buffer)
+
+        for buffer in to_remove:
+            made_changes = True
+            self._builder.get_buffers().remove(buffer)
+
+        return made_changes
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/replace_average_pool_before_fully_connected_with_sum.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/replace_average_pool_before_fully_connected_with_sum.py
new file mode 100755
index 00000000000..0b3926dd8a5
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/replace_average_pool_before_fully_connected_with_sum.py
@@ -0,0 +1,164 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+from executorch.backends.nxp.backend.ir import logger
+
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.sum_options import (
+    Sum,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    BaseOptimization,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    Op,
+    PatternMatcher,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
+    RuleOr,
+    TensorDimensionsMatch,
+    TensorHasData,
+    TensorHasRank,
+    TensorIsChannelsLast,
+    TensorIsFormatless,
+    TensorsAreQuantized,
+    TensorsHaveOneConsumer,
+    TensorsHaveType,
+)
+
+
+class ReplaceAveragePoolBeforeFullyConnectedWithSum(BaseOptimization):
+    """Replace `AveragePool2D` and `Reshape` with `Sum` in the following pattern.
+                   │
+          ┌────────▼────────┐
+          │  AveragePool2D  │  (global kernel)                          │
+          └────────┬────────┘                                       ┌───▼───┐
+                   │  (4D, channels last)                           │  Sum  │
+             ┌─────▼─────┐                                          └───┬───┘
+             │  Reshape  │                          ─────►              │
+             └─────┬─────┘                                     ┌────────▼─────────┐
+                   │  (2D, formatless)                         │  FullyConnected  ◄───── Scaled weights
+          ┌────────▼───────┐                                   └────────┬─────────┘
+          │ FullyConnected ◄───── Weights  (static)
+          └────────┬───────┘
+                   │
+
+    This is possible if the `AveragePool2D` is pooling across the entire input (i.e. global AveragePool). In this
+     case, it is possible to use a `Sum` operator instead, and then statically divide the `weights` of the
+     `FullyConnected`. This will effectively compute the average across the input at runtime.
+    This replacement becomes useful when there is a `Reshape` between, which flattens the tensor to 2D. This
+     flattening can be done by the `Sum` operator as well (parameter `keep_dims=False`).
+    As a result, the `Reshape` must simply remove the `1`s in the spatial dimensions, and keep the `batch size` and
+     `channels` unchanged.
+    """
+
+    def __call__(self) -> bool:
+        matcher = PatternMatcher(
+            self._builder,
+            [
+                Op(["AveragePool2D"], ["x"], ["ap_out"]),
+                Op(["Reshape"], ["ap_out", ...], ["resh_out"]),
+                Op(["FullyConnected"], ["resh_out", "w", ...], ["y"]),
+            ],
+            [
+                # Require either float32, or quantized tensors.
+                RuleOr(
+                    TensorsHaveType(["w", "resh_out"], TensorType.FLOAT32),
+                    TensorsAreQuantized(["w", "resh_out"]),
+                ),
+                TensorsHaveOneConsumer(["x", "ap_out", "resh_out"]),
+                TensorIsChannelsLast("ap_out"),
+                TensorHasRank("resh_out", 2),
+                TensorIsFormatless("resh_out"),
+                TensorHasRank("w", 2),
+                TensorHasData("w"),
+                TensorDimensionsMatch(
+                    "ap_out", 0, "resh_out", 0
+                ),  # Batch size unchanged.
+                TensorDimensionsMatch(
+                    "ap_out", -1, "resh_out", -1
+                ),  # Channels unchanged.
+            ],
+        )
+
+        # The mapped operator (value) will later be added into the TFLite model, in front of the `key` operator.
+        to_add: dict[tflite_model.Operator, tflite_model.Operator] = {}
+        to_remove = []
+        for [ap, reshape, fc], tensor_map, _, _ in matcher.match_patterns():
+            x, resh_out, w = tensor_map["x"], tensor_map["resh_out"], tensor_map["w"]
+
+            kernel_shape = [ap.builtin_options.filter_h, ap.builtin_options.filter_w]
+            if kernel_shape != x.shape[1:3]:
+                continue  # Not a global average pool.
+
+            # Divide the static FullyConnected weights by the number of kernel elements. This will transform the `sums`
+            #  to `averages` at runtime.
+            num_kernel_elements = np.prod(kernel_shape).astype("float32")
+            new_w = self._builder.duplicate_tensor(w)
+            if w.type == TensorType.FLOAT32:
+                # Just divide the weights.
+                new_w.tmp_buffer.data = np.array(
+                    new_w.tmp_buffer.data / num_kernel_elements
+                ).astype("float32")
+
+            elif w.quantization is not None:
+                # Divide the `scale` quantization parameter instead of the data. Since the `weights` are static,
+                #  changing the `scale` will change the actual values represented by the quantized data. This is because
+                #  the scale changes, while the raw data remains exactly the same.
+                new_w.quantization.scale.vector = [
+                    s / num_kernel_elements for s in new_w.quantization.scale.vector
+                ]
+
+                # Since the output of the `Sum` will now contain the `sums` of its input and not the `averages`, its
+                #  `scale` quantization parameter is not ideal. Multiply the `scale` by the number of elements of the
+                #  kernel to maintain the same accuracy.
+                resh_out.quantization.scale.vector = [
+                    s * num_kernel_elements for s in resh_out.quantization.scale.vector
+                ]
+
+            else:
+                # Should never happen. Raise an exception to notify us just in case.
+                logger.e(
+                    logger.Code.INTERNAL_ERROR,
+                    "ReplaceAveragePoolBeforeFullyConnectedWithSum: Unexpected type.",
+                )
+
+            fc.tmp_inputs[1] = (
+                new_w  # Replace the scaled `weights` of the `FullyConnected`.
+            )
+
+            # Reduce over the spatial dimensions.
+            axes = self._builder.create_tensor_for_data(
+                np.array([1, 2], "int32"), "axes"
+            )
+
+            sum_op = tflite_model.Operator(
+                builtin_options=Sum(keep_dims=False),
+                opcode_index=self._builder.op_code_index_for_op_type(
+                    BuiltinOperator.SUM
+                ),
+            )
+            sum_op.tmp_inputs = [x, axes]
+            sum_op.tmp_outputs = [resh_out]
+
+            to_add[fc] = sum_op
+            to_remove.extend([ap, reshape])
+
+        # Add the new `Sum` operators into the model.
+        ops = self._builder.get_operators()
+        for k, sum_op in to_add.items():
+            idx = ops.index(k)
+            ops.insert(idx, sum_op)
+
+        # Remove the `AveragePool` and `Reshape` operators from the model.
+        for op in to_remove:
+            ops.remove(op)
+
+        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
new file mode 100755
index 00000000000..fc94656ac74
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
@@ -0,0 +1,229 @@
+#
+# Copyright 2023 Martin Pavella
+# Copyright 2024 NXP
+#
+# License: MIT
+# See the LICENSE_MIT for more details.
+#
+
+from enum import Enum
+from typing import Callable
+
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.combine_hard_sigmoid_and_mul_to_hard_swish import (
+    CombineHardSigmoidAndMulIntoHardSwish,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.eliminate_dead_branches import (
+    EliminateDeadBranches,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_activation_functions import (
+    FuseActivationFunctions,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_fully_connected_and_add_operators import (
+    FuseFullyConnectedAndAddOperators,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_quanitze_into_preceding_ops import (
+    FuseQuantizeIntoPrecedingOps,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.keep_one_empty_buffer import (
+    KeepOneEmptyBuffer,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import (
+    MoveActivationBeforeConcatenation,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.permute_fully_connected_weights_after_reshape import (
+    PermuteFullyConnectedWeightsAfterReshape,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.prune_cast_operators import (
+    FuseCastOperators,
+    RemoveCastOperatorsWithNoEffect,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.prune_quantize_operators import (
+    FuseParallelQuantizeOperators,
+    PruneQuantizeOperators,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.prune_reshape_operators import (
+    FuseReshapeOperators,
+    RemoveReshapeOperatorsWithNoEffect,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.prune_transpose_operators import (
+    FuseTransposeOperators,
+    RemoveIdentityTransposeOperators,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.remove_unused_tensors_and_buffers import (
+    RemoveUnusedTensorsAndBuffers,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.replace_average_pool_before_fully_connected_with_sum import (
+    ReplaceAveragePoolBeforeFullyConnectedWithSum,
+)
+
+
+class Optimization(Enum):
+    KEEP_ONE_EMPTY_BUFFER = 0
+    FUSE_ACTIVATION_FUNCTIONS = 1
+    FUSE_FULLY_CONNECTED_AND_ADD = 2
+
+    FUSE_RESHAPE_OPERATORS = 3
+    REMOVE_RESHAPE_OPERATORS_WITH_NO_EFFECT = 4
+
+    FUSE_TRANSPOSE_OPERATORS = 5
+    REMOVE_IDENTITY_TRANSPOSE_OPERATORS = 6
+
+    PRUNE_QUANTIZE_OPERATORS = 7
+    FUSE_PARALLEL_QUANTIZE_OPERATORS = 8
+    FUSE_QUANTIZE_INTO_PRECEDING_OPS = 9
+
+    REMOVE_UNUSED_TENSORS = 10
+    ELIMINATE_DEAD_BRANCHES = 11
+    PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE = 12
+
+    FUSE_CAST_OPERATORS = 13
+    REMOVE_CAST_OPERATORS_WITH_NO_EFFECT = 14
+
+    MOVE_ACTIVATION_BEFORE_CONCAT = 15
+    COMBINE_HARD_SIGMOID_AND_MUL_INTO_HARD_SWISH = 16
+    REPLACE_AVERAGE_POOL_BEFORE_FULLY_CONNECTED_WITH_SUM = 17
+
+
+class Optimizer:
+    """
+    Class provides methods to optimize a TFLite model. To do so, it uses a ModelBuilder object, encapsulating
+     the TFLite model.
+
+    A lot of these methods were implemented a while ago they are not very efficient. Some of them may also not cover
+     all edge cases.
+    """
+
+    # avoid circular dependency with importing the model_builder but allow typehints
+    _builder: "model_builder.ModelBuilder"  # noqa F821
+
+    # Dictionary which maps optimizations to methods which implement them
+    optimization_map: dict[Optimization, Callable]
+
+    # As long as the model is being modified, optimizations will be applied again and again. This variable is the hard
+    #  limit to the number of times any single optimization is applied.
+    optimization_application_limit = 10  # Empirical value.
+
+    def __init__(
+        self,
+        builder: "model_builder.ModelBuilder",  # noqa F821
+        conversion_config: ConversionConfig,
+    ):
+        self._builder = builder
+
+        self.optimization_map = {
+            Optimization.KEEP_ONE_EMPTY_BUFFER: KeepOneEmptyBuffer(
+                builder, conversion_config
+            ),
+            Optimization.FUSE_ACTIVATION_FUNCTIONS: FuseActivationFunctions(
+                builder, conversion_config
+            ),
+            Optimization.FUSE_FULLY_CONNECTED_AND_ADD: FuseFullyConnectedAndAddOperators(
+                builder, conversion_config
+            ),
+            Optimization.FUSE_RESHAPE_OPERATORS: FuseReshapeOperators(
+                builder, conversion_config
+            ),
+            Optimization.REMOVE_RESHAPE_OPERATORS_WITH_NO_EFFECT: RemoveReshapeOperatorsWithNoEffect(
+                builder, conversion_config
+            ),
+            Optimization.FUSE_TRANSPOSE_OPERATORS: FuseTransposeOperators(
+                builder, conversion_config
+            ),
+            Optimization.REMOVE_IDENTITY_TRANSPOSE_OPERATORS: RemoveIdentityTransposeOperators(
+                builder, conversion_config
+            ),
+            Optimization.PRUNE_QUANTIZE_OPERATORS: PruneQuantizeOperators(
+                builder, conversion_config
+            ),
+            Optimization.FUSE_PARALLEL_QUANTIZE_OPERATORS: FuseParallelQuantizeOperators(
+                builder, conversion_config
+            ),
+            Optimization.FUSE_QUANTIZE_INTO_PRECEDING_OPS: FuseQuantizeIntoPrecedingOps(
+                builder, conversion_config
+            ),
+            Optimization.REMOVE_UNUSED_TENSORS: RemoveUnusedTensorsAndBuffers(
+                builder, conversion_config
+            ),
+            Optimization.ELIMINATE_DEAD_BRANCHES: EliminateDeadBranches(
+                builder, conversion_config
+            ),
+            Optimization.PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE: PermuteFullyConnectedWeightsAfterReshape(
+                builder, conversion_config
+            ),
+            Optimization.FUSE_CAST_OPERATORS: FuseCastOperators(
+                builder, conversion_config
+            ),
+            Optimization.REMOVE_CAST_OPERATORS_WITH_NO_EFFECT: RemoveCastOperatorsWithNoEffect(
+                builder, conversion_config
+            ),
+            Optimization.MOVE_ACTIVATION_BEFORE_CONCAT: MoveActivationBeforeConcatenation(
+                builder, conversion_config
+            ),
+            Optimization.COMBINE_HARD_SIGMOID_AND_MUL_INTO_HARD_SWISH: CombineHardSigmoidAndMulIntoHardSwish(
+                builder, conversion_config
+            ),
+            Optimization.REPLACE_AVERAGE_POOL_BEFORE_FULLY_CONNECTED_WITH_SUM: ReplaceAveragePoolBeforeFullyConnectedWithSum(
+                builder, conversion_config
+            ),
+        }
+
+    def optimize(
+        self,
+        optimization_whitelist: list[Optimization] | None = None,
+        optimization_blacklist: list[Optimization] | None = None,
+    ):
+        """Apply optimizations to the TFLite model encapsulated by 'self._builder'.
+        :param optimization_whitelist: A list of optimizations to apply to the model.
+        :param optimization_blacklist: A list of optimizations to NOT apply to the model.
+
+        At least one of 'optimization_whitelist' and 'optimization_blacklist' must be 'None'.
+        If both are 'None', all optimizations are applied.
+
+        The optimizations will be applied multiple times in a loop, until the model is fully optimized.
+        """
+
+        if optimization_whitelist is not None and optimization_blacklist is not None:
+            logger.e(
+                logger.Code.INVALID_OPTIMIZATION,
+                "Optimization whitelist and blacklist cannot both be specified.",
+            )
+
+        if optimization_whitelist is not None:
+            optimizations = optimization_whitelist
+        else:
+            # Apply all optimizations
+            optimizations = list(Optimization)
+
+        if optimization_blacklist is not None:
+            for o in optimization_blacklist:
+                try:
+                    optimizations.remove(o)
+                except ValueError:
+                    logger.w(
+                        f"Optimization blacklist contains invalid optimization '{o}'."
+                    )
+
+        # Execute the optimizations until the model is fully optimized.
+        for _i in range(self.optimization_application_limit):
+            run_again = False
+
+            for optimization in optimizations:
+                if optimization not in self.optimization_map.keys():
+                    logger.e(
+                        logger.Code.INVALID_OPTIMIZATION,
+                        f"The converter doesn't recognise the '{optimization}' optimization.",
+                    )
+
+                # Call the optimization
+                made_changes = self.optimization_map[optimization]()
+                logger.internal_assert(
+                    type(made_changes) is bool,
+                    f"Optimization `{optimization}` didn't return bool.",
+                )
+                run_again |= made_changes
+
+            if not run_again:
+                # The model is now fully optimized.
+                break
diff --git a/backends/nxp/backend/ir/tflite_optimizer/pattern_matcher.py b/backends/nxp/backend/ir/tflite_optimizer/pattern_matcher.py
new file mode 100755
index 00000000000..84d65d817b2
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/pattern_matcher.py
@@ -0,0 +1,921 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import cast, Iterator, Tuple
+
+import executorch.backends.nxp.backend.ir.converter.builder.model_builder as model_builder
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.graph_utils import (
+    builtin_operator_for_op_type,
+    create_tensor_to_operator_dictionaries,
+    InputTensorToOpsMap,
+    NameToTensorMap,
+    operator_is_type,
+    OutputTensorToOpMap,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import OpRule
+from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import TensorRule
+
+
+class OperatorBlock(ABC):
+    @abstractmethod
+    def validate(self):
+        pass
+
+
+@dataclass
+class OpLikeBlock(OperatorBlock):
+    ops: list[str] | None = None
+    inputs: list[str | None] | None = None
+    outputs: list[str | None] | None = None
+    op_rules: list[OpRule] | None = None
+
+    def validate(self):
+        """Check if the `Op` follows the limitations of the PatternMatcher.
+        If it doesn't exit with error and a corresponding message.
+        """
+
+        # `...` can only be used at the start or end of the inputs/outputs.
+        if len(self.inputs_as_list()) > 2:
+            logger.internal_assert(
+                ... not in self.inputs_as_list()[1:-1],
+                "PatternMatcher: The `...` can only be used "
+                "at the start and/or end of the inputs.",
+            )
+        if len(self.outputs_as_list()) > 2:
+            logger.internal_assert(
+                ... not in self.outputs_as_list()[1:-1],
+                "PatternMatcher: The `...` can only be used"
+                " at the start and/or end of the outputs.",
+            )
+
+    def inputs_as_list(self) -> list[str | None]:
+        """Return the `inputs` attribute. If it's `None`, return `[]`."""
+        if self.inputs is None:
+            return []
+        return self.inputs
+
+    def outputs_as_list(self) -> list[str | None]:
+        """Return the `outputs` attribute. If it's `None`, return `[]`."""
+        if self.outputs is None:
+            return []
+        return self.outputs
+
+    def io_as_list(self) -> list[str | None]:
+        """Return the `inputs` and `outputs` attributes combined into 1. If they are `None`, return `[]`."""
+        return self.inputs_as_list() + self.outputs_as_list()
+
+
+@dataclass
+class Op(OpLikeBlock):
+    """Class represents 1 operator."""
+
+    def match(
+        self,
+        real_op: tflite_model.Operator,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        """Try to match the `Op` with a real TFLite Operator. If the match is successful, add new mappings for matched
+         tensors into the tensor_map`.
+        :return: True, if the `Op` was successfully matched. Otherwise, return False.
+        """
+
+        # noinspection PyBroadException
+        try:
+            if not self._op_type_matches(real_op, builder):
+                return False
+
+            tensor_map_copy = tensor_map.copy()  # Use a copy in case the match fails.
+
+            if not self._match_inputs(real_op, tensor_map_copy):
+                return False
+
+            if not self._match_outputs(real_op, tensor_map_copy):
+                return False
+
+            if not self._op_rules_satisfied(
+                real_op, tensor_map_copy, input_to_ops_map, output_to_op_map, builder
+            ):
+                return False
+
+            # Operator matched.
+            tensor_map.update(tensor_map_copy)
+            return True
+
+        except Exception:
+            # Unexpected failure.
+            return False
+
+    def _op_type_matches(
+        self, real_op: tflite_model.Operator, builder: "model_builder.ModelBuilder"
+    ) -> bool:
+        """Check if the type of the TFLite operator `real_op` matches the types defined in this `Op`."""
+        if self.ops is None:
+            return True
+
+        return any(operator_is_type(real_op, op_type, builder) for op_type in self.ops)
+
+    def _op_rules_satisfied(
+        self,
+        real_op: tflite_model.Operator,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        """Check if all operator rules defined for this `Op` are satisfied."""
+        if self.op_rules is None:
+            return True
+
+        return all(
+            rule(real_op, tensor_map, input_to_ops_map, output_to_op_map, builder)
+            for rule in self.op_rules
+        )
+
+    def _match_inputs(  # noqa C901
+        self, real_op: tflite_model.Operator, tensor_map: NameToTensorMap
+    ) -> bool:
+        """Check if it is possible to match the input tensors of the TFLite operator `real_op` with the ones
+         defined for this `Op`.
+        New mappings may be added into the `tensor_map`.
+        """
+        if self.inputs is None:
+            return True
+
+        num_real_inputs = len(real_op.tmp_inputs)
+
+        step = 1
+        real_input_index = 0
+        inputs = self.inputs
+        if inputs[0] is ... and inputs[-1] is not ...:
+            # The `...` is used only at the start. In this case, iterate through the inputs from the end.
+            step = -1
+            real_input_index = num_real_inputs - 1
+            inputs = reversed(inputs)
+
+        def _checked_all_inputs(real_input_idx: int) -> bool:
+            if step == 1:
+                return real_input_idx >= num_real_inputs
+            elif step == -1:
+                return real_input_idx < 0
+            else:
+                raise ValueError
+
+        can_skip = False
+        for inpt in inputs:
+            if _checked_all_inputs(real_input_index) and (inpt is not ...):
+                return False  # The inputs don't match
+
+            if inpt is ...:
+                can_skip = True
+                continue
+
+            elif inpt is None:
+                # The tensor is not named, but must be there.
+                real_input_index += step
+            else:
+                # A tensor name is specified.
+                real_in = real_op.tmp_inputs[real_input_index]
+                if inpt in tensor_map.keys():
+                    # Tensor has already been mapped.
+                    logger.internal_assert(
+                        type(tensor_map[inpt]) is tflite_model.Tensor,
+                        f"PatternMatcher: consuming a set of tensors `{inpt}` is not supported right now.",
+                    )
+                    if tensor_map[inpt] != real_in:
+                        # The tensor doesn't match the mapped one.
+                        if can_skip:
+                            real_input_index += step
+                            continue
+
+                        return False
+
+                    else:
+                        # The tensor has been mapped and matches.
+                        real_input_index += step
+                        continue
+
+                # Map the matched tensor.
+                can_skip = (
+                    False  # Matched a tensor, so the `...` does not apply anymore.
+                )
+                tensor_map[inpt] = real_in
+                real_input_index += step
+
+        return True
+
+    def _match_outputs(  # noqa C901
+        self, real_op: tflite_model.Operator, tensor_map: NameToTensorMap
+    ) -> bool:
+        """Check if it is possible to match the output tensors of the TFLite operator `real_op` with the ones
+         defined for this `Op`.
+        New mappings may be added into the `tensor_map`.
+        """
+        if self.outputs is None:
+            return True
+
+        num_real_outputs = len(real_op.tmp_outputs)
+        step = 1
+        real_output_index = 0
+        outputs = self.outputs
+        if outputs[0] is ... and outputs[-1] is not ...:
+            # The `...` is used only at the start. In this case, iterate through the outputs from the end.
+            step = -1
+            real_output_index = num_real_outputs - 1
+            outputs = reversed(outputs)
+
+        def _checked_all_outputs(real_output_idx: int) -> bool:
+            if step == 1:
+                return real_output_idx >= num_real_outputs
+            elif step == -1:
+                return real_output_idx < 0
+            else:
+                raise ValueError
+
+        can_skip = False
+        for out in outputs:
+            if _checked_all_outputs(real_output_index) and (out is not ...):
+                return False  # The outputs don't match
+
+            if out is ...:
+                can_skip = True
+                continue
+
+            elif out is None:
+                # The tensor is not named, but must be there.
+                real_output_index += step
+            else:
+                # A tensor name is specified.
+                real_out = real_op.tmp_outputs[real_output_index]
+                if out in tensor_map.keys():
+                    # Tensor has already been mapped.
+                    if tensor_map[out] != real_out:
+                        # The tensor doesn't match.
+                        if can_skip:
+                            real_output_index += step
+                            continue
+
+                        return False
+                    else:
+                        # The tensor has been mapped and matches.
+                        real_output_index += step
+                        continue
+
+                # Map the matched tensor.
+                can_skip = (
+                    False  # Matched a tensor, so the `...` does not apply anymore.
+                )
+                tensor_map[out] = real_out
+                real_output_index += step
+
+        return True
+
+
+@dataclass
+class MultipleSameOps(OpLikeBlock):
+    """Class represents multiple occurrences of similar operators with the same op type, inputs and outputs."""
+
+    def match(
+        self,
+        real_ops: list[tflite_model.Operator],
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        """Try to match the `MultipleSameOps` with real TFLite operators. If the match is successful, add new mappings
+         for matched tensors into the tensor_map`.
+        :return: True, if the `MultipleSameOps` was successfully matched. Otherwise, return False.
+        """
+        # noinspection PyBroadException
+        try:
+            if len(real_ops) == 0:
+                return False
+
+            if not self._op_types_match(real_ops, builder):
+                return False
+
+            tensor_map_copy = tensor_map.copy()  # Use a copy in case the match fails.
+
+            if not self._match_inputs(real_ops, tensor_map_copy):
+                return False
+
+            if not self._match_outputs(real_ops, tensor_map_copy):
+                return False
+
+            if not self._op_rules_satisfied(
+                real_ops, tensor_map_copy, input_to_ops_map, output_to_op_map, builder
+            ):
+                return False
+
+            # Operator matched.
+            tensor_map.update(tensor_map_copy)
+            return True
+
+        except Exception:
+            # Unexpected failure.
+            return False
+
+    def validate(self):
+        super().validate()
+        logger.internal_assert(
+            self.ops is not None,
+            "PatternMatcher: `MultipleSameOps` doesn't support `ops=None` yet.",
+        )
+
+    def _op_types_match(
+        self,
+        real_ops: list[tflite_model.Operator],
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        """Check if the types of the TFLite operators `real_ops` match the types defined in this `MultipleSameOps`."""
+        for real_op in real_ops:
+            if not any(
+                operator_is_type(real_op, op_type, builder) for op_type in self.ops
+            ):
+                return False
+
+        return True
+
+    def _op_rules_satisfied(
+        self,
+        real_ops: list[tflite_model.Operator],
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        """Check if all operator rules defined for this `MultipleSameOps` are satisfied for all operators."""
+        if self.op_rules is None:
+            return True
+
+        for real_op in real_ops:
+            if not all(
+                rule(real_op, tensor_map, input_to_ops_map, output_to_op_map, builder)
+                for rule in self.op_rules
+            ):
+                return False
+
+        return True
+
+    def _match_inputs(  # noqa C901
+        self, real_ops: list[tflite_model.Operator], tensor_map: NameToTensorMap
+    ) -> bool:
+        """Check if it is possible to match the input tensors of the TFLite operators `real_ops` with the ones
+         defined for this `MultipleSameOps`.
+        New mappings may be added into the `tensor_map`.
+        """
+        if self.inputs is None:
+            return True
+
+        set_of_tensors_map = defaultdict(lambda: [])
+
+        for real_op in real_ops:
+            num_real_inputs = len(real_op.tmp_inputs)
+
+            step = 1
+            real_input_index = 0
+            inputs = self.inputs
+            if inputs[0] is ... and inputs[-1] is not ...:
+                # The `...` is used only at the start. In this case, iterate through the inputs from the end.
+                step = -1
+                real_input_index = num_real_inputs - 1
+                inputs = reversed(inputs)
+
+            def _checked_all_inputs(real_input_idx: int) -> bool:
+                if step == 1:  # noqa B036
+                    return real_input_idx >= num_real_inputs  # noqa B036
+                elif step == -1:  # noqa B036
+                    return real_input_idx < 0
+                else:
+                    raise ValueError
+
+            can_skip = False
+            for inpt in inputs:
+                if _checked_all_inputs(real_input_index) and (inpt is not ...):
+                    return False  # The inputs don't match
+
+                if inpt is ...:
+                    can_skip = True
+                    continue
+
+                elif inpt is None:
+                    # The tensor is not named, but must be there.
+                    real_input_index += step
+                else:
+                    # A tensor name is specified.
+                    real_in = real_op.tmp_inputs[real_input_index]
+                    if inpt in tensor_map.keys():
+                        # Tensor has already been mapped.
+                        logger.internal_assert(
+                            type(tensor_map[inpt]) is tflite_model.Tensor,
+                            f"PatternMatcher: consuming a set of tensors `{inpt}` is not supported right now.",
+                        )
+                        if tensor_map[inpt] != real_in:
+                            # The tensor doesn't match the mapped one.
+                            if can_skip:
+                                real_input_index += step
+                                continue
+
+                            return False
+
+                        else:
+                            # The tensor has been mapped and matches.
+                            real_input_index += step
+                            continue
+
+                    # Map the matched tensor.
+                    can_skip = (
+                        False  # Matched a tensor, so the `...` does not apply anymore.
+                    )
+                    set_of_tensors_map[inpt].append(real_in)
+
+                    real_input_index += step
+
+        # The `MultipleSameOps` were matched with `real_ops`. Add the new tensor mappings to the `tensor_map`.
+        tensor_map.update(set_of_tensors_map)
+
+        return True
+
+    def _match_outputs(
+        self, real_ops: list[tflite_model.Operator], tensor_map: NameToTensorMap
+    ) -> bool:
+        """Check if it is possible to match the output tensors of the TFLite operators `real_ops` with the ones
+         defined for this `MultipleSameOps`.
+        New mappings may be added into the `tensor_map`.
+        """
+        if self.outputs is None:
+            return True
+
+        set_of_tensors_map = defaultdict(lambda: [])
+
+        for real_op in real_ops:
+            num_real_outputs = len(real_op.tmp_outputs)
+
+            step = 1
+            real_output_index = 0
+            outputs = self.outputs
+            if outputs[0] is ... and outputs[-1] is not ...:
+                # The `...` is used only at the start. In this case, iterate through the outputs from the end.
+                step = -1
+                real_output_index = num_real_outputs - 1
+                outputs = reversed(outputs)
+
+            def _checked_all_outputs(real_output_idx: int) -> bool:
+                if step == 1:  # noqa B036
+                    return real_output_idx >= num_real_outputs  # noqa B036
+                elif step == -1:  # noqa B036
+                    return real_output_idx < 0
+                else:
+                    raise ValueError
+
+            for output in outputs:
+                if _checked_all_outputs(real_output_index) and (output is not ...):
+                    return False  # The outputs don't match
+
+                if output is ...:
+                    continue
+
+                elif output is None:
+                    # The tensor is not named, but must be there.
+                    real_output_index += step
+                else:
+                    # A tensor name is specified.
+                    real_out = real_op.tmp_outputs[real_output_index]
+                    if output in tensor_map.keys():
+                        # Tensor has already been mapped. This isn't supported right now.
+                        logger.e(
+                            logger.Code.INTERNAL_ERROR,
+                            "PatternMatcher: MultipleSameOps is producing an already "
+                            f"defined tensor `{output}`, which is not yet supported.",
+                        )
+
+                    # Map the matched tensor.
+                    set_of_tensors_map[output].append(real_out)
+
+                    real_output_index += step
+
+        # The `MultipleSameOps` were matched with `real_ops`. Add the new tensor mappings to the `tensor_map`.
+        tensor_map.update(set_of_tensors_map)
+
+        return True
+
+
+@dataclass()
+class OneOf(OperatorBlock):
+    """Class represents 1 operator, which matches at least 1 of the specified `Op` objects."""
+
+    # For now, limited to `Op` objects.
+    one_of_ops: list[Op]
+
+    def validate(self):
+        for op in self.one_of_ops:
+            op.validate()
+
+
+# noinspection PyMethodMayBeStatic
+class PatternMatcher:
+    builder: "model_builder.ModelBuilder"
+    pattern: list[OperatorBlock]
+    tensor_rules: list[TensorRule] | None
+
+    def __init__(
+        self,
+        builder: "model_builder.ModelBuilder",
+        pattern: list[OperatorBlock],
+        tensor_rules: list[TensorRule] | None = None,
+    ):
+        self.builder = builder
+        self.pattern = pattern
+        self.tensor_rules = tensor_rules
+
+        self._validate_pattern()
+
+    def _tensor_rules_satisfied(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+    ) -> bool:
+        """Check if all currently applicable tensor rules are satisfied."""
+        if self.tensor_rules is None:
+            return True
+
+        for rule in self.tensor_rules:
+            if rule.is_applicable(tensor_map) and not rule(
+                tensor_map, input_to_ops_map, output_to_op_map, self.builder
+            ):
+                return False  # Rule is not satisfied.
+
+        return True
+
+    def _get_opcode_indices_for(self, op_type: str) -> int | None:
+        builtin_op = builtin_operator_for_op_type(op_type)
+        return self.builder.op_code_type_index_map.get(builtin_op, None)
+
+    def _validate_pattern(self):
+        """Make sure the `pattern` is valid according to the limitations of the `PatternMatcher`.
+        If it isn't, exit with error and a corresponding message.
+        """
+        if len(self.pattern) == 0:
+            logger.e(logger.Code.INTERNAL_ERROR, "PatternMatcher: empty pattern.")
+
+        if type(self.pattern[0]) is not Op:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "PatternMatcher: invalid pattern. The first block must be an `Op`.",
+            )
+
+        for block in self.pattern:
+            block.validate()
+
+    def _all_ops_are_in_the_model(self):
+        """Determine if it is even possible to find a match for the pattern, based on whether the ops in the pattern
+        are in the model.
+        """
+
+        for block in self.pattern:
+            match block:
+                case Op():
+                    op = cast(Op, block)
+                    if op.ops is not None:
+                        if all(
+                            self._get_opcode_indices_for(op_type) is None
+                            for op_type in op.ops
+                        ):
+                            return False
+
+                case MultipleSameOps():
+                    multiple_same_ops = cast(MultipleSameOps, block)
+                    if all(
+                        self._get_opcode_indices_for(op_type) is None
+                        for op_type in multiple_same_ops.ops
+                    ):
+                        return False
+
+                case OneOf():
+                    one_of = cast(OneOf, block)
+                    valid = False
+                    for op in one_of.one_of_ops:
+                        if any(
+                            self._get_opcode_indices_for(op_type) is not None
+                            for op_type in op.ops
+                        ):
+                            valid = True
+
+                    if not valid:
+                        return False
+
+        return True
+
+    def _extend_pattern_with_op(
+        self,
+        op: Op,
+        real_pattern: list,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+    ) -> bool:
+        """Extend the currently matched pattern in `real_pattern` with an operator represented by `op`.
+        This function finds a suitable TFLite operator in the model, and adds it to `real_pattern`.
+        :return: True, if a matching operator was found. Otherwise, False.
+        """
+        if all(tensor not in tensor_map.keys() for tensor in op.io_as_list()):
+            # The operator is not connected to the already matched part of the pattern. This is not supported.
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                f"PatternMatcher: Op on index {len(real_pattern)} is not connected "
+                "to the preceding operators in the pattern.",
+            )
+
+        # The Op is somehow connected to the matched part.
+
+        tensor_map_copy = tensor_map.copy()
+
+        # Check if it is connected via the inputs.
+        for inpt in op.inputs_as_list():
+            if inpt not in tensor_map_copy.keys():
+                continue
+
+            # Found connecting input.
+            connecting_input = tensor_map_copy[inpt]
+            logger.internal_assert(
+                type(connecting_input) is tflite_model.Tensor,
+                f"PatternMatcher: consuming a set of tensors `{inpt}` is not yet supported.",
+            )
+
+            following_ops = input_to_ops.get(connecting_input.name, [])
+            for following_op in following_ops:
+                if following_op in real_pattern:
+                    continue  # This operator has already been matched.
+
+                if op.match(
+                    following_op,
+                    tensor_map_copy,
+                    input_to_ops,
+                    output_to_op,
+                    self.builder,
+                ) and self._tensor_rules_satisfied(
+                    tensor_map_copy, input_to_ops, output_to_op
+                ):
+                    # Successful match.
+                    real_pattern.append(following_op)
+                    tensor_map.update(tensor_map_copy)
+                    return True
+
+                else:
+                    tensor_map_copy = (
+                        tensor_map.copy()
+                    )  # Erase any potential invalid mappings.
+
+        # Try operators connected via the outputs.
+        for out in op.outputs_as_list():
+            if out not in tensor_map_copy.keys():
+                continue
+
+            # Found connecting output.
+            connecting_output = tensor_map_copy[out]
+            preceding_op = output_to_op.get(connecting_output.name, None)
+            if preceding_op is None:
+                continue
+            if preceding_op in real_pattern:
+                continue  # This operator has already been matched.
+            if op.match(
+                preceding_op, tensor_map_copy, input_to_ops, output_to_op, self.builder
+            ) and self._tensor_rules_satisfied(
+                tensor_map_copy, input_to_ops, output_to_op
+            ):
+                # Successful match.
+                real_pattern.append(preceding_op)
+                tensor_map.update(tensor_map_copy)
+                return True
+
+            else:
+                tensor_map_copy = (
+                    tensor_map.copy()
+                )  # Erase any potential invalid mappings.
+
+        return False
+
+    def _extend_pattern_with_multiple_same_ops(
+        self,
+        multiple_same_ops: MultipleSameOps,
+        real_pattern: list,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+    ) -> bool:
+        """Extend the currently matched pattern in `real_pattern` with multiple operators represented by
+         `multiple_same_ops`.
+        This function finds suitable TFLite operators in the model, and adds them to `real_pattern`.
+        :return: True, if a matching operators were found. Otherwise, False.
+        """
+        if all(
+            tensor not in tensor_map.keys() for tensor in multiple_same_ops.io_as_list()
+        ):
+            # The `MultipleSameOps` is not connected to the already matched part of the pattern. This is not supported.
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                f"PatternMatcher: MultipleSameOps on index {len(real_pattern)} is not "
+                "connected to any preceding Ops in the pattern.",
+            )
+
+        # ---- The MultipleSameOps is somehow connected to the matched part. ----
+
+        tensor_map_copy = tensor_map.copy()
+
+        # Check if it is connected via the inputs.
+        for inpt in multiple_same_ops.inputs_as_list():
+            if inpt not in tensor_map_copy.keys():
+                continue
+
+            # Found connecting input.
+            connecting_input = tensor_map_copy[inpt]
+            following_ops = input_to_ops.get(connecting_input.name, [])
+            logger.internal_assert(
+                type(connecting_input) is tflite_model.Tensor,
+                f"PatternMatcher: consuming a set of tensors `{inpt}` is not yet supported.",
+            )
+
+            # All following ops have to match.
+            if any(following_op in real_pattern for following_op in following_ops):
+                continue  # This operator has already been matched.
+
+            if multiple_same_ops.match(
+                following_ops, tensor_map_copy, input_to_ops, output_to_op, self.builder
+            ) and self._tensor_rules_satisfied(
+                tensor_map_copy, input_to_ops, output_to_op
+            ):
+                # Successful match.
+                real_pattern.append(following_ops)
+                tensor_map.update(tensor_map_copy)
+                return True
+
+            else:
+                tensor_map_copy = (
+                    tensor_map.copy()
+                )  # Erase any potential invalid mappings.
+
+        # `MultipleSameOps` cannot be connected via the outputs.
+        return False
+
+    def _extend_pattern_with_one_of(
+        self,
+        one_of: OneOf,
+        real_pattern: list,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+    ) -> bool:
+        """Extend the currently matched pattern in `real_pattern` with an operator represented by `one_of`.
+        This function finds a suitable TFLite operator in the model, and adds it to `real_pattern`.
+        :return: True, if a matching operator was found. Otherwise, False.
+        """
+        for op in one_of.one_of_ops:
+            tensor_map_copy = tensor_map.copy()
+            if self._extend_pattern_with_op(
+                op, real_pattern, tensor_map_copy, input_to_ops, output_to_op
+            ):
+                # Successfully matched the `OneOf`.
+                tensor_map.update(tensor_map_copy)
+                return True
+
+        return False
+
+    def _match_rest_of_pattern(
+        self,
+        real_pattern: list,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        pattern_idx: int,
+    ):
+        """Provided that a part of the pattern has been matched with operators in the TFLite model, extend this matched
+         `real_pattern` with new TFLite operators that match the rest of the pattern.
+        :param pattern_idx: Index into the `self.patter`, with the first block that has not yet been matched.
+        """
+        if pattern_idx >= len(self.pattern):
+            # Successfully matched full pattern.
+            return True
+
+        tensor_map_copy = tensor_map.copy()
+
+        match self.pattern[pattern_idx]:
+            case Op():
+                op = cast(Op, self.pattern[pattern_idx])
+                if self._extend_pattern_with_op(
+                    op, real_pattern, tensor_map_copy, input_to_ops, output_to_op
+                ):
+                    # Successful match.
+                    pattern_idx += 1
+                    tensor_map.update(tensor_map_copy)
+
+                else:
+                    # Failed to match the Op.
+                    return False
+
+            case MultipleSameOps():
+                multiple_same_ops = cast(MultipleSameOps, self.pattern[pattern_idx])
+                if self._extend_pattern_with_multiple_same_ops(
+                    multiple_same_ops,
+                    real_pattern,
+                    tensor_map_copy,
+                    input_to_ops,
+                    output_to_op,
+                ):
+                    # Successful match.
+                    pattern_idx += 1
+                    tensor_map.update(tensor_map_copy)
+
+                else:
+                    # Failed to match the MultipleSameOps.
+                    return False
+
+            case OneOf():
+                one_of = cast(OneOf, self.pattern[pattern_idx])
+                if self._extend_pattern_with_one_of(
+                    one_of, real_pattern, tensor_map_copy, input_to_ops, output_to_op
+                ):
+                    # Successful match.
+                    pattern_idx += 1
+                    tensor_map.update(tensor_map_copy)
+
+                else:
+                    # Failed to match the Op.
+                    return False
+
+            case _:
+                logger.e(
+                    logger.Code.INTERNAL_ERROR,
+                    f"PatternMatcher: pattern contains unexpected block `{self.pattern[pattern_idx]}`.",
+                )
+
+        # Matched a block. Recursively match the rest of the pattern.
+        return self._match_rest_of_pattern(
+            real_pattern, tensor_map, input_to_ops, output_to_op, pattern_idx
+        )
+
+    def match_patterns(
+        self,
+    ) -> Iterator[
+        Tuple[
+            list[tflite_model.Operator | list[tflite_model.Operator]],
+            NameToTensorMap,
+            InputTensorToOpsMap,
+            OutputTensorToOpMap,
+        ]
+    ]:
+        """Iterate over the model and yield matched patterns of operators."""
+
+        if not self._all_ops_are_in_the_model():
+            # The model doesn't contain sufficient operators to satisfy the pattern.
+            return
+
+        input_to_ops, output_to_op = create_tensor_to_operator_dictionaries(
+            self.builder
+        )
+
+        real_pattern: list[tflite_model.Operator] = (
+            []
+        )  # List of matched TFLite operators in the TFLite model.
+        tensor_map: NameToTensorMap = {}
+
+        # The first block of a pattern is always an `Op`.
+        first_pattern_op = cast(Op, self.pattern[0])
+
+        for first_real_op in self.builder.get_operators():
+            if first_pattern_op.match(
+                first_real_op, tensor_map, input_to_ops, output_to_op, self.builder
+            ) and self._tensor_rules_satisfied(tensor_map, input_to_ops, output_to_op):
+                # Successful first match.
+                real_pattern.append(first_real_op)
+
+            else:
+                # Mismatch.
+                real_pattern = []
+                tensor_map = {}
+                continue
+
+            # Matched the first `Op`. Now try to match the rest of the pattern.
+            if self._match_rest_of_pattern(
+                real_pattern, tensor_map, input_to_ops, output_to_op, 1
+            ):  # Start from index 1 in the pattern.
+                # Successfully matched full pattern.
+                yield real_pattern, tensor_map, input_to_ops, output_to_op
+
+                # The underlying TFLite model may have been changed. Re-compute the tensor to operator maps to be safe.
+                input_to_ops, output_to_op = create_tensor_to_operator_dictionaries(
+                    self.builder
+                )
+
+            real_pattern = []
+            tensor_map = {}
diff --git a/backends/nxp/backend/ir/tflite_optimizer/tensor_rules.py b/backends/nxp/backend/ir/tflite_optimizer/tensor_rules.py
new file mode 100755
index 00000000000..270f38f9a0c
--- /dev/null
+++ b/backends/nxp/backend/ir/tflite_optimizer/tensor_rules.py
@@ -0,0 +1,710 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import executorch.backends.nxp.backend.ir.converter.builder.model_builder as model_builder
+
+import numpy as np
+from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
+    InputTensorToOpsMap,
+    OutputTensorToOpMap,
+)
+from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
+    NameToTensorMap,
+    operator_is_type,
+)
+
+
+class TensorRule(ABC):
+    @abstractmethod
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        pass
+
+    @abstractmethod
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        """Determine if the rule can be tested, based on whether the required tensors have already been mapped."""
+        pass
+
+
+class MultipleTensorRule(TensorRule):
+    @property
+    @abstractmethod
+    def rules(self) -> list[TensorRule]:
+        """The individual tensor rules."""
+        pass
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        return all(
+            rule(tensor_map, input_to_ops, output_to_op, builder) for rule in self.rules
+        )
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return all(rule.is_applicable(tensor_map) for rule in self.rules)
+
+
+@dataclass
+class TensorHasRank(TensorRule):
+    tensor: str
+    rank: int
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].rank == self.rank
+            case list():
+                return all(t.rank == self.rank for t in tensor_map[self.tensor])
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorHasData(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].tmp_buffer.data is not None
+            case list():
+                return all(
+                    t.tmp_buffer.data is not None for t in tensor_map[self.tensor]
+                )
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorsHaveData(MultipleTensorRule):
+    def __init__(self, tensors: list[str]):
+        self._rules = [TensorHasData(t) for t in tensors]
+
+    @property
+    def rules(self) -> list[TensorRule]:
+        return self._rules
+
+
+@dataclass
+class TensorHasStaticValue(TensorRule):
+    # Rule assures that the tensor has a single static value, which is equal to the provided `value`.
+
+    tensor: str
+    value: int | float
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                data = tensor_map[self.tensor].tmp_buffer.data
+                if data is None or data.size > 1:
+                    return False
+
+                return np.allclose(data, np.asarray([self.value], data.dtype))
+
+            case list():
+                for t in tensor_map[self.tensor]:
+                    data = t.tmp_buffer.data
+                    if data is None or data.size > 1:
+                        return False
+
+                    if not np.allclose(data, np.asarray([self.value], data.dtype)):
+                        return False
+
+                return True
+
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorHasNConsumers(TensorRule):
+    tensor: str
+    n: int
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        model_outputs = builder.get_sub_graph().outputs.tmp_outputs
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                num_consumers = len(input_to_ops.get(tensor_map[self.tensor].name, []))
+                if tensor_map[self.tensor] in model_outputs:
+                    num_consumers += 1
+                return num_consumers == self.n
+
+            case list():
+                for t in tensor_map[self.tensor]:
+                    num_consumers = len(input_to_ops.get(t.name, []))
+                    if t in model_outputs:
+                        num_consumers += 1
+                    if num_consumers != self.n:
+                        return False
+
+                return True
+
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+class TensorHasOneConsumer(TensorHasNConsumers):
+    def __init__(self, tensor: str):
+        super().__init__(tensor, 1)
+
+
+class TensorsHaveOneConsumer(MultipleTensorRule):
+    def __init__(self, tensors: list[str]):
+        self._rules = [TensorHasOneConsumer(t) for t in tensors]
+
+    @property
+    def rules(self) -> list[TensorRule]:
+        return self._rules
+
+
+@dataclass
+class TensorConsumedOnlyBy(TensorRule):
+    tensor: str
+    consuming_operator_type: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return all(
+                    operator_is_type(op, self.consuming_operator_type, builder)
+                    for op in input_to_ops.get(tensor_map[self.tensor].name, [])
+                )
+            case list():
+                for t in tensor_map[self.tensor]:
+                    if not all(
+                        operator_is_type(op, self.consuming_operator_type, builder)
+                        for op in input_to_ops.get(t.name, [])
+                    ):
+                        return False
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorDimensionsMatch(TensorRule):
+    tensor_1: str
+    dim_idx_1: int
+
+    tensor_2: str
+    dim_idx_2: int
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        t1 = tensor_map[self.tensor_1]
+        t2 = tensor_map[self.tensor_2]
+
+        if (type(t1), type(t2)) != (tflite_model.Tensor, tflite_model.Tensor):
+            raise NotImplementedError(
+                "Tensor rule `TensorDimensionsMatch` is not implemented for sets of tensors."
+            )
+
+        if (not t1.shape.is_well_defined()) or (not t2.shape.is_well_defined()):
+            return False
+
+        return t1.shape[self.dim_idx_1] == t2.shape[self.dim_idx_2]
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor_1 in tensor_map.keys() and self.tensor_2 in tensor_map.keys()
+
+
+@dataclass
+class TensorHasDimensionOfSize(TensorRule):
+    tensor: str
+    dim_index: int
+    dim_size: int
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].shape[self.dim_index] == self.dim_size
+
+            case list():
+                return all(
+                    t.shape[self.dim_index] == self.dim_size
+                    for t in tensor_map[self.tensor]
+                )
+
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorsHaveSameShape(TensorRule):
+    tensors: list[str]
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        mapped_tensors = [tensor_map[tensor] for tensor in self.tensors]
+        if any(type(t) is not tflite_model.Tensor for t in mapped_tensors):
+            raise NotImplementedError(
+                "Tensor rule `TensorsHaveSameShape` is not implemented for sets of tensors."
+            )
+
+        if not all(t.shape.is_well_defined() for t in mapped_tensors):
+            # Not all shapes are known.
+            return False
+
+        if len(self.tensors) == 0:
+            return True
+
+        first_shape = mapped_tensors[0].shape
+        return all(t.shape == first_shape for t in mapped_tensors)
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return all(tensor in tensor_map.keys() for tensor in self.tensors)
+
+
+@dataclass
+class TensorsHaveSameType(TensorRule):
+    tensors: list[str]
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        if len(self.tensors) == 0:
+            return True
+
+        mapped_tensors = [tensor_map[tensor] for tensor in self.tensors]
+        if any(type(t) is not tflite_model.Tensor for t in mapped_tensors):
+            raise NotImplementedError(
+                "Tensor rule `TensorsHaveSameType` is not implemented for sets of tensors."
+            )
+
+        first_type = mapped_tensors[0].type
+        return all(t.type == first_type for t in mapped_tensors)
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return all(tensor in tensor_map.keys() for tensor in self.tensors)
+
+
+@dataclass
+class RuleIf(TensorRule):
+    condition_rule: TensorRule
+    body_rule: TensorRule
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        if self.condition_rule(tensor_map, input_to_ops, output_to_op, builder):
+            return self.body_rule(tensor_map, input_to_ops, output_to_op, builder)
+
+        return True
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.condition_rule.is_applicable(
+            tensor_map
+        ) and self.body_rule.is_applicable(tensor_map)
+
+
+class RuleOr(TensorRule):
+
+    def __init__(self, *rules: TensorRule):
+        self.rules = list(rules)
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        return any(
+            rule(tensor_map, input_to_ops, output_to_op, builder) for rule in self.rules
+        )
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return all(rule.is_applicable(tensor_map) for rule in self.rules)
+
+
+class RuleAnd(TensorRule):
+
+    def __init__(self, *rules: TensorRule):
+        self.rules = list(rules)
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        return all(
+            rule(tensor_map, input_to_ops, output_to_op, builder) for rule in self.rules
+        )
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return all(rule.is_applicable(tensor_map) for rule in self.rules)
+
+
+@dataclass
+class TensorHasType(TensorRule):
+    tensor: str
+    type_: TensorType
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].type == self.type_
+            case list():
+                return all(t.type == self.type_ for t in tensor_map[self.tensor])
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorsHaveType(MultipleTensorRule):
+    def __init__(self, tensors: list[str], type_: TensorType):
+        self._rules = [TensorHasType(t, type_) for t in tensors]
+
+    @property
+    def rules(self) -> list[TensorRule]:
+        return self._rules
+
+
+@dataclass
+class TensorIsChannelsLast(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].tensor_format.is_channels_last()
+            case list():
+                return all(
+                    t.tensor_format.is_channels_last() for t in tensor_map[self.tensor]
+                )
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorIsChannelsFirst(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].tensor_format.is_channels_first()
+            case list():
+                return all(
+                    t.tensor_format.is_channels_first() for t in tensor_map[self.tensor]
+                )
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorIsFormatless(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].tensor_format == TensorFormat.FORMATLESS
+            case list():
+                return all(
+                    t.tensor_format == TensorFormat.FORMATLESS
+                    for t in tensor_map[self.tensor]
+                )
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorIsQuantized(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].quantization is not None
+            case list():
+                return all(t.quantization is not None for t in tensor_map[self.tensor])
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorIsNotQuantized(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops_map: InputTensorToOpsMap,
+        output_to_op_map: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return tensor_map[self.tensor].quantization is None
+            case list():
+                return all(t.quantization is None for t in tensor_map[self.tensor])
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+@dataclass
+class TensorIsPerTensorQuantized(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                tensor = tensor_map[self.tensor]
+                return (
+                    tensor.quantization is not None
+                ) and tensor.quantization.is_per_tensor()
+            case list():
+                return all(
+                    (t.quantization is not None) and t.quantization.is_per_tensor()
+                    for t in tensor_map[self.tensor]
+                )
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+class TensorsAreQuantized(MultipleTensorRule):
+    def __init__(self, tensors: list[str]):
+        self._rules = [TensorIsQuantized(t) for t in tensors]
+
+    @property
+    def rules(self) -> list[TensorRule]:
+        return self._rules
+
+
+class TensorsAreNotQuantized(MultipleTensorRule):
+    def __init__(self, tensors: list[str]):
+        self._rules = [TensorIsNotQuantized(t) for t in tensors]
+
+    @property
+    def rules(self) -> list[TensorRule]:
+        return self._rules
+
+
+class TensorsArePerTensorQuantized(MultipleTensorRule):
+    def __init__(self, tensors: list[str]):
+        self._rules = [TensorIsPerTensorQuantized(t) for t in tensors]
+
+    @property
+    def rules(self) -> list[TensorRule]:
+        return self._rules
+
+
+@dataclass
+class TensorsHaveSameQuantization(TensorRule):
+    tensors: list[str]
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        if len(self.tensors) == 0:
+            return True
+
+        all_tensors: list[tflite_model.Tensor] = []
+        for mapped_tensor in (tensor_map[tensor] for tensor in self.tensors):
+            match mapped_tensor:
+                case tflite_model.Tensor():
+                    all_tensors.append(mapped_tensor)
+                case list():
+                    all_tensors.extend(mapped_tensor)
+                case _:
+                    raise ValueError
+
+        first_quantization = all_tensors[0].quantization
+        first_type = all_tensors[0].type
+        return all(t.quantization == first_quantization for t in all_tensors) and all(
+            t.type == first_type for t in all_tensors
+        )
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return all(tensor in tensor_map.keys() for tensor in self.tensors)
+
+
+@dataclass
+class TensorIsNotModelOutput(TensorRule):
+    tensor: str
+
+    def __call__(
+        self,
+        tensor_map: NameToTensorMap,
+        input_to_ops: InputTensorToOpsMap,
+        output_to_op: OutputTensorToOpMap,
+        builder: "model_builder.ModelBuilder",
+    ) -> bool:
+        match tensor_map[self.tensor]:
+            case tflite_model.Tensor():
+                return (
+                    tensor_map[self.tensor]
+                    not in builder.get_sub_graph().outputs.tmp_outputs
+                )
+            case list():
+                return all(
+                    t not in builder.get_sub_graph().outputs.tmp_outputs
+                    for t in tensor_map[self.tensor]
+                )
+            case _:
+                raise ValueError
+
+    def is_applicable(self, tensor_map: NameToTensorMap) -> bool:
+        return self.tensor in tensor_map.keys()
+
+
+class TensorsAreNotModelOutputs(MultipleTensorRule):
+    def __init__(self, tensors: list[str]):
+        self._rules = [TensorIsNotModelOutput(t) for t in tensors]
+
+    @property
+    def rules(self) -> list[TensorRule]:
+        return self._rules
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
new file mode 100644
index 00000000000..8826c60dc3b
--- /dev/null
+++ b/backends/nxp/backend/neutron_converter_manager.py
@@ -0,0 +1,55 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import pkgutil
+
+from executorch.backends.nxp.backend.ir.converter.node_converter import Target
+
+
+class NeutronConverterManager:
+    """
+    Manager for conversion of TFLite model in flatbuffers format into TFLite model that
+    contains NeutronGraph nodes.
+    """
+
+    _supported_target_names = [Target.RT700.value]
+
+    def convert(
+        self, tflite_model: bytes, target: str, neutron_converter_flavor: str
+    ) -> bytes:
+        # Neutron converter crashes if we provide invalid target -> verify.
+        if target not in self._supported_target_names:
+            raise RuntimeError(
+                f"Target '{target}' is not supported by NeutronConverterManager."
+            )
+
+        neutron_converter_modules = [
+            module.name
+            for module in pkgutil.iter_modules()
+            if module.name.startswith("neutron_converter")
+        ]
+
+        requested_module_name = f"neutron_converter_{neutron_converter_flavor}"
+        if requested_module_name not in neutron_converter_modules:
+            if len(neutron_converter_modules) > 0:
+                raise RuntimeError(
+                    f"Neutron Converter module with flavor '{neutron_converter_flavor}' "
+                    f"not found. Available modules: {neutron_converter_modules}."
+                )
+            else:
+                raise RuntimeError(
+                    f"Neutron Converter module with flavor '{neutron_converter_flavor}' "
+                    f"not found. Install 'neutron_converter_[flavor]' Python package."
+                )
+
+        neutron_converter = importlib.import_module(
+            f"{requested_module_name}.neutron_converter"
+        )
+
+        cctx = neutron_converter.CompilationContext()
+        cctx.targetOpts = neutron_converter.getNeutronTarget(target)
+        model_converted = neutron_converter.convertModel(list(tflite_model), cctx)
+
+        return bytes(model_converted)
diff --git a/backends/nxp/backend/node_format_inference.py b/backends/nxp/backend/node_format_inference.py
new file mode 100644
index 00000000000..76b05d172a4
--- /dev/null
+++ b/backends/nxp/backend/node_format_inference.py
@@ -0,0 +1,259 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from enum import Enum
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from torch import Node
+from torch.export import ExportedProgram
+
+logger = logging.getLogger(__name__)
+
+
+class NodeFormat(Enum):
+    # Node's output in NCHW format
+    CHANNELS_FIRST = 0
+
+    # Node's output format has no meaning
+    FORMATLESS = 1
+
+    # Format has not been identified
+    NONE = 2
+
+    def is_channels_first(self) -> bool:
+        return self == NodeFormat.CHANNELS_FIRST
+
+
+class NodeFormatInference:
+    # Dictionary with Edge Aten ops that always use channels first format.
+    # The op in the dictionary is mapped to a dictionary, which holds indices to input nodes
+    # that are always channels first.
+    ops_with_channels_first_nodes = {
+        exir_ops.edge.aten.avg_pool2d.default: {"inputs": [0]},
+        exir_ops.edge.aten.convolution.default: {"inputs": [0, 1]},
+        exir_ops.edge.aten.max_pool2d_with_indices.default: {"inputs": [0]},
+        exir_ops.edge.aten.max_pool2d.default: {"inputs": [0]},
+    }
+
+    # A set of Edge Aten ops, which have the ability to change the format (for example - input nodes
+    # are channels first but output is formatless).
+    ops_that_can_change_tensor_format = {exir_ops.edge.aten.view_copy.default}
+
+    _node_format_mapping: dict[Node, NodeFormat]
+
+    _type_changed_during_last_run: bool
+
+    # Mapping between Node and its ancestors (inputs)
+    _node_inputs: dict[Node, list[Node]]
+
+    # Mapping between Node and its children (outputs)
+    _node_outputs: dict[Node, list[Node]]
+
+    def __init__(self, edge_program: ExportedProgram):
+        self._edge_program = edge_program
+
+        self._nodes = edge_program.graph.nodes
+        self._node_format_mapping = {}
+        self._node_inputs = {
+            node: node.all_input_nodes for node in edge_program.graph.nodes
+        }
+        self._node_outputs = {
+            node: list(node.users.keys()) for node in edge_program.graph.nodes
+        }
+
+        self._type_changed_during_last_run = False
+
+    def identify_node_formats(self) -> dict[Node, NodeFormat]:
+        self._type_changed_during_last_run = True
+
+        # Re-run format inference until there are no changes
+        while self._type_changed_during_last_run:
+            self._type_changed_during_last_run = False
+
+            for node in self._nodes:
+                self._infer_format_of_nodes(node)
+
+        return self._node_format_mapping
+
+    def _infer_format_of_nodes(self, node: Node):
+        op_type = self._get_node_op_type(node)
+
+        if op_type in self.ops_with_channels_first_nodes:
+            self._handle_node_which_uses_channels_first_format(node)
+        elif op_type in self.ops_that_can_change_tensor_format:
+            if op_type == exir_ops.edge.aten.view_copy.default:  # view_copy
+                self._assign_format_to_node(
+                    self._node_outputs[node][0], NodeFormat.FORMATLESS
+                )
+            else:
+                logger.error(
+                    f"Node format inference for node type: {op_type} not found!"
+                )
+        else:
+            self._handle_node_which_can_use_any_node_format(node)
+
+    def _infer_format_based_on_io_ranks(self, node: Node):
+        """Determine the format of the output tensor of given "reshape style operator" based on the ranks of its input
+        and output.
+        """
+        # noinspection PyBroadException
+        try:
+            main_input_rank = len(node.all_input_nodes[0].meta["val"].shape)
+            main_output_rank = len(node.meta["val"].shape)
+
+            if main_output_rank == main_input_rank:
+                # Operator maintains the number of dimensions -> try to propagate the format.
+                self._match_formats_of_nodes(node, node.prev)
+
+            else:
+                # Either the op 'flattens' the tensor, so output is formatless, or it scales it up, in which case the
+                # format is assumed to be 'FORMATLESS', and may be back propagated as channels first later.
+                self._assign_format_to_node(node, NodeFormat.FORMATLESS)
+
+        except:
+            # Some shape data is not known, so we cannot be extra clever. Just set the output to `FORMATLESS` and
+            #  everything will be alright.
+            self._assign_format_to_node(node, NodeFormat.FORMATLESS)
+
+    def _match_formats_of_nodes(self, node_1, node_2):
+        """If one of 'node_1' or 'node_2' is channels first, make the other channels first as well.
+        If neither is channels first, make them both formatless.
+        """
+
+        format_1 = self._get_node_format(node_1)
+        format_2 = self._get_node_format(node_2)
+
+        if format_1.is_channels_first() or format_2.is_channels_first():
+            # At least 1 is channels first
+            if not format_1.is_channels_first():
+                self._assign_format_to_node(node_1, NodeFormat.CHANNELS_FIRST)
+            elif not format_2.is_channels_first():
+                self._assign_format_to_node(node_2, NodeFormat.CHANNELS_FIRST)
+
+        else:
+            self._assign_format_to_node(node_1, NodeFormat.FORMATLESS)
+            self._assign_format_to_node(node_2, NodeFormat.FORMATLESS)
+
+    def _assign_format_to_node(self, node: Node, node_format: NodeFormat):
+        """
+        Assign format to node, but only if it's not channels first.
+        """
+        old_node_format = self._get_node_format(node)
+
+        if old_node_format is NodeFormat.CHANNELS_FIRST:
+            # Once CHANNEL_FIRST was assigned, we don't want to reassign
+            return
+
+        if old_node_format != node_format:
+            self._type_changed_during_last_run = True
+
+        self._node_format_mapping[node] = node_format
+
+    def _get_node_op_type(self, node: Node) -> str | None:
+        """
+        Get node's operation type or None if node is not callable function.
+        """
+        if node.op == "call_function":
+            return node.target
+
+        return None
+
+    def _handle_node_which_uses_channels_first_format(self, node: Node):
+        """
+        Function for assigning format to nodes that require channels first input (Conv, MaxPool etc.)
+        """
+        op_type = self._get_node_op_type(node)
+
+        for index, ancestor_node in enumerate(self._node_inputs[node]):
+            # Go through input nodes and assign them correct format
+            if index in self.ops_with_channels_first_nodes[op_type]["inputs"]:
+                self._assign_format_to_node(ancestor_node, NodeFormat.CHANNELS_FIRST)
+
+                # We need to propagate channels first format up to already visited nodes
+                self._propagate_channels_first_format_up(ancestor_node)
+            else:
+                self._assign_format_to_node(ancestor_node, NodeFormat.FORMATLESS)
+
+        # (TODO Lukas Sztefek): It is expected here, that CHANNELS_FIRST node always produces CHANNELS_FIRST output.
+        # Validate the assumption.
+        self._assign_format_to_node(node, NodeFormat.CHANNELS_FIRST)
+
+    def _handle_node_which_can_use_any_node_format(self, node: Node):
+        """
+        Function for assigning format to nodes that don't care about format (Softmax, Abs).
+        It stays formatless if there is no surrounding channels first ancestor/child node.
+        """
+        if not self._node_produces_or_consumes_channels_first_format(node):
+            # Nor inputs or current node are channels first -> assign everything to formatless
+            for processed_node in self._node_inputs[node] + [node]:
+                self._assign_format_to_node(processed_node, NodeFormat.FORMATLESS)
+
+        else:
+            # Node produces or consumes channels first content
+            for processed_node in self._node_inputs[node] + [node]:
+                is_0d_to_2d = self._node_product_has_0_to_2_dimensions(processed_node)
+
+                if self._get_node_format(processed_node).is_channels_first():
+                    # Node output already channel first
+                    continue
+                elif is_0d_to_2d:
+                    # Node has less than 3 dimensions so it cannot be considered CHANNELS_FIRST
+                    self._assign_format_to_node(processed_node, NodeFormat.FORMATLESS)
+                else:
+                    # Node has more than 2D output -> make it channels first
+                    self._assign_format_to_node(
+                        processed_node, NodeFormat.CHANNELS_FIRST
+                    )
+                    self._propagate_channels_first_format_up(processed_node)
+
+    def _propagate_channels_first_format_up(self, node: Node):
+        if self._node_is_placeholder(node):
+            # Input or buffer node -> there is no parent node so we can end propagation here
+            self._assign_format_to_node(node, NodeFormat.CHANNELS_FIRST)
+            return
+
+        if node in self.ops_that_can_change_tensor_format:
+            # Propagation ends here because processed node changing format.
+            return
+
+        for ancestor_node in self._node_inputs[node]:
+            # Propagate channels first to ancestor nodes
+            self._infer_format_of_nodes(ancestor_node)
+
+    def _node_product_has_0_to_2_dimensions(self, node: Node) -> bool:
+        assert "val" in node.meta, f"Node '{node.name}' doesn't contain 'val' metadata!"
+
+        node_value_meta = node.meta["val"]
+
+        # (TODO Lukas Sztefek): Some nodes contains multiple value metadata (MaxPool, ...). Find out why.
+        if isinstance(node_value_meta, tuple):
+            node_value_meta = node_value_meta[0]
+        elif isinstance(node_value_meta, list):
+            node_value_meta = node_value_meta[0]
+
+        node_output_rank = len(node_value_meta.shape)
+
+        return 0 <= node_output_rank <= 2
+
+    def _node_produces_or_consumes_channels_first_format(self, node) -> bool:
+        """
+        Check if node itself produces output in channels first format or consumes it from ancestor node.
+        """
+        if self._get_node_format(node).is_channels_first():
+            return True
+
+        input_nodes = self._node_inputs[node]
+        return any(
+            self._get_node_format(ancestor_node).is_channels_first()
+            for ancestor_node in input_nodes
+        )
+
+    def _get_node_format(self, node):
+        return self._node_format_mapping.get(node, NodeFormat.NONE)
+
+    def _node_is_placeholder(self, node: Node):
+        return node.op == "placeholder"
diff --git a/backends/nxp/neutron_node_extraction.py b/backends/nxp/neutron_node_extraction.py
new file mode 100644
index 00000000000..10648b48849
--- /dev/null
+++ b/backends/nxp/neutron_node_extraction.py
@@ -0,0 +1,102 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
+    BuiltinOperator,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.Model import Model
+
+
+@dataclass
+class NeutronNodeArtifacts:
+    microcode: np.ndarray
+    weights: np.ndarray
+    kernels: np.ndarray
+
+
+def extract_artifacts_from_neutron_node(
+    tflite_flatbuffer_or_path: bytes | str,
+) -> NeutronNodeArtifacts:
+    """Extract the payload (microcode, weights, kernels) from the Neutron Node in the given TFLite model.
+    The model can be provided as a binary flatbuffer, or a path to a `.tflite` model.
+    """
+
+    if isinstance(tflite_flatbuffer_or_path, str):
+        with open(tflite_flatbuffer_or_path, "rb") as f:
+            flatbuffer = f.read()
+    else:
+        flatbuffer = tflite_flatbuffer_or_path
+
+    model = Model.GetRootAs(flatbuffer, 0)
+    assert (
+        model.SubgraphsLength() == 1
+    ), f"The model has `{model.SubgraphsLength()}` SubGraphs instead of `1`."
+
+    sub_graph = model.Subgraphs(0)
+
+    if sub_graph.OperatorsLength() == 0:
+        raise RuntimeError(
+            "Model converted with neutron-converter has `0` operators instead of `1`."
+        )
+    elif sub_graph.OperatorsLength() > 1:
+        builtin_operators_map: dict[int, str] = {
+            y: x for x, y in BuiltinOperator.__dict__.items()
+        }
+
+        opcodes = [model.OperatorCodes(i) for i in range(model.OperatorCodesLength())]
+        nodes = [sub_graph.Operators(i) for i in range(sub_graph.OperatorsLength())]
+        ops_found = [
+            builtin_operators_map[opcodes[node.OpcodeIndex()].BuiltinCode()]
+            for node in nodes
+        ]
+
+        raise RuntimeError(
+            f"Model converted with neutron-converter has `{sub_graph.OperatorsLength()}` operators "
+            f'instead of `1`. Operators found: {", ".join(ops_found)}.'
+        )
+
+    neutron_node = None
+    opcodes = [model.OperatorCodes(i) for i in range(model.OperatorCodesLength())]
+    for i in range(sub_graph.OperatorsLength()):
+        opcode = opcodes[sub_graph.Operators(i).OpcodeIndex()]
+        if (
+            opcode.BuiltinCode() == BuiltinOperator.CUSTOM
+            and opcode.CustomCode() == b"NeutronGraph"
+        ):
+            # Found the NeutronNode.
+            neutron_node = sub_graph.Operators(i)
+            break
+
+    if neutron_node is None:
+        raise RuntimeError(
+            "Model converted with neutron-converter does not contain a NeutronGraph node."
+        )
+
+    # The last 3 input tensors of the Neutron Node contain:
+    #   1. Neutron Microcode
+    #   2. Neutron Weights
+    #   3. Neutron Kernels
+    assert (
+        neutron_node.InputsLength() >= 3
+    ), f"The Neutron Node only has `{neutron_node.GetInputsLen()}` inputs. Expected at least `3`."
+    microcode_idx, weights_idx, kernels_idx = neutron_node.InputsAsNumpy()[-3:]
+
+    microcode_buffer_idx = sub_graph.Tensors(microcode_idx).Buffer()
+    weights_buffer_idx = sub_graph.Tensors(weights_idx).Buffer()
+    kernels_buffer_idx = sub_graph.Tensors(kernels_idx).Buffer()
+
+    microcode = model.Buffers(microcode_buffer_idx).DataAsNumpy()
+    weights = model.Buffers(weights_buffer_idx).DataAsNumpy()
+    kernels = model.Buffers(kernels_buffer_idx).DataAsNumpy()
+
+    assert (
+        microcode.dtype == weights.dtype == kernels.dtype == np.dtype("uint8")
+    ), "The Neutron Node uses unexpected data types."
+
+    return NeutronNodeArtifacts(microcode, weights, kernels)
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
new file mode 100644
index 00000000000..44863a6344e
--- /dev/null
+++ b/backends/nxp/neutron_partitioner.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2024-2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Partitioner for the NXP Neutron NPU
+
+import logging
+import operator
+from dataclasses import dataclass
+from typing import Dict, final, List, Mapping
+
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import Target
+from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import OperatorSupportBase
+from torch.nn import Parameter
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.nxp_backend import NeutronBackend
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class QDQClusterRecognizer:
+    """
+    Implementation of the Quantize - Dequantize clustering.
+    The quantization is captured in the ExecuTorch program using the QDQ (Quantize - DeQuantize) representation. Here
+    the inputs to a node comes from some dequantize nodes and outputs goes to some quantize nodes.
+    The QDQClusterRecognizer identifies operator performing the quantized arithmetic represented in QDQ form, and the
+    corresponding QDQ cluster. The QDQ cluster consists of the:
+    - dequantize nodes producing the inputs to the compute node
+    - compute node (e.g. conv)
+    - auxiliary nodes, like getitem, view_copy, ... which does not perform a core computation
+    - quantize nodes processing the output of the compute node.
+    """
+
+    @dataclass
+    class QDQCluster:
+        """
+        Dataclass to hold the QDQ cluster instance. For the purpose of Partitioner we hold the list of operators,
+        in the QDQ cluster (`ops`) and the compute node what the QDQ cluster is built around.
+        The compute node is what is represented in the Neutron IR. the rest of nodes are helpers for data transformation,
+        and defines the quantization parameters. This gives the partitioner the ability to:
+            - identify if the node is part of a QDQ cluster
+            - reference the compute node in the QDQ cluster
+        """
+
+        compute_node: torch.fx.Node
+        ops: List[torch.fx.Node]
+
+    QUANTIZE_OPERATORS = [
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+    ]
+
+    DEQUANTIZE_OPERATORS = [
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+    ]
+
+    AUXILIARY_OPS = [
+        operator.getitem,
+        exir_ops.edge.aten.view_copy.default,
+        exir_ops.edge.aten.permute_copy.default,
+    ]
+
+    def __init__(self):
+        self.cluster_map: dict[str, QDQClusterRecognizer.QDQCluster] = {}
+
+    @staticmethod
+    def is_quant_node(node: torch.fx.Node) -> bool:
+        return node.target in QDQClusterRecognizer.QUANTIZE_OPERATORS
+
+    @staticmethod
+    def is_dequant_node(node: torch.fx.Node) -> bool:
+        return node.target in QDQClusterRecognizer.DEQUANTIZE_OPERATORS
+
+    @staticmethod
+    def is_auxiliary_node(node: torch.fx.Node) -> bool:
+        return node.target in QDQClusterRecognizer.AUXILIARY_OPS
+
+    def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+        """
+        Return the list of nodes representing the input part of the QDQ cluster of the node `node`.
+        Those are various dequantization nodes (see DEQUANTIZE_OPERATORS) optionally followed by auxiliary
+        nodes.
+        If the `node` not meets the QDQ cluster schema, returns empty list.
+        """
+
+        # Iterative search for input nodes of the QDQ Cluster:
+        nodes_to_check = [node]
+        qdq_cluster = []
+        while len(nodes_to_check) > 0:
+            n = nodes_to_check.pop()
+            qdq_cluster.append(n)
+            if self.is_dequant_node(n):
+                continue
+            input_nodes_from_dequant_or_helper = [
+                (self.is_dequant_node(i) or self.is_auxiliary_node(i))
+                for i in n.all_input_nodes
+            ]
+            if all(input_nodes_from_dequant_or_helper):
+                nodes_to_check.extend(n.all_input_nodes)
+            else:
+                return []
+
+        logging.debug(f"Dequant Cluster for {node} is: {qdq_cluster}")
+        return qdq_cluster
+
+    def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+        """
+        Returns the list of nodes representing the output part of the QDQ cluster of the `node`.
+        Those are various quantize nodes (see QUANTIZE_OPERATORS) preceded by auxiliary nodes.
+        If the `node` not meets the QDQ cluster schema, returns empty list.
+        """
+
+        # Iterative search for output nodes of the QDQ Cluster:
+        nodes_to_check = [node]
+        qdq_cluster = []
+        while len(nodes_to_check) > 0:
+            n = nodes_to_check.pop()
+            qdq_cluster.append(n)
+            if self.is_quant_node(n):
+                continue
+            consumers = [
+                ngn for ngn in list(node.graph.nodes) if n in ngn.all_input_nodes
+            ]
+            logging.debug(f"\t Users for node {n} are: {consumers}")
+            output_nodes_to_quant_or_helper = [
+                (self.is_quant_node(i) or self.is_auxiliary_node(i)) for i in consumers
+            ]
+            if all(output_nodes_to_quant_or_helper):
+                nodes_to_check.extend(consumers)
+            else:
+                return []
+
+        logging.debug(f"Quant Cluster for {node} is {qdq_cluster}")
+        return qdq_cluster
+
+    def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+        """
+        Returns the QDQ cluster of the operator, if quantized. If operator is not quantized, returns empty list.
+        """
+        logging.debug(node)
+        input_qdq_cluster = self.get_qdq_cluster_input_part(node)
+        output_qdq_cluster = self.get_qdq_cluster_output_part(node)
+        if input_qdq_cluster and output_qdq_cluster:
+            return list(set(input_qdq_cluster).union(output_qdq_cluster))
+        else:
+            return []
+
+    def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
+        """
+        Tags a node and its related dequant and quant nodes with a specified cluster name
+        """
+        for node in nodes:
+            logging.info(f"Tagging node {node} as {cluster_name}")
+            node.meta["cluster"] = cluster_name
+
+    def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
+        """
+        Identifies QDQ clusters and tag them based on compute operation inside.
+        """
+
+        for node in nodes:
+            if (
+                node.op == "call_function"
+                and not self.is_quant_node(node)
+                and not self.is_dequant_node(node)
+            ):
+                cluster = self.get_qdq_cluster(node)
+                if cluster:
+                    cluster_name = f"{node.name}_cluster"
+                    self.tag_nodes(cluster, cluster_name)
+                    self.cluster_map[cluster_name] = self.QDQCluster(node, cluster)
+
+
+supported_ops = {
+    exir_ops.edge.aten.addmm.default: AddMMConverter,  # noqa F405
+    exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
+    exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
+    exir_ops.edge.aten.max_pool2d.default: MaxPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.max_pool2d_with_indices.default: MaxPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.mm.default: MMConverter,  # noqa F405
+    exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
+    exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
+}
+
+
+class NeutronSupportedOperators(OperatorSupportBase):
+
+    def __init__(
+        self,
+        qdq_clusters: Dict[str, QDQClusterRecognizer.QDQCluster],
+        target: Target,
+        operators_not_to_delegate: List[str],
+        parameters_mapping: dict[str, Parameter],
+    ):
+        self.qdq_clusters = qdq_clusters
+        self.target = target
+        self.operators_not_to_delegate = operators_not_to_delegate
+        self.parameters_mapping = parameters_mapping
+
+    def _is_node_quantized(self, node: torch.fx.node.Node):
+        return "cluster" in node.meta
+
+    def _is_node_call_function(self, node: torch.fx.node.Node):
+        return node.op == "call_function"
+
+    def is_node_delegatable(self, node: torch.fx.node.Node):
+        if self.operators_not_to_delegate != [""]:
+            any_non_delegatable = any(
+                x in node.name for x in self.operators_not_to_delegate
+            )
+            return not any_non_delegatable
+        return True
+
+    def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool:
+        """
+        Operator checking function for compute nodes.
+        """
+        if not self.is_node_delegatable(node):
+            return False
+
+        if (node_converter := supported_ops.get(node.target, None)) is None:
+            # There is no `NodeConverter` for this `node`.
+            return False
+
+        return (
+            self._is_node_call_function(node)
+            and self._is_node_quantized(node)
+            and
+            # TODO: `view_copy` node should be delegated only if it's not the only operator in the cluster.
+            node_converter.is_supported(node, self.target, self.parameters_mapping)
+        )
+
+    def _is_node_supported_non_compute(self, node: torch.fx.node.Node) -> bool:
+        """
+        If the node is a quantize, dequantize or auxiliary node inside a QDQ cluster, the support on Neutron
+        is determined by the support of the compute operator.
+        """
+        return self._is_node_quantized(node) and self._is_node_supported_compute(
+            self.qdq_clusters[node.meta["cluster"]].compute_node
+        )
+
+    def is_node_supported(
+        self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        """
+        Check if the Edge operator is supported on Neutron.
+        """
+
+        if (
+            QDQClusterRecognizer.is_quant_node(node)
+            or QDQClusterRecognizer.is_dequant_node(node)
+            or QDQClusterRecognizer.is_auxiliary_node(node)
+        ):
+            return self._is_node_supported_non_compute(node)
+        else:
+            return self._is_node_supported_compute(node)
+
+
+@final
+class NeutronPartitioner(Partitioner):
+    def __init__(self, compile_spec: List[CompileSpec]) -> None:
+        self.delegation_spec = DelegationSpec(NeutronBackend.__name__, compile_spec)
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        # Run the CapabilityBasedPartitioner to return the largest possible
+        # subgraphs containing the nodes with the tags
+        logging.info("NeutronPartitioner::partition")
+        partition_tags = {}
+
+        graph_module = exported_program.graph_module
+        nodes = list(graph_module.graph.nodes)
+
+        qdq_cluster_recognizer = QDQClusterRecognizer()
+        qdq_cluster_recognizer.tag_qdq_clusters(nodes)
+        graph_module.recompile()
+
+        target = None
+        operators_not_to_delegate = ""
+        for spec in self.delegation_spec.compile_specs:
+            if spec.key == "target":
+                target = Target(spec.value.decode())
+            if spec.key == "operators_not_to_delegate":
+                operators_not_to_delegate = spec.value.decode().split(",")
+        assert target is not None
+        logging.info(f"Operators not to delegate: {operators_not_to_delegate}")
+
+        parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters(
+            exported_program
+        )
+        capability_partitioner = CapabilityBasedPartitioner(
+            exported_program.graph_module,
+            NeutronSupportedOperators(
+                qdq_cluster_recognizer.cluster_map,
+                target,
+                operators_not_to_delegate,
+                parameters_mapping,
+            ),
+            allows_single_node_partition=True,
+        )
+
+        partition_list = capability_partitioner.propose_partitions()
+        for partition in partition_list:
+            for node in partition.nodes:
+                delegation_tag = f"tag{partition.id}"
+                node.meta["delegation_tag"] = delegation_tag
+                partition_tags[delegation_tag] = self.delegation_spec
+
+        tag_constant_data(exported_program)
+        return PartitionResult(
+            tagged_exported_program=exported_program, partition_tags=partition_tags
+        )
diff --git a/backends/nxp/neutron_pass_manager.py b/backends/nxp/neutron_pass_manager.py
new file mode 100644
index 00000000000..02bcc0079f6
--- /dev/null
+++ b/backends/nxp/neutron_pass_manager.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 NXP
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Type
+
+from executorch.exir.pass_base import ExportPass
+from executorch.exir.program._program import _transform
+
+from torch._export.pass_base import PassType
+from torch.export import ExportedProgram
+
+
+class NeutronPassManager:
+    def __init__(
+        self,
+        exported_program: ExportedProgram,
+        passes: Optional[List[Type[PassType]]] = None,
+    ) -> None:
+        """
+        A helper class to run multiple passes on a program
+        """
+        self._exported_program = exported_program
+
+        if not passes:
+            self.passes = []
+        else:
+            self.passes = passes
+
+    @property
+    def exported_program(self) -> ExportedProgram:
+        return self._exported_program
+
+    def transform(self) -> ExportedProgram:
+        """
+        Returns a transformed ExportedProgram
+        """
+        ep = self.exported_program
+        for pass_ in self.passes:
+            if issubclass(pass_, ExportPass):
+                transform_pass = pass_()
+            else:
+                raise RuntimeError(
+                    f"Expecting ExportPass or ExportPass(), but got pass: {pass_} with type: {type(pass_)}"
+                )
+            ep = _transform(ep, transform_pass)
+        return ep
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
new file mode 100644
index 00000000000..3233cf6dbd9
--- /dev/null
+++ b/backends/nxp/nxp_backend.py
@@ -0,0 +1,333 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Main implementation of AoT flow to partition and preprocess for Neutron target
+# backends.
+#
+
+import logging
+import struct
+from typing import final, List, Optional
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import Target
+from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
+from executorch.backends.nxp.backend.neutron_converter_manager import (
+    NeutronConverterManager,
+)
+from executorch.backends.nxp.neutron_node_extraction import (
+    extract_artifacts_from_neutron_node,
+    NeutronNodeArtifacts,
+)
+from executorch.backends.nxp.neutron_pass_manager import NeutronPassManager
+from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
+from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
+from torch.export.exported_program import ExportedProgram
+
+
+class NeutronCompileSpecBuilder:
+
+    def __init__(self):
+        self.config: Target = None
+        self.compile_spec: List[CompileSpec] = []
+        self.compiler_flags = []
+        self.output_format = None
+        self.operators_not_to_delegate: List[str] = []
+        self.neutron_converter_flavor = None
+
+    def _replace_colons(self, operator: str) -> str:
+        """
+        Replace '::' with '_'
+        """
+        return operator.replace("::", "_")
+
+    def neutron_compile_spec(
+        self,
+        config: str,
+        neutron_converter_flavor: str,
+        extra_flags: Optional[str] = None,
+        operators_not_to_delegate: Optional[List[str]] = None,
+    ):
+        """
+        Generate compile spec for Neutron NPU
+
+        Args:
+            config: Neutron accelerator configuration, e.g. "imxrt700"
+            neutron_converter_flavor: Flavor of the neutron-converter module to use. Neutron-converter module named "
+             "'neutron_converter_SDK_25_03' has flavor 'SDK_25_03'.
+            extra_flags: Extra flags for the Neutron compiler
+            operators_not_to_delegate: List of operators that should not be delegated
+        """
+        try:
+            self.config = Target(config)
+        except ValueError:
+            raise ValueError(
+                f"Config `{config}` is not a valid target. Must be one of `{Target.values()}`."
+            )
+
+        self.neutron_converter_flavor = neutron_converter_flavor
+
+        assert (
+            self.output_format is None
+        ), f"Output format already set to f{self.output_format}"
+        self.output_format = "tflite"
+        self.compiler_flags = []
+
+        if extra_flags is not None:
+            self.compiler_flags.append(extra_flags)
+
+        if operators_not_to_delegate is not None:
+            self.operators_not_to_delegate = [
+                self._replace_colons(op) for op in operators_not_to_delegate
+            ]
+
+        return self
+
+    def build(self):
+        """
+        Generate a list of compile spec objects from the builder
+        """
+        if self.output_format == "tflite":
+            self.compile_spec += [
+                CompileSpec("output_format", "tflite".encode()),
+                CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
+                CompileSpec("target", self.config.value.encode()),
+                CompileSpec(
+                    "neutron_converter_flavor", self.neutron_converter_flavor.encode()
+                ),
+                CompileSpec(
+                    "operators_not_to_delegate",
+                    ",".join(self.operators_not_to_delegate).encode(),
+                ),
+            ]
+
+        return self.compile_spec
+
+
+def generate_neutron_compile_spec(
+    config: str,  # The target platform. For example "imxrt700".
+    neutron_converter_flavor: str,
+    system_config: Optional[str] = None,
+    extra_flags: Optional[str] = None,
+    operators_not_to_delegate: Optional[List[str]] = None,
+) -> List[CompileSpec]:
+    return (
+        NeutronCompileSpecBuilder()
+        .neutron_compile_spec(
+            config,
+            neutron_converter_flavor,
+            extra_flags=extra_flags,
+            operators_not_to_delegate=operators_not_to_delegate,
+        )
+        .build()
+    )
+
+
+@final
+class NeutronBackend(BackendDetails):
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_spec: List[CompileSpec],
+    ) -> PreprocessResult:
+        logging.info("NeutronBackend::preprocess")
+
+        logging.debug(f"NeutronBackend preprocessing graph:\n{edge_program.graph}")
+
+        output_format = ""
+        compile_flags = []
+        binary = bytes()
+        target = ""
+        neutron_converter_flavor = ""
+        for spec in compile_spec:
+            if spec.key == "output_format":
+                output_format = spec.value.decode()
+            if spec.key == "target":
+                target = spec.value.decode()
+            if spec.key == "compile_flags":
+                compile_flags.append(spec.value.decode())
+            if spec.key == "neutron_converter_flavor":
+                neutron_converter_flavor = spec.value.decode()
+
+        # Check that the output format is set in the compile spec
+        if not output_format:
+            raise RuntimeError("output format is required")
+
+        for node in edge_program.graph.nodes:
+            if node.op == "call_function":
+                logging.debug(f"Operator to be processed: {node.target}")
+
+        # Serialize and return the program.
+        if output_format == "tflite":
+            # We need to create custom model verifier with max_pool2d added as exception.
+            # Otherwise, we get violation that this op is not part of ATen Core ops.
+            edge_program._verifiers = [
+                EXIREdgeDialectVerifier(
+                    class_only=True, exception_list=[torch.ops.aten.max_pool2d.default]
+                )
+            ]
+
+            # Remove MaxPool-related "getitem" nodes from graph
+            edge_program = NeutronPassManager(
+                edge_program, [RemoveGetItemPass]
+            ).transform()
+
+            # Convert the edge program to TFLite.
+            tflite_model, io_formats = EdgeProgramToIRConverter().convert_program(
+                edge_program
+            )
+
+            neutron_model = NeutronConverterManager().convert(
+                tflite_model, target, neutron_converter_flavor
+            )
+
+            # Dump the tflite file if logging level is enabled
+            if logging.root.isEnabledFor(logging.WARNING):
+                import os
+
+                delegation_tag = list(edge_program.graph.nodes)[0].meta[
+                    "delegation_tag"
+                ]
+                logging.debug(
+                    f"Serializing converted graph with tag {delegation_tag} to {os.getcwd()}"
+                )
+                with open(f"{delegation_tag}_pure.et.tflite", "wb") as f:
+                    f.write(bytes(tflite_model))
+                with open(f"{delegation_tag}_neutron.et.tflite", "wb") as f:
+                    f.write(bytes(neutron_model))
+
+            binary = PayloadComposer().get_binary_payload(io_formats, neutron_model)
+        else:
+            raise RuntimeError(f"Unknown format {output_format}")
+
+        return PreprocessResult(processed_bytes=binary)
+
+
+class PayloadComposer:
+    ALIGNMENT = 16
+
+    def _padding_format_string_for_array(self, array: np.ndarray) -> str:
+        """Create a padding format string for the given array, which will add 0s at the end for correct alignment.
+        E.g. the string '10x' represents adding 10 bytes of '0' padding.
+        """
+        assert array.dtype == np.dtype("uint8")
+
+        overflow = array.size % self.ALIGNMENT
+        if overflow == 0:
+            return ""
+
+        # Overflow 1 means padding 15, so use `alignment - overflow` padding.
+        return f"{self.ALIGNMENT - overflow}x"
+
+    def _format_string_for_array(self, array: np.ndarray) -> str:
+        """Create a format string which will represent the provided array. It also handles the necessary alignment.
+        E.g. for array [1,2,3] we get '3s13x', because '3s' means string of 3 bytes, and `13x` means adding 13 bytes
+         of '0' padding at the end (for 16B alignment).
+        """
+        assert array.dtype == np.dtype("uint8")
+
+        return f"{array.size}s{self._padding_format_string_for_array(array)}"
+
+    def _create_payload_header(self, io_formats) -> np.ndarray:
+        """
+        Create bytes header for returned payload. It contains information about
+        input and output tensor formats. Tensors are ordered based on graph signature
+        of ExportedProgram. Header schema:
+
+        +----------------------------------+-----------------------------------+
+        | Input TensorFormats length (1B)  | Output TensorFormats length (1B)  |
+        +----------------------------------+-----------------------------------+
+        | 1st input tensor format (1B)     | [nth* input tensor format (1B)]   |
+        +----------------------------------+-----------------------------------+
+        | 1st output tensor format (1B)    | [nth* output tensor format (1B)]  |
+        +----------------------------------+-----------------------------------+
+
+        :param io_formats: IO tensors formats.
+        :return: Bytes representation of payload header.
+        """
+        inputs = io_formats["inputs"]
+        outputs = io_formats["outputs"]
+
+        assert len(inputs) < 256, "Models with more than 255 inputs are not supported."
+        assert (
+            len(outputs) < 256
+        ), "Models with more than 255 outputs are not supported."
+
+        header_data = [len(inputs)]
+        header_data.append(len(outputs))
+
+        for _tensor, tensor_format in inputs.items():
+            header_data.append(1 if tensor_format == TensorFormat.CHANNELS_LAST else 0)
+
+        for _tensor, tensor_format in outputs.items():
+            header_data.append(1 if tensor_format == TensorFormat.CHANNELS_LAST else 0)
+
+        # noinspection PyTypeChecker
+        return np.array(header_data, dtype=np.uint8)
+
+    def _pack_with_alignment(
+        self, header: np.ndarray, neutron_artifacts: NeutronNodeArtifacts
+    ) -> bytes:
+        """
+        Packs provided data into serialized binary data of the following C struct:
+         struct NeutronBinary {
+             uint8[] header;
+             uint8[] microcode;
+             uint8[] weights;
+             uint8[] kernels;
+         }
+        The individual components must be aligned to 16 bytes.
+        """
+
+        return struct.pack(
+            self._format_string_for_array(header)
+            + self._format_string_for_array(neutron_artifacts.microcode)
+            + self._format_string_for_array(neutron_artifacts.weights)
+            + self._format_string_for_array(neutron_artifacts.kernels),
+            header.tobytes(),
+            neutron_artifacts.microcode.tobytes(),
+            neutron_artifacts.weights.tobytes(),
+            neutron_artifacts.kernels.tobytes(),
+        )
+
+    def get_binary_payload(self, io_formats, neutron_model) -> bytes:
+        """
+        Get binary payload for provided input/output tensor formats and neutron_model. Returned data have
+        following structure:
+
+        +----------------------------------------------------------------------------------------------------------------+
+        |                                            16 bytes aligned blocks                                             |
+        +===========================+===========================+============================+===========================+
+        | Input formats length (1B) | Output formats length (1B) | [nth* input format (1B)]  | [nth* output format (1B)] |
+        +---------------------------+--------------------------- +---------------------------+---------------------------+
+        |                                                Neutron microcode                                               |
+        +----------------------------------------------------------------------------------------------------------------+
+        |                                                 Neutron weights                                                |
+        +----------------------------------------------------------------------------------------------------------------+
+        |                                                 Neutron kernels                                                |
+        +----------------------------------------------------------------------------------------------------------------+
+
+        Tensor format definition: '0x1' == CHANNELS_LAST, '0x0' == FORMATLESS (no format).
+
+        :param io_formats: Dictionary with keys 'inputs' and 'outputs' that contains dictionaries
+            mapping tensor name to TensorFormat.
+        :param neutron_model: Neutron model with single NeutronGraph node.
+        :return: 16 bytes aligned binary payload.
+        """
+        header = self._create_payload_header(io_formats)
+
+        # Extract the Neutron microcode, weights and kernels from the Neutron Node in the `neutron_model`.
+        neutron_artifacts = extract_artifacts_from_neutron_node(neutron_model)
+
+        return self._pack_with_alignment(header, neutron_artifacts)
diff --git a/backends/nxp/requirements-tests.txt b/backends/nxp/requirements-tests.txt
new file mode 100644
index 00000000000..513ccefe848
--- /dev/null
+++ b/backends/nxp/requirements-tests.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://eiq.nxp.com/repository
+tensorflow==2.18.0
+pytest-mock
+tflite
+GvGen
+neutron-converter_SDK_25_03
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
new file mode 100644
index 00000000000..6c452b99baf
--- /dev/null
+++ b/backends/nxp/tests/executorch_pipeline.py
@@ -0,0 +1,91 @@
+# Copyright 2024-2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from executorch import exir
+from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
+from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
+
+# TODO (Robert Kalmar) Uncomment when NXP passes are ported to main
+# from executorch.backends.nxp.pytorch_passes.nxp_pytorch_pass_manager import NXPPyTorchPassManager
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    ExecutorchBackendConfig,
+    ExecutorchProgramManager,
+    to_edge_transform_and_lower,
+)
+from torch import nn
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+
+def _quantize_model(model, calibration_inputs: list[tuple[torch.Tensor]]):
+    quantizer = NeutronQuantizer()
+
+    m = prepare_pt2e(model, quantizer)
+    for _i, data in enumerate(calibration_inputs):
+        m(*data)
+    m = convert_pt2e(m)
+
+    return m
+
+
+def to_quantized_edge_program(
+    model: torch.nn.Module,
+    input_shape: tuple,
+    operators_not_to_delegate: list[str] = None,
+    target="imxrt700",
+    neutron_converter_flavor="SDK_25_03",
+) -> EdgeProgramManager:
+    calibration_inputs = [(torch.randn(input_shape),), (torch.randn(input_shape),)]
+    example_input = (torch.ones(*input_shape),)
+
+    exir_program_aten = torch.export.export_for_training(
+        model, example_input, strict=True
+    )
+
+    # TODO(Robert Kalmar) uncoment when NXP passes are ported to main
+    # Run pre-processing passes of the float32 aten dialect program.
+    # pass_manager = NXPPyTorchPassManager(exir_program_aten)
+    # pass_manager.run()  # All passes by default.
+
+    exir_program_aten_module = exir_program_aten.module()
+    exir_program_aten__module_quant = _quantize_model(
+        exir_program_aten_module, calibration_inputs
+    )
+
+    compile_spec = generate_neutron_compile_spec(
+        target,
+        operators_not_to_delegate=operators_not_to_delegate,
+        neutron_converter_flavor=neutron_converter_flavor,
+    )
+    partitioner = NeutronPartitioner(compile_spec)
+    edge_program_manager = to_edge_transform_and_lower(
+        torch.export.export(
+            exir_program_aten__module_quant, example_input, strict=True
+        ),
+        partitioner=[partitioner],
+        compile_config=EdgeCompileConfig(_check_ir_validity=False),
+    )
+
+    return edge_program_manager
+
+
+def to_quantized_executorch_program(
+    model: torch.nn.Module, input_shape: tuple
+) -> ExecutorchProgramManager:
+    edge_program_manager = to_quantized_edge_program(model, input_shape)
+
+    return edge_program_manager.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
+    )
+
+
+def to_edge_program(model: nn.Module, input_shape) -> EdgeProgramManager:
+    example_input = (torch.ones(input_shape),)
+    exir_program = torch.export.export(model, example_input)
+    return exir.to_edge(exir_program)
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
new file mode 100644
index 00000000000..2c9fdf69f5a
--- /dev/null
+++ b/backends/nxp/tests/executors.py
@@ -0,0 +1,293 @@
+# Copyright 2023-2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Union
+
+import numpy
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.backend.ir import logger
+from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
+from torch.export import ExportedProgram
+
+# If executed on i.MX platform, there is no tensorflow module. And typically the intention is to use the tflite python
+# interpreter available in tflite_runtime
+try:
+    import tensorflow.lite as tflite
+except ModuleNotFoundError:
+    import tflite_runtime.interpreter as tflite
+
+
+class EdgeProgramExecutor:
+
+    def __init__(self, edge_program: ExportedProgram):
+        self.edge_program = edge_program
+
+    def inference(
+        self, input_data: Union[numpy.ndarray, Dict[int, numpy.ndarray]]
+    ) -> Union[numpy.ndarray, Dict[str, numpy.ndarray]]:
+
+        if not isinstance(input_data, numpy.ndarray):
+            raise RuntimeError(
+                "Edge program inference with multiple inputs not implemented"
+            )
+
+        output = self.edge_program.module()(torch.from_numpy(input_data))
+
+        if isinstance(output, torch.Tensor):
+            return output.detach().numpy()
+        elif isinstance(output, tuple) and len(output) == 1:
+            return output[0].detach().numpy()
+
+        raise RuntimeError(
+            "Edge program inference with multiple outputs not implemented"
+        )
+
+
+class TFLiteExecutor:
+    _interpreter: tflite.Interpreter
+
+    def __init__(
+        self,
+        model_path: str = None,
+        model_content=None,
+        save_model=False,
+        saved_model_name="model.tflite",
+        delegate_path=None,
+        num_threads=None,
+        op_resolver_type=tflite.experimental.OpResolverType.AUTO,
+    ):
+        """
+        Construct TFLiteExecutor used to quickly run inference on TFLite model.
+        Exactly one of "model_path" and "model_content" must be specified.
+
+        :param model_path: Path to executed TFLite model.
+        :param model_content: Path to byte representation of TFLite model.
+        :param save_model: If true and model was provided through "model_content",
+            model is saved to storage with name "saved_model_name".
+        :param saved_model_name: Model name used when model stored to storage. Default
+            value is "model.tflite".
+        :param delegate_path: External delegate to be used for the TFLiteExecutor, see
+            https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter for details. Default value is None.
+        :param num_threads: number of threads to be used by the TFLiteExecutor, see
+            https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter for details. Default value is None.
+        :param op_resolver_type: Op kernels to be used by the TFLiteExecutor, see
+            https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter for details. Default value is
+            tflite.experimental.OpResolverType.AUTO.
+        """
+        assert model_path is not None or model_content is not None
+        assert model_path is None or model_content is None
+
+        if delegate_path is not None:
+            delegate = [tflite.load_delegate(delegate_path)]
+        else:
+            delegate = None
+
+        if save_model:
+            with open(saved_model_name, "wb") as f:
+                f.write(model_content)
+
+        if model_path is not None:
+            self._interpreter = tflite.Interpreter(
+                model_path=model_path,
+                experimental_delegates=delegate,
+                num_threads=num_threads,
+                experimental_op_resolver_type=op_resolver_type,
+            )
+        else:
+            self._interpreter = tflite.Interpreter(
+                model_content=model_content,
+                experimental_delegates=delegate,
+                num_threads=num_threads,
+                experimental_op_resolver_type=op_resolver_type,
+            )
+
+        self._interpreter.allocate_tensors()
+
+    def inference(
+        self, input_data: Union[numpy.ndarray, Dict[int, numpy.ndarray]]
+    ) -> Union[numpy.ndarray, Dict[str, numpy.ndarray]]:
+        input_details = self._interpreter.get_input_details()
+        output_details = self._interpreter.get_output_details()
+
+        if isinstance(input_data, numpy.ndarray):
+            self._interpreter.set_tensor(input_details[0]["index"], input_data)
+        elif isinstance(input_data, Dict):
+            if len(input_data) != len(input_details):
+                logger.w(
+                    f"Number of model inputs: '{len(input_details)}', and provided input data: '{len(input_data)}'"
+                    f" is not the same. Using first {len(input_details)} inputs tensors."
+                )
+            for index in range(len(input_details)):
+                self._interpreter.set_tensor(
+                    input_details[index]["index"], input_data[index]
+                )
+
+        self._interpreter.allocate_tensors()
+        self._interpreter.invoke()
+
+        output_data = {}
+
+        for output_detail in output_details:
+            output_data[output_detail["name"]] = self._interpreter.get_tensor(
+                output_detail["index"]
+            )
+
+        # Flatten output if there is only one value in output dictionary
+        if len(output_data) == 1:
+            return np.asarray(next(iter(output_data.values())))
+        else:
+            return output_data
+
+    def get_output_details(self, index):
+        return self._interpreter.get_output_details()[index]
+
+
+def compare_output_arrays(
+    tfl_output: np.ndarray,
+    edge_output: np.ndarray,
+    output_name: str,
+    rtol: float = 1.0e-5,
+    atol: float = 1.0e-8,
+):
+    """Assert that the provided numpy arrays are equal.
+
+    :param tfl_output: Numpy array holding the output of the TFLite model.
+    :param edge_output: Numpy array holding the output of the ExportedProgram.
+    :param output_name: Common name of the above arrays.
+    :param rtol: Relative tolerance.
+    :param atol: Absolute tolerance.
+    """
+    if tfl_output.dtype.char == edge_output.dtype.char == "O":
+        # String types fail in the following checks. Cast them to float32 before comparison.
+        tfl_output = tfl_output.astype(np.float32)
+        edge_output = edge_output.astype(np.float32)
+
+    if tfl_output.dtype != np.bool_ and tfl_output.size != 0:
+        logger.d(
+            f"Maximum output difference of the `{output_name}`tensor: {np.max(np.abs(tfl_output - edge_output))}"
+        )
+
+    assert tfl_output.shape == edge_output.shape, "Output shapes don't match!"
+
+    assert np.allclose(
+        tfl_output, edge_output, rtol=rtol, atol=atol, equal_nan=True
+    ), f"Output values of the `{output_name}` tensor don't match!"
+
+
+class TFLiteIOPreprocess:
+
+    def preprocess(self, data: np.ndarray):
+        return data
+
+
+class ToNHWCPreprocess(TFLiteIOPreprocess):
+
+    def preprocess(self, data: np.ndarray):
+        assert isinstance(
+            data, np.ndarray
+        ), "Only single Numpy array preprocessing is currently supported"
+        return np.transpose(data, [0, 2, 3, 1])
+
+
+class ToNCHWPreprocess(TFLiteIOPreprocess):
+
+    def preprocess(self, data: np.ndarray):
+        assert isinstance(
+            data, np.ndarray
+        ), "Only single Numpy array preprocessing is currently supported"
+        return np.transpose(data, [0, 3, 1, 2])
+
+
+def convert_run_compare(
+    edge_program: ExportedProgram,
+    input_data,
+    rtol=1.0e-5,
+    atol=1.0e-8,
+    save_models=False,
+    tfl_model: (bytes, dict) = None,
+    tflite_input_preprocess: TFLiteIOPreprocess = TFLiteIOPreprocess(),  # noqa B008
+    tflite_output_preprocess: TFLiteIOPreprocess = TFLiteIOPreprocess(),  # noqa B008
+    conversion_config: ConversionConfig = ConversionConfig(),  # noqa B008
+    tflite_op_resolver_type=tflite.experimental.OpResolverType.AUTO,
+) -> (TFLiteExecutor, EdgeProgramExecutor):
+
+    if tfl_model is None:
+        tfl_model, _ = EdgeProgramToIRConverter().convert_program(
+            edge_program, conversion_config
+        )
+
+    edge_program_executor = EdgeProgramExecutor(edge_program)
+    edge_program_output = edge_program_executor.inference(input_data)
+
+    tflite_input_data = tflite_input_preprocess.preprocess(input_data)
+    tflite_executor = TFLiteExecutor(
+        model_content=tfl_model,
+        save_model=save_models,
+        op_resolver_type=tflite_op_resolver_type,
+    )
+    tflite_output = tflite_executor.inference(tflite_input_data)
+    tflite_output = tflite_output_preprocess.preprocess(tflite_output)
+
+    if isinstance(tflite_output, dict) and isinstance(edge_program_output, dict):
+        if (
+            len(
+                set(tflite_output.keys()).symmetric_difference(
+                    set(edge_program_output.keys())
+                )
+            )
+            == 0
+        ):
+            # Both TFLite and ExportedProgram output dictionaries have the same keys.
+            for output_name, tflite_out in tflite_output.items():
+                compare_output_arrays(
+                    tflite_out,
+                    edge_program_output[output_name],
+                    output_name,
+                    rtol,
+                    atol,
+                )
+
+        else:
+            logger.e(
+                logger.Code.INTERNAL_ERROR,
+                "Original program and converted TFLite models have different outputs.",
+            )
+
+    elif isinstance(tflite_output, np.ndarray) and isinstance(
+        edge_program_output, np.ndarray
+    ):
+        compare_output_arrays(
+            tflite_output, edge_program_output, "main output", rtol, atol
+        )
+
+    else:
+        # This can happen for example, if the TFLite model does not have some outputs, which are in exported program.
+        logger.e(
+            logger.Code.NOT_IMPLEMENTED,
+            "Original ExportedProgram and converted TFLite models have different"
+            " number of outputs. Testing is not implemented for this case.",
+        )
+
+    return tflite_executor, edge_program_executor
+
+
+class OverrideSupportedTargets:
+
+    def __init__(self, converter_class, *, new_targets):
+        self._converter_class = converter_class
+        self._new_targets = new_targets
+
+        self._old_targets = self._converter_class.supported_targets
+
+    def __enter__(self):
+        self._converter_class.supported_targets = self._new_targets
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._converter_class.supported_targets = self._old_targets
diff --git a/backends/nxp/tests/exported_program_vizualize.py b/backends/nxp/tests/exported_program_vizualize.py
new file mode 100644
index 00000000000..0f4b8db697c
--- /dev/null
+++ b/backends/nxp/tests/exported_program_vizualize.py
@@ -0,0 +1,90 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+
+from gvgen import GvGen
+from torch.export import ExportedProgram
+
+
+def exported_program_to_dot(  # noqa C901
+    exported_program: ExportedProgram, dot_file_name="graph.dot", show_tags=True
+):
+    """
+    Generate dot file for tagged exported program.
+
+    :param exported_program: Exported program with optional meta values: 'delegation_tag' and 'cluster'.
+    :param dot_file_name: Produced .dot file name.
+    :param show_tags: If True, nodes will be shown as a subcomponent of tag nodes.
+    """
+    graph = GvGen()
+
+    def name_color(string):  # pseudo-randomization function
+        h = hash(string)  # hash string and int together
+        if h < 0:  # ensure positive number
+            h = h * -1
+        random.seed(h)  # set the seed to use for randomization
+        r = int(random.random() * 255)
+        g = int(random.random() * 255)
+        b = int(random.random() * 255)
+        return "#%02x%02x%02x" % (r, g, b)
+
+    graph_items = {}
+    delegation_tags = {}
+
+    # Find tags (parent objects)
+    for node in exported_program.graph.nodes:
+        if "delegation_tag" in node.meta and show_tags:
+            tag = node.meta["delegation_tag"]
+            if tag not in delegation_tags:
+                item = graph.newItem(tag)
+                delegation_tags[tag] = item
+
+    for node in exported_program.graph.nodes:
+        if "delegation_tag" in node.meta and show_tags:
+            # Delegated node -> add color
+            tag = node.meta["delegation_tag"]
+            item = graph.newItem(node.name, delegation_tags[tag])
+
+            graph.propertyAppend(item, "fillcolor", name_color(tag))
+            graph.propertyAppend(item, "style", "filled")
+        else:
+            item = graph.newItem(node.name)
+
+        label = graph.propertyGet(item, "label")
+        if "cluster" in node.meta:
+            graph.propertyAppend(
+                item, "label", label + "\n QDQ Cluster: " + node.meta["cluster"]
+            )
+
+        # Change shape of node for (de)quantize and rest of nodes
+        if any(q in label for q in ["_quantize_per_tensor_", "_quantize_per_channel_"]):
+            graph.propertyAppend(item, "shape", "invhouse")
+        elif any(
+            dq in label
+            for dq in ["_dequantize_per_tensor_", "_dequantize_per_channel_"]
+        ):
+            graph.propertyAppend(item, "shape", "house")
+        else:
+            graph.propertyAppend(item, "shape", "box")
+
+        graph_items[node.name] = item
+
+    # Add connections between nodes
+    for node in exported_program.graph.nodes:
+        for user in node.users:
+            link = graph.newLink(graph_items[node.name], graph_items[user.name])
+
+            label = ""
+            if "val" in node.meta:
+                tensor = node.meta["val"]
+                if isinstance(tensor, tuple):
+                    tensor = tensor[0]  # Fake tensor
+                label = f"  ({list(tensor.shape)} | {tensor.dtype})"
+
+            graph.propertyAppend(link, "label", label)
+
+    with open(dot_file_name, "w") as f:
+        graph.dot(f)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
new file mode 100644
index 00000000000..8b6b63bb53f
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
@@ -0,0 +1,158 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToNCHWPreprocess,
+    ToNHWCPreprocess,
+)
+from executorch.backends.nxp.tests.models import AvgPool2dConvModule, AvgPool2dModule
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+    "input_shape, padding, count_include_pad",
+    [
+        pytest.param(
+            (1, 4, 8, 8),
+            (0, 0),
+            True,
+            id="No padding, include padding to average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 8, 8),
+            (0, 0),
+            False,
+            id="No padding, don't include padding to average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 8, 8),
+            (1, 1),
+            True,
+            id="Padding, keep the same output tensor size as input, include "
+            "padding to average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 8, 8),
+            (1, 0),
+            True,
+            id="Padding, change the output tensor size, include padding to "
+            "average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 9, 9),
+            (1, 0),
+            True,
+            id="Padding, change the output tensor size, include padding to "
+            "average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 7, 7),
+            (0, 1),
+            True,
+            id="Padding, change the output tensor size, include padding to "
+            "average calculation.",
+        ),
+    ],
+)
+def test_avg_pool_2d_conversion(input_shape, padding, count_include_pad):
+    model = AvgPool2dModule(padding=padding, count_include_pad=count_include_pad)
+    edge_program = to_edge_program(model, input_shape).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tflite_output_preprocess=ToNCHWPreprocess(),
+    )
+
+
+@pytest.mark.parametrize(
+    "input_shape, padding, count_include_pad",
+    [
+        pytest.param(
+            (1, 4, 16, 16),
+            (0, 0),
+            True,
+            id="No padding, include padding to average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 16, 16),
+            (0, 0),
+            False,
+            id="No padding, don't include padding to average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 16, 16),
+            (1, 1),
+            True,
+            id="Keep the same output tensor size as input, include padding "
+            "to average calculation.",
+        ),
+        pytest.param(
+            (1, 4, 16, 16),
+            (1, 0),
+            True,
+            id="Padding, change same tensor size, include padding to average"
+            " calculation.",
+        ),
+        pytest.param(
+            (1, 4, 11, 11),
+            (0, 1),
+            True,
+            id="Padding, change same tensor size, include padding to average"
+            " calculation.",
+        ),
+        pytest.param(
+            (1, 4, 11, 11),
+            (1, 0),
+            True,
+            id="Padding, change same tensor size, include padding to average"
+            " calculation.",
+        ),
+    ],
+)
+def test_avg_pool_2d_quant_conversion(mocker, input_shape, padding, count_include_pad):
+    model = AvgPool2dConvModule(padding=padding, count_include_pad=count_include_pad)
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    convert_run_compare(
+        exported_program,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tfl_model=tflite_flatbuffers_model,
+        tflite_output_preprocess=ToNCHWPreprocess(),
+        input_data=input_data,
+    )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
new file mode 100644
index 00000000000..d6030ebae7f
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_constant_pad_nd_converter.py
@@ -0,0 +1,147 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToNCHWPreprocess,
+    ToNHWCPreprocess,
+)
+from executorch.backends.nxp.tests.models import (
+    ConstantPadNDConvModule,
+    ConstantPadNDModule,
+    Conv2dConstantPadNDModule,
+)
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize("constant", [0.0, 42.0, -13.37])
+def test_constant_pad_nd_conversion__specific_constant(constant):
+    input_shape = [2, 4, 6, 8]
+    paddings = [1, 2, 3, 4]
+
+    edge_program = to_edge_program(
+        ConstantPadNDModule(paddings, constant), input_shape
+    ).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(edge_program, input_data)
+
+
+@pytest.mark.parametrize("constant", [0.0, 67.28, 42.0, -13.37])
+@pytest.mark.skip(reason="Neutron Converter does not fully convert for NPU")
+def test_constant_pad_nd_quant_conversion__specific_constant(mocker, constant):
+    input_shape = (2, 4, 12, 12)
+    paddings = (2, 2, 2, 2)
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(
+        Conv2dConstantPadNDModule(paddings, constant), input_shape
+    )
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    edge_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tfl_model=tflite_flatbuffers_model,
+        atol=1.0,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tflite_output_preprocess=ToNCHWPreprocess(),
+    )
+
+
+def test_constant_pad_nd_conversion__default_constant():
+    input_shape = [2, 4, 6, 8]
+    paddings = [1, 2, 3, 4]
+
+    edge_program = to_edge_program(
+        ConstantPadNDModule(paddings), input_shape
+    ).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(edge_program, input_data)
+
+
+@pytest.mark.parametrize(
+    "input_shape, paddings",
+    [
+        pytest.param([2], list(range(2)), id="1D, padding H"),
+        pytest.param([2, 4], list(range(2)), id="2D, padding H"),
+        pytest.param([2, 4], list(range(4)), id="2D, padding N, H"),
+        pytest.param([2, 4, 6], list(range(2)), id="3D, padding H"),
+        pytest.param([2, 4, 6], list(range(4)), id="3D, padding C, H"),
+        pytest.param([2, 4, 6], list(range(6)), id="3D, padding N, C, H"),
+        pytest.param([2, 4, 6, 8], list(range(2)), id="4D, padding W"),
+        pytest.param([2, 4, 6, 8], list(range(4)), id="4D, padding H, W"),
+        pytest.param([2, 4, 6, 8], list(range(6)), id="4D, padding C, H, W"),
+        pytest.param([2, 4, 6, 8], list(range(8)), id="4D, padding N, C, H, W"),
+        pytest.param([1, 2, 3, 4, 5], list(range(2)), id="5D, padding D"),
+        pytest.param([1, 2, 3, 4, 5], list(range(4)), id="5D, padding W, D"),
+        pytest.param([1, 2, 3, 4, 5], list(range(6)), id="5D, padding H, W, D"),
+        pytest.param([1, 2, 3, 4, 5], list(range(8)), id="5D, padding C, H, W, D"),
+        pytest.param([1, 2, 3, 4, 5], list(range(10)), id="5D, padding N, C, H, W, D"),
+    ],
+)
+def test_constant_pad_nd_conversion__format_less(input_shape, paddings):
+    edge_program = to_edge_program(
+        ConstantPadNDModule(paddings), input_shape
+    ).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(edge_program, input_data)
+
+
+@pytest.mark.parametrize(
+    "input_shape, paddings",
+    [
+        pytest.param([2, 4, 6, 8], list(range(2)), id="4D, padding W"),
+        pytest.param([2, 4, 6, 8], list(range(4)), id="4D, padding H, W"),
+        pytest.param([2, 1, 6, 8], [1, 2, 3, 4, 2, 1], id="4D, padding C, H, W"),
+        pytest.param(
+            [2, 1, 6, 8], [1, 2, 3, 4, 2, 1, 5, 6], id="4D, padding N, C, H, W"
+        ),
+    ],
+)
+def test_constant_pad_nd_conversion__channels_first(input_shape, paddings):
+    edge_program = to_edge_program(
+        ConstantPadNDConvModule(paddings), input_shape
+    ).exported_program()  # Extra `Conv` after the padding.
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tflite_output_preprocess=ToNCHWPreprocess(),
+    )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
new file mode 100644
index 00000000000..1eceacbf060
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
@@ -0,0 +1,206 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToNCHWPreprocess,
+    ToNHWCPreprocess,
+)
+from executorch.backends.nxp.tests.models import Conv2dModule
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+    "input_shape, padding",
+    [
+        pytest.param((1, 4, 32, 32), (0, 0), id="No padding."),
+        pytest.param(
+            (1, 4, 32, 32),
+            (1, 1),
+            id="Padding, keep the same output tensor size as input.",
+        ),
+        pytest.param(
+            (1, 4, 32, 32), (1, 0), id="Padding, change the output tensor size."
+        ),
+        pytest.param(
+            (1, 4, 31, 31), (1, 0), id="Padding, change the output tensor size."
+        ),
+        pytest.param(
+            (1, 4, 31, 31), (0, 1), id="Padding, change the output tensor size."
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "dilation",
+    [
+        pytest.param(1, id="No dilation."),
+        pytest.param(2, id="2 dilation."),
+        pytest.param((1, 3), id="Side-different dilation."),
+    ],
+)
+def test_conv2d_conversion(input_shape, padding, dilation: int):
+    edge_program = to_edge_program(
+        Conv2dModule(padding=padding, dilation=dilation), input_shape
+    ).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tflite_output_preprocess=ToNCHWPreprocess(),
+        atol=4e-7,
+    )
+
+
+@pytest.mark.parametrize(
+    "model, input_shape",
+    [
+        pytest.param(
+            Conv2dModule(in_channels=8, out_channels=32, kernel_size=5),
+            (1, 8, 32, 32),
+            id="In ch 8, out ch 32, kernel 5",
+        ),
+        pytest.param(
+            Conv2dModule(in_channels=8, out_channels=32, kernel_size=5, padding=3),
+            (1, 8, 32, 32),
+            id="In ch 8, out ch 32, kernel 5, padding 3",
+        ),
+        pytest.param(
+            Conv2dModule(in_channels=8, out_channels=32, kernel_size=5, padding=(2, 3)),
+            (1, 8, 31, 31),
+            id="In ch 8, out ch 32, kernel 5, padding (2, 3)",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=8,
+                out_channels=32,
+                kernel_size=5,
+                padding=(2, 3),
+                dilation=(1, 2),
+            ),
+            (1, 8, 31, 31),
+            id="In ch 8, out ch 32, kernel 5, padding (2, 3), dilation (1, 2)",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=16, out_channels=32, kernel_size=3, padding=2, dilation=2
+            ),
+            (1, 16, 32, 32),
+            id="In ch 16, out ch 32, kernel 3, padding 2, dilation 2",
+        ),
+        pytest.param(
+            Conv2dModule(in_channels=32, out_channels=32, kernel_size=3, dilation=2),
+            (1, 32, 32, 32),
+            id="In ch 32, out ch 32, kernel 3, dilation 2",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=32,
+                out_channels=32,
+                kernel_size=3,
+                padding=(0, 1),
+                dilation=2,
+            ),
+            (1, 32, 35, 35),
+            id="In ch 32, out ch 32, kernel 3, padding (0, 1), dilation 2",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=32,
+                out_channels=32,
+                kernel_size=3,
+                padding=(1, 0),
+                dilation=(3, 1),
+            ),
+            (1, 32, 35, 35),
+            id="In ch 32, out ch 32, kernel 3, padding (1, 0), dilation (3, 1)",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=32, out_channels=32, kernel_size=3, dilation=(2, 3)
+            ),
+            (1, 32, 32, 32),
+            id="In ch 32, out ch 32, kernel 3, dilation (2, 3)",
+        ),
+        pytest.param(
+            Conv2dModule(in_channels=32, out_channels=64, kernel_size=4),
+            (1, 32, 32, 32),
+            id="In ch 32, out ch 32, kernel 4",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=32, out_channels=64, kernel_size=4, padding=(1, 2)
+            ),
+            (1, 32, 33, 33),
+            id="In ch 32, out ch 32, kernel 4, padding (1, 2)",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=32, out_channels=64, kernel_size=4, padding=(1, 0)
+            ),
+            (1, 32, 33, 33),
+            id="In ch 32, out ch 32, kernel 4, padding (1, 0)",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=32, out_channels=64, kernel_size=4, padding=(0, 2)
+            ),
+            (1, 32, 32, 32),
+            id="In ch 32, out ch 32, kernel 4, padding (0, 2)",
+        ),
+        pytest.param(
+            Conv2dModule(
+                in_channels=32,
+                out_channels=64,
+                kernel_size=4,
+                padding=(0, 2),
+                dilation=(1, 2),
+            ),
+            (1, 32, 32, 32),
+            id="In ch 32, out ch 32, kernel 4, padding (0, 2), dilation (1, 2)",
+        ),
+    ],
+)
+def test_conv2d_quant_conversion(mocker, model: torch.nn.Module, input_shape):
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    convert_run_compare(
+        exported_program,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tfl_model=tflite_flatbuffers_model,
+        tflite_output_preprocess=ToNCHWPreprocess(),
+        input_data=input_data,
+        atol=1.0,
+    )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py
new file mode 100644
index 00000000000..4a19c3d8c4b
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py
@@ -0,0 +1,40 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.tests.executorch_pipeline import to_edge_program
+from executorch.backends.nxp.tests.executors import convert_run_compare
+from executorch.backends.nxp.tests.models import LinearModule
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+def test_linear_conversion__with_bias():
+    input_shape = (10, 32)
+    edge_program = to_edge_program(
+        LinearModule(bias=True), input_shape
+    ).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(edge_program, input_data=input_data, atol=1.0e-6)
+
+
+def test_linear_conversion__without_bias():
+    input_shape = (10, 32)
+    edge_program = to_edge_program(
+        LinearModule(bias=True), input_shape
+    ).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(edge_program, input_data=input_data, atol=1.0e-6)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
new file mode 100644
index 00000000000..2618558f7c9
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
@@ -0,0 +1,121 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.neutron_pass_manager import NeutronPassManager
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToNCHWPreprocess,
+    ToNHWCPreprocess,
+)
+from executorch.backends.nxp.tests.models import MaxPool2dConvModule, MaxPool2dModule
+from executorch.backends.xnnpack._passes import RemoveGetItemPass
+from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+    "input_shape, padding",
+    [
+        pytest.param((1, 4, 8, 8), (0, 0), id="No padding."),
+        pytest.param(
+            (1, 4, 8, 8),
+            (1, 1),
+            id="Padding, keep the same output tensor size as input.",
+        ),
+        pytest.param(
+            (1, 4, 8, 8), (1, 0), id="Padding, change the output tensor size."
+        ),
+        pytest.param(
+            (1, 4, 9, 9), (1, 0), id="Padding, change the output tensor size."
+        ),
+        pytest.param(
+            (1, 4, 9, 9), (0, 1), id="Padding, change the output tensor size."
+        ),
+    ],
+)
+def test_max_pool_2d_conversion(input_shape, padding):
+    edge_program = to_edge_program(
+        MaxPool2dModule(padding=padding), input_shape
+    ).exported_program()
+
+    # We need to create custom model verifier with max_pool2d added as exception.
+    # Otherwise, we get violation that this op is not part of ATen Core ops.
+    edge_program._verifiers = [
+        EXIREdgeDialectVerifier(
+            class_only=True, exception_list=[torch.ops.aten.max_pool2d.default]
+        )
+    ]
+
+    # Remove MaxPool-related "getitem" nodes from graph
+    edge_program = NeutronPassManager(edge_program, [RemoveGetItemPass]).transform()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tflite_output_preprocess=ToNCHWPreprocess(),
+    )
+
+
+@pytest.mark.parametrize(
+    "input_shape, padding",
+    [
+        pytest.param((1, 4, 8, 8), (0, 0), id="No padding."),
+        pytest.param(
+            (1, 4, 8, 8),
+            (1, 1),
+            id="Padding, keep the same output tensor size as input.",
+        ),
+        pytest.param(
+            (1, 4, 8, 8), (1, 0), id="Padding, change the output tensor size."
+        ),
+        pytest.param(
+            (1, 4, 11, 11), (1, 0), id="Padding, change the output tensor size."
+        ),
+        pytest.param(
+            (1, 4, 11, 11), (0, 1), id="Padding, change the output tensor size."
+        ),
+    ],
+)
+def test_max_pool_2d_quant_conversion(mocker, input_shape, padding):
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(MaxPool2dConvModule(padding=padding), input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    convert_run_compare(
+        exported_program,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tfl_model=tflite_flatbuffers_model,
+        tflite_output_preprocess=ToNCHWPreprocess(),
+        input_data=input_data,
+    )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
new file mode 100644
index 00000000000..d25e2759cc8
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
@@ -0,0 +1,64 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToNCHWPreprocess,
+    ToNHWCPreprocess,
+)
+from executorch.backends.nxp.tests.models import Conv2dModule
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+class Conv2dPermuteCopyModule(torch.nn.Module):
+    def __init__(self, new_dims: tuple[int, ...]):
+        super().__init__()
+        self.new_dims = new_dims
+        self.conv = Conv2dModule()
+
+    def forward(self, x):
+        x = self.conv(x)
+        return torch.permute(x, self.new_dims)
+
+
+def test_permute_copy_quant_conversion__with_bias(mocker):
+    input_shape = (1, 4, 8, 8)
+    new_dims = (0, 2, 3, 1)
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(Conv2dPermuteCopyModule(new_dims), input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    edge_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tfl_model=tflite_flatbuffers_model,
+        atol=1.0,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tflite_output_preprocess=ToNCHWPreprocess(),
+    )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
new file mode 100644
index 00000000000..8d903e3e0b5
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_relu_converter.py
@@ -0,0 +1,108 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToNCHWPreprocess,
+    ToNHWCPreprocess,
+)
+from executorch.backends.nxp.tests.models import Conv2dModule, LinearModule, ReLUModule
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+class ConvReLUModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.conv = Conv2dModule()
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        return self.relu(x)
+
+
+class LinearReLUModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.linear = LinearModule(bias=True)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.linear(x)
+        return self.relu(x)
+
+
+def test_relu_conversion():
+    input_shape = (10, 4, 32, 32)
+    edge_program = to_edge_program(ReLUModule(), input_shape).exported_program()
+
+    input_data = 2 * np.random.random(input_shape).astype(np.float32) - 1
+
+    convert_run_compare(edge_program, input_data=input_data)
+
+
+def test_relu_with_conv_quant_conversion(mocker):
+    input_shape = (1, 4, 32, 32)
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(ConvReLUModule(), input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, _ = converter_spy.spy_return
+
+    # Capture converted program
+    edge_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (
+        (2 * np.random.random(input_shape).astype(np.float32) - 1) * 50
+    ).astype(np.int8)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tfl_model=tflite_flatbuffers_model,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tflite_output_preprocess=ToNCHWPreprocess(),
+    )
+
+
+def test_relu_with_linear_quant_conversion(mocker):
+    input_shape = (256, 32)
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(LinearReLUModule(), input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, _ = converter_spy.spy_return
+
+    # Capture converted program
+    edge_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (
+        (2 * np.random.random(input_shape).astype(np.float32) - 1) * 50
+    ).astype(np.int8)
+
+    convert_run_compare(edge_program, input_data, tfl_model=tflite_flatbuffers_model)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
new file mode 100644
index 00000000000..c3eecc04adc
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
@@ -0,0 +1,111 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
+from executorch.backends.nxp.tests.executorch_pipeline import to_edge_program
+from executorch.backends.nxp.tests.executors import convert_run_compare
+from executorch.backends.nxp.tests.models import SoftmaxConvModule, SoftmaxModule
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+    "input_shape,dim",
+    [
+        pytest.param((10,), -1, id="1D,dim=-1"),
+        pytest.param((10,), 0, id="1D,dim=0"),
+        pytest.param((10, 32), -1, id="2D,dim=-1"),
+        pytest.param((10, 32), 1, id="2D,dim=1"),
+    ],
+)
+def test_softmax_conversion__formatless_input(input_shape, dim: int):
+    model = SoftmaxModule(dim)
+
+    edge_program = to_edge_program(model, input_shape).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    convert_run_compare(edge_program, input_data=input_data)
+
+
+@pytest.mark.parametrize(
+    "input_shape,dim",
+    [
+        pytest.param((10, 32, 32), -1, id="3D,dim=-1"),
+        pytest.param((10, 32, 32), 2, id="3D,dim=2"),
+        pytest.param((10, 32, 32, 8), -1, id="4D,dim=-1"),
+        pytest.param((10, 32, 32, 8), 3, id="4D,dim=3"),
+        pytest.param((10, 32, 32, 8, 8), -1, id="5D,dim=-1"),
+        pytest.param((10, 32, 32, 8, 8), 4, id="5D,dim=4"),
+    ],
+)
+def test_softmax_conversion__unknown_input_format(input_shape, dim: int):
+    model = SoftmaxModule(dim)
+
+    edge_program = to_edge_program(model, input_shape).exported_program()
+
+    # Currently this test not pass because the convertibility checker doesn't use tensor formats.
+    with pytest.raises(
+        AssertionError, match="`aten__softmax_default` is not convertible"
+    ):
+        EdgeProgramToIRConverter().convert_program(edge_program, ConversionConfig())
+
+    # input_data = np.random.random(input_shape).astype(np.float32)
+    # convert_run_compare(edge_program_manager.exported_program(), input_data=input_data, atol=5e-7)
+
+
+@pytest.mark.parametrize(
+    "input_shape,dim",
+    [
+        pytest.param((10, 4, 32, 32), 1, id="4D,dim=1"),
+        pytest.param((10, 4, 16, 16), -3, id="4D,dim=-3"),
+    ],
+)
+def test_softmax_conversion_channel_last(input_shape, dim: int):
+    model = SoftmaxConvModule(dim)
+
+    edge_program = to_edge_program(model, input_shape).exported_program()
+
+    # TODO (Robert Kalmar) Currently this test not pass because the convertibility checker doesn't use tensor formats.
+    with pytest.raises(
+        AssertionError, match="`aten__softmax_default` is not convertible"
+    ):
+        EdgeProgramToIRConverter().convert_program(edge_program, ConversionConfig())
+
+    # input_data = np.random.random(input_shape).astype(np.float32)
+    # convert_run_compare(edge_program_manager.exported_program(), tflite_input_preprocess=ToNHWCPreprocess(),
+    #                     tflite_output_preprocess=ToNCHWPreprocess(), input_data=input_data, atol=5e-7)
+
+
+@pytest.mark.parametrize(
+    "input_shape,dim",
+    [
+        pytest.param((10, 32), 0, id="2D,dim=0"),
+        pytest.param((10, 32, 32), 1, id="3D,dim=1"),
+        pytest.param((10, 32, 32, 8), 2, id="4D,dim=2"),
+        pytest.param((10, 32, 32, 8, 8), 3, id="5D,dim=3"),
+        pytest.param((10, 32, 32, 8, 8), 2, id="5D,dim=2"),
+    ],
+)
+def test_softmax_conversion_unsupported_dims(input_shape, dim: int):
+    model = SoftmaxModule(dim)
+
+    edge_program = to_edge_program(model, input_shape).exported_program()
+
+    with pytest.raises(
+        AssertionError, match="`aten__softmax_default` is not convertible"
+    ):
+        EdgeProgramToIRConverter().convert_program(edge_program, ConversionConfig())
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
new file mode 100644
index 00000000000..9863c8acc41
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
@@ -0,0 +1,237 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Sequence
+
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.backend.ir.converter.builder.model_builder import (
+    ModelBuilder,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.conv_2d_options import (
+    Conv2D,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.reshape_options import (
+    Reshape,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.transpose_options import (
+    Transpose,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToNCHWPreprocess,
+    ToNHWCPreprocess,
+)
+from torch import nn
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+class FormatlessToChannelsFirstModule(nn.Module):
+    def __init__(self, channels: int, new_shape: Sequence[int]):
+        super().__init__()
+        self.conv = nn.Conv2d(channels, channels, 2, bias=True)
+        self.new_shape = new_shape
+
+    def forward(self, x):
+        x = torch.reshape(x, self.new_shape)
+        x = self.conv(x)
+        return x
+
+
+class FormatlessToFormatlessModule(nn.Module):
+    def __init__(self, new_shape: Sequence[int]):
+        super().__init__()
+        self.new_shape = new_shape
+
+    def forward(self, x):
+        x = torch.reshape(x, self.new_shape)
+        return x
+
+
+class ConvReshapeModule(nn.Module):
+    def __init__(self, channels: int, new_shape: Sequence[int]):
+        super().__init__()
+        self.conv = nn.Conv2d(channels, channels, 2, bias=True)
+        self.new_shape = new_shape
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.reshape(x, self.new_shape)
+        return x
+
+
+class LinearReshapeModule(torch.nn.Module):
+    def __init__(self, new_shape: Sequence[int]):
+        super().__init__()
+        self.linear = nn.Linear(64, 32, bias=True)
+        self.new_shape = new_shape
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = torch.reshape(x, self.new_shape)
+        return x
+
+
+def test__channels_first_to_2d(mocker):
+    input_shape = [2, 4, 7, 9]
+    new_shape = [12, 32]  # Mix up the dimensions for a thorough test.
+
+    torch_model = ConvReshapeModule(channels=input_shape[1], new_shape=new_shape)
+    edge_program = to_edge_program(torch_model, input_shape).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    converter_spy = mocker.spy(ModelBuilder, "finish")
+
+    convert_run_compare(
+        edge_program, input_data, tflite_input_preprocess=ToNHWCPreprocess()
+    )
+
+    tflite_model = converter_spy.spy_return
+    ops = tflite_model.sub_graphs[0].operators.vector
+    assert len(ops) == 3
+    assert isinstance(ops[0].builtin_options, Conv2D)
+    assert isinstance(ops[1].builtin_options, Transpose)
+    assert isinstance(ops[2].builtin_options, Reshape)
+
+
+def test__channels_first_to_4d(mocker):
+    input_shape = [2, 4, 6, 8]
+    new_shape = [7, 4, 2, 5]
+
+    torch_model = ConvReshapeModule(channels=input_shape[1], new_shape=new_shape)
+    edge_program = to_edge_program(torch_model, input_shape).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    converter_spy = mocker.spy(ModelBuilder, "finish")
+
+    convert_run_compare(
+        edge_program, input_data, tflite_input_preprocess=ToNHWCPreprocess()
+    )
+
+    tflite_model = converter_spy.spy_return
+    ops = tflite_model.sub_graphs[0].operators.vector
+    assert len(ops) == 3
+    assert isinstance(ops[0].builtin_options, Conv2D)
+    assert isinstance(ops[1].builtin_options, Transpose)
+    assert isinstance(ops[2].builtin_options, Reshape)
+
+
+def test__formatless_to_channels_first(mocker):
+    input_shape = [12, 32]
+    new_shape = [2, 4, 6, 8]  # Mix up the dimensions for a thorough test.
+
+    torch_model = FormatlessToChannelsFirstModule(
+        channels=new_shape[1], new_shape=new_shape
+    )
+    edge_program = to_edge_program(torch_model, input_shape).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    converter_spy = mocker.spy(ModelBuilder, "finish")
+
+    convert_run_compare(
+        edge_program, input_data, tflite_output_preprocess=ToNCHWPreprocess()
+    )
+
+    tflite_model = converter_spy.spy_return
+    ops = tflite_model.sub_graphs[0].operators.vector
+    assert len(ops) == 3
+    assert isinstance(ops[0].builtin_options, Reshape)
+    assert isinstance(ops[1].builtin_options, Transpose)
+    assert isinstance(ops[2].builtin_options, Conv2D)
+
+
+def test__formatless_to_formatless(mocker):
+    input_shape = [12, 32]
+    new_shape = [2, 4, 6, 8]
+
+    torch_model = FormatlessToFormatlessModule(new_shape=new_shape)
+    edge_program = to_edge_program(torch_model, input_shape).exported_program()
+
+    input_data = np.random.random(input_shape).astype(np.float32)
+
+    converter_spy = mocker.spy(ModelBuilder, "finish")
+
+    convert_run_compare(edge_program, input_data)
+
+    tflite_model = converter_spy.spy_return
+    ops = tflite_model.sub_graphs[0].operators.vector
+    assert len(ops) == 1  # No extra Transpose ops.
+    assert isinstance(ops[0].builtin_options, Reshape)
+
+
+@pytest.mark.parametrize(
+    "input_shape, new_shape",
+    [
+        pytest.param((8, 64), (1, 16, 4, 4), id="2D"),
+    ],
+)
+def test_view_copy_w_linear_quant_conversion(mocker, input_shape, new_shape):
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(LinearReshapeModule(new_shape=new_shape), input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    edge_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    convert_run_compare(
+        edge_program, input_data, tfl_model=tflite_flatbuffers_model, atol=1.0
+    )
+
+
+@pytest.mark.parametrize(
+    "input_shape, new_shape",
+    [
+        pytest.param((1, 4, 16, 16), (50, 18), id="4D, batch_size=1"),
+        pytest.param((10, 4, 16, 16), (500, 18), id="4D, , batch_size=10"),
+    ],
+)
+@pytest.mark.skip(reason="Neutron Converter does not fully convert for NPU")
+def test_view_copy_w_conv_quant_conversion(mocker, input_shape, new_shape):
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(
+        ConvReshapeModule(channels=input_shape[1], new_shape=new_shape), input_shape
+    )
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    edge_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    convert_run_compare(
+        edge_program,
+        input_data,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+        tfl_model=tflite_flatbuffers_model,
+        atol=1.0,
+    )
diff --git a/backends/nxp/tests/test_neutron_backend.py b/backends/nxp/tests/test_neutron_backend.py
new file mode 100644
index 00000000000..45b4ce5ead5
--- /dev/null
+++ b/backends/nxp/tests/test_neutron_backend.py
@@ -0,0 +1,156 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOptions import BuiltinOptions
+from executorch.backends.nxp.backend.ir.lib.tflite.Model import Model
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    EdgeProgramExecutor,
+    TFLiteExecutor,
+    ToNHWCPreprocess,
+)
+from executorch.backends.nxp.tests.models import (
+    Conv2dModule,
+    ConvFCSoftmaxModule,
+    LinearSoftmaxModule,
+)
+from torch.export import ExportedProgram
+
+
+def test_neutron_backend__single_conv_model():
+    edge_program_manager = to_quantized_edge_program(
+        Conv2dModule(bias=False), (1, 4, 32, 32)
+    )
+    lowered_module = (
+        edge_program_manager.exported_program().graph_module.lowered_module_0
+    )
+    assert (
+        len(lowered_module.processed_bytes) != 0
+    )  # The Neutron microcode, weights and kernels have been written here
+
+
+def test_neutron_backend__single_conv_model__payload_header_channels_last():
+    edge_program_manager = to_quantized_edge_program(
+        Conv2dModule(bias=False), (1, 4, 32, 32)
+    )
+    payload = (
+        edge_program_manager.exported_program().graph_module.lowered_module_0.processed_bytes
+    )
+
+    assert payload[0] == 0x1  # Single input
+    assert payload[1] == 0x1  # Single output
+    assert payload[2] == 0x1  # Channels last
+    assert payload[3] == 0x1  # Channels last
+    assert all(byte == 0x0 for byte in payload[4:16])  # Aligned to 16 bytes
+    assert payload[17] != 0x0  # Followed by non-zero content
+
+
+def test_neutron_backend__linear_softmax_model__payload_header_formatless():
+    edge_program_manager = to_quantized_edge_program(LinearSoftmaxModule(), (1, 12))
+    payload = (
+        edge_program_manager.exported_program().graph_module.lowered_module_0.processed_bytes
+    )
+
+    assert payload[0] == 0x1  # Single input
+    assert payload[1] == 0x1  # Single output
+    assert payload[2] == 0x0  # Formatless
+    assert payload[3] == 0x0  # Formatless
+    assert all(byte == 0x0 for byte in payload[4:16])  # Aligned to 16 bytes
+    assert payload[17] != 0x0  # Followed by non-zero content
+
+
+def test_lowered_program_and_tflite_output_match__conv2d__no_bias(mocker):
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    model = Conv2dModule(bias=False)
+    input_shape = (1, 4, 32, 32)
+
+    # Run conversion
+    to_quantized_edge_program(model, input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    tflite_model = Model.GetRootAs(tflite_flatbuffers_model)
+    sub_graph = tflite_model.Subgraphs(0)
+
+    assert sub_graph.OperatorsLength() == 1
+    assert sub_graph.Operators(0).BuiltinOptionsType() == BuiltinOptions.Conv2DOptions
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (
+        (torch.randn(input_shape, dtype=torch.float32) * 50)
+        .type(torch.int8)
+        .detach()
+        .numpy()
+    )
+    input_data_tflite = np.transpose(input_data, [0, 2, 3, 1])
+
+    # Execute program and TFLite model
+    program_executor = EdgeProgramExecutor(exported_program)
+    tflite_executor = TFLiteExecutor(model_content=tflite_flatbuffers_model)
+
+    output_edge = program_executor.inference(input_data)
+    output_tflite = tflite_executor.inference(input_data_tflite)
+
+    output_tflite = np.transpose(output_tflite, [0, 3, 1, 2])
+
+    # Outputs difference is smaller than 1 (rounding error in quantization)
+    assert np.max(np.abs(output_edge - output_tflite)) <= 1
+
+
+def test_conv_fc__lowered_program_and_tflite_output_match(mocker):
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    model = ConvFCSoftmaxModule()
+    input_shape = (1, 4, 5, 5)
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, input_shape)
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    # Capture generated model
+    tflite_flatbuffers_model, _ = converter_spy.spy_return
+
+    # No Transpose ops in produced TFLite model
+    tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0)
+
+    assert tflite_subgraph.OperatorsLength() == 3
+    assert (
+        tflite_subgraph.Operators(0).BuiltinOptionsType()
+        == BuiltinOptions.Conv2DOptions
+    )
+    assert (
+        tflite_subgraph.Operators(1).BuiltinOptionsType()
+        == BuiltinOptions.ReshapeOptions
+    )
+    assert (
+        tflite_subgraph.Operators(2).BuiltinOptionsType()
+        == BuiltinOptions.FullyConnectedOptions
+    )
+
+    # Verify outputs of program and TFLite model
+    input_data = (
+        (torch.randn(input_shape, dtype=torch.float32))
+        .type(torch.int8)
+        .detach()
+        .numpy()
+    )
+    convert_run_compare(
+        exported_program,
+        input_data=input_data,
+        tflite_input_preprocess=ToNHWCPreprocess(),
+    )
diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py
new file mode 100644
index 00000000000..fb816ef199f
--- /dev/null
+++ b/backends/nxp/tests/test_neutron_converter_manager.py
@@ -0,0 +1,59 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+
+from executorch import exir
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.backend.neutron_converter_manager import (
+    NeutronConverterManager,
+)
+from executorch.backends.nxp.tests.models import Conv2dModule
+
+
+def test_conv2d_neutron_conversion__default_flavor():
+    model = Conv2dModule()
+
+    example_input = (torch.ones(1, 4, 32, 32),)
+    exir_program = torch.export.export(model, example_input)
+    edge_program_manager = exir.to_edge(exir_program)
+
+    edge_program_converter = EdgeProgramToIRConverter()
+    tflite_model, _ = edge_program_converter.convert_program(
+        edge_program_manager.exported_program()
+    )
+
+    neutron_converter_manager = NeutronConverterManager()
+    neutron_model = neutron_converter_manager.convert(
+        tflite_model, "imxrt700", "SDK_25_03"
+    )
+
+    assert len(
+        neutron_model
+    ), "Produced NeutronGraph-based TFLite model has zero length!"
+
+
+def test__conv2d_neutron_conversion__invalid_flavor():
+    model = Conv2dModule()
+
+    example_input = (torch.ones(1, 4, 32, 32),)
+    exir_program = torch.export.export(model, example_input)
+    edge_program_manager = exir.to_edge(exir_program)
+
+    edge_program_converter = EdgeProgramToIRConverter()
+    tflite_model, _ = edge_program_converter.convert_program(
+        edge_program_manager.exported_program()
+    )
+
+    neutron_converter_manager = NeutronConverterManager()
+    with pytest.raises(RuntimeError) as excinfo:
+        _ = neutron_converter_manager.convert(tflite_model, "imxrt700", "bad_flavor")
+
+    assert "Neutron Converter module with flavor 'bad_flavor' not found." in str(
+        excinfo
+    )
diff --git a/backends/nxp/tests/test_node_format_inference.py b/backends/nxp/tests/test_node_format_inference.py
new file mode 100644
index 00000000000..96107efa755
--- /dev/null
+++ b/backends/nxp/tests/test_node_format_inference.py
@@ -0,0 +1,89 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from executorch import exir
+from executorch.backends.nxp.backend.node_format_inference import (
+    NodeFormat,
+    NodeFormatInference,
+)
+from executorch.backends.nxp.neutron_pass_manager import NeutronPassManager
+from executorch.backends.nxp.tests.models import (
+    Conv2dModule,
+    MaxPool2dModule,
+    SoftmaxModule,
+)
+from executorch.backends.xnnpack._passes import RemoveGetItemPass
+from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
+
+
+def test_convolution():
+    model = Conv2dModule()
+    example_input = (torch.ones(1, 4, 32, 32),)
+
+    exir_program = torch.export.export(model, example_input)
+    edge_program = exir.to_edge(exir_program).exported_program()
+
+    node_formats = NodeFormatInference(edge_program).identify_node_formats()
+
+    expected_mapping = {
+        "p_conv_weight": NodeFormat.CHANNELS_FIRST,
+        "p_conv_bias": NodeFormat.FORMATLESS,
+        "x": NodeFormat.CHANNELS_FIRST,
+        "aten_convolution_default": NodeFormat.CHANNELS_FIRST,
+        "output": NodeFormat.CHANNELS_FIRST,
+    }
+
+    for node, node_format in node_formats.items():
+        assert expected_mapping[node.name] == node_format
+
+
+def test_softmax():
+    model = SoftmaxModule(1)
+    example_input = (torch.ones(1, 4, 32, 32),)
+
+    exir_program = torch.export.export(model, example_input)
+    edge_program = exir.to_edge(exir_program).exported_program()
+
+    node_formats = NodeFormatInference(edge_program).identify_node_formats()
+
+    expected_mapping = {
+        "x": NodeFormat.FORMATLESS,
+        "aten__softmax_default": NodeFormat.FORMATLESS,
+        "output": NodeFormat.FORMATLESS,
+    }
+
+    for node, node_format in node_formats.items():
+        assert expected_mapping[node.name] == node_format
+
+
+def test_maxpool2d():
+    model = MaxPool2dModule()
+    example_input = (torch.ones(1, 4, 32, 32),)
+
+    exir_program = torch.export.export(model, example_input)
+    edge_program = exir.to_edge(exir_program).exported_program()
+
+    # We need to create custom model verifier with max_pool2d added as exception.
+    # Otherwise, we get violation that this op is not part of ATen Core ops.
+    edge_program._verifiers = [
+        EXIREdgeDialectVerifier(
+            class_only=True, exception_list=[torch.ops.aten.max_pool2d.default]
+        )
+    ]
+
+    # Remove MaxPool-related "getitem" nodes from graph
+    edge_program = NeutronPassManager(edge_program, [RemoveGetItemPass]).transform()
+    node_formats = NodeFormatInference(edge_program).identify_node_formats()
+
+    expected_mapping = {
+        "x": NodeFormat.CHANNELS_FIRST,
+        "aten_max_pool2d_default": NodeFormat.CHANNELS_FIRST,
+        "output": NodeFormat.CHANNELS_FIRST,
+    }
+
+    for node, node_format in node_formats.items():
+        assert expected_mapping[node.name] == node_format
diff --git a/backends/nxp/tests/test_operator_selector.py b/backends/nxp/tests/test_operator_selector.py
new file mode 100644
index 00000000000..ca301daf738
--- /dev/null
+++ b/backends/nxp/tests/test_operator_selector.py
@@ -0,0 +1,24 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.import torch
+
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.models import Conv2dModule
+
+
+def test_operator_selector_mechanism():
+    model = Conv2dModule(bias=False)
+    input_shape = (1, 4, 32, 32)
+
+    operators_not_to_delegate = ["aten::convolution"]
+
+    edge_program_manager = to_quantized_edge_program(
+        model, input_shape, operators_not_to_delegate=operators_not_to_delegate
+    )
+
+    exported_program = edge_program_manager.exported_program()
+
+    for node in exported_program.graph.nodes:
+        if node.name == "aten_convolution_default":
+            assert "delegation_tag" not in node.meta
diff --git a/backends/nxp/tests/test_qdq_clustering_conv.py b/backends/nxp/tests/test_qdq_clustering_conv.py
new file mode 100644
index 00000000000..1713aace1fe
--- /dev/null
+++ b/backends/nxp/tests/test_qdq_clustering_conv.py
@@ -0,0 +1,31 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.models import Conv2dModule
+
+
+def test_conv2d_partitioner():
+    model = Conv2dModule(bias=False)
+
+    edge_program = to_quantized_edge_program(model, (1, 4, 32, 32))
+
+    # Get subgraph (module) that is delegated to neutron
+    lowered_module = edge_program.exported_program().graph_module.lowered_module_0
+    nodes = list(lowered_module.original_module.graph.nodes)
+
+    assert len(nodes) == 7
+
+    q_x_node = nodes[1]
+    dq_w_node = nodes[2]
+    dq_x_node = nodes[3]
+    conv_node = nodes[4]
+    q_y_node = nodes[5]
+
+    assert "cluster" not in q_x_node.meta
+    assert dq_w_node.meta["cluster"] == "aten_convolution_default_cluster"
+    assert dq_x_node.meta["cluster"] == "aten_convolution_default_cluster"
+    assert conv_node.meta["cluster"] == "aten_convolution_default_cluster"
+    assert q_y_node.meta["cluster"] == "aten_convolution_default_cluster"
diff --git a/backends/xnnpack/_passes/remove_getitem_op.py b/backends/transforms/remove_getitem_op.py
similarity index 100%
rename from backends/xnnpack/_passes/remove_getitem_op.py
rename to backends/transforms/remove_getitem_op.py
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
index 36a7833dca0..4bf5bdfb079 100644
--- a/backends/xnnpack/_passes/__init__.py
+++ b/backends/xnnpack/_passes/__init__.py
@@ -6,6 +6,8 @@
 
 from typing import List, Optional, Type
 
+from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
+
 from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
@@ -23,7 +25,6 @@
     FuseBatchNormWithConvPass,
 )
 from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass
-from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
     TagImplicitQDqPass,
 )
diff --git a/backends/xnnpack/test/passes/test_remove_get_item_pass.py b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
index 4d71d61afd7..f9d98c6a5ff 100644
--- a/backends/xnnpack/test/passes/test_remove_get_item_pass.py
+++ b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
+from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
 from executorch.backends.xnnpack.test.tester import RunPasses, Tester
 
 
diff --git a/examples/nxp/setup.sh b/examples/nxp/setup.sh
new file mode 100644
index 00000000000..1ef2cc82c2a
--- /dev/null
+++ b/examples/nxp/setup.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -u
+
+# Install neutron-converter
+pip install --extra-index-url https://eiq.nxp.com/repository neutron-converter_SDK_25_03
diff --git a/setup.py b/setup.py
index 033426f29ce..0b474d2537e 100644
--- a/setup.py
+++ b/setup.py
@@ -123,6 +123,7 @@ def pybindings(cls) -> bool:
                     cls.mps(),
                     cls.openvino(),
                     cls.xnnpack(),
+                    cls.neutron(),
                     cls.training(),
                 ]
             ),
@@ -144,6 +145,10 @@ def openvino(cls) -> bool:
     def xnnpack(cls) -> bool:
         return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_XNNPACK", default=True)
 
+    @classmethod
+    def neutron(cls) -> bool:
+        return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_NEUTRON", default=False)
+
     @classmethod
     def training(cls) -> bool:
         return cls._is_cmake_arg_enabled(
@@ -678,7 +683,7 @@ def initialize_options(self):
         default_parallel = str(os.cpu_count() - 1)
         self.parallel = os.environ.get("CMAKE_BUILD_PARALLEL_LEVEL", default_parallel)
 
-    def run(self):
+    def run(self):  # noqa C901
         self.dump_options()
 
         cfg = get_build_type(self.debug)
@@ -729,6 +734,9 @@ def run(self):
             if ShouldBuild.xnnpack():
                 cmake_args += ["-DEXECUTORCH_BUILD_XNNPACK=ON"]
 
+            if ShouldBuild.neutron():
+                cmake_args += ["-DEXECUTORCH_BUILD_NEUTRON=ON"]
+
             if ShouldBuild.training():
                 build_args += ["--target", "_training_lib"]
 

From b11807c77f555abc32c3f94f256ce2ee74b76675 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Mon, 12 May 2025 09:25:07 -0700
Subject: [PATCH 037/178] [llava] Remove torch.jit.save in llava example

Differential Revision: D74551753

Pull Request resolved: https://github.com/pytorch/executorch/pull/10794
---
 .ci/scripts/test_llava.sh             |  27 +++----
 examples/models/llava/CMakeLists.txt  |  29 ++++----
 examples/models/llava/export_llava.py |  10 ---
 examples/models/llava/image_util.py   |  79 --------------------
 examples/models/llava/main.cpp        | 101 +++++++++++++++-----------
 examples/models/llava/model.py        |  28 ++++++-
 examples/models/llava/targets.bzl     |   2 +-
 7 files changed, 108 insertions(+), 168 deletions(-)
 delete mode 100644 examples/models/llava/image_util.py

diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 8a1d5683b33..9a0251c9a38 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -93,8 +93,7 @@ cmake_build_llava_runner_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}                                              \
-        -DCMAKE_PREFIX_PATH="$python_lib"                  \
-        -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
+        -DCMAKE_PREFIX_PATH="$python_lib"                                       \
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
 
@@ -107,11 +106,10 @@ export_llava() {
     $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
 }
 
-# Download a new image with different size, to test if the model can handle different image sizes
-prepare_image_tensor() {
+# Download a new image
+download_image() {
     echo "Downloading image"
     curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
-    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
 }
 
 run_and_verify() {
@@ -121,8 +119,8 @@ run_and_verify() {
         echo "Export failed. Abort"
         exit 1
     fi
-    if [[ ! -f "image.pt" ]]; then
-        echo "image.pt is missing."
+    if [[ ! -f "basketball.jpg" ]]; then
+        echo "basketball.jpg is missing."
         exit 1
     fi
     if [[ ! -f "tokenizer.bin" ]]; then
@@ -130,11 +128,9 @@ run_and_verify() {
         exit 1
     fi
 
-
-
     RUNTIME_ARGS="--model_path=llava.pte    \
         --tokenizer_path=tokenizer.bin      \
-        --image_path=image.pt               \
+        --image_path=basketball.jpg         \
         --prompt=ASSISTANT:                 \
         --temperature=0                     \
         --seq_len=650"
@@ -149,13 +145,8 @@ run_and_verify() {
 
     # verify result.txt
     RESULT=$(cat result.txt)
-    # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-    if [[ "$(uname)" == "Darwin" ]]; then
-        EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
-    else
-        # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-        EXPECTED_PREFIX="ASSISTANT: image"
-    fi
+    EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. "
+
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
         echo "Actual result: ${RESULT}"
@@ -184,5 +175,5 @@ fi
 export_llava
 
 # Step3. Run
-prepare_image_tensor
+download_image
 run_and_verify
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index 232e83d8b0a..fe3eb5628b2 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -15,14 +15,12 @@
 # ~~~
 # It should also be cmake-lint clean.
 #
-cmake_minimum_required(VERSION 3.24)  # 3.24 is required for WHOLE_ARCHIVE
+cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE
 project(llava)
 
 # Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
-# This is a temporary hack to get around Torch dep so we can test this on android
-option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF)
 
 include(CMakeDependentOption)
 #
@@ -73,15 +71,6 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
-# Avoid torch dep from torch.load()-ing the image.
-# This is a temporary hack.
-if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
-  add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
-  message("Buidling the runner without Torch, feeding a dummy image!")
-else()
-  find_package_torch()
-endif()
-
 #
 # llava_main: test binary to run llava, with tokenizer and sampler integrated
 #
@@ -95,9 +84,6 @@ target_link_options_shared_lib(executorch)
 add_subdirectory(runner)
 
 set(LINK_LIBS executorch gflags)
-if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
-  list(APPEND LINK_LIBS torch)
-endif()
 set(link_libraries ${LINK_LIBS})
 set(_srcs main.cpp)
 
@@ -197,6 +183,19 @@ if(ANDROID)
   list(APPEND link_libraries log)
 endif()
 
+# stb_image: a lightweight library to load images
+include(FetchContent)
+FetchContent_Declare(
+  stb
+  GIT_REPOSITORY https://github.com/nothings/stb.git
+  GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
+)
+FetchContent_MakeAvailable(stb)
+# Add deprecated/ to use stb_image_resize.h for internal compatibility
+list(APPEND _common_include_directories ${stb_SOURCE_DIR}
+     ${stb_SOURCE_DIR}/deprecated
+)
+
 add_executable(llava_main ${_srcs})
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
   if(APPLE)
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 66b61840866..60c21897e7f 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -30,7 +30,6 @@
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
-from executorch.examples.models.llava.image_util import serialize_image
 from executorch.examples.models.llava.model import LlavaModel
 from executorch.exir import (
     EdgeCompileConfig,
@@ -44,7 +43,6 @@
     ConstraintBasedSymShapeEvalPass,
     HintBasedSymShapeEvalPass,
 )
-
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
 from executorch.util.activation_memory_profiler import generate_memory_trace
 from pytorch_tokenizers.llama2c import Llama2cTokenizer as Tokenizer
@@ -265,13 +263,6 @@ def export_all(llava_model: LlavaModel):
     return executorch_program
 
 
-def get_image_tensor_for_llava_runner(llava_model):
-    # llava runner doesn't have image reader so an image tensor is needed.
-    (resized,) = llava_model.get_example_inputs()
-
-    serialize_image(resized, "image.pt")
-
-
 def get_tokenizer_for_llava_runner(llava_model):
     # serialize tokenizer into tokenizer.bin
     llava_model.tokenizer.save_vocabulary("./")
@@ -336,7 +327,6 @@ def main():
 
     # artifacts
     if args.with_artifacts:
-        get_image_tensor_for_llava_runner(llava_model)
         get_tokenizer_for_llava_runner(llava_model)
 
 
diff --git a/examples/models/llava/image_util.py b/examples/models/llava/image_util.py
deleted file mode 100644
index 3f78f0a6ed6..00000000000
--- a/examples/models/llava/image_util.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Utility functions for image processing. Run it with your image:
-
-# python image_util.py --image-path <path_to_image>
-
-import logging
-from argparse import ArgumentParser
-
-import torch
-import torchvision
-from PIL import Image
-from torch import nn
-
-
-FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-
-
-# pyre-ignore: Undefined or invalid type [11]: Annotation `Image` is not defined as a type.
-def prepare_image(image: Image, target_h: int, target_w: int) -> torch.Tensor:
-    """Read image into a tensor and resize the image so that it fits in
-    a target_h x target_w canvas.
-
-    Args:
-        image (Image): An Image object.
-        target_h (int): Target height.
-        target_w (int): Target width.
-
-    Returns:
-        torch.Tensor: resized image tensor.
-    """
-    img = torchvision.transforms.functional.pil_to_tensor(image)
-    # height ratio
-    ratio_h = img.shape[1] / target_h
-    # width ratio
-    ratio_w = img.shape[2] / target_w
-    # resize the image so that it fits in a target_h x target_w canvas
-    ratio = max(ratio_h, ratio_w)
-    output_size = (int(img.shape[1] / ratio), int(img.shape[2] / ratio))
-    img = torchvision.transforms.Resize(size=output_size)(img)
-    return img
-
-
-def serialize_image(image: torch.Tensor, path: str) -> None:
-    copy = torch.tensor(image)
-    m = nn.Module()
-    par = nn.Parameter(copy, requires_grad=False)
-    m.register_parameter("0", par)
-    tensors = torch.jit.script(m)
-    tensors.save(path)
-
-    logging.info(f"Saved image tensor to {path}")
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--image-path",
-        required=True,
-        help="Path to the image.",
-    )
-    parser.add_argument(
-        "--output-path",
-        default="image.pt",
-    )
-    args = parser.parse_args()
-
-    image = Image.open(args.image_path)
-    image_tensor = prepare_image(image, target_h=336, target_w=336)
-    serialize_image(image_tensor, args.output_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
index b01b33f5dd8..bdf191a789c 100644
--- a/examples/models/llava/main.cpp
+++ b/examples/models/llava/main.cpp
@@ -8,11 +8,10 @@
 
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <gflags/gflags.h>
-#ifndef LLAVA_NO_TORCH_DUMMY_IMAGE
-#include <torch/torch.h>
-#else
-#include <algorithm> // std::fill
-#endif
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include <stb_image_resize.h>
 
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
@@ -28,10 +27,7 @@ DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
 
-DEFINE_string(
-    image_path,
-    "",
-    "The path to a .pt file, a serialized torch tensor for an image, longest edge resized to 336.");
+DEFINE_string(image_path, "", "The path to a .jpg file.");
 
 DEFINE_double(
     temperature,
@@ -50,6 +46,56 @@ DEFINE_int32(
 
 using executorch::extension::llm::Image;
 
+void load_image(const std::string& image_path, Image& image) {
+  int width, height, channels;
+  unsigned char* data =
+      stbi_load(image_path.c_str(), &width, &height, &channels, 0);
+  if (!data) {
+    ET_LOG(Fatal, "Failed to load image: %s", image_path.c_str());
+    exit(1);
+  }
+  // resize the longest edge to 336
+  int new_width = width;
+  int new_height = height;
+  if (width > height) {
+    new_width = 336;
+    new_height = static_cast<int>(height * 336.0 / width);
+  } else {
+    new_height = 336;
+    new_width = static_cast<int>(width * 336.0 / height);
+  }
+  std::vector<uint8_t> resized_data(new_width * new_height * channels);
+  stbir_resize_uint8(
+      data,
+      width,
+      height,
+      0,
+      resized_data.data(),
+      new_width,
+      new_height,
+      0,
+      channels);
+  // transpose to CHW
+  image.data.resize(channels * new_width * new_height);
+  for (int i = 0; i < new_width * new_height; ++i) {
+    for (int c = 0; c < channels; ++c) {
+      image.data[c * new_width * new_height + i] =
+          resized_data[i * channels + c];
+    }
+  }
+  image.width = new_width;
+  image.height = new_height;
+  image.channels = channels;
+  // convert to tensor
+  ET_LOG(
+      Info,
+      "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
+      image.channels,
+      image.height,
+      image.width);
+  stbi_image_free(data);
+}
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -84,40 +130,9 @@ int32_t main(int32_t argc, char** argv) {
   // create llama runner
   example::LlavaRunner runner(model_path, tokenizer_path, temperature);
 
-  // read image and resize the longest edge to 336
-  std::vector<uint8_t> image_data;
-
-#ifdef LLAVA_NO_TORCH_DUMMY_IMAGE
-  // Work without torch using a random data
-  image_data.resize(3 * 240 * 336);
-  std::fill(image_data.begin(), image_data.end(), 0); // black
-  std::array<int32_t, 3> image_shape = {3, 240, 336};
-  std::vector<Image> images = {
-      {.data = image_data, .width = image_shape[2], .height = image_shape[1]}};
-#else //  LLAVA_NO_TORCH_DUMMY_IMAGE
-  //   cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
-  //   int longest_edge = std::max(image.rows, image.cols);
-  //   float scale_factor = 336.0f / longest_edge;
-  //   cv::Size new_size(image.cols * scale_factor, image.rows * scale_factor);
-  //   cv::Mat resized_image;
-  //   cv::resize(image, resized_image, new_size);
-  //   image_data.assign(resized_image.datastart, resized_image.dataend);
-  torch::Tensor image_tensor;
-  torch::load(image_tensor, image_path); // CHW
-  ET_LOG(
-      Info,
-      "image size(0): %" PRId64 ", size(1): %" PRId64 ", size(2): %" PRId64,
-      image_tensor.size(0),
-      image_tensor.size(1),
-      image_tensor.size(2));
-  image_data.assign(
-      image_tensor.data_ptr<uint8_t>(),
-      image_tensor.data_ptr<uint8_t>() + image_tensor.numel());
-  std::vector<Image> images = {
-      {.data = image_data,
-       .width = static_cast<int32_t>(image_tensor.size(2)),
-       .height = static_cast<int32_t>(image_tensor.size(1))}};
-#endif // LLAVA_NO_TORCH_DUMMY_IMAGE
+  Image image;
+  load_image(image_path, image);
+  std::vector<Image> images = {image};
 
   // generate
   runner.generate(std::move(images), prompt, seq_len);
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 7bcf560536c..1050fbdfae1 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -12,6 +12,7 @@
 
 import requests
 import torch
+import torchvision
 from executorch.examples.models.llama.llama_transformer import construct_transformer
 from executorch.examples.models.llama.model_args import ModelArgs
 
@@ -21,8 +22,6 @@
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
-
-from executorch.examples.models.llava.image_util import prepare_image
 from executorch.examples.models.model_base import EagerModelBase
 from PIL import Image
 
@@ -37,6 +36,31 @@
 )
 
 
+# pyre-ignore: Undefined or invalid type [11]: Annotation `Image` is not defined as a type.
+def prepare_image(image: Image, target_h: int, target_w: int) -> torch.Tensor:
+    """Read image into a tensor and resize the image so that it fits in
+    a target_h x target_w canvas.
+
+    Args:
+        image (Image): An Image object.
+        target_h (int): Target height.
+        target_w (int): Target width.
+
+    Returns:
+        torch.Tensor: resized image tensor.
+    """
+    img = torchvision.transforms.functional.pil_to_tensor(image)
+    # height ratio
+    ratio_h = img.shape[1] / target_h
+    # width ratio
+    ratio_w = img.shape[2] / target_w
+    # resize the image so that it fits in a target_h x target_w canvas
+    ratio = max(ratio_h, ratio_w)
+    output_size = (int(img.shape[1] / ratio), int(img.shape[2] / ratio))
+    img = torchvision.transforms.Resize(size=output_size)(img)
+    return img
+
+
 class Llava(torch.nn.Module):
     def __init__(
         self,
diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl
index 6f3a370acf4..bc653e37144 100644
--- a/examples/models/llava/targets.bzl
+++ b/examples/models/llava/targets.bzl
@@ -15,7 +15,7 @@ def define_common_targets():
         ],
         external_deps = [
             "gflags",
-            "torch-core-cpp",
+            "stb",
         ],
         **get_oss_build_kwargs()
     )

From e113c00e97223b0bf5d8cae62fb9eeb72ab95e4f Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 12 May 2025 10:48:51 -0700
Subject: [PATCH 038/178] Move EXECUTORCH_PAL_DEFAULT to default preset
 (#10798)

### Summary
TSIA

### Test plan

CI +

```
$ cmake --preset macos-arm64 && cmake --build cmake-out --parallel
```
```
$ cmake -DEXECUTORCH_PAL_DEFAULT=fake --preset macos-arm64

CMake Error at tools/cmake/preset/default.cmake:48 (message):
  PAL default implementation (fake) file not found:
  /Users/jathu/executorch/runtime/platform/default/fake.cpp
Call Stack (most recent call first):
  CMakeLists.txt:53 (include)
```

cc @larryliu0820
---
 CMakeLists.txt                   | 21 +--------------------
 tools/cmake/preset/default.cmake | 30 ++++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 950d9bc6998..9eed7b96a9e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,15 +100,6 @@ else()
   set(_default_release_disabled_options ON)
 endif()
 
-# Let users override which PAL defaults to use.
-#
-# TODO(dbort): Add another option that lets users point to a specific source
-# file; if set, would override the default option.
-set(EXECUTORCH_PAL_DEFAULT
-    "posix"
-    CACHE STRING
-          "Which PAL default implementation to use: one of {posix, minimal}"
-)
 
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
@@ -477,17 +468,7 @@ list(FILTER _executorch_core__srcs EXCLUDE REGEX
 )
 
 # Add the source file that maps to the requested default PAL implementation.
-if(EXECUTORCH_PAL_DEFAULT MATCHES "^(posix|minimal)$")
-  message(STATUS "executorch: Using PAL default '${EXECUTORCH_PAL_DEFAULT}'")
-  list(APPEND _executorch_core__srcs
-       "runtime/platform/default/${EXECUTORCH_PAL_DEFAULT}.cpp"
-  )
-else()
-  message(
-    FATAL_ERROR "Unknown EXECUTORCH_PAL_DEFAULT \"${EXECUTORCH_PAL_DEFAULT}\". "
-                "Expected one of {posix, minimal}."
-  )
-endif()
+list(APPEND _executorch_core__srcs ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
 
 add_library(executorch_core ${_executorch_core__srcs})
 
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index ddcab5d367b..a8098cf9f1b 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -14,10 +14,36 @@ endif()
 
 # MARK: - Overridable Options
 
-define_overridable_option(EXECUTORCH_ENABLE_LOGGING "Build with ET_LOG_ENABLED" BOOL ${_is_build_type_debug})
-define_overridable_option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" BOOL OFF)
+define_overridable_option(
+  EXECUTORCH_ENABLE_LOGGING
+  "Build with ET_LOG_ENABLED"
+  BOOL ${_is_build_type_debug}
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_COREML
+  "Build the Core ML backend"
+  BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_FLATBUFFERS_MAX_ALIGNMENT
   "Exir lets users set the alignment of tensor data embedded in the flatbuffer, and some users need an alignment larger than the default, which is typically 32."
   STRING 1024
 )
+define_overridable_option(
+  EXECUTORCH_PAL_DEFAULT
+  "Which PAL default implementation to use. Choices: posix, minimal"
+  STRING "posix"
+)
+define_overridable_option(
+  EXECUTORCH_PAL_DEFAULT_FILE_PATH
+  "PAL implementation file path"
+  STRING "${PROJECT_SOURCE_DIR}/runtime/platform/default/${EXECUTORCH_PAL_DEFAULT}.cpp"
+)
+
+
+# MARK: - Validations
+# At this point all the options should be configured with their final value.
+
+if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
+  message(FATAL_ERROR "PAL default implementation (EXECUTORCH_PAL_DEFAULT=${EXECUTORCH_PAL_DEFAULT}) file not found: ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}. Choices: posix, minimal")
+endif()

From adde519d7af76f133290df8ed1418078f6e44ffb Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Mon, 12 May 2025 10:55:06 -0700
Subject: [PATCH 039/178] Make a separate target for kernel utils (#10788)

Currently it's built into portable kernels only. Other kernels used it,
and the runner must include portable kernel to avoid undefined symbols.
We need to split these utils out. A fix for
https://github.com/pytorch/executorch/issues/10677 without BC breaking.
---
 CMakeLists.txt                           |  2 ++
 extension/llm/custom_ops/CMakeLists.txt  |  2 +-
 extension/training/CMakeLists.txt        |  4 ++--
 kernels/optimized/CMakeLists.txt         |  2 +-
 kernels/portable/CMakeLists.txt          |  2 +-
 kernels/portable/cpu/util/CMakeLists.txt | 30 ++++++++++++++++++++++++
 kernels/quantized/CMakeLists.txt         |  2 +-
 tools/cmake/cmake_deps.toml              | 26 ++++++++++++++++++++
 8 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 kernels/portable/cpu/util/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9eed7b96a9e..7528708cffc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -547,6 +547,8 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   find_package_torch_headers()
 endif()
 
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable/cpu/util)
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 42e82dc360f..cf915054ad7 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -119,7 +119,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   else()
     # If no portable_lib, custom_ops_aot_lib still gives the ability to use the
     # ops in PyTorch
-    target_link_libraries(custom_ops_aot_lib PUBLIC executorch_core)
+    target_link_libraries(custom_ops_aot_lib PUBLIC executorch_core kernels_util_all_deps)
   endif()
 
   target_link_libraries(
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
index ee496a7e577..21d4f496978 100644
--- a/extension/training/CMakeLists.txt
+++ b/extension/training/CMakeLists.txt
@@ -25,8 +25,8 @@ target_include_directories(
 
 target_include_directories(extension_training PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(extension_training PUBLIC ${_common_compile_options})
-target_link_libraries(extension_training executorch_core
-    extension_data_loader extension_module_static extension_tensor extension_flat_tensor)
+target_link_libraries(extension_training executorch_core kernels_util_all_deps
+    extension_data_loader extension_module_static extension_tensor extension_flat_tensor )
 
 
 list(TRANSFORM _train_xor__srcs PREPEND "${EXECUTORCH_ROOT}/")
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 85c829469be..ae6d8e6fcd3 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -64,7 +64,7 @@ add_library(optimized_kernels ${_optimized_kernels__srcs})
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
 target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
 target_link_libraries(
-  optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
+  optimized_kernels PUBLIC executorch_core cpublas extension_threadpool kernels_util_all_deps
 )
 target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
 # Build a library for _optimized_kernels_srcs
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index db0f6b710c6..4094225c3db 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -52,7 +52,7 @@ message("Generated files ${gen_command_sources}")
 # Focused on portability and understandability rather than speed.
 #
 add_library(portable_kernels ${_portable_kernels__srcs})
-target_link_libraries(portable_kernels PRIVATE executorch_core)
+target_link_libraries(portable_kernels PRIVATE executorch_core kernels_util_all_deps)
 target_compile_options(portable_kernels PUBLIC ${_common_compile_options})
 
 # Build a library for _portable_kernels__srcs
diff --git a/kernels/portable/cpu/util/CMakeLists.txt b/kernels/portable/cpu/util/CMakeLists.txt
new file mode 100644
index 00000000000..8a2da87936d
--- /dev/null
+++ b/kernels/portable/cpu/util/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Kernel library for portable kernels. Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+endif()
+
+list(TRANSFORM _kernels_util_all_deps__srcs PREPEND "${EXECUTORCH_ROOT}/")
+
+set(_common_compile_options -Wno-deprecated-declarations)
+
+add_library(kernels_util_all_deps ${_kernels_util_all_deps__srcs})
+target_link_libraries(kernels_util_all_deps PRIVATE executorch_core)
+target_include_directories(kernels_util_all_deps PUBLIC ${_common_include_directories})
+target_compile_definitions(kernels_util_all_deps PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
+target_compile_options(kernels_util_all_deps PUBLIC ${_common_compile_options})
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index 149db0c17f6..1c4e952b6ae 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -142,7 +142,7 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode"
 endif()
 
 add_library(quantized_kernels ${_quantized_kernels__srcs})
-target_link_libraries(quantized_kernels PRIVATE executorch_core)
+target_link_libraries(quantized_kernels PRIVATE executorch_core kernels_util_all_deps)
 target_compile_options(quantized_kernels PUBLIC ${_common_compile_options})
 # Build a library for _quantized_kernels_srcs
 #
diff --git a/tools/cmake/cmake_deps.toml b/tools/cmake/cmake_deps.toml
index 78befd62aac..6f12c9d4413 100644
--- a/tools/cmake/cmake_deps.toml
+++ b/tools/cmake/cmake_deps.toml
@@ -56,6 +56,20 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
+  "kernels_util_all_deps",
+]
+
+[targets.kernels_util_all_deps]
+buck_targets = [
+  "//kernels/portable/cpu/util:all_deps",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch_core",
+  "extension_threadpool",
 ]
 
 # HACK: prevent reduce_util from also showing up in custom_ops. The
@@ -89,6 +103,7 @@ deps = [
   "executorch",
   "executorch_core",
   "extension_threadpool",
+  "kernels_util_all_deps",
   "optimized_cpublas",
   "portable_kernels",
 ]
@@ -108,6 +123,8 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
+  "kernels_util_all_deps",
   "portable_kernels",
 ]
 
@@ -147,6 +164,7 @@ deps = [
   "executorch_core",
   "executorch",
   "extension_threadpool",
+  "kernels_util_all_deps",
   "optimized_cpublas",
   "portable_kernels",
 ]
@@ -288,6 +306,8 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
+  "kernels_util_all_deps",
   "portable_kernels",
 ]
 # ---------------------------------- extension end ----------------------------------
@@ -306,6 +326,8 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
+  "kernels_util_all_deps",
   "portable_kernels",
   "quantized_kernels",
   "etdump_flatcc",
@@ -340,6 +362,8 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
+  "kernels_util_all_deps",
   "portable_kernels",
 ]
 
@@ -380,6 +404,7 @@ deps = [
   "executorch",
   "executorch_core",
   "extension_threadpool",
+  "kernels_util_all_deps",
   "xnnpack_backend",
   "portable_kernels",
   "etdump_flatcc",
@@ -455,6 +480,7 @@ deps = [
   "extension_module",
   "extension_tensor",
   "extension_threadpool",
+  "kernels_util_all_deps",
   "optimized_cpublas",
   "portable_kernels",
   "quantized_kernels",

From 4e5ffa3203868b0d136a4ee755bf4793f6fcdc56 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 12 May 2025 14:46:20 -0400
Subject: [PATCH 040/178] [ET-VK] Return fence after waiting is done. (#10808)

Pull Request resolved: #10787

This change returns a fence to fence pool after it has been waited on.
ghstack-source-id: 283339435
@exported-using-ghexport

Differential Revision: [D74484825](https://our.internmc.facebook.com/intern/diff/D74484825/)
---
 backends/vulkan/runtime/graph/ComputeGraph.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 7fde7e04f91..59fd561a2c5 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -625,6 +625,7 @@ void ComputeGraph::prepack() const {
   vkapi::VulkanFence fence = context_->fences().get_fence();
   context_->submit_cmd_to_gpu(fence.get_submit_handle(), /*final_use = */ true);
   fence.wait();
+  context_->fences().return_fence(fence);
 
   context_->flush();
 }
@@ -649,6 +650,7 @@ void ComputeGraph::execute() const {
   vkapi::VulkanFence fence = context_->fences().get_fence();
   context_->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
+  context_->fences().return_fence(fence);
 }
 
 void ComputeGraph::resize_input(

From 500842a8fb295a1104fbb5bce2d3455b5c3834d2 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 12 May 2025 11:47:42 -0700
Subject: [PATCH 041/178] Update backends-coreml.md (#10816)

Adds new "common issues" section to CoreML backend
---
 docs/source/backends-coreml.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
index 2d5e4256bcc..29a4b331be6 100644
--- a/docs/source/backends-coreml.md
+++ b/docs/source/backends-coreml.md
@@ -192,3 +192,9 @@ python examples/apple/coreml/scripts/extract_coreml_models.py -m /path/to/model.
 ```
 
 Note that if the ExecuTorch model has graph breaks, there may be multiple extracted *.mlpackage files.
+
+## Common issues and what to do
+
+During lowering to the CoreML backend, you might see an error like: "ValueError: In op, of type [X], named [Y], the named input [Z] must have the same data type as the named input x. However, [Z] has dtype fp32 whereas x has dtype fp16."
+
+This happens because the model is in FP16, but CoreML interprets some of the arguments as FP32, which leads to a type mismatch.  The solution is to keep the PyTorch model in FP32.  Note that the model will be still be converted to FP16 during lowering to CoreML unless specified otherwise in the compute_precision [CoreML CompileSpec](#coreml-compilespec).  Also see the [related issue in coremltools](https://github.com/apple/coremltools/issues/2480).

From a86816627d70693ff2f9791e3769ea3f906e9e7e Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 12 May 2025 14:56:32 -0400
Subject: [PATCH 042/178] Xnnpack test for program-data separation (#10817)

Pull Request resolved: #10532

Add xnnpack test for program-data separation
ghstack-source-id: 283448842
@exported-using-ghexport

Differential Revision: [D73794695](https://our.internmc.facebook.com/intern/diff/D73794695/)
---
 .../test/runtime/test_xnn_data_separation.cpp | 114 ++++++++++++++++++
 backends/xnnpack/test/targets.bzl             |  20 +++
 test/models/targets.bzl                       |  12 +-
 3 files changed, 140 insertions(+), 6 deletions(-)
 create mode 100644 backends/xnnpack/test/runtime/test_xnn_data_separation.cpp

diff --git a/backends/xnnpack/test/runtime/test_xnn_data_separation.cpp b/backends/xnnpack/test/runtime/test_xnn_data_separation.cpp
new file mode 100644
index 00000000000..342e3478e0f
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_xnn_data_separation.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::extension::FlatTensorDataMap;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Method;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::testing::ManagedMemoryManager;
+using torch::executor::util::FileDataLoader;
+
+constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
+constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
+
+class DataSeparationTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Create data loaders.
+    Result<FileDataLoader> linear_program_loader =
+        FileDataLoader::from(std::getenv("ET_MODULE_LINEAR_XNN_PROGRAM_PATH"));
+    ASSERT_EQ(linear_program_loader.error(), Error::Ok);
+    linear_program_loader_ = std::make_unique<FileDataLoader>(
+        std::move(linear_program_loader.get()));
+
+    Result<FileDataLoader> linear_data_loader =
+        FileDataLoader::from(std::getenv("ET_MODULE_LINEAR_XNN_DATA_PATH"));
+    ASSERT_EQ(linear_data_loader.error(), Error::Ok);
+    linear_data_loader_ =
+        std::make_unique<FileDataLoader>(std::move(linear_data_loader.get()));
+
+    // Create programs.
+    Result<Program> linear_program = Program::load(
+        linear_program_loader_.get(),
+        Program::Verification::InternalConsistency);
+    ASSERT_EQ(linear_program.error(), Error::Ok);
+    linear_program_ =
+        std::make_unique<Program>(std::move(linear_program.get()));
+
+    Result<FlatTensorDataMap> linear_data_map =
+        FlatTensorDataMap::load(linear_data_loader_.get());
+    EXPECT_EQ(linear_data_map.error(), Error::Ok);
+    linear_data_map_ =
+        std::make_unique<FlatTensorDataMap>(std::move(linear_data_map.get()));
+  }
+
+ private:
+  std::unique_ptr<FileDataLoader> linear_program_loader_;
+  std::unique_ptr<FileDataLoader> linear_data_loader_;
+
+ protected:
+  std::unique_ptr<Program> linear_program_;
+  std::unique_ptr<FlatTensorDataMap> linear_data_map_;
+};
+
+TEST_F(DataSeparationTest, TestExternalData) {
+  FlatTensorDataMap* data_map = linear_data_map_.get();
+  EXPECT_EQ(data_map->get_num_keys().get(), 2);
+
+  Result<const char*> key0 = data_map->get_key(0);
+  EXPECT_EQ(key0.error(), Error::Ok);
+  Result<const char*> key1 = data_map->get_key(1);
+  EXPECT_EQ(key1.error(), Error::Ok);
+
+  // Check that accessing keys out of bounds fails.
+  EXPECT_EQ(data_map->get_key(2).error(), Error::InvalidArgument);
+
+  // Linear.weight
+  Result<FreeableBuffer> data0 = data_map->get_data(key0.get());
+  EXPECT_EQ(data0.error(), Error::Ok);
+  EXPECT_EQ(data0.get().size(), 36); // 3*3*4 (3*3 matrix, 4 bytes per float)
+
+  // Linear.bias
+  Result<FreeableBuffer> data1 = data_map->get_data(key1.get());
+  EXPECT_EQ(data1.error(), Error::Ok);
+  EXPECT_EQ(data1.get().size(), 12); // 3*4 (3 vector, 4 bytes per float)
+
+  // Check that accessing non-existent data fails.
+  Result<FreeableBuffer> data2 = data_map->get_data("nonexistent");
+  EXPECT_EQ(data2.error(), Error::NotFound);
+}
+
+TEST_F(DataSeparationTest, TestE2E) {
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+  Result<Method> method = linear_program_->load_method(
+      "forward", &mmm.get(), nullptr, linear_data_map_.get());
+  ASSERT_EQ(method.error(), Error::Ok);
+
+  // Can execute the method.
+  Error err = method->execute();
+  ASSERT_EQ(err, Error::Ok);
+}
diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
index 58589b70607..f175e9655ea 100644
--- a/backends/xnnpack/test/targets.bzl
+++ b/backends/xnnpack/test/targets.bzl
@@ -43,3 +43,23 @@ def define_common_targets():
             "//executorch/schema:program",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_xnn_data_separation",
+        srcs = ["runtime/test_xnn_data_separation.cpp"],
+        deps = [
+                "//executorch/runtime/executor/test:managed_memory_manager",
+                "//executorch/runtime/executor:program",
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/backends/xnnpack:xnnpack_backend",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
+            ],
+            env = {
+                # The tests use these vars to find the program files to load.
+                # Uses an fbcode target path because the authoring/export tools
+                # intentionally don't work in xplat (since they're host-only
+                # tools).
+                "ET_MODULE_LINEAR_XNN_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_xnnpack_program_and_data[ModuleLinear-e.pte])",
+                "ET_MODULE_LINEAR_XNN_DATA_PATH": "$(location fbcode//executorch/test/models:exported_xnnpack_program_and_data[ModuleLinear.ptd])",
+            },
+    )
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index 9e26a6c123b..391ce230ab8 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -98,7 +98,7 @@ def define_common_targets():
         "ModuleLinear",
         "ModuleSimpleTrain",
     ]
-    
+
     runtime.genrule(
         name = "exported_program_and_data",
         cmd = "$(exe :export_program) --modules " + ",".join(MODULES_AND_DATA_TO_EXPORT) + " --external-constants --outdir $OUT",
@@ -213,18 +213,18 @@ def define_common_targets():
     runtime.genrule(
         name = "exported_xnnpack_program_and_data",
         cmd = "$(exe :export_delegated_program)" +
-            " --modules ModuleLinear" + 
+            " --modules ModuleLinear" +
             " --backend_id XnnpackBackend" +
             " --external_constants" +
             " --outdir $OUT",
-        
+
         outs = {
             "ModuleLinear-e.pte": ["ModuleLinear-e.pte"],
             "ModuleLinear.ptd": ["ModuleLinear.ptd"],
         },
         default_outs = ["."],
         visibility = [
-            "//executorch/runtime/executor/test/...",
+            "//executorch/backends/xnnpack/test/...",
             "//executorch/test/...",
         ],
     )
@@ -233,11 +233,11 @@ def define_common_targets():
     runtime.genrule(
         name = "exported_executor_backend_program_and_data",
         cmd = "$(exe :export_delegated_program)" +
-            " --modules ModuleLinear" + 
+            " --modules ModuleLinear" +
             " --backend_id ExecutorBackend" +
             " --external_constants" +
             " --outdir $OUT",
-        
+
         outs = {
             "ModuleLinear-e.pte": ["ModuleLinear-e.pte"],
         },

From 42b55f40ebf16c47b22afe7b263509fbdab0a7ee Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 12 May 2025 12:32:36 -0700
Subject: [PATCH 043/178] Move EXECUTORCH_LOG_LEVEL to default preset (#10799)

### Summary
TSIA

### Test plan

CI +

```
$ cmake --preset macos-arm64 && cmake --build cmake-out --parallel
```
```
$ cmake -DEXECUTORCH_LOG_LEVEL=fake --preset macos-arm64

CMake Error at tools/cmake/preset/default.cmake:65 (message):
  Unknown EXECUTORCH_LOG_LEVEL 'fake'.  Choices: Debug, Info, Error, Fatal
Call Stack (most recent call first):
  CMakeLists.txt:53 (include)
```

cc @larryliu0820
---
 CMakeLists.txt                   | 22 +---------------------
 tools/cmake/Utils.cmake          |  3 ---
 tools/cmake/preset/default.cmake | 19 ++++++++++++++++++-
 3 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7528708cffc..d221c5683f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,27 +108,7 @@ if(NOT EXECUTORCH_ENABLE_LOGGING)
   add_definitions(-DET_LOG_ENABLED=0)
 endif()
 
-# Configure log level. Must be one of debug, info, error, fatal.
-set(EXECUTORCH_LOG_LEVEL
-    "Info"
-    CACHE STRING "Build with the given ET_MIN_LOG_LEVEL value"
-)
-string(TOLOWER "${EXECUTORCH_LOG_LEVEL}" LOG_LEVEL_LOWER)
-if(LOG_LEVEL_LOWER STREQUAL "debug")
-  add_definitions(-DET_MIN_LOG_LEVEL=Debug)
-elseif(LOG_LEVEL_LOWER STREQUAL "info")
-  add_definitions(-DET_MIN_LOG_LEVEL=Info)
-elseif(LOG_LEVEL_LOWER STREQUAL "error")
-  add_definitions(-DET_MIN_LOG_LEVEL=Error)
-elseif(LOG_LEVEL_LOWER STREQUAL "fatal")
-  add_definitions(-DET_MIN_LOG_LEVEL=Fatal)
-else()
-  message(
-    SEND_ERROR
-      "Unknown log level \"${EXECUTORCH_LOG_LEVEL}\". Expected one of Debug, "
-      + "Info, Error, or Fatal."
-  )
-endif()
+add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL})
 
 option(EXECUTORCH_ENABLE_PROGRAM_VERIFICATION
        "Build with ET_ENABLE_PROGRAM_VERIFICATION"
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 13ec09c59d9..0191e02ee32 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -32,9 +32,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  EXECUTORCH_ENABLE_PROGRAM_VERIFICATION : "
                  "${EXECUTORCH_ENABLE_PROGRAM_VERIFICATION}"
   )
-  message(
-    STATUS "  EXECUTORCH_LOG_LEVEL                   : ${EXECUTORCH_LOG_LEVEL}"
-  )
   message(STATUS "  EXECUTORCH_BUILD_ANDROID_JNI           : "
                  "${EXECUTORCH_BUILD_ANDROID_JNI}"
   )
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index a8098cf9f1b..7a0e946b0b4 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -39,7 +39,11 @@ define_overridable_option(
   "PAL implementation file path"
   STRING "${PROJECT_SOURCE_DIR}/runtime/platform/default/${EXECUTORCH_PAL_DEFAULT}.cpp"
 )
-
+define_overridable_option(
+  EXECUTORCH_LOG_LEVEL
+  "Build with the given ET_MIN_LOG_LEVEL value"
+  STRING "Info"
+)
 
 # MARK: - Validations
 # At this point all the options should be configured with their final value.
@@ -47,3 +51,16 @@ define_overridable_option(
 if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
   message(FATAL_ERROR "PAL default implementation (EXECUTORCH_PAL_DEFAULT=${EXECUTORCH_PAL_DEFAULT}) file not found: ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}. Choices: posix, minimal")
 endif()
+
+string(TOLOWER "${EXECUTORCH_LOG_LEVEL}" _executorch_log_level_lower)
+if(_executorch_log_level_lower STREQUAL "debug")
+  set(ET_MIN_LOG_LEVEL Debug)
+elseif(_executorch_log_level_lower STREQUAL "info")
+  set(ET_MIN_LOG_LEVEL Info)
+elseif(_executorch_log_level_lower STREQUAL "error")
+  set(ET_MIN_LOG_LEVEL Error)
+elseif(_executorch_log_level_lower STREQUAL "fatal")
+  set(ET_MIN_LOG_LEVEL Fatal)
+else()
+  message(FATAL_ERROR "Unknown EXECUTORCH_LOG_LEVEL '${EXECUTORCH_LOG_LEVEL}'. Choices: Debug, Info, Error, Fatal")
+endif()

From d966a47d502e893d33c22f910016eea9c968e676 Mon Sep 17 00:00:00 2001
From: mcremon-meta <134334895+mcremon-meta@users.noreply.github.com>
Date: Mon, 12 May 2025 13:10:47 -0700
Subject: [PATCH 044/178] Forward-fixing G3 lt kernel

Differential Revision: D74558007

Pull Request resolved: https://github.com/pytorch/executorch/pull/10812
---
 backends/cadence/fusion_g3/operators/op_lt.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/op_lt.cpp b/backends/cadence/fusion_g3/operators/op_lt.cpp
index 3f6cdbe3505..08783860271 100644
--- a/backends/cadence/fusion_g3/operators/op_lt.cpp
+++ b/backends/cadence/fusion_g3/operators/op_lt.cpp
@@ -134,8 +134,8 @@ Tensor& lt_Tensor_out(
   } else {
     // @lint-ignore CLANGTIDY facebook-hte-CArray
     static constexpr const char op_name[] = "lt.Tensor_out";
-    torch::executor::native::internal::comparison_tensor_out<op_name>(
-        ctx, a, b, out);
+    torch::executor::native::internal::
+        comparison_tensor_out<std::less, op_name>(ctx, a, b, out);
   }
 
   return out;
@@ -188,8 +188,8 @@ Tensor& lt_Scalar_out(
   } else {
     // @lint-ignore CLANGTIDY facebook-hte-CArray
     static constexpr const char op_name[] = "lt.Scalar_out";
-    torch::executor::native::internal::comparison_scalar_out<op_name>(
-        ctx, a, b, out);
+    torch::executor::native::internal::
+        comparison_scalar_out<std::less, op_name>(ctx, a, b, out);
   }
 
   return out;

From e3a6825f1cd5034fdd9e041c6cbf171aa544503e Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 12 May 2025 13:48:13 -0700
Subject: [PATCH 045/178] Move EXECUTORCH_ENABLE_PROGRAM_VERIFICATION to
 default preset (#10800)

### Summary
TSIA

### Test plan

CI +

```
$ cmake --preset macos-arm64 && cmake --build cmake-out --parallel
$ cmake -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=OFF --preset macos-arm64 && cmake --build cmake-out --parallel
$ cmake -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON --preset macos-arm64 && cmake --build cmake-out --parallel
```

cc @larryliu0820
---
 CMakeLists.txt                   | 14 --------------
 tools/cmake/Utils.cmake          |  3 ---
 tools/cmake/preset/default.cmake |  5 +++++
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d221c5683f2..750ff33bc1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,16 +91,6 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
 # Instead please use `find_package(executorch REQUIRED)` in the example
 # directory and add a new executable in the example `CMakeLists.txt`.
 
-# _default_release_disabled_options: default value for options that should be
-# disabled in Release mode by default. Users can still manually enable them,
-# though.
-if(CMAKE_BUILD_TYPE STREQUAL "Release")
-  set(_default_release_disabled_options OFF)
-else()
-  set(_default_release_disabled_options ON)
-endif()
-
-
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -110,10 +100,6 @@ endif()
 
 add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL})
 
-option(EXECUTORCH_ENABLE_PROGRAM_VERIFICATION
-       "Build with ET_ENABLE_PROGRAM_VERIFICATION"
-       ${_default_release_disabled_options}
-)
 if(NOT EXECUTORCH_ENABLE_PROGRAM_VERIFICATION)
   # Avoid pulling in the flatbuffer data verification logic, which can add about
   # 20kB. Note that this will set the compiler flag for all targets in this
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 0191e02ee32..f5d6898478c 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -29,9 +29,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  CMAKE_TOOLCHAIN_FILE          : ${CMAKE_TOOLCHAIN_FILE}")
   message(STATUS "  BUCK2                         : ${BUCK2}")
   message(STATUS "  PYTHON_EXECUTABLE             : ${PYTHON_EXECUTABLE}")
-  message(STATUS "  EXECUTORCH_ENABLE_PROGRAM_VERIFICATION : "
-                 "${EXECUTORCH_ENABLE_PROGRAM_VERIFICATION}"
-  )
   message(STATUS "  EXECUTORCH_BUILD_ANDROID_JNI           : "
                  "${EXECUTORCH_BUILD_ANDROID_JNI}"
   )
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 7a0e946b0b4..4c6cdcc4226 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -44,6 +44,11 @@ define_overridable_option(
   "Build with the given ET_MIN_LOG_LEVEL value"
   STRING "Info"
 )
+define_overridable_option(
+  EXECUTORCH_ENABLE_PROGRAM_VERIFICATION
+  "Build with ET_ENABLE_PROGRAM_VERIFICATION"
+  BOOL ${_is_build_type_debug}
+)
 
 # MARK: - Validations
 # At this point all the options should be configured with their final value.

From d4c9a30847cb2c213c307358b8f81e9aca9c3700 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Mon, 12 May 2025 14:03:48 -0700
Subject: [PATCH 046/178] [jit] Remove TorchScript from doc (#10825)

Summary: As titled. This removes TorchScript from a bunch of readmes,
given that they are not needed.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 .../LlamaDemo/docs/delegates/xnnpack_README.md      |  2 +-
 .../LLaMA/docs/delegates/xnnpack_README.md          |  2 +-
 examples/models/llava/README.md                     | 13 +++----------
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index cce21749979..7f1ee6df374 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -113,7 +113,7 @@ python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokeniz
 ### For LLaVA model
 * For the Llava 1.5 model, you can get it from Huggingface [here](https://huggingface.co/llava-hf/llava-1.5-7b-hf).
 * Run `examples/models/llava/install_requirements.sh` to install dependencies.
-* Run the following command to generate llava.pte, tokenizer.bin and an image tensor (serialized in TorchScript) image.pt.
+* Run the following command to generate llava.pte, tokenizer.bin and download an image basketball.jpg.
 
 ```
 python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index dd6bb26fec1..fc47ce5938f 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -85,7 +85,7 @@ python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoi
 ### For LLaVA model
 * For the Llava 1.5 model, you can get it from Huggingface [here](https://huggingface.co/llava-hf/llava-1.5-7b-hf).
 * Run `examples/models/llava/install_requirements.sh` to install dependencies.
-* Run the following command to generate llava.pte, tokenizer.bin and an image tensor (serialized in TorchScript) image.pt.
+* Run the following command to generate llava.pte, tokenizer.bin and download an image basketball.jpg.
 
 ```
 python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md
index 6ba9ef21555..86d522862f0 100644
--- a/examples/models/llava/README.md
+++ b/examples/models/llava/README.md
@@ -41,10 +41,8 @@ The model weights is 15GiB, and the other memory usage at export stage (`export_
 
 ### Generate ExecuTorch .PTE and other artifacts
 
-Run the following command to generate `llava.pte`, `tokenizer.bin` and an image
-tensor (serialized in TorchScript) `image.pt`.
+Run the following command to generate `llava.pte`, `tokenizer.bin` and an image `basketball.jpg`.
 
-> **Warning**: The C++ runner `llava_main` binary cannot process raw image inputs such as JPEG, PNG, or BMP files directly. You must convert these images to a `.pt` file format using the `examples/models/llava/image_util.py` script before using them with `llava_main`.
 
 Prerequisite: run `install_executorch.sh` to install ExecuTorch and run
 `examples/models/llava/install_requirements.sh` to install dependencies.
@@ -65,11 +63,6 @@ python -m executorch.examples.models.llava.test.test_pte llava.pte
 See or run `.ci/scripts/test_llava.sh` shell script to build a C++ runner. This
 script also has preliminary support to build the C++ runner for Android.
 
-This also has an image utility Python script to generate image in PyTorch
-loadable format. Alternatively, we are working on generating image format which
-doesn't need PyTorch to load an image. Motivation for this is to build the C++
-runner on Android.
-
 Then you should be able to find `llava_main` binary:
 
 ```bash
@@ -104,8 +97,8 @@ Run:
 cmake-out/examples/models/llava/llava_main \
     --model_path=llava.pte                 \
     --tokenizer_path=tokenizer.bin         \
-    --image_path=image.pt                  \
-    --prompt="ASSISTANT:" \
+    --image_path=basketball.jpg            \
+    --prompt="ASSISTANT:"                  \
     --seq_len=768                          \
     --temperature=0
 ```

From 2e890df82ae3b14d84628872f5d1a64e03631eb9 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Mon, 12 May 2025 14:23:22 -0700
Subject: [PATCH 047/178] Android Qwen thinking mode prompt support (#10668)

Use different prompts according to mode
---
 .../executorchllamademo/MainActivity.java     | 12 +++++---
 .../executorchllamademo/PromptFormat.java     | 29 ++++++++++++++-----
 .../executorchllamademo/SettingsActivity.java |  7 +++--
 .../executorchllamademo/SettingsFields.java   | 14 +++++----
 4 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 87e9436b581..37268202b69 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -692,7 +692,10 @@ private String getConversationHistory() {
         prevPromptID = currentPromptID;
       }
       if (conversation.getIsSent()) {
-        format = format.replace(PromptFormat.USER_PLACEHOLDER, conversation.getText());
+        format =
+            format
+                .replace(PromptFormat.USER_PLACEHOLDER, conversation.getText())
+                .replace(PromptFormat.THINKING_MODE_PLACEHOLDER, "");
       } else {
         format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText());
       }
@@ -704,12 +707,12 @@ private String getConversationHistory() {
 
   private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) {
     if (conversationHistory.isEmpty()) {
-      return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
+      return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode);
     }
 
     return mCurrentSettingsFields.getFormattedSystemPrompt()
         + conversationHistory
-        + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt);
+        + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt, mThinkMode);
   }
 
   private void onModelRunStarted() {
@@ -738,7 +741,8 @@ private void onModelRunStopped() {
           if (ModelUtils.getModelCategory(
                   mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
               == ModelUtils.VISION_MODEL) {
-            finalPrompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
+            finalPrompt =
+                mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode);
           } else {
             finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
           }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
index 76c4d5f3b16..5f8ecdd8042 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
@@ -13,6 +13,7 @@ public class PromptFormat {
   public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}";
   public static final String USER_PLACEHOLDER = "{{ user_prompt }}";
   public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}";
+  public static final String THINKING_MODE_PLACEHOLDER = "{{ thinking_mode }}";
   public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences";
 
   public static String getSystemPromptTemplate(ModelType modelType) {
@@ -32,7 +33,7 @@ public static String getSystemPromptTemplate(ModelType modelType) {
     }
   }
 
-  public static String getUserPromptTemplate(ModelType modelType) {
+  public static String getUserPromptTemplate(ModelType modelType, boolean thinkingMode) {
     switch (modelType) {
       case LLAMA_3:
       case LLAMA_3_1:
@@ -43,15 +44,13 @@ public static String getUserPromptTemplate(ModelType modelType) {
             + "<|eot_id|>"
             + "<|start_header_id|>assistant<|end_header_id|>";
 
-      case LLAVA_1_5:
       case QWEN_3:
         return "<|im_start|>user\n"
             + USER_PLACEHOLDER
-            + "<|im_end|>\n"
+            + "\n<|im_end|>\n"
             + "<|im_start|>assistant\n"
-            + "<think>\n"
-            + "\n"
-            + "</think>\n\n\n";
+            + THINKING_MODE_PLACEHOLDER;
+      case LLAVA_1_5:
       default:
         return USER_PLACEHOLDER;
     }
@@ -62,9 +61,14 @@ public static String getConversationFormat(ModelType modelType) {
       case LLAMA_3:
       case LLAMA_3_1:
       case LLAMA_3_2:
-        return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>";
+        return getUserPromptTemplate(modelType, false)
+            + "\n"
+            + ASSISTANT_PLACEHOLDER
+            + "<|eot_id|>";
       case LLAVA_1_5:
         return USER_PLACEHOLDER + " ASSISTANT:";
+      case QWEN_3:
+        return getUserPromptTemplate(modelType, false) + "<|im_end|>\n";
       default:
         return USER_PLACEHOLDER;
     }
@@ -86,13 +90,22 @@ public static String getStopToken(ModelType modelType) {
     }
   }
 
+  public static String getThinkingModeToken(ModelType modelType, boolean thinkingMode) {
+    switch (modelType) {
+      case QWEN_3:
+        return thinkingMode ? "" : "<think>\n\n</think>\n\n\n";
+      default:
+        return "";
+    }
+  }
+
   public static String getLlavaPresetPrompt() {
     return "A chat between a curious human and an artificial intelligence assistant. The assistant"
         + " gives helpful, detailed, and polite answers to the human's questions. USER: ";
   }
 
   public static String getFormattedLlamaGuardPrompt(String userPrompt) {
-    return getUserPromptTemplate(ModelType.LLAMA_GUARD_3)
+    return getUserPromptTemplate(ModelType.LLAMA_GUARD_3, false)
         .replace(
             USER_PLACEHOLDER, getLlamaGuardPresetPrompt().replace(USER_PLACEHOLDER, userPrompt));
   }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 290cbec413e..0e388a5b0a4 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -272,7 +272,8 @@ public void afterTextChanged(Editable s) {
                   new DialogInterface.OnClickListener() {
                     public void onClick(DialogInterface dialog, int whichButton) {
                       // Clear the messageAdapter and sharedPreference
-                      mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType));
+                      mUserPromptEditText.setText(
+                          PromptFormat.getUserPromptTemplate(mModelType, false));
                     }
                   })
               .setNegativeButton(android.R.string.no, null)
@@ -295,7 +296,7 @@ private void showInvalidPromptDialog() {
         .setPositiveButton(
             android.R.string.yes,
             (dialog, whichButton) -> {
-              mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType));
+              mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType, false));
             })
         .setNegativeButton(android.R.string.no, null)
         .show();
@@ -377,7 +378,7 @@ private void setupModelTypeSelectorDialog() {
         (dialog, item) -> {
           mModelTypeTextView.setText(modelTypes[item]);
           mModelType = ModelType.valueOf(modelTypes[item]);
-          mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType));
+          mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType, false));
           dialog.dismiss();
         });
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
index 3adadf574da..94036f43947 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
@@ -38,8 +38,8 @@ public String getUserPrompt() {
     return userPrompt;
   }
 
-  public String getFormattedSystemAndUserPrompt(String prompt) {
-    return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt);
+  public String getFormattedSystemAndUserPrompt(String prompt, boolean thinkingMode) {
+    return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt, thinkingMode);
   }
 
   public String getFormattedSystemPrompt() {
@@ -47,8 +47,12 @@ public String getFormattedSystemPrompt() {
         .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt);
   }
 
-  public String getFormattedUserPrompt(String prompt) {
-    return userPrompt.replace(PromptFormat.USER_PLACEHOLDER, prompt);
+  public String getFormattedUserPrompt(String prompt, boolean thinkingMode) {
+    return userPrompt
+        .replace(PromptFormat.USER_PLACEHOLDER, prompt)
+        .replace(
+            PromptFormat.THINKING_MODE_PLACEHOLDER,
+            PromptFormat.getThinkingModeToken(modelType, thinkingMode));
   }
 
   public boolean getIsClearChatHistory() {
@@ -77,7 +81,7 @@ public SettingsFields() {
     tokenizerFilePath = "";
     temperature = SettingsActivity.TEMPERATURE_MIN_VALUE;
     systemPrompt = "";
-    userPrompt = PromptFormat.getUserPromptTemplate(DEFAULT_MODEL);
+    userPrompt = PromptFormat.getUserPromptTemplate(DEFAULT_MODEL, false);
     isClearChatHistory = false;
     isLoadModel = false;
     modelType = DEFAULT_MODEL;

From 27e159ec1e0d0ee9764b4168103c68c6e11f884a Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Mon, 12 May 2025 14:23:33 -0700
Subject: [PATCH 048/178] mediatek llama runner use executorch_core (#10754)

---
 examples/mediatek/executor_runner/llama_runner/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt b/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt
index 9d27e685f3a..0448817d8eb 100644
--- a/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt
+++ b/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt
@@ -20,7 +20,7 @@ target_sources(
           llm_helper/token_embedding.cpp
 )
 
-target_link_libraries(llm_helper PRIVATE executorch)
+target_link_libraries(llm_helper PRIVATE executorch_core)
 target_include_directories(
   llm_helper PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} llm_helper/include
 )

From 1da516810377ea715c52814ffc04d9e7ca70edf6 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 12 May 2025 17:53:11 -0400
Subject: [PATCH 049/178] Rename "topic: not user facing" (#10828)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/10791 by
@jackzhxng
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/jackzhxng/6/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/jackzhxng/6/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/jackzhxng/6/orig
@diff-train-skip-merge

Co-authored-by: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
---
 .github/scripts/label_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py
index 81668dad0cb..5a9c15bf3c1 100644
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@@ -22,12 +22,12 @@
 
 LABEL_ERR_MSG_TITLE = "This PR needs a `release notes:` label"
 LABEL_ERR_MSG = f"""# {LABEL_ERR_MSG_TITLE}
-If your changes are user facing and intended to be a part of release notes, please use a label starting with `release notes:`.
+If your change should be included in the release notes (i.e. would users of this library care about this change?), please use a label starting with `release notes:`.
 
-If not, please add the `topic: not user facing` label.
+If not, please add the `release notes: none` label.
 
 To add a label, you can comment to pytorchbot, for example
-`@pytorchbot label "topic: not user facing"`
+`@pytorchbot label "release notes: none"`
 
 For more information, see
 https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work.
@@ -115,7 +115,7 @@ def has_required_labels(pr: "GitHubPR") -> bool:
     pr_labels = pr.get_labels()
     # Check if PR is not user facing
     is_not_user_facing_pr = any(
-        label.strip() == "topic: not user facing" for label in pr_labels
+        label.strip() == "release notes: none" for label in pr_labels
     )
     return is_not_user_facing_pr or any(
         label.strip() in get_release_notes_labels(pr.org, pr.project)

From df8fc6102a598f4c0c7ffbcabe3c137d2d67ee5f Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Mon, 12 May 2025 15:08:40 -0700
Subject: [PATCH 050/178] [jit] Remove @torch.jit.export (#10824)

Summary: #10795 As part of the process of deprecating torchscript.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 .../qualcomm/quantizer/observers/per_channel_param_observer.py  | 1 -
 examples/cadence/models/rnnt_encoder.py                         | 1 -
 .../models/efficient_sam/efficient_sam_core/efficient_sam.py    | 2 --
 .../efficient_sam/efficient_sam_core/efficient_sam_encoder.py   | 1 -
 4 files changed, 5 deletions(-)

diff --git a/backends/qualcomm/quantizer/observers/per_channel_param_observer.py b/backends/qualcomm/quantizer/observers/per_channel_param_observer.py
index 0bba4d5ffeb..3c04e620308 100644
--- a/backends/qualcomm/quantizer/observers/per_channel_param_observer.py
+++ b/backends/qualcomm/quantizer/observers/per_channel_param_observer.py
@@ -109,6 +109,5 @@ def forward(self, x_orig):
             self.quant_max,
         )
 
-    @torch.jit.export
     def calculate_qparams(self):
         return self._calculate_qparams(self.min_val, self.max_val)
diff --git a/examples/cadence/models/rnnt_encoder.py b/examples/cadence/models/rnnt_encoder.py
index 641fa5e6121..d89f1ca42e7 100644
--- a/examples/cadence/models/rnnt_encoder.py
+++ b/examples/cadence/models/rnnt_encoder.py
@@ -103,7 +103,6 @@ def forward(
             layer_norm_out = self.layer_norm(output_linear_out)
             return layer_norm_out, transformer_lengths
 
-        @torch.jit.export
         def infer(
             self,
             input: torch.Tensor,
diff --git a/examples/models/efficient_sam/efficient_sam_core/efficient_sam.py b/examples/models/efficient_sam/efficient_sam_core/efficient_sam.py
index d06db2de434..b3594feff28 100644
--- a/examples/models/efficient_sam/efficient_sam_core/efficient_sam.py
+++ b/examples/models/efficient_sam/efficient_sam_core/efficient_sam.py
@@ -59,7 +59,6 @@ def __init__(
             "pixel_std", torch.Tensor(pixel_std).view(1, 3, 1, 1), False
         )
 
-    @torch.jit.export
     def predict_masks(
         self,
         image_embeddings: torch.Tensor,
@@ -174,7 +173,6 @@ def get_rescaled_pts(
             dim=-1,
         )
 
-    @torch.jit.export
     def get_image_embeddings(self, batched_images) -> torch.Tensor:
         """
         Predicts masks end-to-end from provided images and prompts.
diff --git a/examples/models/efficient_sam/efficient_sam_core/efficient_sam_encoder.py b/examples/models/efficient_sam/efficient_sam_core/efficient_sam_encoder.py
index d6ea4f5cc09..e49d3988d49 100644
--- a/examples/models/efficient_sam/efficient_sam_core/efficient_sam_encoder.py
+++ b/examples/models/efficient_sam/efficient_sam_core/efficient_sam_encoder.py
@@ -142,7 +142,6 @@ def forward(self, x):
         return x
 
 
-@torch.jit.export
 def get_abs_pos(
     abs_pos: torch.Tensor, has_cls_token: bool, hw: List[int]
 ) -> torch.Tensor:

From 4a738bdee699e5549bb156a17f9478299fb4990b Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 12 May 2025 16:14:35 -0700
Subject: [PATCH 051/178] Move EXECUTORCH_ENABLE_EVENT_TRACER to default preset
 (#10801)

### Summary
TSIA

### Test plan

```
$ cmake --preset macos-arm64 && cmake --build cmake-out --parallel
```

```
$ cmake -DEXECUTORCH_ENABLE_EVENT_TRACER=ON --preset macos-arm64 && cmake --build cmake-out --parallel

CMake Error at tools/cmake/preset/default.cmake:82 (message):
  Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires
  'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.
Call Stack (most recent call first):
  CMakeLists.txt:53 (include)
```

```
$ cmake -DEXECUTORCH_ENABLE_EVENT_TRACER=ON -DEXECUTORCH_BUILD_DEVTOOLS=ON --preset macos-arm64 && cmake --build cmake-out --parallel
```


cc @larryliu0820
---
 CMakeLists.txt                   |  9 +--------
 backends/xnnpack/CMakeLists.txt  |  9 +--------
 tools/cmake/preset/default.cmake | 13 +++++++++++++
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 750ff33bc1f..523d679516b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,9 +107,6 @@ if(NOT EXECUTORCH_ENABLE_PROGRAM_VERIFICATION)
   add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0)
 endif()
 
-option(EXECUTORCH_ENABLE_EVENT_TRACER "Build with ET_EVENT_TRACER_ENABLED=ON"
-       OFF
-)
 if(EXECUTORCH_ENABLE_EVENT_TRACER)
   add_definitions(-DET_EVENT_TRACER_ENABLED)
 endif()
@@ -788,11 +785,7 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   endif()
 
   if(EXECUTORCH_ENABLE_EVENT_TRACER)
-    if(EXECUTORCH_BUILD_DEVTOOLS)
-      list(APPEND _executor_runner_libs etdump flatccrt)
-    else()
-      message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
-    endif()
+    list(APPEND _executor_runner_libs etdump flatccrt)
   endif()
 
   if(EXECUTORCH_BUILD_COREML)
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index bae557037f1..f3bfc4f669b 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -162,14 +162,7 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   add_executable(xnn_executor_runner ${_xnn_executor_runner__srcs})
 
   if(EXECUTORCH_ENABLE_EVENT_TRACER)
-    if(EXECUTORCH_BUILD_DEVTOOLS)
-      list(APPEND xnn_executor_runner_libs etdump)
-    else()
-      message(
-        SEND_ERROR
-          "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled."
-      )
-    endif()
+    list(APPEND xnn_executor_runner_libs etdump)
   endif()
 
   target_link_libraries(xnn_executor_runner gflags ${xnn_executor_runner_libs})
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 4c6cdcc4226..900d1320287 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -49,6 +49,11 @@ define_overridable_option(
   "Build with ET_ENABLE_PROGRAM_VERIFICATION"
   BOOL ${_is_build_type_debug}
 )
+define_overridable_option(
+  EXECUTORCH_ENABLE_EVENT_TRACER
+  "Build with ET_EVENT_TRACER_ENABLED"
+  BOOL OFF
+)
 
 # MARK: - Validations
 # At this point all the options should be configured with their final value.
@@ -57,6 +62,7 @@ if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
   message(FATAL_ERROR "PAL default implementation (EXECUTORCH_PAL_DEFAULT=${EXECUTORCH_PAL_DEFAULT}) file not found: ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}. Choices: posix, minimal")
 endif()
 
+
 string(TOLOWER "${EXECUTORCH_LOG_LEVEL}" _executorch_log_level_lower)
 if(_executorch_log_level_lower STREQUAL "debug")
   set(ET_MIN_LOG_LEVEL Debug)
@@ -69,3 +75,10 @@ elseif(_executorch_log_level_lower STREQUAL "fatal")
 else()
   message(FATAL_ERROR "Unknown EXECUTORCH_LOG_LEVEL '${EXECUTORCH_LOG_LEVEL}'. Choices: Debug, Info, Error, Fatal")
 endif()
+
+
+if(EXECUTORCH_ENABLE_EVENT_TRACER)
+  if(NOT EXECUTORCH_BUILD_DEVTOOLS)
+    message(FATAL_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
+  endif()
+endif()

From 4006cd2a790cb6205214e7469eeeffbb626d5507 Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Tue, 13 May 2025 02:07:47 +0200
Subject: [PATCH 052/178] Refactor _to_edge_and_lower_llama to remove args

Differential Revision: D73785343

Pull Request resolved: https://github.com/pytorch/executorch/pull/10520
---
 examples/models/llama/export_llama_lib.py | 98 +++++++++++++++--------
 1 file changed, 66 insertions(+), 32 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 0e48a8520d7..3b926550b9f 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -27,7 +27,7 @@
 from executorch.backends.vulkan._passes.remove_asserts import remove_asserts
 from executorch.devtools.backend_debug import print_delegation_info
 
-from executorch.devtools.etrecord import generate_etrecord
+from executorch.devtools.etrecord import generate_etrecord as generate_etrecord_func
 from executorch.examples.models.llama.hf_download import (
     download_and_convert_hf_checkpoint,
 )
@@ -749,7 +749,9 @@ def _to_edge_and_lower_llama_xnnpack(
     pt2e_quant_params,
     quantizers,
     quant_dtype,
-    args,
+    xnnpack_extended_ops: bool = False,
+    generate_etrecord: bool = False,
+    verbose: bool = False,
 ) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
 
@@ -758,7 +760,7 @@ def _to_edge_and_lower_llama_xnnpack(
 
     modelname = f"xnnpack_dq_{modelname}"
 
-    if args.xnnpack_extended_ops:
+    if xnnpack_extended_ops:
         partitioners.append(
             get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
         )
@@ -769,7 +771,7 @@ def _to_edge_and_lower_llama_xnnpack(
         logging.info(f"--> {partitioner.__class__.__name__}")
 
     # TODO: Enable generating ETRecord with XNNPack and to_edge_transform_and_lower().
-    if args.generate_etrecord:
+    if generate_etrecord:
         raise NotImplementedError(
             "export_llama does not support XNNPack and generating ETRecord at the moment."
         )
@@ -777,7 +779,7 @@ def _to_edge_and_lower_llama_xnnpack(
     builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
         partitioners
     )
-    if args.verbose:
+    if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
 
     return builder.to_executorch(passes=additional_passes)
@@ -790,7 +792,23 @@ def _to_edge_and_lower_llama(  # noqa: C901
     pt2e_quant_params,
     quantizers,
     quant_dtype,
-    args,
+    vulkan: bool = False,
+    mps: bool = False,
+    coreml: bool = False,
+    qnn: bool = False,
+    dtype_override: str = "fp32",
+    enable_dynamic_shape: bool = True,
+    use_kv_cache: bool = False,
+    embedding_quantize: Optional[str] = None,
+    pt2e_quantize: Optional[str] = None,
+    coreml_ios: int = 15,
+    coreml_quantize: Optional[str] = None,
+    coreml_compute_units: str = "cpu_only",
+    use_qnn_sha: bool = False,
+    num_sharding: int = 0,
+    soc_model: str = "SM8650",
+    generate_etrecord: bool = False,
+    verbose: bool = False,
 ):
     builder_exported_to_edge = builder_exported.pt2e_quantize(
         quantizers
@@ -798,11 +816,11 @@ def _to_edge_and_lower_llama(  # noqa: C901
 
     # to_backend
     partitioners = []
-    if args.vulkan:
+    if vulkan:
         partitioners.append(
             get_vulkan_partitioner(
-                args.dtype_override,
-                args.enable_dynamic_shape,
+                dtype_override,
+                enable_dynamic_shape,
             )
         )
         modelname = f"vulkan_{modelname}"
@@ -810,22 +828,22 @@ def _to_edge_and_lower_llama(  # noqa: C901
         # Need to remove asserts from the graph to prevent graph breaks
         remove_asserts(builder_exported_to_edge.edge_manager.exported_program())
 
-    if args.mps:
-        partitioners.append(get_mps_partitioner(args.use_kv_cache))
+    if mps:
+        partitioners.append(get_mps_partitioner(use_kv_cache))
         modelname = f"mps_{modelname}"
 
-    if args.coreml:
+    if coreml:
         coreml_partitioner = get_coreml_partitioner(
-            args.coreml_ios,
-            args.embedding_quantize,
-            args.pt2e_quantize,
-            args.coreml_quantize,
-            args.coreml_compute_units,
+            coreml_ios,
+            embedding_quantize,
+            pt2e_quantize,
+            coreml_quantize,
+            coreml_compute_units,
         )
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
 
-    if args.qnn:
+    if qnn:
         logging.warning(
             "The model definition in current repro is not performant, please refer to the instruction"
             " in https://github.com/pytorch/executorch/tree/main/examples/qualcomm/oss_scripts/llama/README.md for better performance."
@@ -833,9 +851,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
         from executorch.extension.llm.custom_ops import model_sharding
 
         partitioners.append(
-            get_qnn_partitioner(
-                args.use_kv_cache, args.pt2e_quantize, args.num_sharding, args.soc_model
-            )
+            get_qnn_partitioner(use_kv_cache, pt2e_quantize, num_sharding, soc_model)
         )
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm._passes`
         from executorch.backends.qualcomm._passes import (
@@ -864,7 +880,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
         )
 
         atten = builder_exported_to_edge.model.layers[0].attention
-        if args.use_qnn_sha:
+        if use_qnn_sha:
             cache_shape = torch.Size(
                 (atten.max_batch_size, atten.max_context_len, atten.head_dim)
             )
@@ -887,10 +903,10 @@ def _to_edge_and_lower_llama(  # noqa: C901
         passes_job[TagQuantIO][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][
             "get_quant_io_dtype_fn"
         ] = partial(get_custom_quant_ios_dtype, cache_shape)
-        if args.num_sharding > 0:
+        if num_sharding > 0:
             SplitGraph, setting = model_sharding.get_split_graph_pass(
                 builder_exported_to_edge.metadata["get_n_layers"],
-                shares=args.num_sharding,
+                shares=num_sharding,
             )
             passes_job[SplitGraph] = setting
             dep_table[SplitGraph] = [FoldQDQ]
@@ -905,7 +921,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    if args.generate_etrecord:
+    if generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
             raise ValueError("Unable to generate etrecord due to missing edge manager.")
 
@@ -913,9 +929,9 @@ def _to_edge_and_lower_llama(  # noqa: C901
         # Copy the edge manager which will be serialized into etrecord. This is memory-wise expensive.
         edge_manager_copy = copy.deepcopy(builder_exported_to_edge.edge_manager)
         builder = builder_exported_to_edge.to_backend(partitioners)
-        if args.verbose:
+        if verbose:
             print_delegation_info(builder.edge_manager.exported_program().graph_module)
-        if args.num_sharding > 0 and args.qnn:
+        if num_sharding > 0 and qnn:
             # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`.
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
@@ -927,7 +943,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
 
         # Generate ETRecord
         if edge_manager_copy:
-            generate_etrecord(
+            generate_etrecord_func(
                 et_record="etrecord.bin",
                 edge_dialect_program=edge_manager_copy,
                 executorch_program=builder.export_program,
@@ -935,9 +951,9 @@ def _to_edge_and_lower_llama(  # noqa: C901
             logging.info("Generated etrecord.bin")
     else:
         builder = builder_exported_to_edge.to_backend(partitioners)
-        if args.verbose:
+        if verbose:
             print_delegation_info(builder.edge_manager.exported_program().graph_module)
-        if args.num_sharding > 0 and args.qnn:
+        if num_sharding > 0 and qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
             canonicalize_program(builder.edge_manager.exported_program())
@@ -976,7 +992,9 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             pt2e_quant_params,
             quantizers,
             quant_dtype,
-            args,
+            xnnpack_extended_ops=args.xnnpack_extended_ops,
+            generate_etrecord=args.generate_etrecord,
+            verbose=args.verbose,
         )
     else:
         builder = _to_edge_and_lower_llama(
@@ -986,7 +1004,23 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             pt2e_quant_params,
             quantizers,
             quant_dtype,
-            args,
+            vulkan=args.vulkan,
+            mps=args.mps,
+            coreml=args.coreml,
+            qnn=args.qnn,
+            dtype_override=args.dtype_override,
+            enable_dynamic_shape=args.enable_dynamic_shape,
+            use_kv_cache=args.use_kv_cache,
+            embedding_quantize=args.embedding_quantize,
+            pt2e_quantize=args.pt2e_quantize,
+            coreml_ios=args.coreml_ios,
+            coreml_quantize=args.coreml_quantize,
+            coreml_compute_units=args.coreml_compute_units,
+            use_qnn_sha=args.use_qnn_sha,
+            num_sharding=args.num_sharding,
+            soc_model=args.soc_model,
+            generate_etrecord=args.generate_etrecord,
+            verbose=args.verbose,
         )
 
     if args.profile_memory:

From 2ee2e03337c8e6a8905ee974a79f467cded2a53a Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 12 May 2025 17:15:37 -0700
Subject: [PATCH 053/178] Move OPTIMIZE_SIZE to default preset (#10802)

### Summary
TSIA

### Test plan
```
$ cmake --preset macos-arm64 && cmake --build cmake-out --parallel
$ test/build_size_test.sh
```

cc @larryliu0820
---
 CMakeLists.txt                                       | 3 +--
 docs/source/using-executorch-building-from-source.md | 2 +-
 test/build_optimized_size_test.sh                    | 2 +-
 test/build_size_test.sh                              | 2 +-
 tools/cmake/preset/default.cmake                     | 5 +++++
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 523d679516b..4b0aba3a9de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,8 +120,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
-option(OPTIMIZE_SIZE "Build executorch runtime optimizing for binary size" OFF)
-if(OPTIMIZE_SIZE)
+if(EXECUTORCH_OPTIMIZE_SIZE)
   # -Os: Optimize for size.
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Os")
 else()
diff --git a/docs/source/using-executorch-building-from-source.md b/docs/source/using-executorch-building-from-source.md
index e2657e69b55..76237971e2d 100644
--- a/docs/source/using-executorch-building-from-source.md
+++ b/docs/source/using-executorch-building-from-source.md
@@ -164,7 +164,7 @@ The release build offers optimizations intended to improve performance and reduc
 To further optimize the release build for size, use both:
 ```bash
 -DCMAKE_BUILD_TYPE=Release \
--DOPTIMIZE_SIZE=ON
+-DEXECUTORCH_OPTIMIZE_SIZE=ON
 ```
 
 See [CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt)
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
index 181c2ce617d..d7b055559f0 100644
--- a/test/build_optimized_size_test.sh
+++ b/test/build_optimized_size_test.sh
@@ -27,7 +27,7 @@ cmake_install_executorch_lib() {
           -DCMAKE_BUILD_TYPE=MinSizeRel \
           -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
           -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-          -DOPTIMIZE_SIZE=ON \
+          -DEXECUTORCH_OPTIMIZE_SIZE=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
   cmake --build cmake-out -j9 --target install --config MinSizeRel
diff --git a/test/build_size_test.sh b/test/build_size_test.sh
index cae5a015280..d020ab58c95 100644
--- a/test/build_size_test.sh
+++ b/test/build_size_test.sh
@@ -28,7 +28,7 @@ cmake_install_executorch_lib() {
           -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
           -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
-          -DOPTIMIZE_SIZE=ON \
+          -DEXECUTORCH_OPTIMIZE_SIZE=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           ${EXTRA_BUILD_ARGS} \
           -Bcmake-out .
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 900d1320287..8020c20667f 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -54,6 +54,11 @@ define_overridable_option(
   "Build with ET_EVENT_TRACER_ENABLED"
   BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_OPTIMIZE_SIZE
+  "Build executorch runtime optimizing for binary size"
+  BOOL OFF
+)
 
 # MARK: - Validations
 # At this point all the options should be configured with their final value.

From 756f86af948f065c8156a067f9a6d7dabba7ba07 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 12 May 2025 17:17:27 -0700
Subject: [PATCH 054/178] Delete EXECUTORCH_BUILD_ANDROID_JNI (#10803)

### Summary
Seems like this isn't used anywhere.

### Test plan

```
$ rg EXECUTORCH_BUILD_ANDROID_JNI --hidden
```


cc @larryliu0820
---
 CMakeLists.txt          | 2 --
 tools/cmake/Utils.cmake | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b0aba3a9de..c79db032539 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,8 +128,6 @@ else()
   set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
-option(EXECUTORCH_BUILD_ANDROID_JNI "Build Android JNI" OFF)
-
 option(EXECUTORCH_BUILD_ARM_BAREMETAL
        "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF
 )
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index f5d6898478c..937fb432635 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -29,9 +29,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  CMAKE_TOOLCHAIN_FILE          : ${CMAKE_TOOLCHAIN_FILE}")
   message(STATUS "  BUCK2                         : ${BUCK2}")
   message(STATUS "  PYTHON_EXECUTABLE             : ${PYTHON_EXECUTABLE}")
-  message(STATUS "  EXECUTORCH_BUILD_ANDROID_JNI           : "
-                 "${EXECUTORCH_BUILD_ANDROID_JNI}"
-  )
   message(STATUS "  EXECUTORCH_BUILD_ARM_BAREMETAL         : "
                  "${EXECUTORCH_BUILD_ARM_BAREMETAL}"
   )

From d7201abe5f7e9ad1f1758ca4f7460d8516840a85 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 12 May 2025 20:21:26 -0400
Subject: [PATCH 055/178] [Executorch][llm] Add support for ring kv cache and
 ring attention (#10832)

Pull Request resolved: #10608

Introduced CachePositionManager to keep track of what is the position for each slot in ring kv cache. This is used to generate mask.
ghstack-source-id: 283404678
@exported-using-ghexport

Differential Revision: [D73891427](https://our.internmc.facebook.com/intern/diff/D73891427/)
---
 examples/models/llama/attention.py            | 107 ++++
 examples/models/llama/tests/TARGETS           |  11 +
 .../models/llama/tests/test_ring_kv_cache.py  | 477 ++++++++++++++++++
 3 files changed, 595 insertions(+)
 create mode 100644 examples/models/llama/tests/test_ring_kv_cache.py

diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index e222c052788..c886a062c39 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from enum import Enum
 from typing import Any, Dict, Optional, Tuple, Type, TypedDict
 
 import torch
@@ -160,6 +161,112 @@ def forward(
         return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
 
+class CacheUpdateStrategy(Enum):
+    RING_BUFFER = "RingBuffer"
+    INVALID = "Invalid"
+
+
+class CachePositionsManager(nn.Module):
+    def __init__(
+        self,
+        max_context_length: int,
+        cache_update_strategy: CacheUpdateStrategy = CacheUpdateStrategy.RING_BUFFER,
+    ):
+        super().__init__()
+        assert (
+            cache_update_strategy == CacheUpdateStrategy.RING_BUFFER
+        ), "Only RingBuffer is supported"
+        self.max_context_length = max_context_length
+        self.register_buffer(
+            "cache_positions",
+            torch.zeros((self.max_context_length), dtype=torch.long, device="cpu"),
+        )
+
+    def calculate_positions_and_update_indices(self, input_pos: torch.Tensor, seq_len):
+        """
+        Calculate indices, into k_cache, v_cache, where to put k_val tensor.
+        Given the input_pos and length of k_val at sequence dim, the input pos may
+        have to wrap around if it is smaller than the cache capacity.
+        If it is larger than the cache capacity then just pick the last
+        self.max_context_length entries.
+
+        Additionally:
+        Update the cache positions buffer with the new indices.
+        Given the cache positions in sequence dim, indicated by indices,
+        we can just update cache_positions buffer using orig_indices.
+        For example
+        Given cache capacity of 4 and update of length 3 with start_pos = 2
+        will have following values
+        indices = [2, 3, 0]
+        orig_indices = [2, 3, 4]
+        So cache_positions after the update will be [4, 1, 2, 3]
+        Note cache_positions[1] = 1 that is from previous write to the cache.
+        The corner case here is cache positions before cache rolls over.
+        For example when start_pos = 0 and update is of length 2, then we have
+        filled positions 0 and 1 in the buffer, while the rest are invalid. In this case
+        we have
+        indices = [0, 1]
+        orig_indices = [0, 1]
+        But if we have cache_positins = [0, 1, 0, 0] that is not valid. Hence we have
+        to make sure that invalid positions have a sentinel value of - 1.
+        """
+        start_pos = input_pos[0].item()
+        torch._check_is_size(start_pos)
+        orig_indices = torch.arange(seq_len, dtype=torch.long) + start_pos
+        indices = orig_indices % self.max_context_length
+
+        full_t = torch.full((self.max_context_length,), -1, dtype=torch.long)
+        arange_tensor = torch.arange(self.max_context_length, dtype=torch.long)
+        cache_positions = torch.where(
+            arange_tensor < start_pos, self.cache_positions, full_t
+        )
+        self.cache_positions.copy_(cache_positions)
+        self.cache_positions.index_copy_(0, indices, orig_indices)
+
+        return indices
+
+
+class RingKVCache(KVCache):
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_context_length: int,
+        n_heads: int,
+        head_dim: int,
+        enable_dynamic_shape: bool,
+        dtype=torch.float32,
+    ):
+        super().__init__(
+            max_batch_size,
+            max_context_length,
+            n_heads,
+            head_dim,
+            enable_dynamic_shape,
+            dtype,
+        )
+        self.cache_positions_manager = CachePositionsManager(max_context_length)
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # input_pos: [S], k_val: [B, H, S, D]
+        seq_len = k_val.size(2)
+        indices = self.cache_positions_manager.calculate_positions_and_update_indices(
+            input_pos, seq_len
+        )
+        if self.enable_dynamic_shape:
+            start_pos = input_pos[0].item()
+            torch._check_is_size(start_pos)
+
+            self.k_cache.index_copy_(2, indices, k_val)
+            self.v_cache.index_copy_(2, indices, v_val)
+        else:
+            self.k_cache[:, :, indices] = k_val
+            self.v_cache[:, :, indices] = v_val
+
+        return self.k_cache, self.v_cache
+
+
 @register_attention("mha")
 class AttentionMHA(Attention):
     def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
diff --git a/examples/models/llama/tests/TARGETS b/examples/models/llama/tests/TARGETS
index 0efaa9635c4..09ca02868ed 100644
--- a/examples/models/llama/tests/TARGETS
+++ b/examples/models/llama/tests/TARGETS
@@ -38,3 +38,14 @@ python_unittest(
         "//executorch/examples/models/llama:static_attention",
     ],
 )
+
+python_unittest(
+    name = "test_ring_kv_cache",
+    srcs = [
+        "test_ring_kv_cache.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:llama_transformer",
+    ],
+)
diff --git a/examples/models/llama/tests/test_ring_kv_cache.py b/examples/models/llama/tests/test_ring_kv_cache.py
new file mode 100644
index 00000000000..dd9971fa010
--- /dev/null
+++ b/examples/models/llama/tests/test_ring_kv_cache.py
@@ -0,0 +1,477 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.examples.models.llama.attention import RingKVCache
+
+
+class TestRingKVCache(unittest.TestCase):
+    def setUp(self):
+        # Common test parameters
+        self.max_batch_size = 2
+        self.max_context_length = 8
+        self.n_heads = 4
+        self.head_dim = 16
+        self.enable_dynamic_shape = True
+        self.dtype = torch.float32
+
+    def test_basic_update(self):
+        """Test basic update functionality of RingKVCache."""
+        cache = RingKVCache(
+            self.max_batch_size,
+            self.max_context_length,
+            self.n_heads,
+            self.head_dim,
+            self.enable_dynamic_shape,
+            self.dtype,
+        )
+
+        # Create input tensors
+        input_pos = torch.tensor([0], dtype=torch.long)
+        seq_len = 3
+        k_val = torch.ones(
+            (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+            dtype=self.dtype,
+        )
+        v_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 2
+        )
+
+        # Update the cache
+        k_out, v_out = cache.update(input_pos, k_val, v_val)
+
+        # Check that the cache was updated correctly
+        for i in range(seq_len):
+            self.assertTrue(torch.all(k_out[:, :, i] == 1.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 2.0))
+
+        # Check that the rest of the cache is still zeros
+        for i in range(seq_len, self.max_context_length):
+            self.assertTrue(torch.all(k_out[:, :, i] == 0.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 0.0))
+
+        # Check that cache_positions was updated correctly
+        expected_positions = torch.tensor(
+            [0, 1, 2, -1, -1, -1, -1, -1], dtype=torch.long
+        )
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+    def test_ring_buffer_wrapping(self):
+        """Test that the ring buffer wraps around correctly."""
+        cache = RingKVCache(
+            self.max_batch_size,
+            self.max_context_length,
+            self.n_heads,
+            self.head_dim,
+            self.enable_dynamic_shape,
+            self.dtype,
+        )
+
+        # Create input tensors for first update
+        input_pos = torch.tensor([6], dtype=torch.long)
+        seq_len = 4  # This will wrap around from position 6 to positions 6, 7, 0, 1
+        k_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 3
+        )
+        v_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 4
+        )
+
+        # Update the cache
+        k_out, v_out = cache.update(input_pos, k_val, v_val)
+
+        # Check that the cache was updated correctly with wrapping
+        # Positions 6, 7 should be updated
+        for i in range(6, 8):
+            self.assertTrue(torch.all(k_out[:, :, i] == 3.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 4.0))
+
+        # Positions 0, 1 should also be updated due to wrapping
+        for i in range(0, 2):
+            self.assertTrue(torch.all(k_out[:, :, i] == 3.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 4.0))
+
+        # The rest should still be zeros
+        for i in range(2, 6):
+            self.assertTrue(torch.all(k_out[:, :, i] == 0.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 0.0))
+
+        # Check that cache_positions was updated correctly
+        # Note that positions 2, 3, 4, 5 are 0 instead of -1 because in actual ring
+        # updates those positions would have been updated.
+        # But CachePositionsManager thinks they are updated because start_pos > (2, 3, 4, 5)
+        # As a result it does not fill them with -1 and instead uses original values
+        # which is 0, the value cache_position buffer is initialized with.
+        expected_positions = torch.tensor([8, 9, 0, 0, 0, 0, 6, 7], dtype=torch.long)
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+    def test_multiple_updates(self):
+        """Test multiple updates to the cache."""
+        cache = RingKVCache(
+            self.max_batch_size,
+            self.max_context_length,
+            self.n_heads,
+            self.head_dim,
+            self.enable_dynamic_shape,
+            self.dtype,
+        )
+
+        # First update
+        input_pos1 = torch.tensor([0], dtype=torch.long)
+        seq_len1 = 2
+        k_val1 = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len1, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 5
+        )
+        v_val1 = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len1, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 6
+        )
+
+        _, _ = cache.update(input_pos1, k_val1, v_val1)
+
+        # Second update
+        input_pos2 = torch.tensor([2], dtype=torch.long)
+        seq_len2 = 3
+        k_val2 = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len2, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 7
+        )
+        v_val2 = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len2, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 8
+        )
+
+        k_out2, v_out2 = cache.update(input_pos2, k_val2, v_val2)
+
+        # Check that the cache was updated correctly after both updates
+        # First update (positions 0, 1)
+        for i in range(0, 2):
+            self.assertTrue(torch.all(k_out2[:, :, i] == 5.0))
+            self.assertTrue(torch.all(v_out2[:, :, i] == 6.0))
+
+        # Second update (positions 2, 3, 4)
+        for i in range(2, 5):
+            self.assertTrue(torch.all(k_out2[:, :, i] == 7.0))
+            self.assertTrue(torch.all(v_out2[:, :, i] == 8.0))
+
+        # The rest should still be zeros
+        for i in range(5, 8):
+            self.assertTrue(torch.all(k_out2[:, :, i] == 0.0))
+            self.assertTrue(torch.all(v_out2[:, :, i] == 0.0))
+
+        # Check that cache_positions was updated correctly
+        expected_positions = torch.tensor([0, 1, 2, 3, 4, -1, -1, -1], dtype=torch.long)
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+        # Third update with wrapping
+        input_pos3 = torch.tensor([6], dtype=torch.long)
+        seq_len3 = 4
+        k_val3 = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len3, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 9
+        )
+        v_val3 = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len3, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 10
+        )
+
+        k_out3, v_out3 = cache.update(input_pos3, k_val3, v_val3)
+
+        # Check final state after third update with wrapping
+        # Positions 0, 1 should now have values from the third update (due to wrapping)
+        for i in range(0, 2):
+            self.assertTrue(torch.all(k_out3[:, :, i] == 9.0))
+            self.assertTrue(torch.all(v_out3[:, :, i] == 10.0))
+
+        # Positions 2, 3, 4 should still have values from the second update
+        for i in range(2, 5):
+            self.assertTrue(torch.all(k_out3[:, :, i] == 7.0))
+            self.assertTrue(torch.all(v_out3[:, :, i] == 8.0))
+
+        # Position 5 should still be zero
+        self.assertTrue(torch.all(k_out3[:, :, 5] == 0.0))
+        self.assertTrue(torch.all(v_out3[:, :, 5] == 0.0))
+
+        # Positions 6, 7 should have values from the third update
+        for i in range(6, 8):
+            self.assertTrue(torch.all(k_out3[:, :, i] == 9.0))
+            self.assertTrue(torch.all(v_out3[:, :, i] == 10.0))
+
+        # Check that cache_positions was updated correctly
+        expected_positions = torch.tensor([8, 9, 2, 3, 4, -1, 6, 7], dtype=torch.long)
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+    def test_edge_case_input_pos_zero(self):
+        """Test the edge case where input_pos is 0."""
+        cache = RingKVCache(
+            self.max_batch_size,
+            self.max_context_length,
+            self.n_heads,
+            self.head_dim,
+            self.enable_dynamic_shape,
+            self.dtype,
+        )
+
+        # Create input tensors
+        input_pos = torch.tensor([0], dtype=torch.long)
+        seq_len = 1
+        k_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 11
+        )
+        v_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 12
+        )
+
+        # Update the cache
+        k_out, v_out = cache.update(input_pos, k_val, v_val)
+
+        # Check that position 0 was updated
+        self.assertTrue(torch.all(k_out[:, :, 0] == 11.0))
+        self.assertTrue(torch.all(v_out[:, :, 0] == 12.0))
+
+        # Check that the rest of the cache is still zeros
+        for i in range(1, self.max_context_length):
+            self.assertTrue(torch.all(k_out[:, :, i] == 0.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 0.0))
+
+        # Check that cache_positions was updated correctly
+        expected_positions = torch.tensor(
+            [0, -1, -1, -1, -1, -1, -1, -1], dtype=torch.long
+        )
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+    def test_edge_case_exceeding_context_length(self):
+        """Test the edge case where input_pos + seq_len > max_context_length."""
+        cache = RingKVCache(
+            self.max_batch_size,
+            self.max_context_length,
+            self.n_heads,
+            self.head_dim,
+            self.enable_dynamic_shape,
+            self.dtype,
+        )
+
+        # Create input tensors
+        input_pos = torch.tensor([5], dtype=torch.long)
+        seq_len = 5  # This will wrap around from position 5 to positions 5, 6, 7, 0, 1
+        k_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 13
+        )
+        v_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 14
+        )
+
+        # Update the cache
+        k_out, v_out = cache.update(input_pos, k_val, v_val)
+
+        # Check that positions 5, 6, 7 were updated
+        for i in range(5, 8):
+            self.assertTrue(torch.all(k_out[:, :, i] == 13.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 14.0))
+
+        # Check that positions 0, 1 were also updated due to wrapping
+        for i in range(0, 2):
+            self.assertTrue(torch.all(k_out[:, :, i] == 13.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 14.0))
+
+        # Check that positions 2, 3, 4 are still zeros
+        for i in range(2, 5):
+            self.assertTrue(torch.all(k_out[:, :, i] == 0.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 0.0))
+
+        # Check that cache_positions was updated correctly
+        # Note that positions 2, 3, 4 are 0 instead of -1 because in actual ring
+        # updates those positions would have been updated.
+        # But CachePositionsManager thinks they are updated because start_pos > (2, 3, 4)
+        # As a result it does not fill them with -1 and instead uses original values
+        # which is 0, the value cache_position buffer is initialized with.
+        expected_positions = torch.tensor([8, 9, 0, 0, 0, 5, 6, 7], dtype=torch.long)
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+    def test_original_indices_tracking(self):
+        """Test that the original indices are tracked correctly in cache_positions."""
+        cache = RingKVCache(
+            self.max_batch_size,
+            self.max_context_length,
+            self.n_heads,
+            self.head_dim,
+            self.enable_dynamic_shape,
+            self.dtype,
+        )
+
+        # First update at position 10 (will be mapped to position 2 in the ring buffer)
+        input_pos = torch.tensor([10], dtype=torch.long)
+        seq_len = 4
+        k_val = torch.ones(
+            (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+            dtype=self.dtype,
+        )
+        v_val = torch.ones(
+            (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+            dtype=self.dtype,
+        )
+
+        # Update the cache
+        cache.update(input_pos, k_val, v_val)
+
+        # Check that cache_positions correctly tracks the original indices
+        # For input_pos=10 and seq_len=4, the original indices should be 10, 11, 12, 13
+        # These map to positions 2, 3, 4, 5 in the ring buffer (since max_context_length=8)
+        # Note that positions 0, 1, 6 and 7 are 0 instead of -1 because in actual ring
+        # updates those positions would have been updated for start_pos = 0.
+        # So CachePositionsManager thinks they are updated because start_pos > (0, 1, 6, 7)
+        # As a result it does not fill them with -1 and instead uses original values
+        # which is 0, the value cache_position buffer is initialized with.
+        expected_positions = torch.tensor(
+            [0, 0, 10, 11, 12, 13, 0, 0], dtype=torch.long
+        )
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+        # Second update at position 14 (will be mapped to position 6 in the ring buffer)
+        input_pos = torch.tensor([14], dtype=torch.long)
+        seq_len = 3
+        k_val = torch.ones(
+            (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+            dtype=self.dtype,
+        )
+        v_val = torch.ones(
+            (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+            dtype=self.dtype,
+        )
+
+        # Update the cache
+        cache.update(input_pos, k_val, v_val)
+
+        # Check that cache_positions correctly tracks the original indices
+        # For input_pos=14 and seq_len=3, the original indices should be 14, 15, 16
+        # These map to positions 6, 7, 0 in the ring buffer
+        expected_positions = torch.tensor(
+            [16, 0, 10, 11, 12, 13, 14, 15], dtype=torch.long
+        )
+        self.assertTrue(
+            torch.all(
+                cache.cache_positions_manager.cache_positions == expected_positions
+            )
+        )
+
+    def test_non_dynamic_shape(self):
+        """Test RingKVCache with enable_dynamic_shape=False."""
+        cache = RingKVCache(
+            self.max_batch_size,
+            self.max_context_length,
+            self.n_heads,
+            self.head_dim,
+            enable_dynamic_shape=False,
+            dtype=self.dtype,
+        )
+
+        # Create input tensors
+        input_pos = torch.tensor([0], dtype=torch.long)
+        seq_len = 3
+        k_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 15
+        )
+        v_val = (
+            torch.ones(
+                (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
+                dtype=self.dtype,
+            )
+            * 16
+        )
+
+        # Update the cache
+        k_out, v_out = cache.update(input_pos, k_val, v_val)
+
+        # Check that the cache was updated correctly
+        for i in range(seq_len):
+            self.assertTrue(torch.all(k_out[:, :, i] == 15.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 16.0))
+
+        # Check that the rest of the cache is still zeros
+        for i in range(seq_len, self.max_context_length):
+            self.assertTrue(torch.all(k_out[:, :, i] == 0.0))
+            self.assertTrue(torch.all(v_out[:, :, i] == 0.0))

From 6f4df1a1d083d33fc11efc3307e8d1885ea3dc71 Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Tue, 13 May 2025 04:37:17 +0200
Subject: [PATCH 056/178] Refactor _get_source_transforms to remove args

Differential Revision: D73800023

Pull Request resolved: https://github.com/pytorch/executorch/pull/10519
---
 examples/models/llama/export_llama_lib.py | 171 +++++++++++++++++-----
 1 file changed, 135 insertions(+), 36 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 3b926550b9f..0a80369df11 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -661,10 +661,37 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
     logging.info(f"Checkpoint dtype: {edge_manager.model.checkpoint_dtype}")
     edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
         _get_source_transforms(
-            modelname=args.model,
             dtype_override=dtype_override,
+            checkpoint=args.checkpoint,
             checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype),  # type: ignore
-            args=args,
+            tokenizer_path=args.tokenizer_path,
+            use_spin_quant=args.use_spin_quant,
+            embedding_quantize=args.embedding_quantize,
+            use_shared_embedding=args.use_shared_embedding,
+            quantization_mode=args.quantization_mode,
+            group_size=args.group_size,
+            calibration_tasks=args.calibration_tasks,
+            calibration_limit=args.calibration_limit,
+            calibration_seq_length=args.calibration_seq_length,
+            expand_rope_table=args.expand_rope_table,
+            use_custom_sdpa_with_attention_mask=getattr(
+                args, "use_custom_sdpa_with_attention_mask", False
+            ),
+            use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
+            quantize_kv_cache=args.quantize_kv_cache,
+            use_kv_cache=args.use_kv_cache,
+            qnn=args.qnn,
+            use_qnn_sha=args.use_qnn_sha,
+            optimized_rotation_path=args.optimized_rotation_path,
+            mps=args.mps,
+            coreml=args.coreml,
+            coreml_ios=args.coreml_ios,
+            vulkan=args.vulkan,
+            use_qat=args.use_qat,
+            use_lora=args.use_lora,
+            preq_mode=args.preq_mode,
+            preq_group_size=args.preq_group_size,
+            preq_embedding_quantize=args.preq_embedding_quantize,
         )
     )
 
@@ -1189,23 +1216,69 @@ def _load_llama_model(
 
 
 def _get_source_transforms(  # noqa
-    modelname: str,
     dtype_override: DType,
     *,
+    checkpoint: Optional[str] = None,
     checkpoint_dtype: Optional[DType] = None,
-    args,
+    tokenizer_path: Optional[str] = None,
+    use_spin_quant: Optional[str] = None,
+    embedding_quantize: Optional[str] = None,
+    use_shared_embedding: bool = False,
+    quantization_mode: Optional[str] = None,
+    group_size: Optional[int] = None,
+    calibration_tasks: Optional[List[str]] = None,
+    calibration_limit: Optional[int] = None,
+    calibration_seq_length: Optional[int] = None,
+    expand_rope_table: bool = False,
+    use_custom_sdpa_with_attention_mask: bool = False,
+    use_sdpa_with_kv_cache: bool = False,
+    quantize_kv_cache: bool = False,
+    use_kv_cache: bool = False,
+    qnn: bool = False,
+    use_qnn_sha: bool = False,
+    optimized_rotation_path: Optional[str] = None,
+    mps: bool = False,
+    coreml: bool = False,
+    coreml_ios: int = 15,
+    vulkan: bool = False,
+    use_qat: bool = False,
+    use_lora: int = 0,
+    preq_mode: Optional[str] = None,
+    preq_group_size: Optional[int] = None,
+    preq_embedding_quantize: Optional[str] = None,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
 
     Args:
-        modelname: The name of the model.
         dtype_override: The dtype to use for the model.
+        checkpoint: Path to the checkpoint file.
         checkpoint_dtype: The dtype of the checkpoint. At the moment, if this is specified,
             it means that you want to run quantize transformations on the weights represented
             in their original dtype, while the overall dtype of the model maybe something
             different. If not specified, defaults to dtype_override.
-        args: The arguments passed to the script.
+        tokenizer_path: Path to the tokenizer file.
+        use_spin_quant: Type of spin quant to use ("cuda" or "native").
+        embedding_quantize: Type of embedding quantization.
+        quantization_mode: Type of quantization mode.
+        expand_rope_table: Whether to expand rope table.
+        use_custom_sdpa_with_attention_mask: Whether to use custom SDPA with attention mask.
+        use_sdpa_with_kv_cache: Whether to use SDPA with KV cache.
+        quantize_kv_cache: Whether to quantize KV cache.
+        use_kv_cache: Whether to use KV cache.
+        qnn: Whether to use QNN.
+        use_qnn_sha: Whether to use QNN SHA.
+        optimized_rotation_path: Path to optimized rotation.
+        mps: Whether to use MPS.
+        coreml: Whether to use CoreML.
+        coreml_ios: CoreML iOS version.
+        vulkan: Whether to use Vulkan.
+        use_shared_embedding: Whether to use shared embedding.
+        use_qat: Whether to use QAT.
+        use_lora: LoRA rank (0 means no LoRA).
+        preq_mode: Pre-quantization mode.
+        preq_group_size: Pre-quantization group size.
+        preq_embedding_quantize: Pre-quantization embedding quantize.
 
     Returns:
         A list of transformation functions.
@@ -1216,21 +1289,21 @@ def _get_source_transforms(  # noqa
 
     transforms = []
 
-    if args.use_spin_quant:
-        if args.use_spin_quant == "cuda":
+    if use_spin_quant:
+        if use_spin_quant == "cuda":
             from .source_transformation.spin_quant import (
                 inject_fast_hadamard_transform_cuda_for_spin_quant,
             )
 
             transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant)
-        elif args.use_spin_quant == "native":
+        elif use_spin_quant == "native":
             from .source_transformation.spin_quant import (
                 inject_fast_hadamard_transform_native_for_spin_quant,
             )
 
             transforms.append(inject_fast_hadamard_transform_native_for_spin_quant)
 
-    if args.embedding_quantize:
+    if embedding_quantize:
         """
         When this option is selected, it finds all embedding layers and transforms
         into quantized embedding equivalent module.
@@ -1240,12 +1313,27 @@ def _get_source_transforms(  # noqa
         transformations based on the given checkpoint first. In those cases,
         this wil be a no-op.
         """
-        modelname = f"{modelname}_e"
+
+        # Create a mock args object with the necessary attributes
+        class Args:
+            pass
+
+        args = Args()
+        args.checkpoint = checkpoint
+        args.tokenizer_path = tokenizer_path
+        args.embedding_quantize = embedding_quantize
+        args.use_shared_embedding = use_shared_embedding
+        args.use_qat = use_qat
+        args.use_lora = use_lora
+        args.preq_mode = preq_mode
+        args.preq_group_size = preq_group_size
+        args.preq_embedding_quantize = preq_embedding_quantize
+
         transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
 
     # quantization_mode should be applied after embedding_quantize
     # to support shared_embedding
-    if args.quantization_mode:
+    if quantization_mode:
         """
         When this option is selected, it finds all linear layers and transforms
         into quantized linear equivalent module.
@@ -1259,7 +1347,25 @@ def _get_source_transforms(  # noqa
         There are cases where this may be a no-op, namely, if all linears are
         quantized in the checkpoint.
         """
-        modelname = f"{modelname}_q"
+
+        # Create a mock args object with the necessary attributes
+        class Args:
+            pass
+
+        args = Args()
+        args.checkpoint = checkpoint
+        args.tokenizer_path = tokenizer_path
+        args.quantization_mode = quantization_mode
+        args.group_size = group_size
+        args.use_shared_embedding = use_shared_embedding
+        args.calibration_tasks = calibration_tasks
+        args.calibration_limit = calibration_limit
+        args.calibration_seq_length = calibration_seq_length
+        args.use_shared_embedding = use_shared_embedding
+        args.use_qat = use_qat
+        args.use_lora = use_lora
+        args.preq_mode = preq_mode
+
         transforms.append(
             get_quant_weight_transform(
                 args=args,
@@ -1268,15 +1374,12 @@ def _get_source_transforms(  # noqa
             )
         )
 
-    if args.expand_rope_table:
+    if expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
-    use_attention_mask_for_custom_sdpa = False
-    if isinstance(args, argparse.Namespace):
-        if getattr(args, "use_custom_sdpa_with_attention_mask", None):
-            use_attention_mask_for_custom_sdpa = True
+    use_attention_mask_for_custom_sdpa = use_custom_sdpa_with_attention_mask
 
-    if args.use_sdpa_with_kv_cache:
+    if use_sdpa_with_kv_cache:
         transforms.append(replace_kv_cache_with_custom_kv_cache)
         # todo: do this optionally
         # if use attention mask instead of causal attention
@@ -1288,24 +1391,22 @@ def _get_source_transforms(  # noqa
         else:
             transforms.append(replace_sdpa_with_custom_op)
 
-    if args.quantize_kv_cache:
-        assert args.use_kv_cache, "quantize_kv_cache requires use_kv_cache=True"
+    if quantize_kv_cache:
+        assert use_kv_cache, "quantize_kv_cache requires use_kv_cache=True"
         transforms.append(replace_kv_cache_with_quantized_kv_cache)
         # Right now
         transforms.append(replace_sdpa_with_quantized_sdpa)
 
-    if args.use_kv_cache:
-        if args.qnn:
+    if use_kv_cache:
+        if qnn:
             from executorch.backends.qualcomm.utils.utils import (
                 convert_linear_to_conv2d,
             )
 
-            if args.use_qnn_sha:
-                if args.optimized_rotation_path:
+            if use_qnn_sha:
+                if optimized_rotation_path:
                     transforms.append(fuse_layer_norms)
-                    transforms.append(
-                        get_model_with_r1_r2(args.optimized_rotation_path)
-                    )
+                    transforms.append(get_model_with_r1_r2(optimized_rotation_path))
                 transforms.append(replace_attention_to_attention_sha)
                 transforms.append(replace_causal_mask)
                 transforms.append(replace_rms_norm_with_native_rms_norm)
@@ -1316,29 +1417,27 @@ def _get_source_transforms(  # noqa
                 transforms.append(replace_sdpa_with_flex_sdpa)
                 transforms.append(replace_causal_mask)
                 transforms.append(replace_rms_norm_with_native_rms_norm)
-                if args.optimized_rotation_path:
+                if optimized_rotation_path:
                     transforms.append(fuse_layer_norms)
-                    transforms.append(
-                        get_model_with_r1_r2(args.optimized_rotation_path)
-                    )
+                    transforms.append(get_model_with_r1_r2(optimized_rotation_path))
                 # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
                 transforms.append(convert_linear_to_conv2d)
 
-        elif args.mps:
+        elif mps:
             # Currently mps doesn't support sdpa op, use the simpler decomposition
             # to get free perf gain.
             transforms.append(replace_sdpa_with_simple_sdpa)
             transforms.append(replace_causal_mask)
 
-        elif args.coreml:
+        elif coreml:
             # iOS 18 introduced fused sdpa op
-            if args.coreml_ios >= 18:
+            if coreml_ios >= 18:
                 transforms.append(replace_sdpa_with_coreml_sdpa)
             else:
                 transforms.append(replace_sdpa_with_simple_sdpa)
             transforms.append(replace_kv_cache_with_coreml_kv_cache)
 
-    if args.vulkan:
+    if vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
 
     return transforms

From 473c77b9eef8f3298c9b0040ad688758d41882d9 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 12 May 2025 19:47:03 -0700
Subject: [PATCH 057/178] Move simple options to  default preset (#10804)

### Summary
* Move the simple options to default preset. This shouldn't change any
build configurations
* Rename `BUILD_EXECUTORCH_PORTABLE_OPS` to
`EXECUTORCH_BUILD_PORTABLE_OPS`

### Test plan

CI

cc @larryliu0820
---
 CMakeLists.txt                             |  72 +---------
 backends/cadence/build_cadence_fusionG3.sh |   4 +-
 backends/cadence/build_cadence_hifi4.sh    |   4 +-
 tools/cmake/Utils.cmake                    |  82 ------------
 tools/cmake/preset/default.cmake           | 145 +++++++++++++++++++++
 5 files changed, 151 insertions(+), 156 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c79db032539..bae7732f881 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,74 +128,6 @@ else()
   set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
-option(EXECUTORCH_BUILD_ARM_BAREMETAL
-       "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF
-)
-
-option(EXECUTORCH_BUILD_KERNELS_CUSTOM "Build the custom kernels" OFF)
-
-option(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT "Build the custom ops lib for AOT"
-       OFF
-)
-
-option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension"
-       OFF
-)
-
-option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
-       OFF
-)
-
-option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
-       OFF
-)
-
-option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
-
-option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
-       OFF
-)
-
-option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF)
-
-option(EXECUTORCH_BUILD_EXTENSION_TRAINING "Build the training extension" OFF)
-
-option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF)
-
-option(EXECUTORCH_BUILD_NEURON "Build the backends/mediatek directory" OFF)
-
-option(EXECUTORCH_BUILD_OPENVINO "Build the Openvino backend" OFF)
-
-option(EXECUTORCH_BUILD_PYBIND "Build the Python Bindings" OFF)
-
-option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)
-
-option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
-
-option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF)
-
-option(EXECUTORCH_BUILD_DEVTOOLS "Build the ExecuTorch Developer Tools")
-
-option(EXECUTORCH_BUILD_TESTS "Build CMake-based unit tests" OFF)
-
-option(EXECUTORCH_NNLIB_OPT "Build Cadence backend Hifi nnlib kernel" OFF)
-
-option(EXECUTORCH_CADENCE_CPU_RUNNER "Build Cadence backend CPU runner" OFF)
-
-option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
-
-option(EXECUTORCH_BUILD_XNNPACK "Build the XNNPACK backend" OFF)
-
-option(EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" OFF)
-
-option(BUILD_EXECUTORCH_PORTABLE_OPS "Build portable_ops library" ON)
-
-option(EXECUTORCH_USE_DL "Use libdl library" ON)
-
-option(EXECUTORCH_BUILD_CADENCE "Build the Cadence DSP backend" OFF)
-
-option(EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" OFF)
-
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
@@ -509,7 +441,7 @@ endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable/cpu/util)
 
-if(BUILD_EXECUTORCH_PORTABLE_OPS)
+if(EXECUTORCH_BUILD_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
@@ -522,7 +454,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
 #
 # gflags: Commandline flag host library.
 #
-option(EXECUTORCH_BUILD_GFLAGS "Build the gflags library." ON)
+
 if(EXECUTORCH_BUILD_GFLAGS)
   add_subdirectory(third-party/gflags)
 endif()
diff --git a/backends/cadence/build_cadence_fusionG3.sh b/backends/cadence/build_cadence_fusionG3.sh
index 9e18448a76d..3ced26f8577 100644
--- a/backends/cadence/build_cadence_fusionG3.sh
+++ b/backends/cadence/build_cadence_fusionG3.sh
@@ -46,7 +46,7 @@ if $STEPWISE_BUILD; then
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
         -DEXECUTORCH_USE_DL=OFF \
-        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
         -DPYTHON_EXECUTABLE=python3 \
         -DEXECUTORCH_FUSION_G3_OPT=ON \
@@ -72,7 +72,7 @@ else
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
         -DEXECUTORCH_USE_DL=OFF \
-        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
         -DPYTHON_EXECUTABLE=python3 \
         -DEXECUTORCH_FUSION_G3_OPT=ON \
diff --git a/backends/cadence/build_cadence_hifi4.sh b/backends/cadence/build_cadence_hifi4.sh
index 67f3b7ca7f1..70f026fe0b2 100644
--- a/backends/cadence/build_cadence_hifi4.sh
+++ b/backends/cadence/build_cadence_hifi4.sh
@@ -45,7 +45,7 @@ if $STEPWISE_BUILD; then
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
         -DEXECUTORCH_USE_DL=OFF \
-        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
         -DPYTHON_EXECUTABLE=python3 \
         -DEXECUTORCH_NNLIB_OPT=ON \
@@ -70,7 +70,7 @@ else
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
         -DEXECUTORCH_USE_DL=OFF \
-        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
         -DPYTHON_EXECUTABLE=python3 \
         -DEXECUTORCH_NNLIB_OPT=ON \
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 937fb432635..773ee1c578b 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -29,99 +29,17 @@ function(executorch_print_configuration_summary)
   message(STATUS "  CMAKE_TOOLCHAIN_FILE          : ${CMAKE_TOOLCHAIN_FILE}")
   message(STATUS "  BUCK2                         : ${BUCK2}")
   message(STATUS "  PYTHON_EXECUTABLE             : ${PYTHON_EXECUTABLE}")
-  message(STATUS "  EXECUTORCH_BUILD_ARM_BAREMETAL         : "
-                 "${EXECUTORCH_BUILD_ARM_BAREMETAL}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_CADENCE               : "
-                 "${EXECUTORCH_BUILD_CADENCE}"
-  )
   message(
     STATUS
       "  EXECUTORCH_BUILD_CPUINFO               : ${EXECUTORCH_BUILD_CPUINFO}"
   )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_DEVTOOLS              : ${EXECUTORCH_BUILD_DEVTOOLS}"
-  )
   message(STATUS "  EXECUTORCH_BUILD_EXECUTOR_RUNNER       : "
                  "${EXECUTORCH_BUILD_EXECUTOR_RUNNER}"
   )
-  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_DATA_LOADER : "
-                 "${EXECUTORCH_BUILD_EXTENSION_DATA_LOADER}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR : "
-                 "${EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_LLM         : "
-                 "${EXECUTORCH_BUILD_EXTENSION_LLM}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_MODULE      : "
-                 "${EXECUTORCH_BUILD_EXTENSION_MODULE}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL : "
-                 "${EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_TENSOR      : "
-                 "${EXECUTORCH_BUILD_EXTENSION_TENSOR}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_TRAINING    : "
-                 "${EXECUTORCH_BUILD_EXTENSION_TRAINING}"
-  )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_GFLAGS                : ${EXECUTORCH_BUILD_GFLAGS}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_KERNELS_CUSTOM        : "
-                 "${EXECUTORCH_BUILD_KERNELS_CUSTOM}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT    : "
-                 "${EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_KERNELS_OPTIMIZED     : "
-                 "${EXECUTORCH_BUILD_KERNELS_OPTIMIZED}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_KERNELS_QUANTIZED     : "
-                 "${EXECUTORCH_BUILD_KERNELS_QUANTIZED}"
-  )
-  message(
-    STATUS "  EXECUTORCH_BUILD_MPS                   : ${EXECUTORCH_BUILD_MPS}"
-  )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_NEURON                : ${EXECUTORCH_BUILD_NEURON}"
-  )
-  message(
-    STATUS
-    "  EXECUTORCH_BUILD_OPENVINO                : ${EXECUTORCH_BUILD_OPENVINO}"
-  )
   message(
     STATUS
       "  EXECUTORCH_BUILD_PTHREADPOOL           : ${EXECUTORCH_BUILD_PTHREADPOOL}"
   )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_PYBIND                : ${EXECUTORCH_BUILD_PYBIND}"
-  )
-  message(
-    STATUS "  EXECUTORCH_BUILD_QNN                   : ${EXECUTORCH_BUILD_QNN}"
-  )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_SIZE_TEST             : ${EXECUTORCH_BUILD_SIZE_TEST}"
-  )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_TESTS                 : ${EXECUTORCH_BUILD_TESTS}"
-  )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_VULKAN                : ${EXECUTORCH_BUILD_VULKAN}"
-  )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_XNNPACK               : ${EXECUTORCH_BUILD_XNNPACK}"
-  )
-
 endfunction()
 
 # This is the funtion to use -Wl, --whole-archive to link static library NB:
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 8020c20667f..1fdd54976aa 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -59,6 +59,151 @@ define_overridable_option(
   "Build executorch runtime optimizing for binary size"
   BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_ARM_BAREMETAL
+  "Build the Arm Baremetal flow for Cortex-M and Ethos-U"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_KERNELS_CUSTOM
+  "Build the custom kernels"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT
+  "Build the custom ops lib for AOT"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
+  "Build the Data Loader extension"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR
+  "Build the Flat Tensor extension"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_LLM
+  "Build the LLM extension"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_MODULE
+  "Build the Module extension"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL
+  "Build the Runner Util extension"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_TENSOR
+  "Build the Tensor extension"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_TRAINING
+  "Build the training extension"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_MPS
+  "Build the MPS backend"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_NEURON
+  "Build the backends/mediatek directory"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_OPENVINO
+  "Build the Openvino backend"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_PYBIND
+  "Build the Python Bindings"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_QNN
+  "Build the Qualcomm backend"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_KERNELS_OPTIMIZED
+  "Build the optimized kernels"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_KERNELS_QUANTIZED
+  "Build the quantized kernels"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_DEVTOOLS
+  "Build the ExecuTorch Developer Tools"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_TESTS
+  "Build CMake-based unit tests"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_NNLIB_OPT
+  "Build Cadence backend Hifi nnlib kernel"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_CADENCE_CPU_RUNNER
+  "Build Cadence backend CPU runner"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_SIZE_TEST
+  "Build the size test"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_XNNPACK
+  "Build the XNNPACK backend"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_VULKAN
+  "Build the Vulkan backend"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_PORTABLE_OPS
+  "Build portable_ops library"
+  BOOL ON
+)
+define_overridable_option(
+  EXECUTORCH_USE_DL
+  "Use libdl library"
+  BOOL ON
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_CADENCE
+  "Build the Cadence DSP backend"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_CORTEX_M
+  "Build the Cortex-M backend"
+  BOOL OFF
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_GFLAGS
+  "Build the gflags library."
+  BOOL ON
+)
 
 # MARK: - Validations
 # At this point all the options should be configured with their final value.

From aa73a55a9fed2ae14a2815267e695877c31f81a4 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Mon, 12 May 2025 21:36:19 -0700
Subject: [PATCH 058/178] Default to file load mode in module (#10827)

Summary:
We've seen a number of people hit issues with mlock as the default load
mode. In particular, mlock will fail above a certain PTE size and may
require elevated permissions on some systems. We should not default to
using mlock and can leave at as an opt-in optimization.

Differential Revision: D74607072
---
 extension/module/module.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/extension/module/module.h b/extension/module/module.h
index 0c4d4779bea..14d5b4294ae 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -53,7 +53,7 @@ class Module {
    */
   explicit Module(
       const std::string& file_path,
-      const LoadMode load_mode = LoadMode::MmapUseMlock,
+      const LoadMode load_mode = LoadMode::File,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   /**
@@ -68,7 +68,7 @@ class Module {
   explicit Module(
       const std::string& file_path,
       const std::string& data_map_path,
-      const LoadMode load_mode = LoadMode::MmapUseMlock,
+      const LoadMode load_mode = LoadMode::File,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   /**
@@ -481,7 +481,7 @@ class Module {
 
   std::string file_path_;
   std::string data_map_path_;
-  LoadMode load_mode_{LoadMode::MmapUseMlock};
+  LoadMode load_mode_{LoadMode::File};
   std::shared_ptr<Program> program_;
   std::unique_ptr<runtime::DataLoader> data_loader_;
   std::unique_ptr<runtime::MemoryAllocator> memory_allocator_;

From 19d3bce69356b1924aac6d98c4f6db3b24a5739e Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 13 May 2025 00:53:41 -0400
Subject: [PATCH 059/178] [ET-VK] Removing un used push constants for conv2d
 pw. (#10841)

Pull Request resolved: #10814

This change removes unused push constants from conv2d pw op to reduce memory usage.
ghstack-source-id: 283597017
@exported-using-ghexport

Differential Revision: [D74523769](https://our.internmc.facebook.com/intern/diff/D74523769/)
---
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     |  4 ----
 .../runtime/graph/ops/impl/Convolution.cpp    | 22 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index a576a46d5b8..468b91f0535 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -29,12 +29,8 @@ ${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
 
 layout(push_constant) uniform restrict Block {
   ivec4 out_limits;
-  ivec4 in_sizes;
-  ivec2 kernel_size;
   ivec2 stride;
   ivec2 padding;
-  ivec2 dilation;
-  ivec2 overlay_region;
   int in_group_size;
   int dummy_padding;
   float out_min;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 48256cb2996..381b9de0d6a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -404,7 +404,27 @@ void add_conv2d_node(
 
   vkapi::ParamsBindList param_buffers;
   std::vector<PushConstantDataInfo> push_constants;
-  if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
+  if (method == Conv2dMethod::Pointwise) {
+    const utils::ivec4 kernel_param_stride_pad = {
+        kernel_params.stride[0],
+        kernel_params.stride[1],
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+    };
+
+    struct Conv2dPWParams final {
+      int in_group_size;
+      int dummy_padding;
+      OutputParams out_params;
+    } param{extra_params.in_group_size, 0, out_params};
+
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        PushConstantDataInfo(
+            &kernel_param_stride_pad, sizeof(kernel_param_stride_pad)),
+        PushConstantDataInfo(&param, sizeof(param)),
+    };
+  } else if (method == Conv2dMethod::Depthwise) {
     const utils::ivec4 kernel_param_size_stride = {
         kernel_params.kernel_size[0],
         kernel_params.kernel_size[1],

From 65d931ea7bb748748625957b1c1651242f0872db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Tue, 13 May 2025 07:41:54 +0200
Subject: [PATCH 060/178] Correct model name in examples/arm/run.sh (#10815)

Change name; qadd3 -> qadd2
---
 examples/arm/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index ed1cbc5e015..89ac5cd30a8 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -182,7 +182,7 @@ if [[ -z "$model_name" ]]; then
         "add"      # 1
         "add3"     # 2
         "qadd"     # 3
-        "qadd3"    # 4
+        "qadd2"    # 4
         "qops"     # 5
         "mv2"      # 6
     )
@@ -191,7 +191,7 @@ if [[ -z "$model_name" ]]; then
         "--delegate"            # 1 add
         "--delegate"            # 2 add3
         "--delegate --quantize" # 3 qadd
-        "--delegate --quantize" # 4 qadd3
+        "--delegate --quantize" # 4 qadd2
         "--delegate --quantize" # 5 qops
         "--delegate --quantize" # 6 mv2
     )

From b30f912e4b4cd06cec9a910bd816245bb6a0313b Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Mon, 12 May 2025 23:07:13 -0700
Subject: [PATCH 061/178] Use certifi certs for buck download (#10095)

### Summary
Depending on how Python is installed and managed, it is common for
certificates to not be installed out of box on Macs. This can cause the
download of buck2 as part of the build process to fail with a
certificate error.

While the ultimate solution for this is for the user to install certs in
their environment, we can smooth this over a bit for build by pointing
urllib at the certifi package certs, which we install as part of
install_requirements.

### Test plan
I worked with @nil-is-all to verify that this fix allowed him to build
and install ExecuTorch from a clean environment. Without this change, he
was seeing cert failure errors during buck download.
---
 tools/cmake/resolve_buck.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/tools/cmake/resolve_buck.py b/tools/cmake/resolve_buck.py
index f9c42a0a3c8..5f5aad3d87d 100644
--- a/tools/cmake/resolve_buck.py
+++ b/tools/cmake/resolve_buck.py
@@ -9,6 +9,7 @@
 import argparse
 import os
 import platform
+import ssl
 import stat
 import sys
 import urllib.request
@@ -18,6 +19,7 @@
 from typing import Union
 
 import buck_util
+import certifi
 import zstd
 
 """
@@ -179,26 +181,23 @@ def resolve_buck2(args: argparse.Namespace) -> Union[str, int]:
             if os.path.isfile(buck2_local_path):
                 return buck2_local_path
 
-            buck2_archive_url = f"https://github.com/facebook/buck2/releases/download/{target_buck_version}/{buck_info.archive_name}"
+        buck2_archive_url = f"https://github.com/facebook/buck2/releases/download/{target_buck_version}/{buck_info.archive_name}"
 
-            try:
-                print(f"Downloading buck2 from {buck2_archive_url}...", file=sys.stderr)
-                archive_file, _ = urllib.request.urlretrieve(buck2_archive_url)
+        print(f"Downloading buck2 from {buck2_archive_url}...", file=sys.stderr)
+        ssl_context = ssl.create_default_context(cafile=certifi.where())
 
-                # Extract and chmod.
-                with open(archive_file, "rb") as f:
-                    data = f.read()
-                    decompressed_bytes = zstd.decompress(data)
+        with urllib.request.urlopen(buck2_archive_url, context=ssl_context) as request:
+            # Extract and chmod.
+            data = request.read()
+            decompressed_bytes = zstd.decompress(data)
 
-                with open(buck2_local_path, "wb") as f:
-                    f.write(decompressed_bytes)
+            with open(buck2_local_path, "wb") as f:
+                f.write(decompressed_bytes)
 
-                file_stat = os.stat(buck2_local_path)
-                os.chmod(buck2_local_path, file_stat.st_mode | stat.S_IEXEC)
-            finally:
-                os.remove(archive_file)
+        file_stat = os.stat(buck2_local_path)
+        os.chmod(buck2_local_path, file_stat.st_mode | stat.S_IEXEC)
 
-            return buck2_local_path
+    return buck2_local_path
 
 
 def main():

From 7993bb268e525e145eef90ccbe96acee8fad3def Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Tue, 13 May 2025 09:15:52 +0200
Subject: [PATCH 062/178] Arm backend: Update partitioner de-tagging iteration
 order (#10813)

When de-tagging tagged/partitioned nodes, the order in which we iterate
the nodes will matter as each node will make a decision based on if the
parent node is partitoned or has a compatible dtype. This change makes
sure we iterate the nodes in the order of the original graph_module.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 backends/arm/tosa_partitioner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/arm/tosa_partitioner.py b/backends/arm/tosa_partitioner.py
index 738c5ab8204..de85dfae92f 100644
--- a/backends/arm/tosa_partitioner.py
+++ b/backends/arm/tosa_partitioner.py
@@ -96,7 +96,9 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
 
             # De-tag outmost q-nodes upwards and dq-nodes downwards.
             # De-tag if at least one input/ output is not part of partition.
-            for node in partition.nodes:
+            for node in exported_program.graph_module.graph.nodes:
+                if not is_partitioned(node):
+                    continue
                 if is_quant_node(node):
                     for input in node.all_input_nodes:
                         if not is_partitioned(input):

From dcd25ebdf480345f07ae3fdb7206ae404bd5dbcd Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Tue, 13 May 2025 12:34:38 +0200
Subject: [PATCH 063/178] Arm backend: Refactor pass tests for TOSA V1.0
 (#10843)

PassPipelines will handle tosa_version aligned with other test
pipelines.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 .../arm/test/passes/test_cast_int64_pass.py   |  17 +--
 .../test_convert_expand_copy_to_repeat.py     |   2 +-
 .../passes/test_convert_split_to_slice.py     |   2 +-
 .../arm/test/passes/test_convert_to_clamp.py  | 108 +++++++++-------
 .../test_decompose_cosine_similarity_pass.py  |   2 +-
 .../test/passes/test_decompose_div_pass.py    |   2 +-
 .../passes/test_decompose_layernorm_pass.py   |   2 +-
 .../passes/test_decompose_meandim_pass.py     |   2 +-
 .../passes/test_decompose_softmax_pass.py     |   4 +-
 .../test/passes/test_decompose_var_pass.py    |   2 +-
 .../arm/test/passes/test_fold_qdq_pass.py     |  15 ++-
 .../test/passes/test_fuse_batchnorm_pass.py   |   4 +-
 .../passes/test_fuse_constant_ops_pass.py     |  13 +-
 .../test_fuse_equal_placeholders_ops_pass.py  |   4 +-
 .../test/passes/test_insert_table_ops_pass.py |  14 +-
 .../test/passes/test_ioquantization_pass.py   |  63 ++++-----
 .../passes/test_meandim_to_averagepool2d.py   |   4 +-
 .../arm/test/passes/test_remove_clone_pass.py |   2 +-
 backends/arm/test/passes/test_rescale_pass.py | 122 ++++++++----------
 .../test_unsqueeze_before_repeat_pass.py      |   4 +-
 backends/arm/test/tester/test_pipeline.py     |  16 ++-
 21 files changed, 206 insertions(+), 198 deletions(-)

diff --git a/backends/arm/test/passes/test_cast_int64_pass.py b/backends/arm/test/passes/test_cast_int64_pass.py
index 988b95924fe..b9ddfcdec86 100644
--- a/backends/arm/test/passes/test_cast_int64_pass.py
+++ b/backends/arm/test/passes/test_cast_int64_pass.py
@@ -6,23 +6,25 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm._passes.cast_int64_pass import CastInt64BuffersToInt32Pass
+from executorch.backends.arm._passes import CastInt64BuffersToInt32Pass
 
+from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 input_t = Tuple[torch.Tensor]  # Input x
 
 
 class Int64Model(torch.nn.Module):
+    test_data = {
+        "rand": (torch.rand(4),),
+    }
 
     def forward(self, x: torch.Tensor):
         return x + 3
 
-    def get_inputs(self) -> input_t:
-        return (torch.rand(4),)
-
 
-def test_int64_model_tosa_BI():
+@common.parametrize("test_data", Int64Model.test_data)
+def test_int64_model(test_data: input_t):
     module = Int64Model()
     op_checks = {
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1,
@@ -30,13 +32,12 @@ def test_int64_model_tosa_BI():
     }
     pipeline = PassPipeline[input_t](
         module,
-        module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
+        test_data,
+        quantize=False,
         ops_before_pass=op_checks,
         ops_after_pass=op_checks,
         passes_with_exported_program=[CastInt64BuffersToInt32Pass],
     )
-    pipeline.pop_stage("quantize")
     pipeline.run()
 
     exported_program = pipeline.tester.get_artifact("RunPasses").exported_program()
diff --git a/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py b/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py
index 5d83bc82f22..38c1cf3296e 100644
--- a/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py
+++ b/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py
@@ -35,7 +35,7 @@ def test_expand_to_repeat_tosa_BI():
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
+        quantize=True,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten_expand_copy_default": 1,
         },
diff --git a/backends/arm/test/passes/test_convert_split_to_slice.py b/backends/arm/test/passes/test_convert_split_to_slice.py
index d4fdffe3b01..7ca6b71236f 100644
--- a/backends/arm/test/passes/test_convert_split_to_slice.py
+++ b/backends/arm/test/passes/test_convert_split_to_slice.py
@@ -49,7 +49,7 @@ def test_split_to_slice_tosa_BI(module):
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
+        quantize=True,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1,
         },
diff --git a/backends/arm/test/passes/test_convert_to_clamp.py b/backends/arm/test/passes/test_convert_to_clamp.py
index 0b106b7bc82..c35dd1c72a5 100644
--- a/backends/arm/test/passes/test_convert_to_clamp.py
+++ b/backends/arm/test/passes/test_convert_to_clamp.py
@@ -3,18 +3,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+
+from typing import Tuple
 
 import torch
 from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
-from executorch.backends.xnnpack.test.tester.tester import RunPasses
+input_t = Tuple[torch.Tensor]  # Input x
 
 
 class HardTanh(torch.nn.Module):
+    test_data = {"rand": (torch.rand(1, 64, 64, 3),)}
+
     def __init__(self):
         super().__init__()
 
@@ -23,11 +26,10 @@ def __init__(self):
     def forward(self, x):
         return self.hardtanh(x)
 
-    def get_inputs(self):
-        return (torch.rand(1, 64, 64, 3),)
-
 
 class ReLU(torch.nn.Module):
+    test_data = {"rand": (torch.rand(1, 64, 64, 3),)}
+
     def __init__(self):
         super().__init__()
 
@@ -36,45 +38,55 @@ def __init__(self):
     def forward(self, x):
         return self.relu(x)
 
-    def get_inputs(self):
-        return (torch.rand(1, 64, 64, 3),)
-
-
-class TestConvertToClampPass(unittest.TestCase):
-    """
-    Tests the ConvertToClampPass which converts hardtanh.default and relu.default to clamp.default
-    """
-
-    def test_tosa_MI_hardtahn(self):
-        module = HardTanh()
-        test_pass_stage = RunPasses([ConvertToClampPass])
-        (
-            ArmTester(
-                module,
-                example_inputs=module.get_inputs(),
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-            .check(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
-            .run_passes(test_pass_stage)
-            .check(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
-        )
-
-    def test_tosa_MI_relu(self):
-        module = ReLU()
-        test_pass_stage = RunPasses([ConvertToClampPass])
-        (
-            ArmTester(
-                module,
-                example_inputs=module.get_inputs(),
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-            .check(["executorch_exir_dialects_edge__ops_aten_relu_default"])
-            .run_passes(test_pass_stage)
-            .check(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
-            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
-        )
+
+"""
+Tests the ConvertToClampPass which converts hardtanh.default and relu.default to clamp.default
+"""
+
+
+@common.parametrize("test_data", HardTanh.test_data)
+def test_tosa_MI_hardtahn(test_data: input_t):
+    module = HardTanh()
+    op_checks_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten_hardtanh_default": 1,
+    }
+    op_checks_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten_clamp_default": 1,
+    }
+    op_checks_not_after_pass = [
+        "executorch_exir_dialects_edge__ops_aten_hardtanh_default",
+    ]
+    pipeline = PassPipeline[input_t](
+        module,
+        test_data,
+        quantize=False,
+        ops_before_pass=op_checks_before_pass,
+        ops_after_pass=op_checks_after_pass,
+        ops_not_after_pass=op_checks_not_after_pass,
+        pass_list=[ConvertToClampPass],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", ReLU.test_data)
+def test_tosa_MI_relu(test_data: input_t):
+    module = ReLU()
+    op_checks_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten_relu_default": 1,
+    }
+    op_checks_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten_clamp_default": 1,
+    }
+    op_checks_not_after_pass = [
+        "executorch_exir_dialects_edge__ops_aten_relu_default",
+    ]
+    pipeline = PassPipeline[input_t](
+        module,
+        test_data,
+        quantize=False,
+        ops_before_pass=op_checks_before_pass,
+        ops_after_pass=op_checks_after_pass,
+        ops_not_after_pass=op_checks_not_after_pass,
+        pass_list=[ConvertToClampPass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_cosine_similarity_pass.py b/backends/arm/test/passes/test_decompose_cosine_similarity_pass.py
index f3fa95ec10c..31b2627b978 100644
--- a/backends/arm/test/passes/test_decompose_cosine_similarity_pass.py
+++ b/backends/arm/test/passes/test_decompose_cosine_similarity_pass.py
@@ -42,11 +42,11 @@ def test_decompose_cosine_similarity_tosa_BI(module):
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
         ops_before_pass=None,
         ops_not_before_pass=None,
         ops_after_pass=ops_after_pass,
         ops_not_after_pass=None,
         pass_list=[DecomposeCosineSimilarityPass],
+        quantize=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_div_pass.py b/backends/arm/test/passes/test_decompose_div_pass.py
index 71d586c0029..24e18b4f523 100644
--- a/backends/arm/test/passes/test_decompose_div_pass.py
+++ b/backends/arm/test/passes/test_decompose_div_pass.py
@@ -47,7 +47,7 @@ def test_decompose_div_tosa_MI(module):
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten_div_Tensor": 1,
         },
diff --git a/backends/arm/test/passes/test_decompose_layernorm_pass.py b/backends/arm/test/passes/test_decompose_layernorm_pass.py
index 40e49e15bc5..9c375ceaf8f 100644
--- a/backends/arm/test/passes/test_decompose_layernorm_pass.py
+++ b/backends/arm/test/passes/test_decompose_layernorm_pass.py
@@ -37,7 +37,7 @@ def test_decompose_layernorm_tosa_MI():
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten_native_layer_norm_default": 1,
         },
diff --git a/backends/arm/test/passes/test_decompose_meandim_pass.py b/backends/arm/test/passes/test_decompose_meandim_pass.py
index 6ba9ceff3a7..511959e36cf 100644
--- a/backends/arm/test/passes/test_decompose_meandim_pass.py
+++ b/backends/arm/test/passes/test_decompose_meandim_pass.py
@@ -53,7 +53,7 @@ def test_decompose_meandim_tosa_MI(module):
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
         },
diff --git a/backends/arm/test/passes/test_decompose_softmax_pass.py b/backends/arm/test/passes/test_decompose_softmax_pass.py
index efb911f03aa..6c7ed7cfb60 100644
--- a/backends/arm/test/passes/test_decompose_softmax_pass.py
+++ b/backends/arm/test/passes/test_decompose_softmax_pass.py
@@ -52,7 +52,7 @@ def test_softmax_basic_tosa_MI():
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten__softmax_default": 1,
         },
@@ -79,7 +79,7 @@ def test_softmax_log_tosa_MI():
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1,
         },
diff --git a/backends/arm/test/passes/test_decompose_var_pass.py b/backends/arm/test/passes/test_decompose_var_pass.py
index fe793dba14b..65357fc2212 100644
--- a/backends/arm/test/passes/test_decompose_var_pass.py
+++ b/backends/arm/test/passes/test_decompose_var_pass.py
@@ -60,7 +60,7 @@ def test_decompose_var_tosa_MI(module):
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten_var_correction": 1,
         },
diff --git a/backends/arm/test/passes/test_fold_qdq_pass.py b/backends/arm/test/passes/test_fold_qdq_pass.py
index ae9e8feef72..86324d523c6 100644
--- a/backends/arm/test/passes/test_fold_qdq_pass.py
+++ b/backends/arm/test/passes/test_fold_qdq_pass.py
@@ -7,6 +7,7 @@
 
 import torch
 from executorch.backends.arm._passes import FoldAndAnnotateQParamsPass
+from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 
@@ -14,14 +15,16 @@
 
 
 class SimpleQuantizeModel(torch.nn.Module):
+    test_data = {
+        "rand": (torch.rand(1, 1280, 7, 7), torch.rand(1, 1280, 7, 7)),
+    }
+
     def forward(self, x, y):
         return x + torch.max((x + x), (y + y))
 
-    def get_inputs(self) -> input_t:
-        return (torch.rand(1, 1280, 7, 7), torch.rand(1, 1280, 7, 7))
-
 
-def test_fold_qdq_pass_tosa_BI():
+@common.parametrize("test_data", SimpleQuantizeModel.test_data)
+def test_fold_qdq_pass_tosa_BI(test_data: input_t):
     """
     Tests the FoldAndAnnotateQParamsPass which folds dq/q nodes into
     the node and stores the quantization parameters in meta.
@@ -32,8 +35,8 @@ def test_fold_qdq_pass_tosa_BI():
     module = SimpleQuantizeModel()
     pipeline = PassPipeline[input_t](
         module,
-        module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
+        test_data,
+        quantize=True,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 7,
             "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 6,
diff --git a/backends/arm/test/passes/test_fuse_batchnorm_pass.py b/backends/arm/test/passes/test_fuse_batchnorm_pass.py
index 415aa9f6132..de9181f9fa4 100644
--- a/backends/arm/test/passes/test_fuse_batchnorm_pass.py
+++ b/backends/arm/test/passes/test_fuse_batchnorm_pass.py
@@ -136,12 +136,12 @@ def forward(self, x):
 
 
 @common.parametrize("module", modules)
-def test_fuse_batchnorm_tosa_MI(module):
+def test_fuse_batchnorm_tosa_MI(module: torch.nn.Module):
     """Test various cases where the batchnorm should and shouldn't be fused."""
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass=module.ops_before_pass,
         ops_after_pass=module.ops_after_pass,
         passes_with_exported_program=[FuseBatchnorm2DPass],
diff --git a/backends/arm/test/passes/test_fuse_constant_ops_pass.py b/backends/arm/test/passes/test_fuse_constant_ops_pass.py
index 12d85054f79..5e759d7a824 100644
--- a/backends/arm/test/passes/test_fuse_constant_ops_pass.py
+++ b/backends/arm/test/passes/test_fuse_constant_ops_pass.py
@@ -98,11 +98,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 @common.parametrize("module", modules)
-def test_fuse_const_ops_tosa_MI(module):
+def test_fuse_const_ops_tosa_MI(module: torch.nn.Module):
     pipeline = PassPipeline[input_t](
         module=module,
         test_data=(torch.rand(1),),
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass=module.ops_before_pass,
         ops_after_pass=module.ops_after_pass,
         ops_not_after_pass=module.ops_not_after_pass,
@@ -113,8 +113,13 @@ def test_fuse_const_ops_tosa_MI(module):
 
 @unittest.skip("Test failing on internal CI")
 @common.parametrize("module", modules)
-def test_fuse_const_ops_tosa_BI(module):
+def test_fuse_const_ops_tosa_BI(module: torch.nn.Module):
     pipeline = TosaPipelineBI[input_t](
-        module, (torch.rand(10, 10),), [], [], use_to_edge_transform_and_lower=True
+        module,
+        (torch.rand(10, 10),),
+        [],
+        [],
+        quantize=True,
+        use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/passes/test_fuse_equal_placeholders_ops_pass.py b/backends/arm/test/passes/test_fuse_equal_placeholders_ops_pass.py
index 2674f45cf6a..49626eefb71 100644
--- a/backends/arm/test/passes/test_fuse_equal_placeholders_ops_pass.py
+++ b/backends/arm/test/passes/test_fuse_equal_placeholders_ops_pass.py
@@ -60,7 +60,7 @@ def test_fuse_equal_placeholders_constants_tosa_MI():
     pipeline = PassPipeline[input_t](
         module,
         data,
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass=module.ops_before_pass,
         ops_after_pass=module.ops_after_pass,
         passes_with_exported_program=[FuseEqualPlaceholdersPass],
@@ -81,7 +81,7 @@ def test_fuse_equal_placeholders_state_dict_tosa_MI():
     pipeline = PassPipeline[input_t](
         module,
         data,
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass=module.ops_before_pass,
         ops_after_pass=module.ops_after_pass,
         passes_with_exported_program=[FuseEqualPlaceholdersPass],
diff --git a/backends/arm/test/passes/test_insert_table_ops_pass.py b/backends/arm/test/passes/test_insert_table_ops_pass.py
index bdbcef3713d..88ef96d71ab 100644
--- a/backends/arm/test/passes/test_insert_table_ops_pass.py
+++ b/backends/arm/test/passes/test_insert_table_ops_pass.py
@@ -11,26 +11,28 @@
     FoldAndAnnotateQParamsPass,
 )
 from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 input_t = Tuple[torch.Tensor]  # Input x
 
 
 class Sigmoid(torch.nn.Module):
+    test_data = {
+        "rand": (torch.rand(4),),
+    }
 
     def forward(self, x: torch.Tensor):
         return x.sigmoid()
 
-    def get_inputs(self) -> input_t:
-        return (torch.rand(4),)
 
-
-def test_insert_table_tosa_BI():
+@common.parametrize("test_data", Sigmoid.test_data)
+def test_insert_table_tosa_BI(test_data: input_t):
     module = Sigmoid()
     pipeline = PassPipeline[input_t](
         module,
-        module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
+        test_data,
+        quantize=True,
         ops_before_pass={},
         ops_after_pass={
             "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 1,
diff --git a/backends/arm/test/passes/test_ioquantization_pass.py b/backends/arm/test/passes/test_ioquantization_pass.py
index ecaff5e3673..2b3e900ab21 100644
--- a/backends/arm/test/passes/test_ioquantization_pass.py
+++ b/backends/arm/test/passes/test_ioquantization_pass.py
@@ -4,61 +4,44 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from typing import Tuple
+
 import torch
 
 from executorch.backends.arm.test import common
 
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import EthosU55PipelineBI
 from executorch.exir.passes.quantize_io_pass import QuantizeInputs, QuantizeOutputs
 
 
+input_t = Tuple[torch.Tensor]
+
+
 class SimpleModel(torch.nn.Module):
+    test_data = {
+        "rand_rand": (torch.rand(1, 2, 2, 1), torch.rand(1, 2, 2, 1)),
+    }
+
     def forward(self, x, y):
         return x + y
 
-    def get_inputs(self):
-        a = torch.rand(1, 2, 2, 1)
-        b = torch.rand(1, 2, 2, 1)
-        return (a, b)
-
 
-def test_ioquantisation_pass_u55_BI():
+@common.parametrize("test_data", SimpleModel.test_data)
+def test_ioquantisation_pass_u55_BI(test_data: input_t):
     """
     Test the executorch/exir/passes/quanize_io_pass pass works(meaning we don't get Q/DQ nodes) on a simple model
     """
     model = SimpleModel()
-    tester = (
-        ArmTester(
-            model,
-            example_inputs=model.get_inputs(),
-            compile_spec=common.get_u55_compile_spec(),
-        )
-        .quantize()
-        .export()
-        .to_edge()
-        .check_count(
-            {
-                "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3
-            }
-        )
-        .check_count(
-            {
-                "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3
-            }
-        )
-        .partition()
-        .check_count(
-            {
-                "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2
-            }
-        )
-        .check_count(
-            {
-                "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 1
-            }
-        )
+    pipeline = EthosU55PipelineBI(
+        model,
+        test_data,
+        aten_ops=[],
+        exir_ops=[],
+        use_to_edge_transform_and_lower=False,
     )
-    edge = tester.get_artifact()
+    pipeline.pop_stage(-1)
+    pipeline.run()
+    edge = pipeline.tester.get_artifact()
     edge.transform(passes=[QuantizeInputs(edge, [0, 1]), QuantizeOutputs(edge, [0])])
-    tester.check_not(["edge__ops_quantized_decomposed_quantize_per_tensor"])
-    tester.check_not(["edge__ops_quantized_decomposed_dequantize_per_tensor"])
+    pipeline.tester.check_not(["edge__ops_quantized_decomposed_quantize_per_tensor"])
+    pipeline.tester.check_not(["edge__ops_quantized_decomposed_dequantize_per_tensor"])
diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
index 291272af709..fbcb26d2542 100644
--- a/backends/arm/test/passes/test_meandim_to_averagepool2d.py
+++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
@@ -58,7 +58,7 @@ def get_inputs(self) -> input_t:
 
 
 @common.parametrize("module", modules)
-def test_meandim_to_avgpool_tosa_BI(module):
+def test_meandim_to_avgpool_tosa_BI(module: torch.nn.Module):
     """
     Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d
     for the special case where dim is [-1, -2] and keepdim is True.
@@ -66,7 +66,7 @@ def test_meandim_to_avgpool_tosa_BI(module):
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
+        quantize=True,
         ops_before_pass=module.ops_before_pass,
         ops_after_pass=module.ops_after_pass,
         ops_not_after_pass=module.ops_not_after_pass,
diff --git a/backends/arm/test/passes/test_remove_clone_pass.py b/backends/arm/test/passes/test_remove_clone_pass.py
index e586edd323d..9f317b44043 100755
--- a/backends/arm/test/passes/test_remove_clone_pass.py
+++ b/backends/arm/test/passes/test_remove_clone_pass.py
@@ -33,7 +33,7 @@ def test_remove_clone_tosa_BI():
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        tosa_version="TOSA-0.80+BI",
+        quantize=True,
         ops_before_pass={
             "executorch_exir_dialects_edge__ops_aten_clone_default": 1,
         },
diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py
index 21317c23a8a..420fdab5f45 100644
--- a/backends/arm/test/passes/test_rescale_pass.py
+++ b/backends/arm/test/passes/test_rescale_pass.py
@@ -4,15 +4,20 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import pytest
 
 import torch
 import torch.library
 from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+)
+
+input_t = Tuple[torch.Tensor, torch.Tensor]  # Input x
 
 
 def test_rescale_op():
@@ -91,13 +96,13 @@ def test_zp_outside_range():
 
 
 class RescaleNetwork(torch.nn.Module):
-    test_parameters = [
-        (torch.rand(5), torch.rand(5)),
-        (torch.randn(5, 2), torch.randn(5, 1)),
-        (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
-        (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-        (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
-    ]
+    test_data = {
+        "rand": (torch.rand(5), torch.rand(5)),
+        "randn": (torch.randn(5, 2), torch.randn(5, 1)),
+        "ones": (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
+        "randn_ones": (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
+        "randn_large": (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
+    }
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         a = y.exp()
@@ -110,62 +115,49 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return f
 
 
-def _test_rescale_pipeline(
-    module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
-):
+@common.parametrize("test_data", RescaleNetwork.test_data)
+def test_quantized_rescale_tosa_bi(test_data: tuple[torch.Tensor, torch.Tensor]):
     """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and
     need the InsertRescalesPass, make sure that they play nicely together."""
-    tester = (
-        ArmTester(
-            module,
-            example_inputs=test_data,
-            compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-        )
-        .quantize()
-        .export()
-        .to_edge_transform_and_lower()
-        .to_executorch()
+    module = RescaleNetwork()
+    pipeline = TosaPipelineBI(
+        module=module,
+        test_data=test_data,
+        aten_op=[],
+        exir_op=[],
     )
-    if conftest.is_option_enabled("tosa_ref_model"):
-        tester.run_method_and_compare_outputs(test_data)
-
-
-def _test_rescale_pipeline_ethosu(
-    module: torch.nn.Module, compile_spec, test_data: tuple[torch.Tensor, torch.Tensor]
-):
-    tester = (
-        ArmTester(
-            module,
-            example_inputs=test_data,
-            compile_spec=compile_spec,
-        )
-        .quantize()
-        .export()
-        .to_edge_transform_and_lower()
-        .to_executorch()
-        .serialize()
+    if not conftest.is_option_enabled("tosa_ref_model"):
+        pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+
+@common.parametrize("test_data", RescaleNetwork.test_data)
+@common.XfailIfNoCorstone300
+def test_quantized_rescale_u55(test_data: tuple[torch.Tensor, torch.Tensor]):
+    """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and
+    need the InsertRescalesPass, make sure that they play nicely together."""
+    module = RescaleNetwork()
+    pipeline = EthosU55PipelineBI(
+        module=module,
+        test_data=test_data,
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", RescaleNetwork.test_data)
+@common.XfailIfNoCorstone320
+def test_quantized_rescale_u85(test_data: tuple[torch.Tensor, torch.Tensor]):
+    """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and
+    need the InsertRescalesPass, make sure that they play nicely together."""
+    module = RescaleNetwork()
+    pipeline = EthosU85PipelineBI(
+        module=module,
+        test_data=test_data,
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
     )
-    if conftest.is_option_enabled("corstone_fvp"):
-        tester.run_method_and_compare_outputs(inputs=test_data)
-
-
-class TestRescales(unittest.TestCase):
-
-    @parameterized.expand(RescaleNetwork.test_parameters)
-    @pytest.mark.tosa_ref_model
-    def test_quantized_rescale(self, x, y):
-        _test_rescale_pipeline(RescaleNetwork(), (x, y))
-
-    @parameterized.expand(RescaleNetwork.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_quantized_rescale_U55(self, x, y):
-        _test_rescale_pipeline_ethosu(
-            RescaleNetwork(), common.get_u55_compile_spec(), (x, y)
-        )
-
-    @parameterized.expand(RescaleNetwork.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_quantized_rescale_U85(self, x, y):
-        _test_rescale_pipeline_ethosu(
-            RescaleNetwork(), common.get_u85_compile_spec(), (x, y)
-        )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py b/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py
index bcecdf6d6a1..a12ac38b866 100644
--- a/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py
+++ b/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py
@@ -38,7 +38,7 @@ def forward(self, x: torch.Tensor):
 
 
 @common.parametrize("test_data", Repeat.test_data)
-def test_unsqueeze_before_repeat_tosa_MI(test_data):
+def test_unsqueeze_before_repeat_tosa_MI(test_data: input_t):
     """
     When rank(input) != number of repeated dimensions (=4 in Repeat module),
     insert view.
@@ -48,7 +48,7 @@ def test_unsqueeze_before_repeat_tosa_MI(test_data):
     pipeline = PassPipeline(
         module,
         data,
-        tosa_version="TOSA-0.80+MI",
+        quantize=False,
         ops_before_pass={"aten_repeat_default": 3},
         ops_not_before_pass=["aten_view_copy_default"],
         ops_after_pass=ops_after_pass,
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
index c4c90064bce..480497b4aee 100644
--- a/backends/arm/test/tester/test_pipeline.py
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -618,7 +618,7 @@ def __init__(
         self,
         module: torch.nn.Module,
         test_data: T,
-        tosa_version: str,
+        quantize: Optional[bool] = False,
         ops_before_pass: Optional[Dict[str, int]] = None,
         ops_not_before_pass: Optional[list[str]] = None,
         ops_after_pass: Optional[Dict[str, int]] = None,
@@ -628,8 +628,18 @@ def __init__(
         passes_with_exported_program: Optional[List[Type[ExportPass]]] = None,
         custom_path: str = None,
     ):
+        tosa_profiles = {
+            "0.80": TosaSpecification.create_from_string(
+                "TOSA-0.80+" + ("BI" if quantize else "MI")
+            ),
+            "1.0": TosaSpecification.create_from_string(
+                "TOSA-1.0+" + ("INT" if quantize else "FP")
+            ),
+        }
+        tosa_version = conftest.get_option("tosa_version")
+
         compile_spec = common.get_tosa_compile_spec(
-            tosa_version, custom_path=custom_path
+            tosa_profiles[tosa_version], custom_path=custom_path
         )
         super().__init__(
             module,
@@ -648,7 +658,7 @@ def __init__(
         self.pop_stage("to_executorch")
         self.pop_stage("check.aten")
 
-        if "BI" in tosa_version:
+        if quantize:
             self.add_stage(self.tester.quantize, pos=0)
 
         # Add checks/check_not's if given

From f8e7264daae11492236d63b6e0e0a40cee9bac7d Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Tue, 13 May 2025 06:55:14 -0700
Subject: [PATCH 064/178] Move dependent options to default preset (#10805)

### Summary
* Move the simple options to default preset. This shouldn't change any
build configurations

### Test plan
CI

cc @larryliu0820
---
 CMakeLists.txt                   | 28 +----------------------
 tools/cmake/Utils.cmake          | 11 ---------
 tools/cmake/preset/default.cmake | 38 ++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bae7732f881..ed154c6df04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,22 +128,6 @@ else()
   set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
-#
-# pthreadpool: build pthreadpool library. Disable on unsupported platforms
-#
-cmake_dependent_option(
-  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
-  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
-)
-
-#
-# cpuinfo: build cpuinfo library. Disable on unsupported platforms
-#
-cmake_dependent_option(
-  EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
-  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
-)
-
 add_subdirectory(third-party)
 
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
@@ -475,14 +459,6 @@ install(
 )
 install(FILES tools/cmake/executorch-config.cmake DESTINATION lib/cmake/ExecuTorch)
 
-#
-# executor_runner: Host tool that demonstrates program execution.
-#
-cmake_dependent_option(
-  EXECUTORCH_BUILD_EXECUTOR_RUNNER "Build the executor_runner executable" ON
-  "NOT CMAKE_TOOLCHAIN_IOS" OFF
-)
-
 # Add googletest if any test targets should be built
 if(BUILD_TESTING)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
@@ -571,9 +547,7 @@ if(EXECUTORCH_BUILD_EXTENSION_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/tensor)
 endif()
 
-if(EXECUTORCH_BUILD_PTHREADPOOL
-   AND EXECUTORCH_BUILD_CPUINFO
-)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
 
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 773ee1c578b..d588a983086 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -29,17 +29,6 @@ function(executorch_print_configuration_summary)
   message(STATUS "  CMAKE_TOOLCHAIN_FILE          : ${CMAKE_TOOLCHAIN_FILE}")
   message(STATUS "  BUCK2                         : ${BUCK2}")
   message(STATUS "  PYTHON_EXECUTABLE             : ${PYTHON_EXECUTABLE}")
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_CPUINFO               : ${EXECUTORCH_BUILD_CPUINFO}"
-  )
-  message(STATUS "  EXECUTORCH_BUILD_EXECUTOR_RUNNER       : "
-                 "${EXECUTORCH_BUILD_EXECUTOR_RUNNER}"
-  )
-  message(
-    STATUS
-      "  EXECUTORCH_BUILD_PTHREADPOOL           : ${EXECUTORCH_BUILD_PTHREADPOOL}"
-  )
 endfunction()
 
 # This is the funtion to use -Wl, --whole-archive to link static library NB:
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 1fdd54976aa..7b6830bd3c8 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -205,6 +205,35 @@ define_overridable_option(
   BOOL ON
 )
 
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  set(_default_executorch_build_pthreadpool OFF)
+  set(_default_executorch_build_cpuinfo OFF)
+else()
+  set(_default_executorch_build_pthreadpool ON)
+  set(_default_executorch_build_cpuinfo ON)
+endif()
+define_overridable_option(
+  EXECUTORCH_BUILD_PTHREADPOOL
+  "Build pthreadpool library."
+  BOOL ${_default_executorch_build_pthreadpool}
+)
+define_overridable_option(
+  EXECUTORCH_BUILD_CPUINFO
+  "Build cpuinfo library."
+  BOOL ${_default_executorch_build_cpuinfo}
+)
+
+# TODO(jathu): move this to platform specific presets when created
+set(_default_executorch_build_executor_runner ON)
+if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")
+  set(_default_executorch_build_executor_runner OFF)
+endif()
+define_overridable_option(
+  EXECUTORCH_BUILD_EXECUTOR_RUNNER
+  "Build the executor_runner executable"
+  BOOL ${_default_executorch_build_executor_runner}
+)
+
 # MARK: - Validations
 # At this point all the options should be configured with their final value.
 
@@ -232,3 +261,12 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
     message(FATAL_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
   endif()
 endif()
+
+
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  if(EXECUTORCH_BUILD_PTHREADPOOL)
+    message(FATAL_ERROR "Cannot enable both EXECUTORCH_BUILD_PTHREADPOOL and EXECUTORCH_BUILD_ARM_BAREMETAL")
+  elseif(EXECUTORCH_BUILD_CPUINFO)
+    message(FATAL_ERROR "Cannot enable both EXECUTORCH_BUILD_CPUINFO and EXECUTORCH_BUILD_ARM_BAREMETAL")
+  endif()
+endif()

From 0bb059fa617508c25cabfff6b73ce3a451c27ee8 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 13 May 2025 10:29:51 -0400
Subject: [PATCH 065/178] [Executorch][llm] Add ring buffer based kv cache and
 mask calculation to MHA (#10833)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/10609 by
@kimishpatel
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/186/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/186/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/185/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/186/orig
@diff-train-skip-merge

Co-authored-by: Kimish Patel <kimishpatel@fb.com>
---
 examples/models/llama/attention.py            |  93 +++++--
 .../llama/source_transformation/sdpa.py       |  81 +++---
 .../test_quantized_sdpa.py                    |  18 +-
 .../test_sdpa_with_quantized_kv_cache.py      |   4 +-
 examples/models/llama/tests/TARGETS           |  11 +
 .../models/llama/tests/test_ring_attention.py | 241 ++++++++++++++++++
 .../models/llama/tests/test_ring_kv_cache.py  |  85 +++---
 .../models/llama/tests/test_simple_sdpa.py    |   2 +-
 8 files changed, 433 insertions(+), 102 deletions(-)
 create mode 100644 examples/models/llama/tests/test_ring_attention.py

diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index c886a062c39..515fd0080fc 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -123,14 +123,12 @@ def __init__(
         head_dim: int,
         n_rep: int,
         max_context_len: int,
-        enable_dynamic_shape: bool,
     ):
         super().__init__()
         self.dim = dim
         self.head_dim = head_dim
         self.n_rep = n_rep
         self.max_context_len = max_context_len
-        self.enable_dynamic_shape = enable_dynamic_shape
 
     def forward(
         self,
@@ -142,21 +140,12 @@ def forward(
         seqlen,
         mask: torch.Tensor,
     ) -> torch.Tensor:
-        if self.enable_dynamic_shape:
-            start_pos = input_pos[-1].item()
-            torch._check_is_size(start_pos)
-            torch._check(start_pos < self.max_context_len)
-            seq_length = q.size(2)
-            # pyre-ignore: Incompatible parameter type [6]
-            attn_mask = mask.narrow(0, start_pos, seq_length)
-        else:
-            attn_mask = mask[None, None, input_pos]
 
         # TODO(kimishpatel): This should not be necessary because scaled_dot_product_attention
         # can natively support GQA now. But needs enable_gqa=True
         k = k.repeat_interleave(self.n_rep, dim=1)
         v = v.repeat_interleave(self.n_rep, dim=1)
-        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
 
         return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
@@ -236,21 +225,79 @@ def __init__(
         enable_dynamic_shape: bool,
         dtype=torch.float32,
     ):
+        self.window_size = max_context_length
+        """
+        Reason why we want the kv cache size to be twice the context length:
+        Sliding window attention without ringbuffer
+        pos   0  1  2  3  4  5  6  7  8  9  10
+        0     x  0  0  0  0  0  0  0  0  0  0
+        1     x  x  0  0  0  0  0  0  0  0  0
+        2     x  x  x  0  0  0  0  0  0  0  0
+        3     x  x  x  x  0  0  0  0  0  0  0
+        4     0  x  x  x  x  0  0  0  0  0  0
+        5     0  0  x  x  x  x  0  0  0  0  0
+        6     0  0  0  x  x  x  x  0  0  0  0
+        7     0  0  0  0  x  x  x  x  0  0  0
+        8     0  0  0  0  0  x  x  x  x  0  0
+        9     0  0  0  0  0  0  x  x  x  x  0
+        10    0  0  0  0  0  0  0  x  x  x  x
+
+        So when doing attention for pos = 5 and seq_len = 4 our attention
+        mask would be
+        5     0  0  x  x  x  x  0  0  0  0  0
+        6     0  0  0  x  x  x  x  0  0  0  0
+        7     0  0  0  0  x  x  x  x  0  0  0
+        8     0  0  0  0  0  x  x  x  x  0  0
+        Thus tok at pos = 5 is able to attend to tokens at pos 2, 3 and 4.
+        This is how training is done.
+
+        Now lets consider ring kv cache of size 4. When we are at pos = 5
+        before updating the kv cache, state of the kv cache would be
+        [4 1 2 3]. That is we evicted token at pos = 0 out. Now during
+        attention calculation at pos = 5 seq len = 4, we will update cache and
+        new pos in the cache would be [8 5 6 7]. So note that 5 can now only attend
+        to itself. Not 2, 3 and 4 as you would have during training.
+        So not having kept 2, 3 and 4 in cache means we will have divergent behavior.
+        Worst case of this would have been when update it equal to the length of
+        the cache. like in our case pos = 5 seq len = 4.
+        Thus we need to have a cache that is larger. How much larger, as much as
+        the sliding window size. So twice the max_context_length.
+        How would that have helped. Lets see. At pos = 5 our cache would have
+        [0, 1, 2, 3, 4, NA, NA, NA] After cache update we would have
+        [8, 1, 2, 3, 4, 5, 6, 7]. We kicked out token at pos = 0. However, the
+        current step still has access to [pos - sliding_window_size, pos] tokens.
+        
+        To make sure we dont over attend, i.e. we dont have pos = 5
+        to attend to pos = 1, mask calculaton has to account for the sliding window
+        size.
+        """
         super().__init__(
             max_batch_size,
-            max_context_length,
+            max_context_length * 2,
             n_heads,
             head_dim,
             enable_dynamic_shape,
             dtype,
         )
-        self.cache_positions_manager = CachePositionsManager(max_context_length)
+        self.cache_positions_manager = CachePositionsManager(self.max_context_length)
+        self.is_ring_buffer = True
+
+    def create_causal_mask_for_ring_buffer(self, start_pos, seq_len):
+        pos_q = start_pos + torch.arange(seq_len, dtype=torch.long).view(-1, 1)
+        cache_positions = self.cache_positions_manager.cache_positions
+        delta = pos_q - cache_positions
+        attn_mask = (cache_positions >= 0) & (delta >= 0) & (delta < self.window_size)
+        attn_mask = torch.where(attn_mask == True, 0, float("-inf"))  # noqa E712
+        return attn_mask
 
     def update(
         self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # input_pos: [S], k_val: [B, H, S, D]
         seq_len = k_val.size(2)
+        assert seq_len <= self.k_cache.size(
+            2
+        ), f"Update sequence length({seq_len}) for kv cache must be smaller than the cache size({self.k_cache.size(2)})"
         indices = self.cache_positions_manager.calculate_positions_and_update_indices(
             input_pos, seq_len
         )
@@ -286,6 +333,7 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.attention_qkv_bias = args.attention_qkv_bias
         self.use_qk_norm = args.use_qk_norm
         self.qk_norm_before_rope = args.qk_norm_before_rope
+        self.enable_dynamic_shape = args.enable_dynamic_shape
 
         if self.use_qk_norm:
             q_norm_dim = self.head_dim
@@ -331,7 +379,6 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
                 head_dim=self.head_dim,
                 n_rep=self.n_rep,
                 max_context_len=self.max_context_len,
-                enable_dynamic_shape=args.enable_dynamic_shape,
             )
 
     def forward(
@@ -368,8 +415,22 @@ def forward(
 
         if self.use_kv_cache:
             assert input_pos is not None
+            if self.enable_dynamic_shape:
+                start_pos = input_pos[-1].item()
+                torch._check_is_size(start_pos)
+                torch._check(start_pos < self.max_context_len)
+                seq_length = q.size(2)
+                # pyre-ignore: Incompatible parameter type [6]
+                attn_mask = self.mask.narrow(0, start_pos, seq_length)
+            else:
+                # mask is always 2D
+                attn_mask = self.mask[input_pos]
             k, v = self.kv_cache.update(input_pos, k, v)
-            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
+            if getattr(self.kv_cache, "is_ring_buffer", False):
+                attn_mask = self.kv_cache.create_causal_mask_for_ring_buffer(
+                    input_pos[0].item(), seqlen
+                )
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, attn_mask)
             return self.wo(output), None
 
         # grouped multiquery attention: expand out keys and values
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
index 1bc54198fba..1fb3d97a9c7 100644
--- a/examples/models/llama/source_transformation/sdpa.py
+++ b/examples/models/llama/source_transformation/sdpa.py
@@ -22,15 +22,11 @@ class SDPACustom(torch.nn.Module):
     def __init__(
         self,
         dim: int,
-        max_context_len,
-        enable_dynamic_shape,
         use_attention_mask: bool = False,
     ):
         super().__init__()
         self.dim = dim
-        self.max_context_len = max_context_len
         self.use_attention_mask = use_attention_mask
-        self.enable_dynamic_shape = enable_dynamic_shape
 
     def forward(
         self,
@@ -42,16 +38,6 @@ def forward(
         seqlen,
         mask,
     ):
-        if self.use_attention_mask:
-            if self.enable_dynamic_shape:
-                start_pos = input_pos[-1].item()
-                torch._check_is_size(start_pos)
-                torch._check(start_pos < self.max_context_len)
-                seq_length = q.size(2)
-                mask = mask.narrow(0, start_pos, seq_length)
-            else:
-                mask = mask[input_pos]
-
         q = q.transpose(1, 2)  # (bs, seqlen, n_local_heads, head_dim)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
@@ -96,8 +82,6 @@ def _replace_sdpa_with_custom_op(
                 name,
                 SDPACustom(
                     child.dim,
-                    child.max_context_len,
-                    child.enable_dynamic_shape,
                     use_attention_mask=use_attention_mask,
                 ),
             )
@@ -133,12 +117,15 @@ class QuantizedSDPA(torch.nn.Module):
     zero points, we need to pass kv_cache to SDPA.
     """
 
-    def __init__(self, dim: int, kv_cache: QuantizedKVCache):
+    def __init__(
+        self, dim: int, kv_cache: QuantizedKVCache, use_attention_mask: bool = False
+    ):
         super().__init__()
         self.dim = dim
         self.quantized_dtype = torch.int8
         self.float_dtype = torch.float32
         self.kv_cache = kv_cache
+        self.use_attention_mask = use_attention_mask
 
     def forward(
         self,
@@ -176,22 +163,40 @@ def forward(
         v_scale_fp32 = self.kv_cache.v_cache_scales
 
         start_pos = input_pos[0].item()
-        output = torch.ops.llama.custom_quantized_sdpa(
-            q_quantized,
-            k_quantized,
-            v_quantized,
-            start_pos,
-            None,
-            0,
-            True,
-            None,
-            q_zero_point_int8,
-            q_scale_fp32,
-            k_zero_point_int8,
-            k_scale_fp32,
-            v_zero_point_int8,
-            v_scale_fp32,
-        )
+        if self.use_attention_mask:
+            output = torch.ops.llama.custom_quantized_sdpa(
+                q_quantized,
+                k_quantized,
+                v_quantized,
+                start_pos,
+                mask,
+                0,
+                False,
+                None,
+                q_zero_point_int8,
+                q_scale_fp32,
+                k_zero_point_int8,
+                k_scale_fp32,
+                v_zero_point_int8,
+                v_scale_fp32,
+            )
+        else:
+            output = torch.ops.llama.custom_quantized_sdpa(
+                q_quantized,
+                k_quantized,
+                v_quantized,
+                start_pos,
+                None,
+                0,
+                True,
+                None,
+                q_zero_point_int8,
+                q_scale_fp32,
+                k_zero_point_int8,
+                k_scale_fp32,
+                v_zero_point_int8,
+                v_scale_fp32,
+            )
 
         return output.view(bsz, seqlen, self.dim)
 
@@ -201,6 +206,7 @@ def _update_attention_module_with_quantized_sdpa(
 ):
     sdpa = getattr(module, "SDPA", None)
     assert sdpa is not None
+    # TODO: add support for SDPA with attention mask
     # pyre-ignore
     setattr(module, "SDPA", QuantizedSDPA(sdpa.dim, kv_cache))  # noqa: B010
 
@@ -254,7 +260,8 @@ def forward(
         seqlen,
         mask,
     ):
-        attn_mask = mask[None, None, input_pos]
+        # Input mask is slided however it is 2D
+        attn_mask = mask[None, None]
 
         k = k.repeat_interleave(self.n_rep, dim=1)
         v = v.repeat_interleave(self.n_rep, dim=1)
@@ -310,7 +317,8 @@ def forward(
         """
         k = repeat_kv(k, self.n_rep)
         v = repeat_kv(v, self.n_rep)
-        attn_mask = mask[input_pos]
+        # Mask is already sliced as needed
+        attn_mask = mask
 
         scale_factor = 1 / math.sqrt(q.size(-1))
         attn_weight = q @ k.transpose(-2, -1) * scale_factor
@@ -391,7 +399,8 @@ def forward(
         seqlen,
         mask,
     ):
-        attn_mask = mask[None, None, input_pos]
+        # Input mask is slided however it is 2D
+        attn_mask = mask[None, None]
 
         if self.n_rep > 1:
             k = k.repeat_interleave(self.n_rep, dim=1)
diff --git a/examples/models/llama/source_transformation/test_quantized_sdpa.py b/examples/models/llama/source_transformation/test_quantized_sdpa.py
index 242f3a0876d..4297221919e 100644
--- a/examples/models/llama/source_transformation/test_quantized_sdpa.py
+++ b/examples/models/llama/source_transformation/test_quantized_sdpa.py
@@ -31,7 +31,7 @@ def __init__(
         self.dim = dim
         self.head_dim = head_dim
         self.n_rep = n_rep
-        self.SDPA = SDPA(dim, head_dim, n_rep, max_context_len, enable_dynamic_shape)
+        self.SDPA = SDPA(dim, head_dim, n_rep, max_context_len)
         self.kv_cache = None
 
     def forward(self, x, freqs_cos, freqs_sin, **kwargs):
@@ -159,15 +159,9 @@ def test_forward_functionality(self):
         k_quantized, v_quantized = model.attention.kv_cache.update(input_pos, k, v)
 
         # Run the forward pass with the quantized SDPA
-        try:
-            output = model.attention.SDPA(
-                input_pos, q, k_quantized, v_quantized, bsz, seqlen, None
-            )
+        output = model.attention.SDPA(
+            input_pos, q, k_quantized, v_quantized, bsz, seqlen, None
+        )
 
-            # Verify the output shape
-            self.assertEqual(output.shape, (bsz, seqlen, self.dim))
-        except Exception:
-            # If the forward pass fails, it might be due to missing custom ops
-            self.skipTest(
-                "Custom ops not available, skipping forward functionality test"
-            )
+        # Verify the output shape
+        self.assertEqual(output.shape, (bsz, seqlen, self.dim))
diff --git a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
index e5e278f8ce8..b2c93d7d93d 100644
--- a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
@@ -71,8 +71,8 @@ def test_simple(self, is_dynamic_shape=False):
         self.seq_len = 3
         self._init_cache()
         q, k_val, v_val = self._init_kv()
-        self.float_sdpa = SDPACustom(self.dim, self.max_context_len, True)
-        self.quantized_sdpa = SDPACustom(self.dim, self.max_context_len, True)
+        self.float_sdpa = SDPACustom(self.dim)
+        self.quantized_sdpa = SDPACustom(self.dim)
         k, v = self.custom_kv_cache.update(input_pos, k_val, v_val)
         float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
         k, v = self.quantized_kv_cache.update(input_pos, k_val, v_val)
diff --git a/examples/models/llama/tests/TARGETS b/examples/models/llama/tests/TARGETS
index 09ca02868ed..0d52cfa19d3 100644
--- a/examples/models/llama/tests/TARGETS
+++ b/examples/models/llama/tests/TARGETS
@@ -49,3 +49,14 @@ python_unittest(
         "//executorch/examples/models/llama:llama_transformer",
     ],
 )
+
+python_unittest(
+    name = "test_ring_attention",
+    srcs = [
+        "test_ring_attention.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:llama_transformer",
+    ],
+)
diff --git a/examples/models/llama/tests/test_ring_attention.py b/examples/models/llama/tests/test_ring_attention.py
new file mode 100644
index 00000000000..064be7f04e0
--- /dev/null
+++ b/examples/models/llama/tests/test_ring_attention.py
@@ -0,0 +1,241 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from executorch.examples.models.llama.attention import AttentionMHA, RingKVCache
+from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.rope import Rope
+from torch.nn.attention import SDPBackend
+
+
+class TestRingAttention(unittest.TestCase):
+    def setUp(self):
+        # Common test parameters
+        self.batch_size = 1
+        self.seq_len = 1  # Single token processing
+        self.dim = 64
+        self.n_heads = 4
+        self.n_kv_heads = 4
+        self.head_dim = 16
+        self.max_context_len = 16
+        self.sliding_window = 8
+        self.dtype = torch.float32
+        self.device = "cpu"
+
+    def _create_baseline_attention(self, seq_len: int):
+        """Create baseline attention with regular KV cache."""
+        # Create model args
+        self.args = ModelArgs(
+            dim=self.dim,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            head_dim=self.head_dim,
+            max_batch_size=self.batch_size,
+            max_context_len=self.max_context_len,
+            use_kv_cache=True,
+            enable_dynamic_shape=True,
+        )
+
+        # Create RoPE instance
+        self.rope = Rope(self.args)
+
+        attention = AttentionMHA(self.args, layer_id=0, rope=self.rope)
+        attention.mask = self._create_sliding_window_mask(
+            seq_len, self.max_context_len, self.sliding_window
+        )
+
+        return attention
+
+    def _create_ring_attention(self, attention):
+        """Create attention with ring buffer KV cache."""
+        assert self.sliding_window is not None
+        # Create RoPE instance
+        self.rope = Rope(self.args)
+        baseline_attention = copy.deepcopy(attention)
+
+        # Replace the KV cache with a ring buffer KV cache
+        baseline_attention.kv_cache = RingKVCache(
+            self.args.max_batch_size,
+            self.sliding_window,
+            self.n_kv_heads,
+            self.head_dim,
+            self.args.enable_dynamic_shape,
+            self.dtype,
+        )
+        return baseline_attention
+
+    def _create_sliding_window_mask(self, seq_len, context_len, window_size):
+        """Create a sliding window mask for the baseline."""
+        mask = torch.full((seq_len, context_len), float("-inf"), dtype=self.dtype)
+        for i in range(seq_len):
+            pos = i
+            # Allow attention to window_size previous positions
+            start_idx = max(0, pos - window_size + 1)
+            mask[i, start_idx : pos + 1] = 0
+        return mask
+
+    def test_single_token_processing(self):
+        """Test that ring buffer and baseline produce the same output for single token processing."""
+        seq_len = 10
+        self.sliding_window = 4
+        baseline_attn = self._create_baseline_attention(seq_len)
+        ring_attn = self._create_ring_attention(baseline_attn)
+
+        # Process tokens one by one
+        with torch.nn.attention.sdpa_kernel(
+            [SDPBackend.FLASH_ATTENTION]
+        ), torch.no_grad():
+            for pos in range(seq_len):
+                # Create input tensor for a single token
+                x = torch.randn((self.batch_size, 1, self.dim), dtype=self.dtype)
+                input_pos = torch.tensor([pos], dtype=torch.long)
+                freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, 1)
+
+                # Process with baseline attention
+                baseline_out, _ = baseline_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+
+                # Process with ring buffer attention
+                ring_out, _ = ring_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+
+                # Check that outputs are the same
+                self.assertTrue(
+                    torch.allclose(baseline_out, ring_out, rtol=1e-7, atol=1e-7),
+                    f"Outputs differ at position {pos}",
+                )
+
+    def test_sliding_window_attention(self):
+        """Test that ring buffer with sliding window size produces the same output as baseline with sliding window mask."""
+        self.sliding_window = 4
+        self.max_context_len = 16
+
+        seq_len = 10
+        # Create baseline attention with full context length
+        baseline_attn = self._create_baseline_attention(seq_len)
+
+        # Create ring attention with sliding window size
+        ring_attn = self._create_ring_attention(baseline_attn)
+
+        # Process tokens one by one
+        with torch.nn.attention.sdpa_kernel(
+            [SDPBackend.FLASH_ATTENTION]
+        ), torch.no_grad():
+            for pos in range(seq_len):
+                # Create input tensor for a single token
+                x = torch.randn((self.batch_size, 1, self.dim), dtype=self.dtype)
+                input_pos = torch.tensor([pos], dtype=torch.long)
+                freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, 1)
+
+                baseline_out, _ = baseline_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+
+                # Process with ring buffer attention
+                ring_out, _ = ring_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+
+                # Check that outputs are the same
+                self.assertTrue(
+                    torch.allclose(baseline_out, ring_out, rtol=1e-7, atol=1e-7),
+                    f"Outputs differ at position {pos}",
+                )
+
+    def test_ring_buffer_wrapping(self):
+        """Test that ring buffer correctly wraps around and maintains correct attention patterns."""
+        self.sliding_window = 3
+        self.max_context_len = 15
+
+        # Create baseline attention with full context length
+        baseline_attn = self._create_baseline_attention(self.max_context_len)
+
+        # Create ring attention with sliding window size
+        ring_attn = self._create_ring_attention(baseline_attn)
+
+        # Process enough tokens to cause wrapping
+        seq_len = 1
+        with torch.nn.attention.sdpa_kernel(
+            [SDPBackend.FLASH_ATTENTION]
+        ), torch.no_grad():
+            for pos in range(8):
+                # Create input tensor for a single token
+                x = torch.randn((self.batch_size, seq_len, self.dim), dtype=self.dtype)
+                input_pos = torch.tensor([pos], dtype=torch.long)
+                freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seq_len)
+
+                baseline_out, _ = baseline_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+
+                # Process with ring buffer attention
+                ring_out, _ = ring_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+                self.assertTrue(
+                    torch.allclose(baseline_out, ring_out, rtol=1e-7, atol=1e-7),
+                    f"Outputs differ at position {pos}",
+                )
+
+        # After processing 8 tokens with window size 4, the ring buffer should have wrapped around
+        # Check the cache positions to verify wrapping
+        cache_positions = ring_attn.kv_cache.cache_positions_manager.cache_positions
+
+        # The cache positions should contain the most recent 4 positions (4, 5, 6, 7)
+        # mapped to the ring buffer indices
+        expected_positions = torch.tensor([6, 7, 2, 3, 4, 5], dtype=torch.long)
+
+        self.assertTrue(
+            torch.all(cache_positions == expected_positions),
+            f"Expected positions {expected_positions}, got {cache_positions}",
+        )
+
+    def test_large_context_with_sliding_window(self):
+        """Test with a large context length and compare baseline with sliding window to ring buffer."""
+        # Use a larger context length and sliding window for this test
+        self.max_context_len = 64
+        self.sliding_window = 8
+
+        token_lens = [8, 1, 3, 2, 1, 1, 1, 1, 7, 1, 5, 1, 1, 1, 4, 1, 1, 2, 1, 1]
+        seq_len = sum(token_lens)
+        # Create baseline attention with full context length
+        baseline_attn = self._create_baseline_attention(seq_len)
+
+        # Create ring attention with sliding window size
+        ring_attn = self._create_ring_attention(baseline_attn)
+
+        pos = 0
+        with torch.nn.attention.sdpa_kernel(
+            [SDPBackend.FLASH_ATTENTION]
+        ), torch.no_grad():
+            for token_len in token_lens:
+                # Create input tensor for a single token
+                x = torch.randn(
+                    (self.batch_size, token_len, self.dim), dtype=self.dtype
+                )
+                input_pos = torch.tensor([pos], dtype=torch.long)
+                freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, token_len)
+
+                baseline_out, _ = baseline_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+
+                # Process with ring buffer attention
+                ring_out, _ = ring_attn.forward(
+                    x, freqs_cos, freqs_sin, input_pos=input_pos
+                )
+
+                # Check that outputs are the same
+                self.assertTrue(
+                    torch.allclose(baseline_out, ring_out, rtol=1e-7, atol=1e-7),
+                    f"Outputs differ at position {pos} with max difference {(baseline_out - ring_out).abs().max()}",
+                )
+                pos += token_len
diff --git a/examples/models/llama/tests/test_ring_kv_cache.py b/examples/models/llama/tests/test_ring_kv_cache.py
index dd9971fa010..0a10e39cf16 100644
--- a/examples/models/llama/tests/test_ring_kv_cache.py
+++ b/examples/models/llama/tests/test_ring_kv_cache.py
@@ -61,7 +61,8 @@ def test_basic_update(self):
 
         # Check that cache_positions was updated correctly
         expected_positions = torch.tensor(
-            [0, 1, 2, -1, -1, -1, -1, -1], dtype=torch.long
+            [0, 1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            dtype=torch.long,
         )
         self.assertTrue(
             torch.all(
@@ -81,8 +82,8 @@ def test_ring_buffer_wrapping(self):
         )
 
         # Create input tensors for first update
-        input_pos = torch.tensor([6], dtype=torch.long)
-        seq_len = 4  # This will wrap around from position 6 to positions 6, 7, 0, 1
+        input_pos = torch.tensor([14], dtype=torch.long)
+        seq_len = 4  # This will wrap around from position 14 to positions 14, 15, 0, 1
         k_val = (
             torch.ones(
                 (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
@@ -102,8 +103,8 @@ def test_ring_buffer_wrapping(self):
         k_out, v_out = cache.update(input_pos, k_val, v_val)
 
         # Check that the cache was updated correctly with wrapping
-        # Positions 6, 7 should be updated
-        for i in range(6, 8):
+        # Positions 14, 15 should be updated
+        for i in range(14, 16):
             self.assertTrue(torch.all(k_out[:, :, i] == 3.0))
             self.assertTrue(torch.all(v_out[:, :, i] == 4.0))
 
@@ -113,17 +114,19 @@ def test_ring_buffer_wrapping(self):
             self.assertTrue(torch.all(v_out[:, :, i] == 4.0))
 
         # The rest should still be zeros
-        for i in range(2, 6):
+        for i in range(2, 14):
             self.assertTrue(torch.all(k_out[:, :, i] == 0.0))
             self.assertTrue(torch.all(v_out[:, :, i] == 0.0))
 
         # Check that cache_positions was updated correctly
-        # Note that positions 2, 3, 4, 5 are 0 instead of -1 because in actual ring
+        # Note that positions 2-13 are 0 instead of -1 because in actual ring
         # updates those positions would have been updated.
-        # But CachePositionsManager thinks they are updated because start_pos > (2, 3, 4, 5)
+        # But CachePositionsManager thinks they are updated because start_pos > (2-13)
         # As a result it does not fill them with -1 and instead uses original values
         # which is 0, the value cache_position buffer is initialized with.
-        expected_positions = torch.tensor([8, 9, 0, 0, 0, 0, 6, 7], dtype=torch.long)
+        expected_positions = torch.tensor(
+            [16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15], dtype=torch.long
+        )
         self.assertTrue(
             torch.all(
                 cache.cache_positions_manager.cache_positions == expected_positions
@@ -198,7 +201,10 @@ def test_multiple_updates(self):
             self.assertTrue(torch.all(v_out2[:, :, i] == 0.0))
 
         # Check that cache_positions was updated correctly
-        expected_positions = torch.tensor([0, 1, 2, 3, 4, -1, -1, -1], dtype=torch.long)
+        expected_positions = torch.tensor(
+            [0, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            dtype=torch.long,
+        )
         self.assertTrue(
             torch.all(
                 cache.cache_positions_manager.cache_positions == expected_positions
@@ -206,7 +212,7 @@ def test_multiple_updates(self):
         )
 
         # Third update with wrapping
-        input_pos3 = torch.tensor([6], dtype=torch.long)
+        input_pos3 = torch.tensor([14], dtype=torch.long)
         seq_len3 = 4
         k_val3 = (
             torch.ones(
@@ -236,17 +242,21 @@ def test_multiple_updates(self):
             self.assertTrue(torch.all(k_out3[:, :, i] == 7.0))
             self.assertTrue(torch.all(v_out3[:, :, i] == 8.0))
 
-        # Position 5 should still be zero
-        self.assertTrue(torch.all(k_out3[:, :, 5] == 0.0))
-        self.assertTrue(torch.all(v_out3[:, :, 5] == 0.0))
+        # Positions 5-13 should still be zero
+        for i in range(5, 14):
+            self.assertTrue(torch.all(k_out3[:, :, i] == 0.0))
+            self.assertTrue(torch.all(v_out3[:, :, i] == 0.0))
 
-        # Positions 6, 7 should have values from the third update
-        for i in range(6, 8):
+        # Positions 14, 15 should have values from the third update
+        for i in range(14, 16):
             self.assertTrue(torch.all(k_out3[:, :, i] == 9.0))
             self.assertTrue(torch.all(v_out3[:, :, i] == 10.0))
 
         # Check that cache_positions was updated correctly
-        expected_positions = torch.tensor([8, 9, 2, 3, 4, -1, 6, 7], dtype=torch.long)
+        expected_positions = torch.tensor(
+            [16, 17, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 15],
+            dtype=torch.long,
+        )
         self.assertTrue(
             torch.all(
                 cache.cache_positions_manager.cache_positions == expected_positions
@@ -296,7 +306,8 @@ def test_edge_case_input_pos_zero(self):
 
         # Check that cache_positions was updated correctly
         expected_positions = torch.tensor(
-            [0, -1, -1, -1, -1, -1, -1, -1], dtype=torch.long
+            [0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            dtype=torch.long,
         )
         self.assertTrue(
             torch.all(
@@ -316,8 +327,10 @@ def test_edge_case_exceeding_context_length(self):
         )
 
         # Create input tensors
-        input_pos = torch.tensor([5], dtype=torch.long)
-        seq_len = 5  # This will wrap around from position 5 to positions 5, 6, 7, 0, 1
+        input_pos = torch.tensor([13], dtype=torch.long)
+        seq_len = (
+            5  # This will wrap around from position 13 to positions 13, 14, 15, 0, 1
+        )
         k_val = (
             torch.ones(
                 (self.max_batch_size, self.n_heads, seq_len, self.head_dim),
@@ -336,8 +349,8 @@ def test_edge_case_exceeding_context_length(self):
         # Update the cache
         k_out, v_out = cache.update(input_pos, k_val, v_val)
 
-        # Check that positions 5, 6, 7 were updated
-        for i in range(5, 8):
+        # Check that positions 13, 14, 15 were updated
+        for i in range(13, 16):
             self.assertTrue(torch.all(k_out[:, :, i] == 13.0))
             self.assertTrue(torch.all(v_out[:, :, i] == 14.0))
 
@@ -346,18 +359,20 @@ def test_edge_case_exceeding_context_length(self):
             self.assertTrue(torch.all(k_out[:, :, i] == 13.0))
             self.assertTrue(torch.all(v_out[:, :, i] == 14.0))
 
-        # Check that positions 2, 3, 4 are still zeros
-        for i in range(2, 5):
+        # Check that positions 2-12 are still zeros
+        for i in range(2, 13):
             self.assertTrue(torch.all(k_out[:, :, i] == 0.0))
             self.assertTrue(torch.all(v_out[:, :, i] == 0.0))
 
         # Check that cache_positions was updated correctly
-        # Note that positions 2, 3, 4 are 0 instead of -1 because in actual ring
+        # Note that positions 2-12 are 0 instead of -1 because in actual ring
         # updates those positions would have been updated.
-        # But CachePositionsManager thinks they are updated because start_pos > (2, 3, 4)
+        # But CachePositionsManager thinks they are updated because start_pos > (2-12)
         # As a result it does not fill them with -1 and instead uses original values
         # which is 0, the value cache_position buffer is initialized with.
-        expected_positions = torch.tensor([8, 9, 0, 0, 0, 5, 6, 7], dtype=torch.long)
+        expected_positions = torch.tensor(
+            [16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 15], dtype=torch.long
+        )
         self.assertTrue(
             torch.all(
                 cache.cache_positions_manager.cache_positions == expected_positions
@@ -375,7 +390,7 @@ def test_original_indices_tracking(self):
             self.dtype,
         )
 
-        # First update at position 10 (will be mapped to position 2 in the ring buffer)
+        # First update at position 10 (will be mapped to position 10 in the ring buffer)
         input_pos = torch.tensor([10], dtype=torch.long)
         seq_len = 4
         k_val = torch.ones(
@@ -392,14 +407,14 @@ def test_original_indices_tracking(self):
 
         # Check that cache_positions correctly tracks the original indices
         # For input_pos=10 and seq_len=4, the original indices should be 10, 11, 12, 13
-        # These map to positions 2, 3, 4, 5 in the ring buffer (since max_context_length=8)
-        # Note that positions 0, 1, 6 and 7 are 0 instead of -1 because in actual ring
+        # These map to positions 10, 11, 12, 13 in the ring buffer (since max_context_length=8 but buffer size is 16)
+        # Note that positions 0-9 are 0 because in actual ring
         # updates those positions would have been updated for start_pos = 0.
-        # So CachePositionsManager thinks they are updated because start_pos > (0, 1, 6, 7)
+        # So CachePositionsManager thinks they are updated because start_pos > (0-9)
         # As a result it does not fill them with -1 and instead uses original values
         # which is 0, the value cache_position buffer is initialized with.
         expected_positions = torch.tensor(
-            [0, 0, 10, 11, 12, 13, 0, 0], dtype=torch.long
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, -1, -1], dtype=torch.long
         )
         self.assertTrue(
             torch.all(
@@ -407,7 +422,7 @@ def test_original_indices_tracking(self):
             )
         )
 
-        # Second update at position 14 (will be mapped to position 6 in the ring buffer)
+        # Second update at position 14 (will be mapped to position 14 in the ring buffer)
         input_pos = torch.tensor([14], dtype=torch.long)
         seq_len = 3
         k_val = torch.ones(
@@ -424,9 +439,9 @@ def test_original_indices_tracking(self):
 
         # Check that cache_positions correctly tracks the original indices
         # For input_pos=14 and seq_len=3, the original indices should be 14, 15, 16
-        # These map to positions 6, 7, 0 in the ring buffer
+        # These map to positions 14, 15, 0 in the ring buffer
         expected_positions = torch.tensor(
-            [16, 0, 10, 11, 12, 13, 14, 15], dtype=torch.long
+            [16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15], dtype=torch.long
         )
         self.assertTrue(
             torch.all(
diff --git a/examples/models/llama/tests/test_simple_sdpa.py b/examples/models/llama/tests/test_simple_sdpa.py
index d60bc30b7d3..dbfa38ac590 100644
--- a/examples/models/llama/tests/test_simple_sdpa.py
+++ b/examples/models/llama/tests/test_simple_sdpa.py
@@ -35,13 +35,13 @@ def test_simple_sdpa(self):
             head_dim=head_dim,
             n_rep=n_rep,
             max_context_len=max_context_length,
-            enable_dynamic_shape=False,
         )
         input_pos = torch.tensor([0])
         query = torch.randn(1, 1, n_local_heads, head_dim)
         key = torch.randn(1, 1, n_local_heads, head_dim)
         value = torch.randn(1, 1, n_local_heads, head_dim)
         mask = torch.randn(max_context_length, max_context_length)
+        mask = mask[input_pos]
         query = query.transpose(1, 2)
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)

From e71b3aad670fe0dd391944e3a91ee0508a31e1b1 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Tue, 13 May 2025 17:24:41 +0200
Subject: [PATCH 066/178] Arm backend: Fix mypy linting in pre-push (#10850)

- Adds ./src/ to MYPYPATH to find imports for editable install
- Make --revision point to the commit before the commit to be linted.
Since lintrunner compares files diffing between --revision and HEAD,
pointing it to the the latest commit (== HEAD) results in no files being
linted.

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 backends/arm/scripts/pre-push | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push
index 804abbe26a4..9fb54995849 100755
--- a/backends/arm/scripts/pre-push
+++ b/backends/arm/scripts/pre-push
@@ -76,7 +76,7 @@ for COMMIT in ${COMMITS}; do
 
     # lintrunner on latest patches.
     echo -e "${INFO} Lintrunner"
-    lintrunner --revision ${COMMIT}
+    MYPYPATH=./src/ lintrunner --revision ${COMMIT}^1
     if [[ $? != 0 ]]; then
         echo -e "${ERROR} Failed linting"
         FAILED=1

From e7ec9131c2357b9995218abcf63b43b8a19a0ed1 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Tue, 13 May 2025 08:52:59 -0700
Subject: [PATCH 067/178] Delete executorch_print_configuration_summary
 (#10806)

### Summary
Now that we automatically announce the configurations, we don't need
this anymore.

### Test plan

```
$ ./scripts/build_apple_frameworks.sh

-- --- Configurated Options ---

-- CMAKE_CXX_STANDARD                     : 17
-- CMAKE_BUILD_TYPE                       : Release
-- CMAKE_CXX_COMPILER_ID                  : AppleClang
-- CMAKE_TOOLCHAIN_FILE                   : /Users/jathu/executorch/third-party/ios-cmake/ios.toolchain.cmake
-- BUCK2                                  x (unset)
-- PYTHON_EXECUTABLE                      : /Users/jathu/executorch/.venv/bin/python3
-- EXECUTORCH_ENABLE_LOGGING              : OFF
-- EXECUTORCH_BUILD_COREML                : OFF
-- EXECUTORCH_FLATBUFFERS_MAX_ALIGNMENT   : 1024
-- EXECUTORCH_PAL_DEFAULT                 : posix
-- EXECUTORCH_PAL_DEFAULT_FILE_PATH       : /Users/jathu/executorch/runtime/platform/default/posix.cpp
-- EXECUTORCH_LOG_LEVEL                   : Info
-- EXECUTORCH_ENABLE_PROGRAM_VERIFICATION : OFF
-- EXECUTORCH_ENABLE_EVENT_TRACER         : OFF
-- EXECUTORCH_OPTIMIZE_SIZE               : OFF
-- EXECUTORCH_BUILD_ARM_BAREMETAL         : OFF
-- EXECUTORCH_BUILD_KERNELS_CUSTOM        : OFF
-- EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT    : OFF
-- EXECUTORCH_BUILD_EXTENSION_DATA_LOADER : ON
-- EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR : OFF
-- EXECUTORCH_BUILD_EXTENSION_LLM         : OFF
-- EXECUTORCH_BUILD_EXTENSION_MODULE      : ON
-- EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL : OFF
-- EXECUTORCH_BUILD_EXTENSION_TENSOR      : ON
-- EXECUTORCH_BUILD_EXTENSION_TRAINING    : OFF
-- EXECUTORCH_BUILD_MPS                   : OFF
-- EXECUTORCH_BUILD_NEURON                : OFF
-- EXECUTORCH_BUILD_OPENVINO              : OFF
-- EXECUTORCH_BUILD_PYBIND                : OFF
-- EXECUTORCH_BUILD_QNN                   : OFF
-- EXECUTORCH_BUILD_KERNELS_OPTIMIZED     : OFF
-- EXECUTORCH_BUILD_KERNELS_QUANTIZED     : OFF
-- EXECUTORCH_BUILD_DEVTOOLS              : OFF
-- EXECUTORCH_BUILD_TESTS                 : OFF
-- EXECUTORCH_NNLIB_OPT                   : OFF
-- EXECUTORCH_CADENCE_CPU_RUNNER          : OFF
-- EXECUTORCH_BUILD_SIZE_TEST             : OFF
-- EXECUTORCH_BUILD_XNNPACK               : OFF
-- EXECUTORCH_BUILD_VULKAN                : OFF
-- EXECUTORCH_BUILD_PORTABLE_OPS          : ON
-- EXECUTORCH_USE_DL                      : ON
-- EXECUTORCH_BUILD_CADENCE               : OFF
-- EXECUTORCH_BUILD_CORTEX_M              : OFF
-- EXECUTORCH_BUILD_GFLAGS                : ON
-- EXECUTORCH_BUILD_PTHREADPOOL           : ON
-- EXECUTORCH_BUILD_CPUINFO               : ON
-- EXECUTORCH_BUILD_EXECUTOR_RUNNER       : OFF
-- ---------------------------
```

cc @larryliu0820
---
 CMakeLists.txt                          | 26 ++++++++++++++-----------
 examples/models/llama/CMakeLists.txt    |  3 ---
 examples/models/llava/CMakeLists.txt    |  3 ---
 examples/selective_build/CMakeLists.txt |  3 ---
 test/CMakeLists.txt                     |  3 ---
 tools/cmake/Utils.cmake                 | 13 -------------
 tools/cmake/common/preset.cmake         |  6 +++++-
 7 files changed, 20 insertions(+), 37 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed154c6df04..47b8a4eb678 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,21 @@ project(executorch)
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
 
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+announce_configured_options(CMAKE_CXX_STANDARD)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Debug)
+endif()
+announce_configured_options(CMAKE_BUILD_TYPE)
+
+announce_configured_options(CMAKE_CXX_COMPILER_ID)
+announce_configured_options(CMAKE_TOOLCHAIN_FILE)
+announce_configured_options(BUCK2)
+announce_configured_options(PYTHON_EXECUTABLE)
+
 load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
 
@@ -63,14 +78,6 @@ include(ExternalProject)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-if(NOT CMAKE_CXX_STANDARD)
-  set(CMAKE_CXX_STANDARD 17)
-endif()
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Debug)
-endif()
-
 # Setup RPATH.
 # See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
 # Use separate rpaths during build and install phases
@@ -712,6 +719,3 @@ if(EXECUTORCH_BUILD_VULKAN)
 endif()
 
 include(Test.cmake)
-
-# Print all summary
-executorch_print_configuration_summary()
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 4ea735e5717..952cdf1b65d 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -222,6 +222,3 @@ target_include_directories(
 )
 target_link_libraries(llama_main PUBLIC llama_runner ${link_libraries})
 target_compile_options(llama_main PUBLIC ${_common_compile_options})
-
-# Print all summary
-executorch_print_configuration_summary()
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index fe3eb5628b2..64be6111674 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -208,6 +208,3 @@ endif()
 target_include_directories(llava_main PUBLIC ${_common_include_directories})
 target_link_libraries(llava_main PUBLIC llava_runner ${link_libraries})
 target_compile_options(llava_main PUBLIC ${_common_compile_options})
-
-# Print all summary
-executorch_print_configuration_summary()
diff --git a/examples/selective_build/CMakeLists.txt b/examples/selective_build/CMakeLists.txt
index fc059c2cc68..a37b4362d78 100644
--- a/examples/selective_build/CMakeLists.txt
+++ b/examples/selective_build/CMakeLists.txt
@@ -135,6 +135,3 @@ target_link_libraries(
 )
 target_link_options_shared_lib(select_build_lib)
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
-
-# Print all summary
-executorch_print_configuration_summary()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 04e18f55a9e..0e3a6703c54 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -80,6 +80,3 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
 endif()
 endif()
-
-# Print all summary
-executorch_print_configuration_summary()
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index d588a983086..ae2a1055b83 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -18,19 +18,6 @@
 # It should also be cmake-lint clean.
 #
 
-# Public function to print summary for all configurations. For new variables,
-# it's recommended to add them here.
-function(executorch_print_configuration_summary)
-  message(STATUS "")
-  message(STATUS "******** Summary ********")
-  message(STATUS "  CMAKE_BUILD_TYPE              : ${CMAKE_BUILD_TYPE}")
-  message(STATUS "  CMAKE_CXX_STANDARD            : ${CMAKE_CXX_STANDARD}")
-  message(STATUS "  CMAKE_CXX_COMPILER_ID         : ${CMAKE_CXX_COMPILER_ID}")
-  message(STATUS "  CMAKE_TOOLCHAIN_FILE          : ${CMAKE_TOOLCHAIN_FILE}")
-  message(STATUS "  BUCK2                         : ${BUCK2}")
-  message(STATUS "  PYTHON_EXECUTABLE             : ${PYTHON_EXECUTABLE}")
-endfunction()
-
 # This is the funtion to use -Wl, --whole-archive to link static library NB:
 # target_link_options is broken for this case, it only append the interface link
 # options of the first library.
diff --git a/tools/cmake/common/preset.cmake b/tools/cmake/common/preset.cmake
index e9933c8f05e..45ec8ec30f7 100644
--- a/tools/cmake/common/preset.cmake
+++ b/tools/cmake/common/preset.cmake
@@ -48,7 +48,11 @@ function(print_configured_options)
       set(padding "${padding} ")
       math(EXPR num_spaces "${num_spaces} - 1")
     endwhile()
-    message(STATUS "${_option}${padding} : ${${_option}}")
+    if(DEFINED ${_option})
+      message(STATUS "${_option}${padding} : ${${_option}}")
+    else()
+      message(STATUS "${_option}${padding} x (unset)")
+    endif()
   endforeach()
   message(STATUS "---------------------------")
 endfunction()

From 0b231c4e338b557a4ca033669cece7495df7757b Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 13 May 2025 12:15:11 -0400
Subject: [PATCH 068/178] [Executorch][llm] Make custom update cache op operate
 on indices (#10834)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/10610 by
@kimishpatel
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/187/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/187/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/186/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/187/orig
@diff-train-skip-merge

Co-authored-by: Kimish Patel <kimishpatel@fb.com>
---
 .../source_transformation/custom_kv_cache.py  | 107 ++++++--
 extension/llm/custom_ops/custom_ops.py        |  56 +++-
 extension/llm/custom_ops/op_sdpa_aot.cpp      |  51 ++++
 extension/llm/custom_ops/op_update_cache.cpp  | 197 +++++++++++---
 extension/llm/custom_ops/op_update_cache.h    |  10 +
 extension/llm/custom_ops/test_update_cache.py | 251 ++++++++++++++++++
 6 files changed, 598 insertions(+), 74 deletions(-)

diff --git a/examples/models/llama/source_transformation/custom_kv_cache.py b/examples/models/llama/source_transformation/custom_kv_cache.py
index 1158a8ba7a6..4674074f8a5 100644
--- a/examples/models/llama/source_transformation/custom_kv_cache.py
+++ b/examples/models/llama/source_transformation/custom_kv_cache.py
@@ -6,7 +6,7 @@
 
 import logging
 from enum import Enum
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -93,7 +93,7 @@ def _quantize(self, value):
         )
         return quantized_value, scales, zero_points
 
-    def _quantize_and_update(self, input_pos, k_val, v_val):
+    def _quantize_and_update(self, input_pos, k_val, v_val, indices=None):
         quantized_k_val, k_scales, k_zero_points = self._quantize(k_val)
         quantized_v_val, v_scales, v_zero_points = self._quantize(v_val)
 
@@ -104,17 +104,48 @@ def _quantize_and_update(self, input_pos, k_val, v_val):
 
         if self.use_custom_update_cache_op:
             start_pos = input_pos[0].item()
-            _ = torch.ops.llama.update_cache(quantized_k_val, self.k_cache, start_pos)
-            _ = torch.ops.llama.update_cache(k_scales, self.k_cache_scales, start_pos)
-            _ = torch.ops.llama.update_cache(
-                k_zero_points, self.k_cache_zero_points, start_pos
-            )
-            _ = torch.ops.llama.update_cache(quantized_v_val, self.v_cache, start_pos)
-            _ = torch.ops.llama.update_cache(v_scales, self.v_cache_scales, start_pos)
-            _ = torch.ops.llama.update_cache(
-                v_zero_points, self.v_cache_zero_points, start_pos
-            )
+            if indices is not None:
+                _ = torch.ops.llama.update_cache_with_indices(
+                    quantized_k_val, self.k_cache, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    k_scales, self.k_cache_scales, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    k_zero_points, self.k_cache_zero_points, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    quantized_v_val, self.v_cache, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    v_scales, self.v_cache_scales, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    v_zero_points, self.v_cache_zero_points, start_pos, indices
+                )
+            else:
+                _ = torch.ops.llama.update_cache(
+                    quantized_k_val, self.k_cache, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    k_scales, self.k_cache_scales, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    k_zero_points, self.k_cache_zero_points, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    quantized_v_val, self.v_cache, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    v_scales, self.v_cache_scales, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    v_zero_points, self.v_cache_zero_points, start_pos
+                )
         else:
+            assert indices is None, "Indices not supported for this path"
+            # Following is also broken because in prefill input_pos = [0]
+            # but we need to update some slice of cache
             self.k_cache[:, input_pos] = quantized_k_val
             self.k_cache_scales[:, input_pos] = k_scales
             self.k_cache_zero_points[:, input_pos] = k_zero_points
@@ -122,8 +153,8 @@ def _quantize_and_update(self, input_pos, k_val, v_val):
             self.v_cache_scales[:, input_pos] = v_scales
             self.v_cache_zero_points[:, input_pos] = v_zero_points
 
-    def _update_and_return_float_values(self, input_pos, k_val, v_val):
-        self._quantize_and_update(input_pos, k_val, v_val)
+    def _update_and_return_float_values(self, input_pos, k_val, v_val, indices=None):
+        self._quantize_and_update(input_pos, k_val, v_val, indices)
 
         k_out = torch.ops.quantized_decomposed.dequantize_per_token(
             self.k_cache,
@@ -144,24 +175,34 @@ def _update_and_return_float_values(self, input_pos, k_val, v_val):
             self.cache_fp_type,
         )
 
-        # When returning float values we jsut use the last value
+        # When returning float values we just use the last value
         # instead of dequantized value.
         start_pos = input_pos[0].item()
         if self.use_custom_update_cache_op:
-            _ = torch.ops.llama.update_cache(k_val, k_out, start_pos)
-            _ = torch.ops.llama.update_cache(v_val, v_out, start_pos)
+            if indices is not None:
+                _ = torch.ops.llama.update_cache_with_indices(
+                    k_val, k_out, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    v_val, v_out, start_pos, indices
+                )
+            else:
+                _ = torch.ops.llama.update_cache(k_val, k_out, start_pos)
+                _ = torch.ops.llama.update_cache(v_val, v_out, start_pos)
         else:
             k_out[:, input_pos] = k_val
             v_out[:, input_pos] = v_val
 
         return k_out, v_out
 
-    def _update_and_return_quantized_values(self, input_pos, k_val, v_val):
-        self._quantize_and_update(input_pos, k_val, v_val)
+    def _update_and_return_quantized_values(
+        self, input_pos, k_val, v_val, indices=None
+    ):
+        self._quantize_and_update(input_pos, k_val, v_val, indices)
 
         return self.k_cache, self.v_cache
 
-    def update(self, input_pos, k_val, v_val):
+    def update(self, input_pos, k_val, v_val, indices=None):
         """
         k_val, v_val: [B, H, S, D]
         return: [B, H, S, D]
@@ -172,10 +213,12 @@ def update(self, input_pos, k_val, v_val):
         v_val = v_val.transpose(1, 2)
 
         if self.return_float_values:
-            k_out, v_out = self._update_and_return_float_values(input_pos, k_val, v_val)
+            k_out, v_out = self._update_and_return_float_values(
+                input_pos, k_val, v_val, indices
+            )
         else:
             k_out, v_out = self._update_and_return_quantized_values(
-                input_pos, k_val, v_val
+                input_pos, k_val, v_val, indices
             )
         return k_out.transpose(1, 2), v_out.transpose(1, 2)
 
@@ -277,14 +320,28 @@ def __init__(
         )
 
     def update(
-        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+        self,
+        input_pos: torch.Tensor,
+        k_val: torch.Tensor,
+        v_val: torch.Tensor,
+        indices: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # input_pos: [S], k_val: [B, H, S, D]
         k_val = k_val.transpose(1, 2)
         v_val = v_val.transpose(1, 2)
         start_pos = input_pos[0].item()
-        _ = torch.ops.llama.update_cache(k_val, self.k_cache, start_pos)
-        _ = torch.ops.llama.update_cache(v_val, self.v_cache, start_pos)
+
+        if indices is not None:
+            _ = torch.ops.llama.update_cache_with_indices(
+                k_val, self.k_cache, start_pos, indices
+            )
+            _ = torch.ops.llama.update_cache_with_indices(
+                v_val, self.v_cache, start_pos, indices
+            )
+        else:
+            _ = torch.ops.llama.update_cache(k_val, self.k_cache, start_pos)
+            _ = torch.ops.llama.update_cache(v_val, self.v_cache, start_pos)
+
         return (
             self.k_cache.transpose(1, 2),
             self.v_cache.transpose(1, 2),
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
index 6d96a926497..947eae6c0d0 100644
--- a/extension/llm/custom_ops/custom_ops.py
+++ b/extension/llm/custom_ops/custom_ops.py
@@ -184,6 +184,7 @@ def _validate_update_cache_params(
     value,
     cache,
     start_pos,
+    indices=None,
 ):
     seq_len = value.size(1)
     assert (
@@ -200,17 +201,30 @@ def _validate_update_cache_params(
         ), f"Expected value and cache to have same size in dimension {i} but got {value.size(i)} and {cache.size(i)}"
 
     torch._check_is_size(start_pos)
-    # Setting to arbitrary limit of 256 for now since there is no way
-    # to plumb this information from model config
-    torch._check(start_pos < cache.size(1))
-    assert start_pos < cache.size(
-        1
-    ), f"Start position {start_pos} must be less than sequence length {cache.size(1)}"
-
-    torch._check((start_pos + seq_len) < cache.size(1))
-    assert (start_pos + seq_len) < cache.size(
-        1
-    ), f"Start position  + length = {start_pos + seq_len} must be less than sequence length {cache.size(1)}"
+    if indices is None:
+        torch._check(start_pos < cache.size(1))
+        assert start_pos < cache.size(
+            1
+        ), f"Start position {start_pos} must be less than sequence length {cache.size(1)}"
+
+        torch._check((start_pos + seq_len) < cache.size(1))
+        assert (start_pos + seq_len) < cache.size(
+            1
+        ), f"Start position  + length = {start_pos + seq_len} must be less than sequence length {cache.size(1)}"
+
+    if indices is not None:
+        assert (
+            indices.dim() == 2
+        ), f"Expected indices to be 2 dimensional but got {indices.dim()} dimensions."
+        assert (
+            indices.dtype == torch.int64
+        ), f"Expected indices to be int64 but got {indices.dtype}"
+        assert indices.size(0) == value.size(
+            0
+        ), f"Expected indices batch dimension to match value batch dimension but got {indices.size(0)} and {value.size(0)}"
+        assert indices.size(1) == value.size(
+            1
+        ), f"Expected indices sequence length dimension to match value sequence length dimension but got {indices.size(1)} and {value.size(1)}"
 
 
 @impl(custom_ops_lib, "update_cache", "Meta")
@@ -231,6 +245,26 @@ def update_cache_meta(
     return torch.empty((1,), dtype=value.dtype, device="meta")
 
 
+@impl(custom_ops_lib, "update_cache_with_indices", "Meta")
+def update_cache_with_indices_meta(
+    value,
+    cache,
+    start_pos,
+    indices,
+):
+    _validate_update_cache_params(
+        value,
+        cache,
+        start_pos,
+        indices,
+    )
+
+    # Update cache doesnt really return anything but I dont know a better
+    # workaround. Should we just return cache instead? But I am afraid that
+    # will result in extra memory allocation
+    return torch.empty((1,), dtype=value.dtype, device="meta")
+
+
 def _validate_quantized_sdpa_params(
     query,
     key,
diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
index ff367c85c8a..5bbf22d336e 100644
--- a/extension/llm/custom_ops/op_sdpa_aot.cpp
+++ b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -129,6 +129,20 @@ at::Tensor update_cache_aten(
     at::Tensor& cache,
     const int64_t start_pos);
 
+// New functions for update_cache_with_indices
+Tensor& update_cache_with_indices_out_no_context(
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    const Tensor& indices,
+    Tensor& output);
+
+at::Tensor update_cache_with_indices_aten(
+    const at::Tensor& value,
+    at::Tensor& cache,
+    const int64_t start_pos,
+    const at::Tensor& indices);
+
 Tensor& sdpa_with_kv_cache_out_no_context(
     const Tensor& q_projected,
     const Tensor& k_projected,
@@ -340,6 +354,29 @@ at::Tensor update_cache_aten(
   return output;
 }
 
+// Implementations for update_cache_with_indices
+Tensor& update_cache_with_indices_out_no_context(
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    const Tensor& indices,
+    Tensor& output) {
+  executorch::aten::RuntimeContext context{};
+  return torch::executor::native::update_cache_with_indices_out(
+      context, value, cache, start_pos, indices, output);
+}
+
+at::Tensor update_cache_with_indices_aten(
+    const at::Tensor& value,
+    at::Tensor& cache,
+    const int64_t start_pos,
+    const at::Tensor& indices) {
+  auto output = at::empty({1});
+  WRAP_TO_ATEN(update_cache_with_indices_out_no_context, 4)
+  (value, cache, start_pos, indices, output);
+  return output;
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
@@ -367,6 +404,12 @@ TORCH_LIBRARY_FRAGMENT(llama, m) {
   m.def(
       "update_cache.out(Tensor value, Tensor(a!) cache, "
       "SymInt start_pos, *, Tensor(b!) out) -> Tensor(b!)");
+  m.def(
+      "update_cache_with_indices(Tensor value, Tensor(a!) cache, "
+      "SymInt start_pos, Tensor indices) -> Tensor");
+  m.def(
+      "update_cache_with_indices.out(Tensor value, Tensor(a!) cache, "
+      "SymInt start_pos, Tensor indices, *, Tensor(b!) out) -> Tensor(b!)");
   m.def(
       "custom_quantized_sdpa(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
       "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
@@ -397,6 +440,14 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
   m.impl(
       "update_cache.out",
       WRAP_TO_ATEN(torch::executor::native::update_cache_out_no_context, 3));
+  m.impl(
+      "update_cache_with_indices",
+      torch::executor::native::update_cache_with_indices_aten);
+  m.impl(
+      "update_cache_with_indices.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::update_cache_with_indices_out_no_context,
+          4));
   m.impl(
       "custom_quantized_sdpa",
       torch::executor::native::custom_quantized_sdpa_aten);
diff --git a/extension/llm/custom_ops/op_update_cache.cpp b/extension/llm/custom_ops/op_update_cache.cpp
index 323b7a65ddb..7ab994deb5f 100644
--- a/extension/llm/custom_ops/op_update_cache.cpp
+++ b/extension/llm/custom_ops/op_update_cache.cpp
@@ -20,30 +20,58 @@ namespace executor {
 namespace native {
 
 namespace {
+// Helper function to validate cache parameters
 bool validate_cache_params(
     const Tensor& quantized_value,
     const Tensor& quantized_cache,
     int64_t start_pos,
-    int64_t seq_length) {
+    int64_t seq_length,
+    const optional<Tensor>& indices = nullopt) {
   ET_CHECK_OR_RETURN_FALSE(
       quantized_cache.dim() == 4, "quantized cache must be a 4D tensor");
 
   ET_CHECK_OR_RETURN_FALSE(
       quantized_value.dim() == 4, "quantized_value must be a 4D tensor");
 
-  ET_CHECK_OR_RETURN_FALSE(
-      start_pos < quantized_cache.size(1),
-      "start_pos must be less than cache size at dim 1");
+  if (indices.has_value()) {
+    const auto& indices_tensor = indices.value();
+    ET_CHECK_OR_RETURN_FALSE(
+        indices_tensor.dim() == 2,
+        "indices must be a 2D tensor [batch_size, seq_len]");
 
-  ET_CHECK_OR_RETURN_FALSE(
-      (start_pos + seq_length) <= quantized_cache.size(1),
-      "start_post + seq_length must be less than max seq length supported by cache."
-      "start pos: %" PRId64 ", seq_length: %" PRId64
-      "."
-      "cache size: %zd",
-      start_pos,
-      seq_length,
-      quantized_cache.size(1));
+    ET_CHECK_OR_RETURN_FALSE(
+        indices_tensor.size(0) == quantized_value.size(0),
+        "indices batch dimension must match value batch dimension");
+
+    ET_CHECK_OR_RETURN_FALSE(
+        indices_tensor.size(1) == quantized_value.size(1),
+        "indices sequence length dimension must match value sequence length dimension");
+
+    ET_CHECK_OR_RETURN_FALSE(
+        indices_tensor.scalar_type() == ScalarType::Long,
+        "indices must be of Long (int64_t) type");
+
+    ET_CHECK_OR_RETURN_FALSE(
+        is_contiguous_dim_order(
+            indices_tensor.dim_order().data(), indices_tensor.dim()),
+        "indices must be in contiguous dim order");
+  } else {
+    ET_CHECK_OR_RETURN_FALSE(
+        start_pos < quantized_cache.size(1),
+        "start_pos: %" PRId64 " must be less than cache size at dim 1: %zd",
+        start_pos,
+        quantized_cache.size(1));
+
+    ET_CHECK_OR_RETURN_FALSE(
+        (start_pos + seq_length) <= quantized_cache.size(1),
+        "start_post + seq_length must be less than max seq length supported by cache."
+        "start pos: %" PRId64 ", seq_length: %" PRId64
+        "."
+        "cache size: %zd",
+        start_pos,
+        seq_length,
+        quantized_cache.size(1));
+  }
 
   // Make sure they are in contiguous dim order
   ET_CHECK_OR_RETURN_FALSE(
@@ -58,34 +86,37 @@ bool validate_cache_params(
 
   return true;
 }
-} // anonymous namespace
 
-Tensor& update_cache_out(
+// Helper function for the actual update operation
+Tensor& update_cache_impl(
     RuntimeContext& ctx,
     const Tensor& value,
     Tensor& cache,
     const int64_t start_pos,
-    Tensor& output) {
+    Tensor& output,
+    const optional<Tensor>& indices = nullopt) {
   (void)ctx;
-  int64_t seq_len = value.size(1);
-  ET_KERNEL_CHECK(
-      ctx,
-      validate_cache_params(value, cache, start_pos, seq_len),
-      InvalidArgument,
-      output);
 
   ET_CHECK_MSG(
       value.size(0) == cache.size(0),
-      "projected_value batch size should be equal to the cache batch size.");
+      "projected_value batch size (%zd) should be equal to the cache batch size (%zd).",
+      value.size(0),
+      cache.size(0));
   ET_CHECK_MSG(
       value.size(2) == cache.size(2),
-      "projected_value number of heads should be equal to the cache number of heads.");
+      "projected_value number of heads (%zd) should be equal to the cache number of heads (%zd).",
+      value.size(2),
+      cache.size(2));
   ET_CHECK_MSG(
       value.size(3) == cache.size(3),
-      "projected_value embedding dimension should be equal to the cache embedding dimension.");
+      "projected_value embedding dimension (%zd) should be equal to the cache embedding dimension (%zd).",
+      value.size(3),
+      cache.size(3));
   ET_CHECK_MSG(
       value.element_size() == cache.element_size(),
-      "projected_value data type size should be equal to the cache data type size.");
+      "projected_value data type size (%zd) should be equal to the cache data type size (%zd).",
+      value.element_size(),
+      cache.element_size());
 
   ET_CHECK_MSG(
       is_contiguous_dim_order(value.dim_order().data(), value.dim()),
@@ -110,23 +141,107 @@ Tensor& update_cache_out(
   executorch::aten::SizesType num_bytes_to_copy =
       (value.numel() / value.size(0)) * value.element_size();
 
-  for (int64_t batch_line = 0; batch_line < value.size(0); ++batch_line) {
-    executorch::aten::SizesType cache_pos_offset =
-        (batch_line * cache_batch_dim_stride +
-         start_pos * cache_seq_dim_stride) *
-        cache.element_size();
-    executorch::aten::SizesType value_pos_offset =
-        (batch_line * value_batch_dim_stride) * cache.element_size();
-
-    std::memcpy(
-        (uint8_t*)cache_data + cache_pos_offset,
-        (uint8_t*)value_data + value_pos_offset,
-        num_bytes_to_copy);
+  if (indices.has_value()) {
+    // Use the provided indices tensor for each batch and sequence position
+    const Tensor& indices_tensor = indices.value();
+    const int64_t* indices_data =
+        static_cast<const int64_t*>(indices_tensor.const_data_ptr());
+    auto indices_strides = indices_tensor.strides();
+    executorch::aten::StridesType indices_batch_stride = indices_strides[0];
+    executorch::aten::StridesType indices_seq_stride = indices_strides[1];
+
+    // Calculate bytes to copy for a single token
+    executorch::aten::SizesType bytes_per_token =
+        (value.numel() / (value.size(0) * value.size(1))) *
+        value.element_size();
+
+    for (int64_t batch_line = 0; batch_line < value.size(0); ++batch_line) {
+      for (int64_t seq_idx = 0; seq_idx < value.size(1); ++seq_idx) {
+        // Get the target position from the indices tensor
+        int64_t target_pos = indices_data
+            [batch_line * indices_batch_stride + seq_idx * indices_seq_stride];
+
+        // Ensure the target position is valid
+        ET_CHECK_MSG(
+            target_pos >= 0 && target_pos < cache.size(1),
+            "Index out of bounds: %" PRId64 " not in [0, %zd)",
+            target_pos,
+            cache.size(1));
+
+        // Calculate offsets for cache and value
+        executorch::aten::SizesType cache_pos_offset =
+            (batch_line * cache_batch_dim_stride +
+             target_pos * cache_seq_dim_stride) *
+            cache.element_size();
+
+        executorch::aten::SizesType value_pos_offset =
+            (batch_line * value_batch_dim_stride + seq_idx * value_strides[1]) *
+            value.element_size();
+
+        // Copy a single token
+        std::memcpy(
+            (uint8_t*)cache_data + cache_pos_offset,
+            (uint8_t*)value_data + value_pos_offset,
+            bytes_per_token);
+      }
+    }
+  } else {
+    // Use the original implementation with start_pos
+    for (int64_t batch_line = 0; batch_line < value.size(0); ++batch_line) {
+      executorch::aten::SizesType cache_pos_offset =
+          (batch_line * cache_batch_dim_stride +
+           start_pos * cache_seq_dim_stride) *
+          cache.element_size();
+      executorch::aten::SizesType value_pos_offset =
+          (batch_line * value_batch_dim_stride) * cache.element_size();
+
+      std::memcpy(
+          (uint8_t*)cache_data + cache_pos_offset,
+          (uint8_t*)value_data + value_pos_offset,
+          num_bytes_to_copy);
+    }
   }
 
   // Noone uses output. Just a placeholder.
   return output;
 }
+} // anonymous namespace
+
+// Original update_cache_out function without indices parameter
+Tensor& update_cache_out(
+    RuntimeContext& ctx,
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    Tensor& output) {
+  int64_t seq_len = value.size(1);
+  ET_KERNEL_CHECK(
+      ctx,
+      validate_cache_params(value, cache, start_pos, seq_len),
+      InvalidArgument,
+      output);
+
+  return update_cache_impl(ctx, value, cache, start_pos, output);
+}
+
+// New function that explicitly takes indices
+Tensor& update_cache_with_indices_out(
+    RuntimeContext& ctx,
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    const Tensor& indices,
+    Tensor& output) {
+  int64_t seq_len = value.size(1);
+  ET_KERNEL_CHECK(
+      ctx,
+      validate_cache_params(value, cache, start_pos, seq_len, indices),
+      InvalidArgument,
+      output);
+
+  return update_cache_impl(ctx, value, cache, start_pos, output, indices);
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
@@ -141,3 +256,9 @@ EXECUTORCH_LIBRARY(
     llama,
     "update_cache.out",
     torch::executor::native::update_cache_out);
+
+// Register the new update_cache_with_indices.out op
+EXECUTORCH_LIBRARY(
+    llama,
+    "update_cache_with_indices.out",
+    torch::executor::native::update_cache_with_indices_out);
diff --git a/extension/llm/custom_ops/op_update_cache.h b/extension/llm/custom_ops/op_update_cache.h
index cf518b4e108..84c73039469 100644
--- a/extension/llm/custom_ops/op_update_cache.h
+++ b/extension/llm/custom_ops/op_update_cache.h
@@ -15,12 +15,22 @@ namespace executor {
 
 namespace native {
 
+// Original update_cache_out function without indices parameter
 Tensor& update_cache_out(
     RuntimeContext& ctx,
     const Tensor& value,
     Tensor& cache,
     const int64_t start_pos,
     Tensor& output);
+
+// New function that explicitly takes indices
+Tensor& update_cache_with_indices_out(
+    RuntimeContext& ctx,
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    const Tensor& indices,
+    Tensor& output);
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/custom_ops/test_update_cache.py b/extension/llm/custom_ops/test_update_cache.py
index 1d2f392c129..78c30d5f8b7 100644
--- a/extension/llm/custom_ops/test_update_cache.py
+++ b/extension/llm/custom_ops/test_update_cache.py
@@ -6,11 +6,28 @@
 
 # pyre-unsafe
 
+import multiprocessing
 import unittest
 
 import torch
 
 
+def run_in_subprocess(target):
+    """
+    Decorator to run the target function in a separate subprocess
+    so as to allow cpp code to throw runtime::abort
+    """
+
+    def wrapper(*args, **kwargs):
+        p = multiprocessing.Process(target=target, args=args, kwargs=kwargs)
+        p.start()
+        p.join()
+        if p.exitcode != 0:
+            raise Exception(f"Subprocess failed with exit code {p.exitcode}")
+
+    return wrapper
+
+
 class UpdateQuantizedKVCacheTest(unittest.TestCase):
 
     def _reset(self):
@@ -82,6 +99,38 @@ def _update_and_validate(
         self.assertTrue(torch.allclose(k_zero_points_cache, self.k_zero_points_cache))
         self.assertTrue(torch.allclose(v_zero_points_cache, self.v_zero_points_cache))
 
+    def _update_with_indices_and_validate(
+        self, k, k_scales, k_zero_points, start_pos, indices
+    ):
+        k_cache = self.quantized_k_cache.clone()
+        k_scales_cache = self.k_scales_cache.clone()
+        k_zero_points_cache = self.k_zero_points_cache.clone()
+
+        # Update using Python indexing for reference
+        for batch_idx in range(self.batch_size):
+            for seq_idx in range(indices.size(1)):
+                idx = indices[batch_idx, seq_idx].item()
+                if idx >= 0 and idx < self.seq_len:
+                    self.quantized_k_cache[batch_idx, idx] = k[batch_idx, seq_idx]
+                    self.k_scales_cache[batch_idx, idx] = k_scales[batch_idx, seq_idx]
+                    self.k_zero_points_cache[batch_idx, idx] = k_zero_points[
+                        batch_idx, seq_idx
+                    ]
+
+        # Update using custom op
+        torch.ops.llama.update_cache_with_indices(k, k_cache, start_pos, indices)
+        torch.ops.llama.update_cache_with_indices(
+            k_scales, k_scales_cache, start_pos, indices
+        )
+        torch.ops.llama.update_cache_with_indices(
+            k_zero_points, k_zero_points_cache, start_pos, indices
+        )
+
+        # Validate results
+        self.assertTrue(torch.allclose(k_cache, self.quantized_k_cache))
+        self.assertTrue(torch.allclose(k_scales_cache, self.k_scales_cache))
+        self.assertTrue(torch.allclose(k_zero_points_cache, self.k_zero_points_cache))
+
     def test_update_kv_cache_simple(self):
         k = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
         v = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
@@ -94,6 +143,208 @@ def test_update_kv_cache_simple(self):
             k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
         )
 
+    # Tests for update_cache_with_indices functionality
+
+    def test_basic_update_with_indices(self):
+        """Test basic update with indices functionality."""
+        self._reset()
+        k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 3, 8, 1), dtype=torch.int64)
+
+        # Update positions 2, 5, 7
+        indices = torch.tensor([[2, 5, 7]], dtype=torch.int64)
+        start_pos = 0  # start_pos is ignored when indices are provided
+
+        self._update_with_indices_and_validate(
+            k, k_scales, k_zero_points, start_pos, indices
+        )
+
+    def test_single_index_update(self):
+        """Test updating a single position with indices."""
+        self._reset()
+        k = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+
+        # Update only position 4
+        indices = torch.tensor([[4]], dtype=torch.int64)
+        start_pos = 0
+
+        self._update_with_indices_and_validate(
+            k, k_scales, k_zero_points, start_pos, indices
+        )
+
+    def test_sparse_indices(self):
+        """Test updating non-contiguous positions."""
+        self._reset()
+        k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 3, 8, 1), dtype=torch.int64)
+
+        # Update positions 1, 4, 8 (sparse, non-contiguous)
+        indices = torch.tensor([[1, 4, 8]], dtype=torch.int64)
+        start_pos = 0
+
+        self._update_with_indices_and_validate(
+            k, k_scales, k_zero_points, start_pos, indices
+        )
+
+    def test_out_of_order_indices(self):
+        """Test updating positions in a non-sequential order."""
+        self._reset()
+        k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 3, 8, 1), dtype=torch.int64)
+
+        # Update positions in reverse order: 8, 5, 2
+        indices = torch.tensor([[8, 5, 2]], dtype=torch.int64)
+        start_pos = 0
+
+        self._update_with_indices_and_validate(
+            k, k_scales, k_zero_points, start_pos, indices
+        )
+
+    def test_indices_exceeding_cache_size(self):
+        """Test behavior when indices exceed the cache size."""
+        self._reset()
+        k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+
+        # Try to update positions 5, 9, 15 (where 15 is out of bounds)
+        indices = torch.tensor([[5, 9, 15]], dtype=torch.int64)
+        start_pos = 0
+
+        @run_in_subprocess
+        def run_and_catch(k, k_cache, start_pos, indices):
+            torch.ops.llama.update_cache(k, k_cache, start_pos, indices)
+
+        exception_raised = False
+        try:
+            run_and_catch(k, self.quantized_k_cache, start_pos, indices)
+        except Exception:
+            exception_raised = True
+        self.assertTrue(exception_raised)
+
+    def test_negative_indices(self):
+        """Test behavior with negative indices."""
+        self._reset()
+        k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+
+        # Try to update with negative indices
+        indices = torch.tensor([[5, -1, 8]], dtype=torch.int64)
+        start_pos = 0
+
+        @run_in_subprocess
+        def run_and_catch(k, k_cache, start_pos, indices):
+            torch.ops.llama.update_cache(k, k_cache, start_pos, indices)
+
+        exception_raised = False
+        try:
+            run_and_catch(k, self.quantized_k_cache, start_pos, indices)
+        except Exception:
+            exception_raised = True
+        self.assertTrue(exception_raised)
+
+    def test_duplicate_indices(self):
+        """Test behavior when the same position is updated multiple times."""
+        self._reset()
+        k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 3, 8, 1), dtype=torch.int64)
+        v_zero_points = torch.randint(0, 20, (1, 3, 8, 1), dtype=torch.int64)
+
+        # Update with duplicate indices - the last value should be used
+        indices = torch.tensor([[3, 5, 3]], dtype=torch.int64)
+        start_pos = 0
+
+        # For our reference implementation, we need to handle this case specially
+        k_cache = self.quantized_k_cache.clone()
+        v_cache = self.quantized_v_cache.clone()
+        k_scales_cache = self.k_scales_cache.clone()
+        v_scales_cache = self.v_scales_cache.clone()
+        k_zero_points_cache = self.k_zero_points_cache.clone()
+        v_zero_points_cache = self.v_zero_points_cache.clone()
+
+        # Update using custom op
+        torch.ops.llama.update_cache_with_indices(k, k_cache, start_pos, indices)
+        torch.ops.llama.update_cache_with_indices(
+            k_scales, k_scales_cache, start_pos, indices
+        )
+        torch.ops.llama.update_cache_with_indices(
+            k_zero_points, k_zero_points_cache, start_pos, indices
+        )
+        torch.ops.llama.update_cache_with_indices(v, v_cache, start_pos, indices)
+        torch.ops.llama.update_cache_with_indices(
+            v_scales, v_scales_cache, start_pos, indices
+        )
+        torch.ops.llama.update_cache_with_indices(
+            v_zero_points, v_zero_points_cache, start_pos, indices
+        )
+
+        # Position 3 should have the value from the last update (index 2 in the sequence)
+        self.assertTrue(torch.allclose(k_cache[0, 3], k[0, 2]))
+        self.assertTrue(torch.allclose(v_cache[0, 3], v[0, 2]))
+        self.assertTrue(torch.allclose(k_scales_cache[0, 3], k_scales[0, 2]))
+        self.assertTrue(torch.allclose(v_scales_cache[0, 3], v_scales[0, 2]))
+        self.assertTrue(torch.allclose(k_zero_points_cache[0, 3], k_zero_points[0, 2]))
+        self.assertTrue(torch.allclose(v_zero_points_cache[0, 3], v_zero_points[0, 2]))
+
+        # Position 5 should have the value from index 1
+        self.assertTrue(torch.allclose(k_cache[0, 5], k[0, 1]))
+        self.assertTrue(torch.allclose(v_cache[0, 5], v[0, 1]))
+
+    def test_batched_update_with_indices(self):
+        """Test updating with indices in a batched setting."""
+        self.batch_size = 2
+        self._reset()
+        k = torch.randint(0, 50, (self.batch_size, 3, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((self.batch_size, 3, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(
+            0, 20, (self.batch_size, 3, 8, 1), dtype=torch.int64
+        )
+
+        # Different indices for each batch
+        indices = torch.tensor(
+            [[1, 4, 7], [2, 5, 8]],  # indices for batch 0  # indices for batch 1
+            dtype=torch.int64,
+        )
+        start_pos = 0
+
+        self._update_with_indices_and_validate(
+            k, k_scales, k_zero_points, start_pos, indices
+        )
+
+    def test_different_seq_lengths_per_batch(self):
+        """Test updating with different sequence lengths per batch using padding."""
+        self.batch_size = 2
+        self._reset()
+
+        # Create inputs with 3 tokens
+        k = torch.randint(0, 50, (self.batch_size, 3, 8, 4), dtype=torch.int8)
+
+        # Batch 0: update 3 positions, Batch 1: update only 2 positions (use -1 as padding)
+        indices = torch.tensor(
+            [
+                [1, 3, 5],  # 3 valid indices for batch 0
+                [2, 4, -1],  # 2 valid indices for batch 1, with -1 as padding
+            ],
+            dtype=torch.int64,
+        )
+        start_pos = 0
+
+        @run_in_subprocess
+        def run_and_catch(k, k_cache, start_pos, indices):
+            torch.ops.llama.update_cache(k, k_cache, start_pos, indices)
+
+        exception_raised = False
+        try:
+            run_and_catch(k, self.quantized_k_cache, start_pos, indices)
+        except Exception:
+            exception_raised = True
+        self.assertTrue(exception_raised)
+
     def test_update_kv_cache_large_update(self):
         self._reset()
         k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)

From d338eea8b1b1c2cd395f714d9db823c0425cbffc Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Tue, 13 May 2025 09:40:08 -0700
Subject: [PATCH 069/178] Update ownership for the build system (#10837)

### Summary
Let me know if you want to be included.


![image](https://github.com/user-attachments/assets/2f5476fd-eec7-424e-851b-f95328aacd4e)


### Test plan
N/A
---
 CODEOWNERS | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 014b8ed0fce..a9dabcb78f2 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -15,10 +15,6 @@
 /backends/vulkan @SS-JIA
 /backends/xnnpack @digantdesai @mcr229
 
-/build @GregoryComer @kirklandsign
-
-/codegen @larryliu0820 @lucylq
-
 /devtools @tarun292 @Gasoonjia
 
 /docs @mergennachin
@@ -41,7 +37,6 @@
 /exir/backend @cccclai @kimishpatel @JacobSzwejbka @tarun292
 /exir @JacobSzwejbka @tarun292 @larryliu0820
 
-
 /extension/android @kirklandsign
 /extension/android_test @kirklandsign
 /extension/apple @shoumikhin
@@ -83,3 +78,11 @@
 /test @larryliu0820 @kirklandsign
 
 /util @tarun292
+
+# Build System -----------------------------------------------------------------
+
+CMakeLists.txt @jathu @larryliu0820 @kirklandsign
+CMakePresets.json @jathu @larryliu0820 @kirklandsign
+
+/codegen @larryliu0820 @lucylq
+/tools/cmake @jathu @larryliu0820 @kirklandsign

From 3ffe697346f00e7087bca4fa83c22fdf5f3c083c Mon Sep 17 00:00:00 2001
From: Franco Meloni <franco.meloni91@gmail.com>
Date: Tue, 13 May 2025 19:37:32 +0200
Subject: [PATCH 070/178] Add floatValue to ExecuTorch value

Differential Revision: D74603334

Pull Request resolved: https://github.com/pytorch/executorch/pull/10823
---
 .../apple/ExecuTorch/Exported/ExecuTorchValue.h  | 16 ++++++++++++++++
 .../apple/ExecuTorch/Exported/ExecuTorchValue.mm | 10 ++++++++++
 2 files changed, 26 insertions(+)

diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
index a477d7d6f09..1554132e929 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
@@ -40,6 +40,8 @@ typedef NSInteger ExecuTorchIntegerValue
     NS_SWIFT_BRIDGED_TYPEDEF NS_SWIFT_NAME(IntegerValue);
 typedef double ExecuTorchDoubleValue
     NS_SWIFT_BRIDGED_TYPEDEF NS_SWIFT_NAME(DoubleValue);
+typedef float ExecuTorchFloatValue
+    NS_SWIFT_BRIDGED_TYPEDEF NS_SWIFT_NAME(FloatValue);
 
 /**
  * A dynamic value type used by ExecuTorch.
@@ -100,6 +102,13 @@ __attribute__((deprecated("This API is experimental.")))
  */
 @property(nonatomic, readonly) ExecuTorchDoubleValue doubleValue NS_SWIFT_NAME(double);
 
+/**
+ * The float value if the tag is ExecuTorchValueTagDouble.
+ *
+ * @return An float representing the float value.
+ */
+ @property(nonatomic, readonly) ExecuTorchFloatValue floatValue NS_SWIFT_NAME(float);
+
 /**
  * Returns YES if the value is of type None.
  *
@@ -149,6 +158,13 @@ __attribute__((deprecated("This API is experimental.")))
  */
 @property(nonatomic, readonly) BOOL isDouble;
 
+/**
+ * Returns YES if the value is a float.
+ *
+ * @return A BOOL indicating whether the value is a float.
+ */
+ @property(nonatomic, readonly) BOOL isFloat;
+
 /**
  * Creates an instance encapsulating a Tensor.
  *
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm
index 4e5b07bcc36..3475d679e5e 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm
@@ -88,6 +88,11 @@ - (ExecuTorchDoubleValue)doubleValue {
   return [(ExecuTorchScalarValue)_value doubleValue];
 }
 
+- (ExecuTorchFloatValue)floatValue {
+  ET_CHECK(self.isFloat);
+  return [(ExecuTorchScalarValue)_value floatValue];
+}
+
 - (BOOL)isNone {
   return _tag == ExecuTorchValueTagNone;
 }
@@ -118,6 +123,11 @@ - (BOOL)isDouble {
   return _tag == ExecuTorchValueTagDouble;
 }
 
+- (BOOL)isFloat {
+  // EValue does not have a separate tag for float.
+  return _tag == ExecuTorchValueTagDouble;
+}
+
 - (BOOL)isEqualToValue:(nullable ExecuTorchValue *)other {
   if (!other) {
     return NO;

From ef30b25c285891af3b615a7d1196ee099a1a1519 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 13 May 2025 13:38:11 -0400
Subject: [PATCH 071/178] [Executorch][llm] Enable leveraging ring kv cache via
 module swap (#10835)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/10611 by
@kimishpatel
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/188/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/188/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/187/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/188/orig
@diff-train-skip-merge

Co-authored-by: Kimish Patel <kimishpatel@fb.com>
---
 examples/models/llama/attention.py            |  18 +-
 .../source_transformation/custom_kv_cache.py  | 191 +++++++++++++++++-
 examples/models/llama/tests/TARGETS           |  25 +++
 .../llama/tests/test_replace_kv_cache.py      | 158 +++++++++++++++
 .../models/llama/tests/test_ring_attention.py | 161 ++++++++++++---
 5 files changed, 522 insertions(+), 31 deletions(-)
 create mode 100644 examples/models/llama/tests/test_replace_kv_cache.py

diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index 515fd0080fc..63d783c3332 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -150,6 +150,16 @@ def forward(
         return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
 
+def _create_causal_mask_for_ring_buffer(
+    cache_positions, window_size, start_pos, seq_len
+):
+    pos_q = start_pos + torch.arange(seq_len, dtype=torch.long).view(-1, 1)
+    delta = pos_q - cache_positions
+    attn_mask = (cache_positions >= 0) & (delta >= 0) & (delta < window_size)
+    attn_mask = torch.where(attn_mask == True, 0, float("-inf"))  # noqa E712
+    return attn_mask
+
+
 class CacheUpdateStrategy(Enum):
     RING_BUFFER = "RingBuffer"
     INVALID = "Invalid"
@@ -283,12 +293,10 @@ def __init__(
         self.is_ring_buffer = True
 
     def create_causal_mask_for_ring_buffer(self, start_pos, seq_len):
-        pos_q = start_pos + torch.arange(seq_len, dtype=torch.long).view(-1, 1)
         cache_positions = self.cache_positions_manager.cache_positions
-        delta = pos_q - cache_positions
-        attn_mask = (cache_positions >= 0) & (delta >= 0) & (delta < self.window_size)
-        attn_mask = torch.where(attn_mask == True, 0, float("-inf"))  # noqa E712
-        return attn_mask
+        return _create_causal_mask_for_ring_buffer(
+            cache_positions, self.window_size, start_pos, seq_len
+        )
 
     def update(
         self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
diff --git a/examples/models/llama/source_transformation/custom_kv_cache.py b/examples/models/llama/source_transformation/custom_kv_cache.py
index 4674074f8a5..ffe6732dd53 100644
--- a/examples/models/llama/source_transformation/custom_kv_cache.py
+++ b/examples/models/llama/source_transformation/custom_kv_cache.py
@@ -10,7 +10,12 @@
 
 import torch
 import torch.nn as nn
-from executorch.examples.models.llama.attention import KVCache
+from executorch.examples.models.llama.attention import (
+    _create_causal_mask_for_ring_buffer,
+    CachePositionsManager,
+    KVCache,
+    RingKVCache,
+)
 
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 
@@ -75,6 +80,7 @@ def __init__(
             self.register_buffer(
                 "v_cache_zero_points", torch.ones(scale_shape, dtype=torch.int8)
             )
+        self.cache_type = cache_type
 
     def _quantize(self, value):
         (
@@ -209,6 +215,7 @@ def update(self, input_pos, k_val, v_val, indices=None):
         However the storage is [B, S, H, D] so we incur transpose in, transpose out
         This shall be removed by subsequent post-export graph pass
         """
+
         k_val = k_val.transpose(1, 2)
         v_val = v_val.transpose(1, 2)
 
@@ -382,3 +389,185 @@ def _replace_kv_cache_with_custom_kv_cache(module):
         else:
             _replace_kv_cache_with_custom_kv_cache(child)
     return module
+
+
+class QuantizedRingKVCache(QuantizedKVCache):
+    def __init__(
+        self,
+        max_batch_size,
+        max_context_length,
+        n_heads,
+        head_dim,
+        cache_type: QuantizedCacheType = QuantizedCacheType.AffineSymmetric,
+        use_custom_update_cache_op: bool = False,
+    ):
+        # Look at attention.py for explanation on why max_context_length * 2
+        super().__init__(
+            max_batch_size,
+            max_context_length * 2,
+            n_heads,
+            head_dim,
+            cache_type,
+            use_custom_update_cache_op,
+        )
+        self.cache_positions_manager = CachePositionsManager(self.max_context_length)
+        self.is_ring_buffer = True
+        self.window_size = max_context_length
+
+    def create_causal_mask_for_ring_buffer(self, start_pos, seq_len):
+        cache_positions = self.cache_positions_manager.cache_positions
+        return _create_causal_mask_for_ring_buffer(
+            cache_positions, self.window_size, start_pos, seq_len
+        )
+
+    def update(self, input_pos, k_val, v_val):
+        """
+        k_val, v_val: [B, H, S, D]
+        return: [B, H, S, D]
+        However the storage is [B, S, H, D] so we incur transpose in, transpose out
+        This shall be removed by subsequent post-export graph pass
+        """
+        # Need to transpose for two reasons
+        # 1. kv cache is stored as [B, S, H, D]
+        # 2. If seq_len = k_val.size(2), we wont be able be able to optimize
+        #    away transpose at the output of k, v projection
+        seq_len = k_val.transpose(1, 2).size(1)
+        assert seq_len <= self.k_cache.size(
+            1
+        ), f"Update sequence length({seq_len}) for kv cache must be smaller than the cache size({self.k_cache.size(2)})"
+        indices = self.cache_positions_manager.calculate_positions_and_update_indices(
+            input_pos, seq_len
+        )
+        indices = indices.unsqueeze(0)
+
+        return super().update(input_pos, k_val, v_val, indices)
+
+    @classmethod
+    def from_quantized_kv_cache(
+        cls,
+        kv_cache,
+        sliding_window_size,
+    ):
+        assert isinstance(
+            kv_cache, QuantizedKVCache
+        ), "For QuantizedRingKVCache expect QuantizedKVCache as input kv_cache"
+        max_batch_size, _, n_heads, head_dim = kv_cache.k_cache.shape
+        return cls(
+            max_batch_size,
+            sliding_window_size,
+            n_heads,
+            head_dim,
+            kv_cache.cache_type,
+            kv_cache.use_custom_update_cache_op,
+        )
+
+
+class CustomRingKVCache(CustomKVCache):
+    def __init__(
+        self,
+        max_batch_size,
+        max_context_length,
+        n_heads,
+        head_dim,
+        dtype=torch.float32,
+    ):
+        # Look at attention.py for explanation on why max_context_length * 2
+        super().__init__(
+            max_batch_size, max_context_length * 2, n_heads, head_dim, dtype
+        )
+        self.cache_positions_manager = CachePositionsManager(self.max_context_length)
+        self.is_ring_buffer = True
+        self.window_size = max_context_length
+
+    def create_causal_mask_for_ring_buffer(self, start_pos, seq_len):
+        cache_positions = self.cache_positions_manager.cache_positions
+        return _create_causal_mask_for_ring_buffer(
+            cache_positions, self.window_size, start_pos, seq_len
+        )
+
+    def update(self, input_pos, k_val, v_val):
+        """
+        k_val, v_val: [B, H, S, D]
+        return: [B, H, S, D]
+        However the storage is [B, S, H, D] so we incur transpose in, transpose out
+        This shall be removed by subsequent post-export graph pass
+        """
+        # Need to transpose for two reasons
+        # 1. kv cache is stored as [B, S, H, D]
+        # 2. If seq_len = k_val.size(2), we wont be able be able to optimize
+        #    away transpose at the output of k, v projection
+        seq_len = k_val.transpose(1, 2).size(1)
+        assert seq_len <= self.k_cache.size(
+            1
+        ), f"Update sequence length({seq_len}) for kv cache must be smaller than the cache size({self.k_cache.size(2)})"
+        indices = self.cache_positions_manager.calculate_positions_and_update_indices(
+            input_pos, seq_len
+        )
+        indices = indices.unsqueeze(0)
+
+        return super().update(input_pos, k_val, v_val, indices)
+
+    @classmethod
+    def from_custom_kv_cache(
+        cls,
+        kv_cache,
+        sliding_window_size,
+    ):
+        max_batch_size, n_heads, _, head_dim = kv_cache.k_cache.shape
+        if isinstance(kv_cache, CustomKVCache):
+            # If replacing custom kv cache, then the shape is [B, S, H, D]
+            max_batch_size, _, n_heads, head_dim = kv_cache.k_cache.shape
+        return cls(
+            max_batch_size,
+            sliding_window_size,
+            n_heads,
+            head_dim,
+            dtype=kv_cache.k_cache.dtype,
+        )
+
+
+def _replace_kv_cache_with_ring_kv_cache(attention, layer_size):
+    sliding_window_size = layer_size
+    assert (
+        getattr(attention, "kv_cache", None) is not None
+    ), "Attention module must have kv_cache module"
+    kv_cache = attention.kv_cache
+    if isinstance(kv_cache, KVCache):
+        attention.kv_cache = RingKVCache(
+            kv_cache.max_batch_size,
+            sliding_window_size,
+            kv_cache.n_heads,
+            kv_cache.head_dim,
+            kv_cache.enable_dynamic_shape,
+            kv_cache.k_cache.dtype,
+        )
+    elif isinstance(kv_cache, CustomKVCache):
+        attention.kv_cache = CustomRingKVCache.from_custom_kv_cache(
+            kv_cache, layer_size
+        )
+    elif isinstance(kv_cache, QuantizedKVCache):
+        attention.kv_cache = QuantizedRingKVCache.from_quantized_kv_cache(
+            kv_cache, layer_size
+        )
+
+
+def replace_kv_cache_with_ring_kv_cache(module, layer_sizes):
+    # This is needed to ensure that custom ops are registered
+    from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
+
+    logging.info(
+        "Replacing kv cache with ring kv cache. This modifies the model in place."
+    )
+    assert len(layer_sizes) == len(
+        module.layers
+    ), f"Length of layer sizes {len(layer_sizes)} must match the number of layers in the module {len(module.layers)}."
+    for i, transformer_block in enumerate(module.layers):
+        sliding_window_size = layer_sizes[i]
+        if sliding_window_size == 0:
+            continue
+        assert (
+            getattr(transformer_block, "attention", None) is not None
+        ), f"Transfomer block must have attention module. Transformer block {transformer_block}"
+        attention = transformer_block.attention
+        _replace_kv_cache_with_ring_kv_cache(attention, sliding_window_size)
+    return module
diff --git a/examples/models/llama/tests/TARGETS b/examples/models/llama/tests/TARGETS
index 0d52cfa19d3..40ab6653c60 100644
--- a/examples/models/llama/tests/TARGETS
+++ b/examples/models/llama/tests/TARGETS
@@ -55,8 +55,33 @@ python_unittest(
     srcs = [
         "test_ring_attention.py",
     ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/kernels/quantized:aot_lib",
+    ],
     deps = [
         "//caffe2:torch",
+        "//executorch/examples/models/llama:export_library",
+        "//executorch/examples/models/llama:llama_transformer",
+        "//executorch/examples/models/llama:custom_kv_cache",
+        "//executorch/examples/models/llama:sdpa",
+    ],
+)
+
+python_unittest(
+    name = "test_replace_kv_cache",
+    srcs = [
+        "test_replace_kv_cache.py",
+    ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/kernels/quantized:aot_lib",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:export_library",
         "//executorch/examples/models/llama:llama_transformer",
+        "//executorch/examples/models/llama:custom_kv_cache",
+        "//executorch/examples/models/llama:sdpa",
     ],
 )
diff --git a/examples/models/llama/tests/test_replace_kv_cache.py b/examples/models/llama/tests/test_replace_kv_cache.py
new file mode 100644
index 00000000000..8d7171633b2
--- /dev/null
+++ b/examples/models/llama/tests/test_replace_kv_cache.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import List
+
+import torch.nn as nn
+
+from executorch.examples.models.llama.attention import (
+    Attention,
+    AttentionMHA,
+    KVCache,
+    RingKVCache,
+    Rope,
+)
+from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
+    CustomKVCache,
+    CustomRingKVCache,
+    QuantizedKVCache,
+    QuantizedRingKVCache,
+    replace_kv_cache_with_custom_kv_cache,
+    replace_kv_cache_with_quantized_kv_cache,
+    replace_kv_cache_with_ring_kv_cache,
+)
+
+
+class MockTransformerBlock(nn.Module):
+    def __init__(self, attention: Attention):
+        super().__init__()
+        self.attention = attention
+
+
+class TestReplaceKVCache(unittest.TestCase):
+    def setUp(self):
+        # Common parameters for creating attention modules
+        self.batch_size = 2
+        self.seq_len = 10
+        self.dim = 32
+        self.n_heads = 4
+        self.n_kv_heads = 2
+        self.head_dim = 8
+        self.max_context_len = 16
+        self.enable_dynamic_shape = True
+
+        # Create model args
+        self.args = ModelArgs(
+            dim=self.dim,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            head_dim=self.head_dim,
+            max_batch_size=self.batch_size,
+            max_context_len=self.max_context_len,
+            use_kv_cache=True,
+            enable_dynamic_shape=self.enable_dynamic_shape,
+        )
+
+        # Create a rope instance
+        self.rope = Rope(self.args)
+
+    def _create_attention_with_kv_cache(self) -> Attention:
+        """Create an attention module with KVCache."""
+        return AttentionMHA(self.args, layer_id=0, rope=self.rope)
+
+    def _create_mock_model(self, attention_modules: List[Attention]) -> nn.Module:
+        """Create a mock model with transformer blocks containing the given attention modules."""
+        model = nn.Module()
+        model.layers = nn.ModuleList(
+            [MockTransformerBlock(attention) for attention in attention_modules]
+        )
+        return model
+
+    def test_replace_kv_cache_with_ring_kv_cache(self):
+        """Test replacing KVCache with RingKVCache."""
+        # Create a model with KVCache
+        attention = self._create_attention_with_kv_cache()
+        model = self._create_mock_model([attention])
+
+        # Verify that the model has KVCache
+        self.assertIsInstance(model.layers[0].attention.kv_cache, KVCache)
+        self.assertNotIsInstance(model.layers[0].attention.kv_cache, RingKVCache)
+
+        # Replace KVCache with RingKVCache
+        layer_sizes = [8]  # Sliding window size for each layer
+        replace_kv_cache_with_ring_kv_cache(model, layer_sizes)
+
+        # Verify that KVCache has been replaced with RingKVCache
+        self.assertIsInstance(model.layers[0].attention.kv_cache, RingKVCache)
+
+        # Verify that the sliding window size is set correctly
+        self.assertEqual(model.layers[0].attention.kv_cache.window_size, layer_sizes[0])
+
+    def test_replace_custom_kv_cache_with_custom_ring_kv_cache(self):
+        """Test replacing CustomKVCache with CustomRingKVCache."""
+        # Create a model with KVCache
+        attention = self._create_attention_with_kv_cache()
+        model = self._create_mock_model([attention])
+
+        # Replace KVCache with CustomKVCache
+        replace_kv_cache_with_custom_kv_cache(model)
+
+        # Verify that the model has CustomKVCache
+        self.assertIsInstance(model.layers[0].attention.kv_cache, CustomKVCache)
+        self.assertNotIsInstance(model.layers[0].attention.kv_cache, CustomRingKVCache)
+
+        # Replace CustomKVCache with CustomRingKVCache
+        layer_sizes = [8]  # Sliding window size for each layer
+        replace_kv_cache_with_ring_kv_cache(model, layer_sizes)
+
+        # Verify that CustomKVCache has been replaced with CustomRingKVCache
+        self.assertIsInstance(model.layers[0].attention.kv_cache, CustomRingKVCache)
+
+    def test_replace_quantized_kv_cache_with_quantized_ring_kv_cache(self):
+        """Test replacing QuantizedKVCache with QuantizedRingKVCache."""
+        # Create a model with KVCache
+        attention = self._create_attention_with_kv_cache()
+        model = self._create_mock_model([attention])
+
+        # Replace KVCache with QuantizedKVCache
+        replace_kv_cache_with_quantized_kv_cache(model)
+
+        # Verify that the model has QuantizedKVCache
+        self.assertIsInstance(model.layers[0].attention.kv_cache, QuantizedKVCache)
+        self.assertNotIsInstance(
+            model.layers[0].attention.kv_cache, QuantizedRingKVCache
+        )
+
+        # Replace QuantizedKVCache with QuantizedRingKVCache
+        layer_sizes = [8]  # Sliding window size for each layer
+        replace_kv_cache_with_ring_kv_cache(model, layer_sizes)
+
+        # Verify that QuantizedKVCache has been replaced with QuantizedRingKVCache
+        self.assertIsInstance(model.layers[0].attention.kv_cache, QuantizedRingKVCache)
+
+    def test_multiple_layers_with_different_window_sizes(self):
+        """Test replacing KV caches in multiple layers with different window sizes."""
+        # Create a model with multiple layers
+        attention1 = self._create_attention_with_kv_cache()
+        attention2 = self._create_attention_with_kv_cache()
+        attention3 = self._create_attention_with_kv_cache()
+        model = self._create_mock_model([attention1, attention2, attention3])
+
+        # Replace KVCache with RingKVCache with different window sizes
+        layer_sizes = [4, 8, 16]  # Different sliding window sizes for each layer
+        replace_kv_cache_with_ring_kv_cache(model, layer_sizes)
+
+        # Verify that each layer has the correct window size
+        self.assertIsInstance(model.layers[0].attention.kv_cache, RingKVCache)
+        self.assertEqual(model.layers[0].attention.kv_cache.window_size, layer_sizes[0])
+
+        self.assertIsInstance(model.layers[1].attention.kv_cache, RingKVCache)
+        self.assertEqual(model.layers[1].attention.kv_cache.window_size, layer_sizes[1])
+
+        self.assertIsInstance(model.layers[2].attention.kv_cache, RingKVCache)
+        self.assertEqual(model.layers[2].attention.kv_cache.window_size, layer_sizes[2])
diff --git a/examples/models/llama/tests/test_ring_attention.py b/examples/models/llama/tests/test_ring_attention.py
index 064be7f04e0..df0d0733033 100644
--- a/examples/models/llama/tests/test_ring_attention.py
+++ b/examples/models/llama/tests/test_ring_attention.py
@@ -6,14 +6,29 @@
 
 import copy
 import unittest
+from enum import Enum
 
 import torch
 from executorch.examples.models.llama.attention import AttentionMHA, RingKVCache
 from executorch.examples.models.llama.model_args import ModelArgs
 from executorch.examples.models.llama.rope import Rope
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
+    CustomKVCache,
+    CustomRingKVCache,
+    QuantizedKVCache,
+    QuantizedRingKVCache,
+    replace_kv_cache_with_custom_kv_cache,
+    replace_kv_cache_with_quantized_kv_cache,
+)
 from torch.nn.attention import SDPBackend
 
 
+class KVCacheType(Enum):
+    REGULAR = "regular"
+    QUANTIZED = "quantized"
+    CUSTOM = "custom"
+
+
 class TestRingAttention(unittest.TestCase):
     def setUp(self):
         # Common test parameters
@@ -28,7 +43,9 @@ def setUp(self):
         self.dtype = torch.float32
         self.device = "cpu"
 
-    def _create_baseline_attention(self, seq_len: int):
+    def _create_baseline_attention(
+        self, seq_len: int, kv_cache_type: KVCacheType = KVCacheType.REGULAR
+    ):
         """Create baseline attention with regular KV cache."""
         # Create model args
         self.args = ModelArgs(
@@ -50,24 +67,54 @@ def _create_baseline_attention(self, seq_len: int):
             seq_len, self.max_context_len, self.sliding_window
         )
 
-        return attention
-
-    def _create_ring_attention(self, attention):
+        # Replace the KV cache with the specified type
+        if kv_cache_type == KVCacheType.QUANTIZED:
+            # Create a copy to avoid modifying the original attention
+            attention_copy = copy.deepcopy(attention)
+            # Replace KVCache with QuantizedKVCache
+            replace_kv_cache_with_quantized_kv_cache(attention_copy)
+            return attention_copy
+        elif kv_cache_type == KVCacheType.CUSTOM:
+            # Create a copy to avoid modifying the original attention
+            attention_copy = copy.deepcopy(attention)
+            # Replace KVCache with CustomKVCache
+            replace_kv_cache_with_custom_kv_cache(attention_copy)
+            return attention_copy
+        else:
+            return attention
+
+    def _create_ring_attention(
+        self, attention, kv_cache_type: KVCacheType = KVCacheType.REGULAR
+    ):
         """Create attention with ring buffer KV cache."""
         assert self.sliding_window is not None
         # Create RoPE instance
         self.rope = Rope(self.args)
         baseline_attention = copy.deepcopy(attention)
 
-        # Replace the KV cache with a ring buffer KV cache
-        baseline_attention.kv_cache = RingKVCache(
-            self.args.max_batch_size,
-            self.sliding_window,
-            self.n_kv_heads,
-            self.head_dim,
-            self.args.enable_dynamic_shape,
-            self.dtype,
-        )
+        # Replace the KV cache with a ring buffer KV cache based on the type
+        if isinstance(baseline_attention.kv_cache, QuantizedKVCache):
+            # Replace QuantizedKVCache with QuantizedRingKVCache
+            baseline_attention.kv_cache = QuantizedRingKVCache.from_quantized_kv_cache(
+                baseline_attention.kv_cache,
+                self.sliding_window,
+            )
+        elif isinstance(baseline_attention.kv_cache, CustomKVCache):
+            # Replace CustomKVCache with CustomRingKVCache
+            baseline_attention.kv_cache = CustomRingKVCache.from_custom_kv_cache(
+                baseline_attention.kv_cache,
+                self.sliding_window,
+            )
+        else:
+            # Replace regular KVCache with RingKVCache
+            baseline_attention.kv_cache = RingKVCache(
+                self.args.max_batch_size,
+                self.sliding_window,
+                self.n_kv_heads,
+                self.head_dim,
+                self.args.enable_dynamic_shape,
+                self.dtype,
+            )
         return baseline_attention
 
     def _create_sliding_window_mask(self, seq_len, context_len, window_size):
@@ -80,12 +127,20 @@ def _create_sliding_window_mask(self, seq_len, context_len, window_size):
             mask[i, start_idx : pos + 1] = 0
         return mask
 
-    def test_single_token_processing(self):
+    def _run_test_with_kv_cache_type(self, test_func, kv_cache_type: KVCacheType):
+        """Run a test with the specified KV cache type."""
+        original_test_name = test_func.__name__
+        print(f"\nRunning {original_test_name} with {kv_cache_type.value} KV cache")
+        test_func(kv_cache_type)
+
+    def test_single_token_processing(
+        self, kv_cache_type: KVCacheType = KVCacheType.REGULAR
+    ):
         """Test that ring buffer and baseline produce the same output for single token processing."""
         seq_len = 10
         self.sliding_window = 4
-        baseline_attn = self._create_baseline_attention(seq_len)
-        ring_attn = self._create_ring_attention(baseline_attn)
+        baseline_attn = self._create_baseline_attention(seq_len, kv_cache_type)
+        ring_attn = self._create_ring_attention(baseline_attn, kv_cache_type)
 
         # Process tokens one by one
         with torch.nn.attention.sdpa_kernel(
@@ -113,17 +168,31 @@ def test_single_token_processing(self):
                     f"Outputs differ at position {pos}",
                 )
 
-    def test_sliding_window_attention(self):
+    def test_single_token_processing_quantized(self):
+        """Test single token processing with QuantizedKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_single_token_processing, KVCacheType.QUANTIZED
+        )
+
+    def test_single_token_processing_custom(self):
+        """Test single token processing with CustomKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_single_token_processing, KVCacheType.CUSTOM
+        )
+
+    def test_sliding_window_attention(
+        self, kv_cache_type: KVCacheType = KVCacheType.REGULAR
+    ):
         """Test that ring buffer with sliding window size produces the same output as baseline with sliding window mask."""
         self.sliding_window = 4
         self.max_context_len = 16
 
         seq_len = 10
         # Create baseline attention with full context length
-        baseline_attn = self._create_baseline_attention(seq_len)
+        baseline_attn = self._create_baseline_attention(seq_len, kv_cache_type)
 
         # Create ring attention with sliding window size
-        ring_attn = self._create_ring_attention(baseline_attn)
+        ring_attn = self._create_ring_attention(baseline_attn, kv_cache_type)
 
         # Process tokens one by one
         with torch.nn.attention.sdpa_kernel(
@@ -150,16 +219,32 @@ def test_sliding_window_attention(self):
                     f"Outputs differ at position {pos}",
                 )
 
-    def test_ring_buffer_wrapping(self):
+    def test_sliding_window_attention_quantized(self):
+        """Test sliding window attention with QuantizedKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_sliding_window_attention, KVCacheType.QUANTIZED
+        )
+
+    def test_sliding_window_attention_custom(self):
+        """Test sliding window attention with CustomKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_sliding_window_attention, KVCacheType.CUSTOM
+        )
+
+    def test_ring_buffer_wrapping(
+        self, kv_cache_type: KVCacheType = KVCacheType.REGULAR
+    ):
         """Test that ring buffer correctly wraps around and maintains correct attention patterns."""
         self.sliding_window = 3
         self.max_context_len = 15
 
         # Create baseline attention with full context length
-        baseline_attn = self._create_baseline_attention(self.max_context_len)
+        baseline_attn = self._create_baseline_attention(
+            self.max_context_len, kv_cache_type
+        )
 
         # Create ring attention with sliding window size
-        ring_attn = self._create_ring_attention(baseline_attn)
+        ring_attn = self._create_ring_attention(baseline_attn, kv_cache_type)
 
         # Process enough tokens to cause wrapping
         seq_len = 1
@@ -198,7 +283,21 @@ def test_ring_buffer_wrapping(self):
             f"Expected positions {expected_positions}, got {cache_positions}",
         )
 
-    def test_large_context_with_sliding_window(self):
+    def test_ring_buffer_wrapping_quantized(self):
+        """Test ring buffer wrapping with QuantizedKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_ring_buffer_wrapping, KVCacheType.QUANTIZED
+        )
+
+    def test_ring_buffer_wrapping_custom(self):
+        """Test ring buffer wrapping with CustomKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_ring_buffer_wrapping, KVCacheType.CUSTOM
+        )
+
+    def test_large_context_with_sliding_window(
+        self, kv_cache_type: KVCacheType = KVCacheType.REGULAR
+    ):
         """Test with a large context length and compare baseline with sliding window to ring buffer."""
         # Use a larger context length and sliding window for this test
         self.max_context_len = 64
@@ -207,10 +306,10 @@ def test_large_context_with_sliding_window(self):
         token_lens = [8, 1, 3, 2, 1, 1, 1, 1, 7, 1, 5, 1, 1, 1, 4, 1, 1, 2, 1, 1]
         seq_len = sum(token_lens)
         # Create baseline attention with full context length
-        baseline_attn = self._create_baseline_attention(seq_len)
+        baseline_attn = self._create_baseline_attention(seq_len, kv_cache_type)
 
         # Create ring attention with sliding window size
-        ring_attn = self._create_ring_attention(baseline_attn)
+        ring_attn = self._create_ring_attention(baseline_attn, kv_cache_type)
 
         pos = 0
         with torch.nn.attention.sdpa_kernel(
@@ -239,3 +338,15 @@ def test_large_context_with_sliding_window(self):
                     f"Outputs differ at position {pos} with max difference {(baseline_out - ring_out).abs().max()}",
                 )
                 pos += token_len
+
+    def test_large_context_with_sliding_window_quantized(self):
+        """Test large context with sliding window with QuantizedKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_large_context_with_sliding_window, KVCacheType.QUANTIZED
+        )
+
+    def test_large_context_with_sliding_window_custom(self):
+        """Test large context with sliding window with CustomKVCache."""
+        self._run_test_with_kv_cache_type(
+            self.test_large_context_with_sliding_window, KVCacheType.CUSTOM
+        )

From f1ef702ff35fdeb92c5affa0e9f8e1e4472ff731 Mon Sep 17 00:00:00 2001
From: winskuo-quic <143469905+winskuo-quic@users.noreply.github.com>
Date: Wed, 14 May 2025 02:10:44 +0800
Subject: [PATCH 072/178] Qualcomm AI Engine Direct - Flags for CI (#9536)

### Summary
Initially, `--compile_only` will use 1 random input for CI purpose.
Introducing `--ci` flag, so when users are using `--compile_only`, they
won't be getting models with poor accuracy.
---
 .ci/scripts/test_model.sh                     |  2 +-
 .../qualcomm/_passes/annotate_quant_attrs.py  | 46 +++++++------------
 examples/qualcomm/scripts/deeplab_v3.py       |  6 ++-
 examples/qualcomm/scripts/edsr.py             |  6 ++-
 examples/qualcomm/scripts/inception_v3.py     |  6 ++-
 examples/qualcomm/scripts/inception_v4.py     |  8 +++-
 examples/qualcomm/scripts/mobilenet_v2.py     |  6 ++-
 examples/qualcomm/scripts/mobilenet_v3.py     |  6 ++-
 examples/qualcomm/scripts/torchvision_vit.py  |  6 ++-
 examples/qualcomm/scripts/wav2letter.py       |  4 +-
 examples/qualcomm/utils.py                    |  7 +++
 11 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 9175942a0ac..38c45dc3fb7 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -201,7 +201,7 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py
index ed19a54b7e7..b4f14fc28cd 100644
--- a/backends/qualcomm/_passes/annotate_quant_attrs.py
+++ b/backends/qualcomm/_passes/annotate_quant_attrs.py
@@ -31,15 +31,12 @@
 class AnnotateQuantAttrs(ExportPass):
     """
     Add "quant_attrs" to graph nodes' meta from the QDQ information
-    generated after quatization process.
+    generated after quantization process.
     """
 
-    def __init__(
-        self, edge_program: torch.export.ExportedProgram, skip_advanced_requat: bool
-    ):
+    def __init__(self, edge_program: torch.export.ExportedProgram):
         super(AnnotateQuantAttrs, self).__init__()
         self.edge_program = edge_program
-        self.skip_advanced_requant = skip_advanced_requat
 
     def _annotate_source_nodes(
         self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any]
@@ -88,30 +85,21 @@ def _annotate_requant(self, n):
                 dq_attrs = get_quant_attrs(self.edge_program, dq_node)
                 # TODO: Store multiple pairs of requantize attributes when we have an op builder
                 # that has multiple outputs that requires quant attributes.
-                if self.skip_advanced_requant:
-                    if q_attrs[QCOM_DTYPE] != dq_attrs[QCOM_DTYPE]:
-                        dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
-                        user_node = list(dq_node.users)[0]
-                        n.args[0].meta.setdefault(QCOM_REQUANTIZE, {})
-                        n.args[0].meta[QCOM_REQUANTIZE][user_node.name] = dq_attrs
-                else:
-                    # When dtype is the same but other specs such as scale and offset are different,
-                    # insert requant to improve accuracy.
-                    # Users can turn this feature off if any inference speed drop is observed.
-                    if any(
-                        q_attrs[attr] != dq_attrs[attr]
-                        for attr in [
-                            QCOM_SCALE,
-                            QCOM_ZERO_POINT,
-                            QCOM_QUANT_MIN,
-                            QCOM_QUANT_MAX,
-                            QCOM_DTYPE,
-                        ]
-                    ):
-                        dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
-                        user_node = list(dq_node.users)[0]
-                        n.args[0].meta.setdefault(QCOM_REQUANTIZE, {})
-                        n.args[0].meta[QCOM_REQUANTIZE][user_node.name] = dq_attrs
+
+                if any(
+                    q_attrs[attr] != dq_attrs[attr]
+                    for attr in [
+                        QCOM_SCALE,
+                        QCOM_ZERO_POINT,
+                        QCOM_QUANT_MIN,
+                        QCOM_QUANT_MAX,
+                        QCOM_DTYPE,
+                    ]
+                ):
+                    dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
+                    user_node = list(dq_node.users)[0]
+                    n.args[0].meta.setdefault(QCOM_REQUANTIZE, {})
+                    n.args[0].meta[QCOM_REQUANTIZE][user_node.name] = dq_attrs
 
     # Dequant all the fold_quant parameters back to fp32.
     # If an operation is not supported by QNN and got fallback, it will expect a fp32 param.
diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py
index 37863878e4a..cb64d904919 100755
--- a/examples/qualcomm/scripts/deeplab_v3.py
+++ b/examples/qualcomm/scripts/deeplab_v3.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 import random
 import re
@@ -74,8 +75,11 @@ def main(args):
         )
 
     data_num = 100
-    if args.compile_only:
+    if args.ci:
         inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         inputs, targets, input_list = get_dataset(
             data_size=data_num, dataset_dir=args.artifact, download=args.download
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index a12a5069c3f..222c04ed1b1 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 import re
 from multiprocessing.connection import Client
@@ -113,8 +114,11 @@ def main(args):
         )
 
     instance = EdsrModel()
-    if args.compile_only:
+    if args.ci:
         inputs = instance.get_example_inputs()
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         dataset = get_dataset(
             args.hr_ref_dir, args.lr_dir, args.default_dataset, args.artifact
diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py
index 5042f941d20..6cfb44adcf7 100755
--- a/examples/qualcomm/scripts/inception_v3.py
+++ b/examples/qualcomm/scripts/inception_v3.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 from multiprocessing.connection import Client
 
@@ -37,8 +38,11 @@ def main(args):
         )
 
     data_num = 100
-    if args.compile_only:
+    if args.ci:
         inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py
index a82c976b2d3..92de33f8cba 100755
--- a/examples/qualcomm/scripts/inception_v4.py
+++ b/examples/qualcomm/scripts/inception_v4.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 from multiprocessing.connection import Client
 
@@ -37,8 +38,11 @@ def main(args):
         )
 
     data_num = 100
-    if args.compile_only:
-        inputs = [(torch.rand(1, 3, 299, 299),)]
+    if args.ci:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py
index 5cfe20f88c0..1b153431741 100755
--- a/examples/qualcomm/scripts/mobilenet_v2.py
+++ b/examples/qualcomm/scripts/mobilenet_v2.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 from multiprocessing.connection import Client
 
@@ -37,8 +38,11 @@ def main(args):
         )
 
     data_num = 100
-    if args.compile_only:
+    if args.ci:
         inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py
index f89c4c091bb..e34125bbfca 100644
--- a/examples/qualcomm/scripts/mobilenet_v3.py
+++ b/examples/qualcomm/scripts/mobilenet_v3.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 from multiprocessing.connection import Client
 
@@ -36,8 +37,11 @@ def main(args):
         )
 
     data_num = 100
-    if args.compile_only:
+    if args.ci:
         inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py
index 33fcd050ad3..428863daf4b 100755
--- a/examples/qualcomm/scripts/torchvision_vit.py
+++ b/examples/qualcomm/scripts/torchvision_vit.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 from multiprocessing.connection import Client
 
@@ -28,8 +29,11 @@ def main(args):
     os.makedirs(args.artifact, exist_ok=True)
 
     data_num = 100
-    if args.compile_only:
+    if args.ci:
         inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
     else:
         inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
diff --git a/examples/qualcomm/scripts/wav2letter.py b/examples/qualcomm/scripts/wav2letter.py
index 7f30d1865b8..e5b97a8241e 100644
--- a/examples/qualcomm/scripts/wav2letter.py
+++ b/examples/qualcomm/scripts/wav2letter.py
@@ -134,10 +134,10 @@ def main(args):
 
     # retrieve dataset, will take some time to download
     data_num = 100
-    if args.compile_only:
+    if args.ci:
         inputs = [(torch.rand(1, 1, 700, 1),)]
         logging.warning(
-            "With compile_only, accuracy will be bad due to insufficient datasets for quantization."
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
         )
     else:
         inputs, targets, input_list = get_dataset(
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 542739a2898..d8dab88e998 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -585,6 +585,13 @@ def setup_common_args_and_variables():
         action="store_true",
     )
 
+    parser.add_argument(
+        "--ci",
+        help="This flag is for Continuous Integration(CI) purpose and is NOT recommended to turn on for typical use cases. It will use random inputs instead of real inputs.",
+        action="store_true",
+        default=False,
+    )
+
     # QNN_SDK_ROOT might also be an argument, but it is used in various places.
     # So maybe it's fine to just use the environment.
     if "QNN_SDK_ROOT" not in os.environ:

From d0360b7b6f7976dd75a3b6517540393e7e37052b Mon Sep 17 00:00:00 2001
From: "Mengtao (Martin) Yuan" <myuan@fb.com>
Date: Tue, 13 May 2025 12:12:06 -0700
Subject: [PATCH 073/178] BUCK forward fix on NXP backend

Differential Revision: D74625061

Pull Request resolved: https://github.com/pytorch/executorch/pull/10838
---
 backends/arm/_passes/TARGETS          |  1 +
 backends/arm/operator_support/TARGETS |  1 +
 backends/transforms/targets.bzl       | 13 +++++++++++++
 backends/xnnpack/TARGETS              |  1 +
 backends/xnnpack/test/TARGETS         |  1 +
 5 files changed, 17 insertions(+)

diff --git a/backends/arm/_passes/TARGETS b/backends/arm/_passes/TARGETS
index 7f61b6c2b4d..02d8549ac85 100644
--- a/backends/arm/_passes/TARGETS
+++ b/backends/arm/_passes/TARGETS
@@ -7,6 +7,7 @@ python_library(
         "//executorch/backends/arm:tosa_quant_utils",
         "//executorch/backends/arm:tosa_utils",
         "//executorch/backends/transforms:fuse_view_copy",
+        "//executorch/backends/transforms:remove_getitem_op",
         "//executorch/backends/transforms:replace_scalar_with_tensor",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
diff --git a/backends/arm/operator_support/TARGETS b/backends/arm/operator_support/TARGETS
index c0c5af7487b..e14552fd016 100644
--- a/backends/arm/operator_support/TARGETS
+++ b/backends/arm/operator_support/TARGETS
@@ -6,6 +6,7 @@ python_library(
     deps = [
         "//executorch/backends/arm/_passes:passes",
         "//executorch/backends/arm:tosa_specification",
+        "//executorch/backends/transforms:remove_getitem_op",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
     ],
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
index 71980195962..ad6d93420e3 100644
--- a/backends/transforms/targets.bzl
+++ b/backends/transforms/targets.bzl
@@ -117,6 +117,19 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_library(
+        name = "remove_getitem_op",
+        srcs = ["remove_getitem_op.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
     runtime.python_library(
         name = "mean_to_sum_div",
         srcs = ["mean_to_sum_div.py"],
diff --git a/backends/xnnpack/TARGETS b/backends/xnnpack/TARGETS
index 4a3dfed7625..d5c6d6303d2 100644
--- a/backends/xnnpack/TARGETS
+++ b/backends/xnnpack/TARGETS
@@ -16,6 +16,7 @@ runtime.python_library(
     ],
     deps = [
         "//executorch/backends/transforms:lib",
+        "//executorch/backends/transforms:remove_getitem_op",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/backends/xnnpack/operators:operators",
         "//executorch/backends/xnnpack/serialization:xnnpack_serializer",
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
index 9b2ce0a4e82..bd3dddd0985 100644
--- a/backends/xnnpack/test/TARGETS
+++ b/backends/xnnpack/test/TARGETS
@@ -13,6 +13,7 @@ runtime.python_test(
         "test_xnnpack_utils_classes.py",
     ],
     deps = [
+        "//executorch/backends/transforms:remove_getitem_op",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/backends/xnnpack/test/tester:tester",
         "//executorch/backends/xnnpack/utils:xnnpack_utils",

From e13b08669f37577b331864ea817566aca323a4dc Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Tue, 13 May 2025 22:56:51 +0200
Subject: [PATCH 074/178] Arm backend: Merge decompose/convert meandim pass
 (#10844)

This change has multiple benifits:
- Cleaner arm_pass_manager
- Use more efficient avgpool2d decompostion in more cases
- Fixes a bug decomposing to avgpool for rank != 4

Note that symmetric_io_quantization is required for the unittests
resulting in only a avgpool op becuase of the way avgpool is annotated.

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 backends/arm/_passes/__init__.py              |   1 -
 backends/arm/_passes/arm_pass_manager.py      |   4 +-
 .../arm/_passes/decompose_meandim_pass.py     | 110 ++++++++++++---
 .../_passes/meandim_to_averagepool_pass.py    |  54 --------
 .../tosa_supported_operators.py               |  39 +++---
 backends/arm/test/ops/test_layer_norm.py      |   6 +-
 backends/arm/test/ops/test_mean_dim.py        | 131 ++++++++++++++++--
 .../passes/test_decompose_meandim_pass.py     |  67 ++++++---
 .../passes/test_meandim_to_averagepool2d.py   |  76 ----------
 9 files changed, 272 insertions(+), 216 deletions(-)
 delete mode 100644 backends/arm/_passes/meandim_to_averagepool_pass.py
 delete mode 100644 backends/arm/test/passes/test_meandim_to_averagepool2d.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 35879d5026c..364d4bdf329 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -47,7 +47,6 @@
 from .keep_dims_false_to_squeeze_pass import KeepDimsFalseToSqueezePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
 from .match_where_self_arg_dtype_pass import MatchWhereSelfDtypePass  # noqa
-from .meandim_to_averagepool_pass import ConvertMeanDimToAveragePoolPass  # noqa
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
 from .remove_clone_pass import RemoveClonePass  # noqa
 from .replace_scalar_with_tensor_pass import (  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index c5ebace2834..1d9c2231b2f 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -17,7 +17,6 @@
     ConvertAnyDefaultDimDimsPass,
     ConvertExpandCopyToRepeatPass,
     ConvertFullLikeToFullPass,
-    ConvertMeanDimToAveragePoolPass,
     ConvertMinMaxPass,
     ConvertMmToBmmPass,
     ConvertSplitToSlicePass,
@@ -87,7 +86,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
-        self.add_pass(ConvertMeanDimToAveragePoolPass())
+        self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
@@ -140,7 +139,6 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeNotEqualPass())
-        self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxPass())
         self.add_pass(DecomposeGeluPass())
diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py
index 6af6caf0c3f..0e5fe03ab0f 100644
--- a/backends/arm/_passes/decompose_meandim_pass.py
+++ b/backends/arm/_passes/decompose_meandim_pass.py
@@ -1,10 +1,9 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
+from math import prod
 
 import torch
 from executorch.backends.arm._passes import ArmPass
@@ -28,15 +27,37 @@ def get_meandim_decomposition(op) -> tuple:
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
+def get_avgpool(op):
+    if op == exir_ops.edge.aten.mean.dim:
+        return exir_ops.edge.aten.avg_pool2d.default
+    if op == torch.ops.aten.mean.dim:
+        return torch.ops.aten.avg_pool2d.default
+    raise RuntimeError(f"Can't get meandim decomposition for op {op}")
+
+
+def get_view(op):
+    if op == exir_ops.edge.aten.mean.dim:
+        return exir_ops.edge.aten.view_copy.default
+    if op == torch.ops.aten.mean.dim:
+        return torch.ops.aten.view_copy.default
+    raise RuntimeError(f"Can't get meandim decomposition for op {op}")
+
+
 class DecomposeMeanDimPass(ArmPass):
     """
-    This pass decomposes meandim into a sum and mul node.
+    Decomposes a meandim into avg_pool and/or sum + mul (1/N) depending on which dims the mean is taken for:
+        h,w -> avg_pool
+        n,c -> sum + mul(1/N)
+    For rank < 4, the input is first reshaped to 4D by padding with dim=1 from the left.
 
     Example:
-        y = mean_dim(x, dim, keepdim)
+        x = mean_dim(x, (0,2), keepdim=False) # x = (c,h,w)
     Becomes:
-        sum = sum.dim_IntList(x, dim, keepdim)
-        y = mul(sum, 1/N)
+        x = view_copy.default(x, new_shape=(1,c,h,w)) # Reshape to work with avg_pool
+        x = avg_pool2d.default(x, kernel=(1,w), stride=(1,1)) # Reduce w with avg_pool
+        x = sum.dim_IntList(x, dim=1, keepdims=True) # Reduce c with sum
+        x = mul.Tensor(x, 1/c) # Divide by number of channels to get mean
+        x = view_copy.default(x, new_shape=(h)) # Squeeze dims since keepdims = False
     """
 
     def call_operator(self, op, args, kwargs, meta):
@@ -44,26 +65,73 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         x = get_node_arg(args, 0)
-        dim = get_node_arg(args, 1)
-        keepdim = get_node_arg(args, 2, False)
-
-        # if dim == [-1, -2], mean.dim can be
-        # decomposed to avg_pool2d. This is handled by ConvertMeanDimToAveragePool.
-        if dim == [-1, -2]:
-            # Simply return the mean.dim operator for future decomposition.
-            return super().call_operator(op, args, kwargs, meta)
+        input_shape = x.data.size()
+        output_shape = meta["val"].size()
+        dims_to_reduce = get_node_arg(args, 1)
+        dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
 
-        shape = meta["val"].size()
         dtype = meta["val"].dtype
-        input_shape = x.data.size()
-        N = 1
-        for d in dim:
-            N *= input_shape[d]
+        view_op = get_view(op)
 
+        if len(input_shape) > 4:
+            raise NotImplementedError(
+                f"{op} with rank > 4 is currently not supported for the TOSA backend."
+            )
+
+        # Unsqueeze to 4D
+        if len(input_shape) < 4:
+            pad_n = 4 - len(input_shape)
+            new_shape = [1] * pad_n + list(input_shape)
+            dims_to_reduce = [dim + pad_n for dim in dims_to_reduce]
+
+            x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
+
+        # Reduce (h,w) by avg pool
+        dims_to_reduce_by_avgpool = [dim for dim in dims_to_reduce if dim >= 2]
+        x = self._reduce_by_average_pool(op, x, dims_to_reduce_by_avgpool, meta)
+
+        # Reduce (n, c) by reduce sum
+        dims_to_reduce_by_sum = [dim for dim in dims_to_reduce if dim < 2]
+        x = self._reduce_by_sum(op, x, dims_to_reduce_by_sum, meta, dtype)
+
+        # Reshape to correct output shape if necessary
+        if x.data.size() != output_shape:
+            x = super().call_operator(view_op, (x, output_shape), {}, meta, True)
+
+        return x
+
+    def _reduce_by_sum(self, op, input_node, dims, meta, dtype):
+        if len(dims) == 0:
+            return input_node
+
+        input_shape = input_node.data.size()
+        output_shape = meta["val"].size()
+        N = prod((n for i, n in enumerate(input_shape) if i in dims))
         sum_op, full_op, mul_op = get_meandim_decomposition(op)
 
-        sum = super().call_operator(sum_op, (x, dim, keepdim), {}, meta, True)
+        sum = super().call_operator(sum_op, (input_node, dims, True), {}, meta, True)
         full = super().call_operator(
-            full_op, ([1] * len(shape), 1 / N), {"dtype": dtype}, meta, True
+            full_op, ([1] * len(output_shape), 1 / N), {"dtype": dtype}, meta, True
         )
         return super().call_operator(mul_op, (sum, full), {}, meta, True)
+
+    def _reduce_by_average_pool(self, op, input_node, dims, meta):
+        if len(dims) == 0:
+            return input_node
+
+        avgpool_op = get_avgpool(op)
+        input_shape = input_node.data.size()
+
+        stride = [1, 1]
+        if dims in ([2, 3], [3, 2]):
+            kernel_size = [input_shape[2], input_shape[3]]
+        elif dims == [3]:
+            kernel_size = [1, input_shape[3]]
+        elif dims == [2]:
+            kernel_size = [input_shape[2], 1]
+        else:
+            raise RuntimeError(f"Bad dims {dims} for {op} decomposition of mean_dim.")
+
+        return super().call_operator(
+            avgpool_op, (input_node, kernel_size, stride), {}, meta, True
+        )
diff --git a/backends/arm/_passes/meandim_to_averagepool_pass.py b/backends/arm/_passes/meandim_to_averagepool_pass.py
deleted file mode 100644
index 9a755191504..00000000000
--- a/backends/arm/_passes/meandim_to_averagepool_pass.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-from typing import Any, cast, Dict, Tuple
-
-import torch.fx
-
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
-
-Argument = Any
-
-
-class ConvertMeanDimToAveragePoolPass(ExportPass):
-    """
-    Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation.
-    """
-
-    def call_operator(
-        self,
-        op: torch.fx.node.Target,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        if op != exir_ops.edge.aten.mean.dim:
-            return super().call_operator(op, args, kwargs, meta)
-
-        input_value = cast(ProxyValue, args[0])
-        dim = cast(list, args[1])
-        keep_dim = cast(bool, args[2]) if len(args) > 2 else False
-
-        # averagepool2d gets converted to a mean operation with dim = [-1, -2] and keep_dim = True
-        # so check the dim argument for this case
-        if dim == [-1, -2] and keep_dim is True:
-            # Given the shape format of input is (N, C, H, W)
-            kernel_size = [
-                input_value.to_tensor().size()[2],
-                input_value.to_tensor().size()[3],
-            ]
-            stride = [1, 1]
-            return super().call_operator(
-                exir_ops.edge.aten.avg_pool2d.default,
-                (input_value, kernel_size, stride),
-                {},
-                meta,
-            )
-        else:
-            return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 89a87b2637a..c732c91a20a 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -262,28 +262,23 @@ def is_node_supported(
 
         if node.op != "call_function":
             return True
-        if node.target == exir_ops.edge.aten.mean.dim:
-            dim = node.args[1]
-            needs_decomp = dim != [-1, -2]
-        else:
-            needs_decomp = node.target in [
-                exir_ops.edge.aten.div.Tensor,
-                exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
-                exir_ops.edge.aten.native_layer_norm.default,
-                exir_ops.edge.aten.mean.dim,
-                exir_ops.edge.aten._softmax.default,
-                exir_ops.edge.aten._log_softmax.default,
-                exir_ops.edge.aten.var.correction,
-                exir_ops.edge.aten.var.dim,
-                exir_ops.edge.aten.add.Scalar,
-                exir_ops.edge.aten.sqrt.default,
-                exir_ops.edge.aten.sub.Scalar,
-                exir_ops.edge.aten.mul.Scalar,
-                exir_ops.edge.aten.ne.Tensor,
-                exir_ops.edge.aten.ne.Scalar,
-                exir_ops.edge.aten.div.Scalar,
-                exir_ops.edge.aten.leaky_relu.default,
-            ]
+        needs_decomp = node.target in [
+            exir_ops.edge.aten.div.Tensor,
+            exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+            exir_ops.edge.aten.native_layer_norm.default,
+            exir_ops.edge.aten._softmax.default,
+            exir_ops.edge.aten._log_softmax.default,
+            exir_ops.edge.aten.var.correction,
+            exir_ops.edge.aten.var.dim,
+            exir_ops.edge.aten.add.Scalar,
+            exir_ops.edge.aten.sqrt.default,
+            exir_ops.edge.aten.sub.Scalar,
+            exir_ops.edge.aten.mul.Scalar,
+            exir_ops.edge.aten.ne.Tensor,
+            exir_ops.edge.aten.ne.Scalar,
+            exir_ops.edge.aten.div.Scalar,
+            exir_ops.edge.aten.leaky_relu.default,
+        ]
         if needs_decomp:
             self.reporter.report_reject(node, "Needs to be decomposed.")
             return False
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index d2d9aa0bc14..8d31ef992cb 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -81,8 +81,8 @@ def test_native_layer_norm_tosa_BI(test_data):
         model,
         test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
+        symmetric_io_quantization=True,
     )
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
@@ -95,8 +95,8 @@ def test_native_layer_norm_u55_BI(test_data):
         test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
         run_on_fvp=True,
+        symmetric_io_quantization=True,
     )
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
@@ -109,6 +109,6 @@ def test_native_layer_norm_u85_BI(test_data):
         test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
         run_on_fvp=True,
+        symmetric_io_quantization=True,
     )
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 43063058805..b512d6d13bc 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -53,6 +53,7 @@ def test_adaptive_avg_pool2d_tosa_BI(test_data):
         test_data(),
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
+        symmetric_io_quantization=True,
     ).run()
 
 
@@ -65,6 +66,7 @@ def test_adaptive_avg_pool2d_u55_BI(test_data):
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
         run_on_fvp=True,
+        symmetric_io_quantization=True,
     ).run()
 
 
@@ -77,21 +79,120 @@ def test_adaptive_avg_pool2d_u85_BI(test_data):
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
         run_on_fvp=True,
+        symmetric_io_quantization=True,
     ).run()
 
 
 class MeanDim(torch.nn.Module):
     test_data_suite: dict[str, tuple] = {
-        "zeros": lambda: (torch.zeros(1, 1280, 7, 7), -1, True),
-        "ones": lambda: (torch.ones(1, 1280, 7, 7), (-1, 2), False),
-        "rand": lambda: (
-            torch.rand(1, 1280, 7, 7),
-            (-1),
+        "rank_1_keepdim": lambda: (
+            torch.rand(7),
+            (0),
+            True,
+        ),
+        "rank_2_keepdim": lambda: (
+            torch.rand(7, 7),
+            (0, 1),
+            True,
+        ),
+        "rank_3_keepdim": lambda: (
+            torch.rand(7, 7, 7),
+            (0, 1, 2),
+            True,
+        ),
+        "rand_1_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (1),
+            True,
+        ),
+        "rand_2_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (2),
+            True,
+        ),
+        "rand_3_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (3),
             True,
         ),
-        "randn": lambda: (
-            torch.randn(1, 1280, 7, 7),
-            (-1, -2, -3),
+        "rand_12_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (1, 2),
+            True,
+        ),
+        "rand_13_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (1, 3),
+            True,
+        ),
+        "rand_23_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (2, 3),
+            True,
+        ),
+        "rand_123_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (1, 2, 3),
+            True,
+        ),
+        "rand_0123_keepdim": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (0, 1, 2, 3),
+            True,
+        ),
+        "rank_1": lambda: (
+            torch.rand(7),
+            (-1),
+            False,
+        ),
+        "rank_2": lambda: (
+            torch.rand(7, 7),
+            (-2, -1),
+            False,
+        ),
+        "rank_3": lambda: (
+            torch.rand(7, 7, 7),
+            (-3, -2, -1),
+            False,
+        ),
+        "rand_1": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-3),
+            False,
+        ),
+        "rand_2": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-2),
+            False,
+        ),
+        "rand_3": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-1),
+            False,
+        ),
+        "rand_12": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-3, -2),
+            False,
+        ),
+        "rand_13": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-3, -1),
+            False,
+        ),
+        "rand_23": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-2, -1),
+            False,
+        ),
+        "rand_123": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-3, -2, -1),
+            False,
+        ),
+        "rand_0123": lambda: (
+            torch.rand(1, 7, 7, 7),
+            (-4, -3, -2, -1),
             False,
         ),
     }
@@ -124,9 +225,9 @@ def test_mean_dim_tosa_BI(test_data):
     pipeline = TosaPipelineBI[input_t](
         MeanDim(dim, keep_dim),
         (test_data,),
-        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+        [],  # Might be sum, avgpool, or both
+        symmetric_io_quantization=True,
     )
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
@@ -137,10 +238,10 @@ def test_mean_dim_u55_BI(test_data):
     pipeline = EthosU55PipelineBI[input_t](
         MeanDim(dim, keep_dim),
         (test_data,),
-        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+        [],  # Might be sum, avgpool, or both
         run_on_fvp=True,
-    )
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+        symmetric_io_quantization=True,
+    ).dump_artifact("export")
     pipeline.run()
 
 
@@ -151,8 +252,8 @@ def test_mean_dim_u85_BI(test_data):
     pipeline = EthosU85PipelineBI[input_t](
         MeanDim(dim, keep_dim),
         (test_data,),
-        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+        [],  # Might be sum, avgpool, or both
         run_on_fvp=True,
+        symmetric_io_quantization=True,
     )
-    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_meandim_pass.py b/backends/arm/test/passes/test_decompose_meandim_pass.py
index 511959e36cf..fe953198527 100644
--- a/backends/arm/test/passes/test_decompose_meandim_pass.py
+++ b/backends/arm/test/passes/test_decompose_meandim_pass.py
@@ -17,32 +17,68 @@
 
 class MeanDim(torch.nn.Module):
     """
-    Basic mean model using torch.mean function making sure keepdim=True (keepdim=False doesnt work for this pass for some reason)
+    Basic mean model using torch.mean with keepdim = True
     """
 
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
+    }
+    ops_not_before_pass = [
+        "executorch_exir_dialects_edge__ops_aten_view_copy_default",
+        "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default",
+        "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList",
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+    ]
+    ops_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+    }
+
+    ops_not_after_pass = [
+        "executorch_exir_dialects_edge__ops_aten_view_copy_default",
+        "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default",
+        "executorch_exir_dialects_edge__ops_aten_mean_dim",
+    ]
+
     def __init__(self):
         super(MeanDim, self).__init__()
 
     def forward(self, x):
-        return torch.mean(x, 1, True)
+        return torch.mean(x, (0, 1), True)
 
     def get_inputs(self) -> input_t:
-        return (torch.rand(4, 4),)
+        return (torch.rand(4, 4, 4, 4),)
 
 
 class MeanDimTensor(torch.nn.Module):
     """
-    Basic mean model using torch.Tensor.mean function making sure keepdim=True (keepdim=False doesnt work for this pass for some reason)
+    Basic mean model using torch.Tensor.mean with keepdim = False
     """
 
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
+    }
+    ops_not_before_pass = [
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+        "executorch_exir_dialects_edge__ops_aten_full_default",
+        "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList",
+    ]
+    ops_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_aten_full_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+    }
+
+    ops_not_after_pass = ["executorch_exir_dialects_edge__ops_aten_mean_dim"]
+
     def __init__(self):
         super(MeanDimTensor, self).__init__()
 
     def forward(self, x):
-        return x.mean(1, True)
+        return x.mean((0, 1), False)
 
     def get_inputs(self) -> input_t:
-        return (torch.rand(4, 4),)
+        return (torch.rand(4, 4, 4),)
 
 
 modules = {"meandim_basic": MeanDim(), "meandim_tensor": MeanDimTensor()}
@@ -53,21 +89,10 @@ def test_decompose_meandim_tosa_MI(module):
     pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
-        quantize=False,
-        ops_before_pass={
-            "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
-        },
-        ops_not_before_pass=[
-            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
-            "executorch_exir_dialects_edge__ops_aten_full_default",
-            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList",
-        ],
-        ops_after_pass={
-            "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
-            "executorch_exir_dialects_edge__ops_aten_full_default": 1,
-            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
-        },
-        ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_mean_dim"],
+        ops_before_pass=module.ops_before_pass,
+        ops_not_before_pass=module.ops_not_before_pass,
+        ops_after_pass=module.ops_after_pass,
+        ops_not_after_pass=module.ops_not_after_pass,
         pass_list=[DecomposeMeanDimPass],
     )
     pipeline.run()
diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
deleted file mode 100644
index fbcb26d2542..00000000000
--- a/backends/arm/test/passes/test_meandim_to_averagepool2d.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from typing import Tuple
-
-import torch
-from executorch.backends.arm._passes import ConvertMeanDimToAveragePoolPass
-from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
-
-
-input_t = Tuple[torch.Tensor, torch.Tensor]  # Input x
-
-
-class MeanDim(torch.nn.Module):
-    def forward(self, x):
-        return torch.mean(x, dim=[-1, -2], keepdim=True)
-
-    def get_inputs(self) -> input_t:
-        return (torch.rand(1, 1280, 7, 7),)
-
-    ops_before_pass = {"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}
-    ops_after_pass = {"executorch_exir_dialects_edge__ops_aten_avg_pool2d_default": 1}
-    ops_not_after_pass = [
-        "aten_sum_dim_int_list",
-        "aten_full_default",
-        "aten_mul_tensor",
-    ]
-
-
-class MeanDim2(torch.nn.Module):
-    def forward(self, x):
-        return torch.mean(x, dim=1)
-
-    def get_inputs(self) -> input_t:
-        return (torch.rand(1, 1280, 7, 7),)
-
-    ops_before_pass = {
-        "aten_sum_dim_int_list": 3,
-        "aten_full_default": 4,
-        "aten_mul_tensor": 3,
-    }
-    ops_after_pass = {
-        "aten_sum_dim_int_list": 3,
-        "aten_full_default": 4,
-        "aten_mul_tensor": 3,
-    }
-    ops_not_after_pass = ["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]
-
-
-modules = {
-    "meandim_to_averagepool": MeanDim(),
-    "meandim_no_modification": MeanDim2(),
-}
-
-
-@common.parametrize("module", modules)
-def test_meandim_to_avgpool_tosa_BI(module: torch.nn.Module):
-    """
-    Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d
-    for the special case where dim is [-1, -2] and keepdim is True.
-    """
-    pipeline = PassPipeline[input_t](
-        module,
-        module.get_inputs(),
-        quantize=True,
-        ops_before_pass=module.ops_before_pass,
-        ops_after_pass=module.ops_after_pass,
-        ops_not_after_pass=module.ops_not_after_pass,
-        pass_list=[ConvertMeanDimToAveragePoolPass],
-    )
-    pipeline.pop_stage(-1)  # Do not compare output
-    pipeline.run()

From 518324f6096380190cd5c48ba408eed4b2bba65a Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Tue, 13 May 2025 14:01:16 -0700
Subject: [PATCH 075/178] Build flatcc for the host (#10855)

### Summary
* Similar to what we did with `flatc`, let's build `flatcc_cli` for the
host using
[ExternalProject](https://cmake.org/cmake/help/latest/module/ExternalProject.html)
* Move the `flatcc` definitions to be under `third-party/`
* Move the `etdump` and `bundled_program` definitions to be under their
respective folders
* Clean up the build files and remove redundant things

### Test plan
CI

```
$ ./install_executorch.sh

$ ./scripts/build_apple_frameworks.sh

$ rm -rf cmake-out \
   && cmake --preset macos-arm64 \
   && cmake --build cmake-out --parallel

$ rm -rf cmake-out \
   && cmake -DEXECUTORCH_BUILD_DEVTOOLS=ON -Dprotobuf_BUILD_TESTS=OFF -DEXECUTORCH_ENABLE_EVENT_TRACER=ON --preset macos-arm64 \
   && cmake --build cmake-out --parallel
```

cc @larryliu0820
---
 devtools/CMakeLists.txt                 | 234 +-----------------------
 devtools/bundled_program/CMakeLists.txt |  56 ++++++
 devtools/etdump/CMakeLists.txt          |  67 +++++++
 third-party/CMakeLists.txt              |  47 +++++
 4 files changed, 173 insertions(+), 231 deletions(-)
 create mode 100644 devtools/bundled_program/CMakeLists.txt
 create mode 100644 devtools/etdump/CMakeLists.txt

diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt
index f9fd2086b91..87175d50867 100644
--- a/devtools/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -4,236 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Please this file formatted by running:
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-
-cmake_minimum_required(VERSION 3.19)
-
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
-
-if(NOT CMAKE_CXX_STANDARD)
-  set(CMAKE_CXX_STANDARD 17)
-endif()
-
-if(NOT FLATCC_EXECUTABLE)
-  if(WIN32)
-    set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/${CMAKE_BUILD_TYPE}/flatcc)
-  else()
-    set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
-  endif()
-endif()
-
-# Source root directory for executorch.
-if(NOT EXECUTORCH_ROOT)
-  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
-endif()
-
-include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
-# Paths to headers generated from the .fbs files. set(_etdump_schemas
-# etdump_schema_flatcc.fbs scalar_type.fbs)
-
-set(_etdump_schema_names "etdump_schema_flatcc.fbs" "scalar_type.fbs")
-set(_bundled_input_schema_names "bundled_program_schema.fbs" "scalar_type.fbs")
-
-foreach(schema_file ${_etdump_schema_names})
-  list(APPEND _etdump_schema__srcs
-       "${CMAKE_CURRENT_SOURCE_DIR}/etdump/${schema_file}"
-  )
-endforeach()
-
-foreach(schema_file ${_bundled_input_schema_names})
-  list(APPEND _bundled_program_schema__srcs
-       "${CMAKE_CURRENT_SOURCE_DIR}/bundled_program/schema/${schema_file}"
-  )
-endforeach()
-
-set(FLATCC_TEST
-    OFF
-    CACHE BOOL ""
-)
-set(FLATCC_REFLECTION
-    OFF
-    CACHE BOOL ""
-)
-set(FLATCC_DEBUG_CLANG_SANITIZE
-    OFF
-    CACHE BOOL ""
-)
-
-add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc)
-
-# Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
-# a shared object; recompile with -fPIC" when building on some x86 linux
-# systems.
-set_property(TARGET flatccrt PROPERTY POSITION_INDEPENDENT_CODE ON)
-
-# Assume we are cross-compiling and the CMAKE_TOOLCHAIN_FILE is set
-include(ExternalProject)
-
 # The include directory that will contain the generated schema headers.
-set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/include")
-set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/bundled_program")
-
-# TODO(dbort): Only enable this when cross-compiling. It can cause build race
-# conditions (libflatcc.a errors) when enabled.
-option(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT
-       "Whether to build the flatcc commandline tool as a separate project" ON
-)
-
-if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
-  # Add the host project. We build this separately so that we can generate
-  # headers on the host during the build, even if we're cross-compiling the
-  # flatcc runtime to a different architecture.
-  execute_process(
-    COMMAND
-      ${CMAKE_COMMAND} ${_flatcc_source_dir} -DFLATCC_TEST=OFF
-      -DFLATCC_REFLECTION=OFF
-      # See above comment about POSITION_INDEPENDENT_CODE.
-      -DCMAKE_POSITION_INDEPENDENT_CODE=ON -B${CMAKE_BINARY_DIR}/_host_build
-  )
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build
-  )
-  set(_etdump_schema_gen_dep)
-  # TODO(dbort): flatcc installs its files directly in its source directory
-  # instead of under CMAKE_BINARY_DIR, and it has no options to avoid doing
-  # this. We build flatcc twice in the executorch build: once to get the
-  # `flatcc` host commandline tool, and once to get the (potentially
-  # cross-compiled) target runtime library. The host build will put its outputs
-  # in the source tree, making the cross-compiling target build think that the
-  # outputs have already been built. It will then try to link against the
-  # host-architecture libraries, failing when cross-compiling. To work around
-  # this, delete the host outputs after running this command (which only runs
-  # when setting up the cmake files, not when actually building). This leaves
-  # room for the target build to put its own files in the source tree. We should
-  # try to remove this hack, ideally by submitting an upstream PR that adds an
-  # option to change the installation location.
-  set(_etdump_schema_cleanup_paths ${_flatcc_source_dir}/bin/*
-                                   ${_flatcc_source_dir}/lib/*
-  )
-else()
-  # If we're not cross-compiling, we can just use the plain commandline target.
-  set(_etdump_schema_gen_dep flatcc_cli)
-  set(_etdump_schema_cleanup_paths "")
-endif()
-
-set(_etdump_schema__outputs)
-foreach(fbs_file ${_etdump_schema_names})
-  string(REGEX REPLACE "[.]fbs$" "_reader.h" generated "${fbs_file}")
-  list(APPEND _etdump_schema__outputs
-       "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}"
-  )
-  string(REGEX REPLACE "[.]fbs$" "_builder.h" generated "${fbs_file}")
-  list(APPEND _etdump_schema__outputs
-       "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}"
-  )
-endforeach()
-
-# lint_cmake: -linelength
-set(_bundled_program_schema__outputs)
-foreach(fbs_file ${_bundled_input_schema_names})
-  string(REGEX REPLACE "[.]fbs$" "_generated.h" generated "${fbs_file}")
-  list(
-    APPEND
-    _bundled_program_schema__outputs
-    "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema/${generated}"
-  )
-endforeach()
-
-add_library(etdump_schema INTERFACE ${_etdump_schema__outputs})
-add_library(
-  bundled_program_schema INTERFACE ${_bundled_program_schema__outputs}
-)
-
-file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/devtools/etdump)
-file(MAKE_DIRECTORY
-     ${_program_schema__include_dir}/executorch/devtools/bundled_program
-)
-
-if(WIN32)
-  set(RM_COMMAND rmdir /s /q)
-else()
-  set(RM_COMMAND rm -rf)
-endif()
-
-add_custom_command(
-  OUTPUT ${_etdump_schema__outputs}
-  COMMAND
-    # Note that the flatcc project actually writes its outputs into the source
-    # tree instead of under the binary directory, and there's no way to change
-    # that behavior.
-    ${FLATCC_EXECUTABLE} -cwr -o
-    ${_program_schema__include_dir}/executorch/devtools/etdump
-    ${_etdump_schema__srcs}
-  COMMAND ${RM_COMMAND} ${_etdump_schema_cleanup_paths}
-  DEPENDS ${_etdump_schema_gen_dep}
-  COMMENT "Generating etdump headers"
-)
-
-unset(RM_COMMAND)
-
-add_library(
-  etdump ${CMAKE_CURRENT_SOURCE_DIR}/etdump/etdump_flatcc.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/emitter.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/buffer_data_sink.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/buffer_data_sink.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/file_data_sink.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/file_data_sink.h
-)
-
-target_link_libraries(
-  etdump
-  PUBLIC etdump_schema flatccrt
-  PRIVATE executorch
-)
-
-add_custom_command(
-  OUTPUT ${_bundled_program_schema__outputs}
-  COMMAND
-    flatc --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
-    "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema"
-    ${_bundled_program_schema__srcs}
-  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools
-  DEPENDS flatc ${_bundled_program_schema__srcs}
-  COMMENT "Generating bundled_program headers"
-  VERBATIM
-)
-
-# add_library(bundled_program INTERFACE ${_bundled_program_schema__outputs})
-add_library(
-  bundled_program
-  ${CMAKE_CURRENT_SOURCE_DIR}/bundled_program/bundled_program.cpp
-)
-target_link_libraries(bundled_program executorch bundled_program_schema)
-
-set_target_properties(bundled_program PROPERTIES LINKER_LANGUAGE CXX)
-target_include_directories(
-  bundled_program PUBLIC ${_bundled_schema__include_dir}
-                         ${EXECUTORCH_ROOT}/third-party/flatbuffers/include
-)
-
-target_include_directories(
-  etdump PUBLIC ${_program_schema__include_dir} ${_flatcc_source_dir}/include
-)
-
-# Install libraries
-install(
-  TARGETS bundled_program etdump flatccrt
-  DESTINATION ${CMAKE_BINARY_DIR}/lib
-  INCLUDES
-  DESTINATION ${_common_include_directories}
-)
+set(DEVTOOLS_INCLUDE_DIR "${CMAKE_BINARY_DIR}/devtools/include")
 
-if(BUILD_TESTING)
-  # TODO: This is currently not working!
-  # add_subdirectory(etdump/tests)
-endif()
+add_subdirectory(etdump)
+add_subdirectory(bundled_program)
diff --git a/devtools/bundled_program/CMakeLists.txt b/devtools/bundled_program/CMakeLists.txt
new file mode 100644
index 00000000000..ee7fb34e37f
--- /dev/null
+++ b/devtools/bundled_program/CMakeLists.txt
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(
+  _schema_files
+  bundled_program_schema.fbs
+  scalar_type.fbs
+)
+
+set(_schema_outputs)
+foreach(schema_file ${_schema_files})
+  list(APPEND _bundled_program_schema__srcs "${CMAKE_CURRENT_SOURCE_DIR}/schema/${schema_file}")
+
+  string(REGEX REPLACE "[.]fbs$" "_generated.h" generated "${schema_file}")
+  list(APPEND _schema_outputs "${DEVTOOLS_INCLUDE_DIR}/executorch/devtools/bundled_program/schema/${generated}")
+endforeach()
+
+file(MAKE_DIRECTORY ${DEVTOOLS_INCLUDE_DIR}/executorch/devtools/bundled_program)
+add_custom_command(
+  OUTPUT ${_schema_outputs}
+  COMMAND
+    flatc --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
+    ${DEVTOOLS_INCLUDE_DIR}/executorch/devtools/bundled_program/schema
+    ${_bundled_program_schema__srcs}
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools
+  DEPENDS flatc ${_bundled_program_schema__srcs}
+  COMMENT "Generating bundled_program headers"
+  VERBATIM
+)
+
+add_library(
+  bundled_program
+  ${_schema_outputs}
+  ${CMAKE_CURRENT_SOURCE_DIR}/bundled_program.cpp
+)
+target_link_libraries(
+  bundled_program
+  PUBLIC
+    executorch
+)
+target_include_directories(
+  bundled_program
+  PUBLIC
+    ${DEVTOOLS_INCLUDE_DIR}
+    ${PROJECT_SOURCE_DIR}/third-party/flatbuffers/include
+)
+
+install(
+  TARGETS bundled_program
+  DESTINATION ${CMAKE_BINARY_DIR}/lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/devtools/etdump/CMakeLists.txt b/devtools/etdump/CMakeLists.txt
new file mode 100644
index 00000000000..847ac9914e9
--- /dev/null
+++ b/devtools/etdump/CMakeLists.txt
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(
+  _schema_files
+  etdump_schema_flatcc.fbs
+  scalar_type.fbs
+)
+
+set(_schema_outputs)
+foreach(schema_file ${_schema_files})
+  list(APPEND _etdump_schema__srcs "${CMAKE_CURRENT_SOURCE_DIR}/${schema_file}")
+
+  string(REGEX REPLACE "[.]fbs$" "_reader.h" generated_reader "${schema_file}")
+  list(APPEND _schema_outputs "${DEVTOOLS_INCLUDE_DIR}/executorch/devtools/etdump/${generated_reader}")
+
+  string(REGEX REPLACE "[.]fbs$" "_builder.h" generated_builder "${schema_file}")
+  list(APPEND _schema_outputs "${DEVTOOLS_INCLUDE_DIR}/executorch/devtools/etdump/${generated_builder}")
+endforeach()
+
+file(MAKE_DIRECTORY ${DEVTOOLS_INCLUDE_DIR}/executorch/devtools/etdump)
+add_custom_command(
+  OUTPUT ${_schema_outputs}
+  COMMAND
+    # Note that the flatcc project actually writes its outputs into the source
+    # tree instead of under the binary directory, and there's no way to change
+    # that behavior.
+    flatcc_cli -cwr -o
+    ${DEVTOOLS_INCLUDE_DIR}/executorch/devtools/etdump
+    ${_etdump_schema__srcs}
+  DEPENDS flatcc_cli ${_etdump_schema__srcs}
+  COMMENT "Generating etdump headers"
+)
+
+add_library(
+  etdump
+  ${_schema_outputs}
+  ${CMAKE_CURRENT_SOURCE_DIR}/etdump_flatcc.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/emitter.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/data_sinks/buffer_data_sink.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/data_sinks/buffer_data_sink.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/data_sinks/file_data_sink.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/data_sinks/file_data_sink.h
+)
+target_link_libraries(
+  etdump
+  PUBLIC
+    flatccrt
+  PRIVATE
+    executorch
+)
+target_include_directories(
+  etdump
+  PUBLIC
+    ${DEVTOOLS_INCLUDE_DIR}
+    ${PROJECT_SOURCE_DIR}/third-party/flatcc/include
+)
+
+install(
+  TARGETS etdump
+  DESTINATION ${CMAKE_BINARY_DIR}/lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt
index 2aa606927c3..32b6687be5e 100644
--- a/third-party/CMakeLists.txt
+++ b/third-party/CMakeLists.txt
@@ -47,3 +47,50 @@ endif()
 # set(FLATBUFFERS_BUILD_FLATHASH OFF)
 # set(FLATBUFFERS_BUILD_FLATLIB OFF)
 # set(FLATBUFFERS_BUILD_TESTS OFF)
+
+# MARK: - flatcc
+
+# Similar to flatbuffers, we want to build flatcc for the host. See inline comments
+# in the flatbuffers ExternalProject_Add for more details.
+ExternalProject_Add(
+  flatcc_external_project
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}/flatcc_external_project
+  SOURCE_DIR ${PROJECT_SOURCE_DIR}/third-party/flatcc
+  CMAKE_GENERATOR "Unix Makefiles"
+  CMAKE_ARGS -DFLATCC_RTONLY=OFF
+             -DFLATCC_TEST=OFF
+             -DFLATCC_REFLECTION=OFF
+             -DFLATCC_DEBUG_CLANG_SANITIZE=OFF
+             -DFLATCC_INSTALL=ON
+             -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_TOOLCHAIN_FILE=
+             $<$<AND:$<BOOL:${APPLE}>,$<BOOL:$<FILTER:${PLATFORM},EXCLUDE,^MAC>>>:-DCMAKE_OSX_SYSROOT=>
+             -DCMAKE_OSX_DEPLOYMENT_TARGET:STRING=${CMAKE_OSX_DEPLOYMENT_TARGET}
+  BUILD_BYPRODUCTS <INSTALL_DIR>/bin/flatcc
+)
+ExternalProject_Get_Property(flatcc_external_project INSTALL_DIR)
+add_executable(flatcc_cli IMPORTED GLOBAL)
+add_dependencies(flatcc_cli flatcc_external_project)
+if(WIN32)
+  set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/${CMAKE_BUILD_TYPE}/flatcc)
+else()
+  set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/flatcc)
+endif()
+
+set(FLATCC_RTONLY ON CACHE BOOL "")
+set(FLATCC_TEST OFF CACHE BOOL "")
+set(FLATCC_REFLECTION OFF CACHE BOOL "")
+set(FLATCC_DEBUG_CLANG_SANITIZE OFF CACHE BOOL "")
+set(FLATCC_INSTALL OFF CACHE BOOL "")
+add_subdirectory(flatcc)
+# Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
+# a shared object; recompile with -fPIC" when building on some x86 linux
+# systems.
+#
+# Learn more: https://github.com/pytorch/executorch/pull/2467
+set_property(TARGET flatccrt PROPERTY POSITION_INDEPENDENT_CODE ON)
+install(
+  TARGETS flatccrt
+  DESTINATION ${CMAKE_BINARY_DIR}/lib
+)

From d0b4ed6fe726ab1c8b2ad116880b09a2e70891cf Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Tue, 13 May 2025 23:53:39 +0200
Subject: [PATCH 076/178] Arm backend: Decompose sum in pass (#10852)

Moves the unrolling of reducing multiple indices
from the sum node visitor to a new DecomposeSumPass.

KeepDimsFalseToSqueezePass is merged into the new
pass to decompose the sum op fully in one pass.

This change introduces new rescales for each
reduced dim, requiring decomposition before quantization to get proper
quantization parameters.

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 backends/arm/_passes/__init__.py              |   2 +-
 backends/arm/_passes/arm_pass_manager.py      |   8 +-
 backends/arm/_passes/decompose_sum_pass.py    |  79 ++++++++
 .../keep_dims_false_to_squeeze_pass.py        |  92 ---------
 backends/arm/operators/op_sum.py              | 180 +++++++-----------
 5 files changed, 156 insertions(+), 205 deletions(-)
 create mode 100644 backends/arm/_passes/decompose_sum_pass.py
 delete mode 100644 backends/arm/_passes/keep_dims_false_to_squeeze_pass.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 364d4bdf329..37d3e4278df 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -32,6 +32,7 @@
 from .decompose_softmax_pass import DecomposeSoftmaxPass  # noqa
 from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass  # noqa
 from .decompose_sqrt_pass import DecomposeSqrtPass  # noqa
+from .decompose_sum_pass import DecomposeSumPass  # noqa
 from .decompose_var_pass import DecomposeVarPass  # noqa
 from .fold_qdq_with_annotated_qparams_pass import (  # noqa
     FoldAndAnnotateQParamsPass,
@@ -44,7 +45,6 @@
 from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
 from .insert_rescales_pass import InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
-from .keep_dims_false_to_squeeze_pass import KeepDimsFalseToSqueezePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
 from .match_where_self_arg_dtype_pass import MatchWhereSelfDtypePass  # noqa
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 1d9c2231b2f..20d4f41a273 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -36,6 +36,7 @@
     DecomposeSoftmaxPass,
     DecomposeSoftmaxUnstablePass,
     DecomposeSqrtPass,
+    DecomposeSumPass,
     DecomposeVarPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchnorm2DPass,
@@ -44,7 +45,6 @@
     FuseQuantizedActivationPass,
     InsertRescalePass,
     InsertTableOpsPass,
-    KeepDimsFalseToSqueezePass,
     MatchArgRanksPass,
     MatchWhereSelfDtypePass,
     QuantizeOperatorArguments,
@@ -109,7 +109,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
-        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
@@ -161,7 +161,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
-        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
@@ -218,4 +218,6 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
 
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ReplaceInfValues())
+        self.add_pass(DecomposeSumPass())
+
         return self._transform(graph_module)
diff --git a/backends/arm/_passes/decompose_sum_pass.py b/backends/arm/_passes/decompose_sum_pass.py
new file mode 100644
index 00000000000..531b0d72a19
--- /dev/null
+++ b/backends/arm/_passes/decompose_sum_pass.py
@@ -0,0 +1,79 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+def _get_sum_decomp(op):
+    match op:
+        case exir_ops.edge.aten.sum.dim_IntList:
+            return (
+                exir_ops.edge.aten.view_copy.default,
+                exir_ops.edge.aten.sum.dim_IntList,
+            )
+        case torch.ops.aten.sum.dim_IntList:
+            return (torch.ops.aten.view_copy.default, torch.ops.aten.sum.dim_IntList)
+        case _:
+            raise RuntimeError("Unvalid op in DecomposeSumPass")
+
+
+class DecomposeSumPass(ExportPass):
+    """
+    In Pytorch, the default behaviour of for example Tensor.sum is to squeeze the
+    dimension that is summed (keep_dim = False). However, in TOSA, REDUCE_SUM always
+    preserves the rank of the input (keep_dim = True). To get a 1-1 mapping in the sum
+    lowering, normalize the keep_dim = False case to keep_dim = True and lower the rank
+    with a view op.
+
+    Since TOSA can only reduce one dimension at a time, multiple dims are additionally
+    unrolled into multiple ops.
+
+    Original:
+        sum((dim_1, dim_2), keep_dim = False) -> squeezed_shape
+    After pass:
+        sum(dim_1, keep_dim = True) -> unsqueezed_shape
+        sum(dim_2, keep_dim = True) -> unsqueezed_shape
+        view(shape = squeezed_shape) -> squeezed_shape
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in [
+            exir_ops.edge.aten.sum.dim_IntList,
+            torch.ops.aten.sum.dim_IntList,
+        ]:
+            return super().call_operator(op, args, kwargs, meta)
+
+        match len(args):
+            case 3:
+                (
+                    input_node,
+                    dims,
+                    keepdims,
+                ) = args
+            case 2:
+                (
+                    input_node,
+                    dims,
+                ) = args
+                keepdims = False
+            case _:
+                raise ValueError(f"Invalid number of arguments ({len(args)}) provided.")
+
+        view_op, sum_op = _get_sum_decomp(op)
+
+        for dim in dims:
+            input_node = super().call_operator(
+                sum_op, (input_node, dim, True), kwargs, meta
+            )
+
+        if not keepdims:
+            shape = list(meta["val"].size())
+            input_node = super().call_operator(
+                view_op, (input_node, shape), kwargs, meta
+            )
+
+        return input_node
diff --git a/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
deleted file mode 100644
index 744436cba9e..00000000000
--- a/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-from typing import cast
-
-import torch
-import torch.fx
-from executorch.backends.arm._passes.arm_pass_utils import (
-    create_node,
-    get_node_arg,
-    set_node_arg,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-
-
-class KeepDimsFalseToSqueezePass(ExportPass):
-    """
-    In Pytorch, the default behaviour of for example Tensor.sum is to squeeze
-    the dimension that is summed (keep_dim = False).
-    However, in TOSA, REDUCE_SUM always preserves the
-    rank of the input (keep_dim = True).
-    To get a 1-1 mapping in the sum lowering, normalize the
-    keep_dim = False case to keep_dim = True and add squeeze ops.
-
-    Original:
-        sum(dims, keep_dim = False)
-    After pass:
-        sum(dims, keep_dim = True)
-        squeeze(dim = dims)
-    """
-
-    # CURRENTLY NOT HANDLED OPS
-    # exir_ops.edge.aten.argmax,
-    # exir_ops.edge.aten.argmin,
-    # exir_ops.edge.aten.prod.dim_int,
-
-    # HANDLED OPS
-    # exir_ops.edge.aten.sum.dim_IntList
-    # exir_ops.edge.aten.any.default (decomposed in convert_any_default_dim_dims_pass)
-    # exir_ops.edge.aten.any.dim (decomposed in convert_any_default_dim_dims_pass)
-    # exir_ops.edge.aten.any.dims (decomposed in convert_any_default_dim_dims_pass)
-    # exir_ops.edge.aten.max.dim (decomposed in convert_minmax_pass)
-    # exir_ops.edge.aten.min.dim (decomposed in convert_minmax_pass)
-    # exir_ops.edge.aten.amin (decomposed in convert_minmax_pass)
-    # exir_ops.edge.aten.amax (decomposed in convert_minmax_pass)
-    # exir_ops.edge.aten.var.correction (decomposed in decompose_var_pass)
-    # exir_ops.edge.aten.var.dim (decomposed in decompose_var_pass)
-    # exir_ops.edge.aten.mean.dim (decomposed in decompose_meandim_pass)
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        for node in graph_module.graph.nodes:
-            keep_dim_index = None
-
-            if node.op != "call_function":
-                continue
-            if node.target == exir_ops.edge.aten.sum.dim_IntList:
-                keep_dim_index = 2
-            else:
-                continue
-
-            sum_node = cast(torch.fx.Node, node)
-            keep_dim = get_node_arg(
-                # pyre-ignore[6]
-                sum_node.args,  # type: ignore[arg-type]
-                keep_dim_index,
-                False,
-            )
-
-            if keep_dim:
-                continue
-
-            dim_list = get_node_arg(sum_node.args, 1, [0])  # type: ignore[arg-type]  # pyre-ignore[6]
-
-            # Add keep_dim = True arg to sum node.
-            set_node_arg(sum_node, 2, True)
-
-            with graph_module.graph.inserting_after(sum_node):
-                squeeze_node = create_node(
-                    graph_module.graph, exir_ops.edge.aten.squeeze_copy.dims, ()
-                )
-                sum_node.replace_all_uses_with(squeeze_node)
-                squeeze_node.args = (sum_node, dim_list)
-
-        graph_module.graph.eliminate_dead_code()
-        graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
-        return PassResult(graph_module, True)
diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py
index f232136fd9b..c0a436f4d99 100644
--- a/backends/arm/operators/op_sum.py
+++ b/backends/arm/operators/op_sum.py
@@ -5,7 +5,7 @@
 
 # pyre-unsafe
 
-from typing import Any, cast, List
+from typing import Any, List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
@@ -45,41 +45,36 @@ def define_node(
 
         validate_num_inputs(self.target, inputs, 3)
 
-        input_shape = list(inputs[0].shape)
-        dim_list = cast(list[int], inputs[1].special)
-        dim_list = [dim % len(input_shape) for dim in dim_list]
-        keep_dim = cast(bool, inputs[2].number if len(inputs) > 2 else False)
-        assert keep_dim, "This case should be handled by InsertSqueezeAfterSumPass"
+        tensor = inputs[0]
+        input_shape = list(tensor.shape)
+        dim = int(inputs[1].number % len(input_shape))
+
+        output_shape = input_shape
+        output_shape[dim] = 1  # Output shape is input shape with dim reduced
 
         # Rescale input to 32 bit
         rescaled_inputs, scale = tqutils.insert_rescale_ops_to_int32(
             tosa_graph,
-            [inputs[0]],
+            [tensor],
             node,
         )
 
-        prev_node = rescaled_inputs[0]
-        reduced_shape = input_shape
-
-        # Reduce all dims in dim_list one-by-one.
-        for dim in dim_list:
-            # When reduced, the size of the dim becomes 1.
-            reduced_shape[dim] = 1
-
-            attr = ts.TosaSerializerAttribute()
-            attr.AxisAttribute(inputs[0].dim_order.index(dim))
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(tensor.dim_order.index(dim))
 
-            next_node = tosa_graph.addIntermediate(
-                tutils.tosa_shape(reduced_shape, inputs[0].dim_order),
-                dtype=ts.DType.INT32,
-            )
+        intermediate = tosa_graph.addIntermediate(
+            tutils.tosa_shape(output_shape, tensor.dim_order),
+            dtype=ts.DType.INT32,
+        )
 
-            tosa_graph.addOperator(
-                ts.TosaOp.Op().REDUCE_SUM, [prev_node.name], [next_node.name], attr
-            )
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().REDUCE_SUM,
+            [rescaled_inputs[0].name],
+            [intermediate.name],
+            attr,
+        )
 
-            prev_node = next_node
-        tqutils.insert_rescale_op_to_int8(tosa_graph, prev_node, scale, node)
+        tqutils.insert_rescale_op_to_int8(tosa_graph, intermediate, scale, node)
 
 
 @register_node_visitor
@@ -103,38 +98,27 @@ def define_node(
 
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        validate_num_inputs(self.target, inputs, 3)
-
         if inputs[0].dtype == ts.DType.INT8:
             return super().define_node(node, tosa_graph, inputs, output)
-        input_name = inputs[0].name
-        reduced_shape = list(inputs[0].shape)
-        dim_list = cast(list[int], inputs[1].special)
-        dim_list = [dim % len(reduced_shape) for dim in dim_list]
-        keep_dim = cast(bool, inputs[2].number if len(inputs) > 2 else False)
-        assert keep_dim, "This case should be handled by InsertSqueezeAfterSumPass"
 
-        # Reduce all dims in dim_list one-by-one.
-        for dim in dim_list:
-            # When reduced, the size of the dim becomes 1
-            reduced_shape[dim] = 1
+        validate_num_inputs(self.target, inputs, 3)
 
-            attr = ts.TosaSerializerAttribute()
-            attr.AxisAttribute(inputs[0].dim_order.index(dim))
+        tensor = inputs[0]
+        input_shape = list(tensor.shape)
+        dim = int(inputs[1].number % len(input_shape))
 
-            if dim == dim_list[-1]:
-                output_name = output.name
-            else:
-                output_name = tosa_graph.addIntermediate(
-                    tutils.tosa_shape(reduced_shape, inputs[0].dim_order),
-                    dtype=ts.DType.FP32,
-                ).name
+        output_shape = input_shape
+        output_shape[dim] = 1  # Output shape is input shape with dim reduced
 
-            tosa_graph.addOperator(
-                ts.TosaOp.Op().REDUCE_SUM, [input_name], [output_name], attr
-            )
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(tensor.dim_order.index(dim))
 
-            input_name = output_name
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().REDUCE_SUM,
+            [tensor.name],
+            [output.name],
+            attr,
+        )
 
 
 @register_node_visitor
@@ -160,45 +144,37 @@ def define_node(
 
         validate_num_inputs(self.target, inputs, 3)
 
-        input_shape = list(inputs[0].shape)
-        dim_list = cast(list[int], inputs[1].special)
-        dim_list = [dim % len(input_shape) for dim in dim_list]
-        keep_dim = cast(bool, inputs[2].number if len(inputs) > 2 else False)
-        assert keep_dim, "This case should be handled by InsertSqueezeAfterSumPass"
+        tensor = inputs[0]
+        input_shape = list(tensor.shape)
+        dim = int(inputs[1].number % len(input_shape))
+
+        output_shape = input_shape
+        output_shape[dim] = 1  # Output shape is input shape with dim reduced
 
         # Rescale input to 32 bit
         rescaled_inputs, scale = tqutils.insert_rescale_ops_to_int32(
             tosa_graph,
-            [inputs[0]],
+            [tensor],
             node,
-            self.tosa_specs,
         )
 
-        prev_node = rescaled_inputs[0]
-        reduced_shape = input_shape
-
-        # Reduce all dims in dim_list one-by-one.
-        for dim in dim_list:
-            # When reduced, the size of the dim becomes 1.
-            reduced_shape[dim] = 1
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(tensor.dim_order.index(dim))
 
-            attr = ts.TosaSerializerAttribute()
-            attr.ReduceSumAttribute(inputs[0].dim_order.index(dim))
-
-            next_node = tosa_graph.addIntermediate(
-                tutils.tosa_shape(reduced_shape, inputs[0].dim_order),
-                dtype=ts.DType.INT32,
-            )
-
-            tosa_graph.addOperator(
-                ts.TosaOp.Op().REDUCE_SUM, [prev_node.name], [next_node.name], attr
-            )
+        intermediate = tosa_graph.addIntermediate(
+            tutils.tosa_shape(output_shape, tensor.dim_order),
+            dtype=ts.DType.INT32,
+        )
 
-            prev_node = next_node
-        tqutils.insert_rescale_op_to_int8(
-            tosa_graph, prev_node, scale, node, self.tosa_specs
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().REDUCE_SUM,
+            [rescaled_inputs[0].name],
+            [intermediate.name],
+            attr,
         )
 
+        tqutils.insert_rescale_op_to_int8(tosa_graph, intermediate, scale, node)
+
 
 @register_node_visitor
 class SumVisitor_FP(SumVisitor_INT):
@@ -221,33 +197,19 @@ def define_node(
 
         validate_num_inputs(self.target, inputs, 3)
 
-        if inputs[0].dtype == ts.DType.INT8:
-            return super().define_node(node, tosa_graph, inputs, output)
-        input_name = inputs[0].name
-        reduced_shape = list(inputs[0].shape)
-        dim_list = cast(list[int], inputs[1].special)
-        dim_list = [dim % len(reduced_shape) for dim in dim_list]
-        keep_dim = cast(bool, inputs[2].number if len(inputs) > 2 else False)
-        assert keep_dim, "This case should be handled by InsertSqueezeAfterSumPass"
-
-        # Reduce all dims in dim_list one-by-one.
-        for dim in dim_list:
-            # When reduced, the size of the dim becomes 1
-            reduced_shape[dim] = 1
-
-            attr = ts.TosaSerializerAttribute()
-            attr.ReduceSumAttribute(inputs[0].dim_order.index(dim))
-
-            if dim == dim_list[-1]:
-                output_name = output.name
-            else:
-                output_name = tosa_graph.addIntermediate(
-                    tutils.tosa_shape(reduced_shape, inputs[0].dim_order),
-                    dtype=ts.DType.FP32,
-                ).name
-
-            tosa_graph.addOperator(
-                ts.TosaOp.Op().REDUCE_SUM, [input_name], [output_name], attr
-            )
-
-            input_name = output_name
+        tensor = inputs[0]
+        input_shape = list(tensor.shape)
+        dim = int(inputs[1].number % len(input_shape))
+
+        output_shape = input_shape
+        output_shape[dim] = 1  # Output shape is input shape with dim reduced
+
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(tensor.dim_order.index(dim))
+
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().REDUCE_SUM,
+            [tensor.name],
+            [output.name],
+            attr,
+        )

From 9ded0a29ac6f4fc0f9c81a4018797d136dfca492 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Tue, 13 May 2025 15:27:17 -0700
Subject: [PATCH 077/178] Allow graceful handling of cpuinfo init failure

Differential Revision: D74604740

Pull Request resolved: https://github.com/pytorch/executorch/pull/10826
---
 extension/threadpool/threadpool.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index 4134bb8669d..5fee732b053 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -96,7 +96,11 @@ void ThreadPool::run(
 // get_threadpool is not thread safe due to leak_corrupted_threadpool
 // Make this part threadsafe: TODO(kimishpatel)
 ThreadPool* get_threadpool() {
-  ET_CHECK_MSG(cpuinfo_initialize(), "cpuinfo initialization failed");
+  if (!cpuinfo_initialize()) {
+    ET_LOG(Error, "cpuinfo initialization failed");
+    return nullptr; // NOLINT(facebook-hte-NullableReturn)
+  }
+
   int num_threads = cpuinfo_get_processors_count();
   /*
    * For llvm-tsan, holding limit for the number of locks for a single thread

From b20419d422d5c54f90da5aec3f879d4c9b227bf6 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Tue, 13 May 2025 15:36:47 -0700
Subject: [PATCH 078/178] Use the install method for flatc (#10859)

### Summary

I came across this "cleaner" approach when setting up flatcc
(https://github.com/pytorch/executorch/pull/10855). So, let's do it for
flatc too.

### Test plan

CI

cc @larryliu0820
---
 setup.py                   |  2 +-
 third-party/CMakeLists.txt | 15 +++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 0b474d2537e..162a3541132 100644
--- a/setup.py
+++ b/setup.py
@@ -840,7 +840,7 @@ def get_ext_modules() -> List[Extension]:
 
     ext_modules = [
         BuiltFile(
-            src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers_external_project",
+            src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers_external_project/bin/%BUILD_TYPE%/",
             src_name="flatc",
             dst="executorch/data/bin/",
             is_executable=True,
diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt
index 32b6687be5e..df7e2b2521d 100644
--- a/third-party/CMakeLists.txt
+++ b/third-party/CMakeLists.txt
@@ -12,33 +12,32 @@
 ExternalProject_Add(
   flatbuffers_external_project
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_external_project
-  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_external_project
   SOURCE_DIR ${PROJECT_SOURCE_DIR}/third-party/flatbuffers
   # Always use Make to avoid needing to codesign flatc if the project is using Xcode.
   CMAKE_GENERATOR "Unix Makefiles"
   CMAKE_ARGS -DFLATBUFFERS_BUILD_FLATC=ON
-             -DFLATBUFFERS_INSTALL=OFF
+             -DFLATBUFFERS_INSTALL=ON
              -DFLATBUFFERS_BUILD_FLATHASH=OFF
              -DFLATBUFFERS_BUILD_FLATLIB=OFF
              -DFLATBUFFERS_BUILD_TESTS=OFF
+             -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
              -DCMAKE_CXX_FLAGS="-DFLATBUFFERS_MAX_ALIGNMENT=${EXECUTORCH_FLATBUFFERS_MAX_ALIGNMENT}"
              # Unset the toolchain to build for the host instead of the toolchain set for the project.
              -DCMAKE_TOOLCHAIN_FILE=
              # If building for iOS, "unset" these variables to rely on the host (macOS) defaults.
              $<$<AND:$<BOOL:${APPLE}>,$<BOOL:$<FILTER:${PLATFORM},EXCLUDE,^MAC>>>:-DCMAKE_OSX_SYSROOT=>
              -DCMAKE_OSX_DEPLOYMENT_TARGET:STRING=${CMAKE_OSX_DEPLOYMENT_TARGET}
-  INSTALL_COMMAND ""
-  BUILD_BYPRODUCTS <BINARY_DIR>/flatc
+  BUILD_BYPRODUCTS <INSTALL_DIR>/bin/flatc
 )
-ExternalProject_Get_Property(flatbuffers_external_project BINARY_DIR)
+ExternalProject_Get_Property(flatbuffers_external_project INSTALL_DIR)
 add_executable(flatc IMPORTED GLOBAL)
 add_dependencies(flatc flatbuffers_external_project)
 if(WIN32)
   # flatbuffers does not use CMAKE_BUILD_TYPE. Internally, the build forces Release
   # config, but from CMake's perspective the build type is always Debug.
-  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${BINARY_DIR}/$<CONFIG>/flatc.exe)
+  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/$<CONFIG>/bin/flatc.exe)
 else()
-  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${BINARY_DIR}/flatc)
+  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/flatc)
 endif()
 
 # TODO: re-enable once flatbuffers is added as a subdirectory.
@@ -73,7 +72,7 @@ ExternalProject_Get_Property(flatcc_external_project INSTALL_DIR)
 add_executable(flatcc_cli IMPORTED GLOBAL)
 add_dependencies(flatcc_cli flatcc_external_project)
 if(WIN32)
-  set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/${CMAKE_BUILD_TYPE}/flatcc)
+  set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/$<CONFIG>/bin/flatcc.exe)
 else()
   set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/flatcc)
 endif()

From 4a89327c2ae5767e0755e1d6f3a9fae58da2cebf Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Tue, 13 May 2025 15:46:49 -0700
Subject: [PATCH 079/178] Remove EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT
 (#10860)

### Summary
I forgot to do this in https://github.com/pytorch/executorch/pull/10855.
But since we build flatcc for the host now, we don't need this variable.

### Test plan

CI

```
$ rg EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT --hidden -g '!.git/'
```

cc @larryliu0820
---
 .ci/scripts/build-qnn-sdk.sh             | 3 +--
 backends/arm/scripts/build_executorch.sh | 2 --
 docs/source/backends-qualcomm.md         | 3 +--
 setup.py                                 | 5 -----
 4 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index 8237b70d03d..f6dafebc756 100644
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -39,8 +39,7 @@ set_up_aot() {
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-      -DPYTHON_EXECUTABLE=python3 \
-      -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
+      -DPYTHON_EXECUTABLE=python3
   cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
   # install Python APIs to correct import path
   # The filename might vary depending on your Python and host version.
diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh
index 573f93221d4..3642b5d8446 100755
--- a/backends/arm/scripts/build_executorch.sh
+++ b/backends/arm/scripts/build_executorch.sh
@@ -82,7 +82,6 @@ if [ "$build_with_etdump" = true ] ; then
         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
         -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-        -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON      \
         -DFLATCC_ALLOW_WERROR=OFF                         \
         -B"${et_build_host_dir}"                          \
         "${et_root_dir}"
@@ -111,7 +110,6 @@ if [ "$build_with_etdump" = true ] ; then
     # Add DevTools flags use in the Target build below
     build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
                             -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-                            -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
                             -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
                             -DFLATCC_ALLOW_WERROR=OFF                         \
                             -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc "
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 3723fc9b454..73bdefe45d7 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -143,8 +143,7 @@ cmake .. \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-  -DPYTHON_EXECUTABLE=python3 \
-  -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
+  -DPYTHON_EXECUTABLE=python3
 
 # nproc is used to detect the number of available CPU.
 # If it is not applicable, please feel free to use the number you want.
diff --git a/setup.py b/setup.py
index 162a3541132..860e7f39403 100644
--- a/setup.py
+++ b/setup.py
@@ -714,11 +714,6 @@ def run(self):  # noqa C901
             "-DEXECUTORCH_ENABLE_LOGGING=ON",
             "-DEXECUTORCH_LOG_LEVEL=Info",
             "-DCMAKE_OSX_DEPLOYMENT_TARGET=10.15",
-            # The separate host project is only required when cross-compiling,
-            # and it can cause build race conditions (libflatcc.a errors) when
-            # enabled. TODO(dbort): Remove this override once this option is
-            # managed by cmake itself.
-            "-DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF",
             "-DEXECUTORCH_BUILD_TESTS=ON",
         ]
 

From abaee69fd7d3cefccaf61bdf1e90dee4418a54e8 Mon Sep 17 00:00:00 2001
From: neuropilot-captain <neuropilot@mediatek.com>
Date: Wed, 14 May 2025 08:40:00 +0800
Subject: [PATCH 080/178] Update documents for Express SDK update (#10462)

### Summary
Update the document for Express SDK update


cc @cccclai @cbilgin

Co-authored-by: neuropilot-captain <neuropilot-captain@mediatek.com>
---
 backends/mediatek/README.md      | 21 +++++++++++++--------
 docs/source/backends-mediatek.md |  9 +++++----
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md
index 0a756a7bf1a..665e11ce266 100644
--- a/backends/mediatek/README.md
+++ b/backends/mediatek/README.md
@@ -6,6 +6,7 @@ MediaTek backend empowers ExecuTorch to speed up PyTorch models on edge devices
 The examples provided in this repository are tested and supported on the following MediaTek chip:
 
 - MediaTek Dimensity 9300 (D9300)
+- MediaTek Dimensity 9400 (D9400)
 
 ## Build Instructions
 
@@ -31,13 +32,17 @@ Before you begin, ensure you have the following prerequisites installed and conf
 
 #### 3. MediaTek ExecuTorch Libraries
 
-Download [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress) from MediaTek's NeuroPilot portal:
+To get started with MediaTek's ExecuTorch libraries, download the [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress) from MediaTek's NeuroPilot portal. The SDK includes the following components:
 
-- `libneuronusdk_adapter.mtk.so`: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip.
-- `libneuron_buffer_allocator.so`: This utility library is designed for allocating DMA buffers necessary for model inference.
-- `mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`: This library preprocess the model into a MediaTek representation.
-- `mtk_neuron-8.2.2-py3-none-linux_x86_64.whl`: This library converts the model to binaries.
-- Copy `NeuronAdapter.h` under `backends/mediatek/runtime/include/api/`.
+- **`libneuronusdk_adapter.mtk.so`**: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip.
+
+- **`libneuron_buffer_allocator.so`**: This utility library is designed for allocating DMA buffers necessary for model inference.
+
+- **`mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`**: This library preprocesses the model into a MediaTek representation.
+
+- **`mtk_neuron-8.2.19-py3-none-linux_x86_64.whl`**: This library converts the model to binaries.
+
+Additionally, make sure to copy `NeuronAdapter.h` to the following directory: `backends/mediatek/runtime/include/api/`.
 
 ### Setup
 
@@ -52,8 +57,8 @@ Follow the steps below to setup your build environment:
    ```
 - Install the two .whl downloaded from NeuroPilot Portal
    ```bash
-   pip3 install mtk_neuron-8.2.2-py3-none-linux_x86_64.whl
-   pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+   pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+   pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
    ```
 - Set evironment variables for building backend
    ```bash
diff --git a/docs/source/backends-mediatek.md b/docs/source/backends-mediatek.md
index 9f130c8c594..7200f24bf98 100644
--- a/docs/source/backends-mediatek.md
+++ b/docs/source/backends-mediatek.md
@@ -25,6 +25,7 @@ MediaTek backend empowers ExecuTorch to speed up PyTorch models on edge devices
 
 ### Supported Chips:
 - MediaTek Dimensity 9300 (D9300)
+- MediaTek Dimensity 9400 (D9400)
 
 ### Software:
 
@@ -37,19 +38,19 @@ Follow the steps below to setup your build environment:
 1. **Setup ExecuTorch Environment**: Refer to the [Getting Started](getting-started.md) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup MediaTek Backend Environment**
-- Install the dependent libs. Ensure that you are inside `backends/mediatek/` directory
    ```bash
    pip3 install -r requirements.txt
    ```
 - Install the two .whl downloaded from NeuroPilot Portal
    ```bash
-   pip3 install mtk_neuron-8.2.13-py3-none-linux_x86_64.whl
-   pip3 install mtk_converter-8.9.1+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+   pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+   pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
    ```
 - Set evironment variables for building backend
    ```bash
-   export NEURON_BUFFER_ALLOCATOR_LIB=<path_to_buffer_allocator.so>
+   export NEURON_BUFFER_ALLOCATOR_LIB=<path_to_buffer_allocator>
    ```
+Additionally, make sure to copy `NeuronAdapter.h` to the following directory: `backends/mediatek/runtime/include/api/`.
 
 ## Build
 

From 62cf849b9333bedc00dfbda9bd8928c8161fb9d6 Mon Sep 17 00:00:00 2001
From: Guang Yang <42389959+guangy10@users.noreply.github.com>
Date: Tue, 13 May 2025 17:40:45 -0700
Subject: [PATCH 081/178] Update CI for HF Optimum models (#10820)

### Summary
Updated pinned `optimum-executorch`. Use `optimum-cli` to generate pte
files with xnnpack+custom_sdpa+8da4w. Getting the CI ready to be
profiling those ptes using `executor_runner`.

### Test plan
CI will export native HF models via `optimum-cli` and validate those
models via python API `from_pretrained`.

Co-authored-by: Guang Yang <guangyang@fb.com>
---
 .github/workflows/trunk.yml | 78 ++++++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index f393e52aa1d..8d87c65f7a3 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -555,11 +555,11 @@ jobs:
     strategy:
       matrix:
         hf_model_id: [
-          google/gemma-2-2b,
-          Qwen/Qwen2.5-0.5B,
+          google/gemma-3-1b-it,
+          Qwen/Qwen3-0.6B,
           HuggingFaceTB/SmolLM2-135M,
           meta-llama/Llama-3.2-1B,
-          allenai/OLMo-1B-hf
+          allenai/OLMo-1B-hf,
         ]
       fail-fast: false
     with:
@@ -569,44 +569,102 @@ jobs:
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      upload-artifact: profiling-artifacts-${{ strategy.job-index }}
       script: |
         echo "::group::Set up ExecuTorch"
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        # Build executor_runner with ETdump enabled
+        PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DEXECUTORCH_ENABLE_LOGGING=1 \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+          -DEXECUTORCH_BUILD_XNNPACK=ON \
+          -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
+          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+          -Bcmake-out .
+        cmake --build cmake-out -j16 --target install --config Release
         echo "::endgroup::"
 
         echo "::group::Set up Hugging Face"
         pip install -U "huggingface_hub[cli]"
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         git clone https://github.com/huggingface/optimum-executorch
-        cd optimum-executorch
+        pushd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
+        git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
         pip install .[tests]
+        popd
+
+        if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then
+          # Fixes for gemma-3 is not available in the released version
+          git clone https://github.com/huggingface/transformers.git
+          pushd transformers
+          git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f
+          pip install -e .
+          popd
+        fi
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export and Run ${{ matrix.hf_model_id }}"
+        echo "::group::Export to ExecuTorch"
         # Pass matrix variable as environment variable
         export MODEL_ID="${{ matrix.hf_model_id }}"
+        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w"
+        pushd optimum-executorch
+
+        optimum-cli export executorch \
+          --model ${MODEL_ID} \
+          --task text-generation \
+          --recipe xnnpack \
+          --use_custom_sdpa \
+          --output_dir ${OUTPUT_DIR} \
+          --qlinear
+
+        ls -FlAGhp ${OUTPUT_DIR}
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using python API"
+        pushd optimum-executorch
         python -c "
         import os
         from optimum.executorch import ExecuTorchModelForCausalLM
         from transformers import AutoTokenizer
 
         model_id = os.getenv('MODEL_ID')
-        print(f'Loading model: {model_id}')
-        model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        pte_dir = os.getenv('OUTPUT_DIR')
+        print(f'Loading model {model_id} from {pte_dir}.')
+        model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
         generated_text = model.text_generation(
-          tokenizer=tokenizer,
+          tokenizer=AutoTokenizer.from_pretrained(model_id),
           prompt='Simply put, the theory of relativity states that',
           max_seq_len=64
         )
         print(generated_text)
         "
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using executor_runner with ETDump"
+        ./cmake-out/executor_runner \
+          --model_path ${OUTPUT_DIR}/model.pte \
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp
+
+        export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv
+        mkdir -p $(dirname "$TSV_PATH")
+        python3 -m devtools.inspector.inspector_cli \
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp \
+          --tsv_path ${TSV_PATH}
+
         echo "::endgroup::"
 
 
From ed80e3b383aeb5e0c5f985f32878a70f9999e00b Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Tue, 13 May 2025 17:52:18 -0700
Subject: [PATCH 082/178] [LlamaDemo] Replace some tokens

Differential Revision: D74687286

Pull Request resolved: https://github.com/pytorch/executorch/pull/10863
---
 .../executorchllamademo/MainActivity.java        |  1 +
 .../executorchllamademo/PromptFormat.java        | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 37268202b69..b26031d89a6 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -90,6 +90,7 @@ public void onResult(String result) {
     if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) {
       return;
     }
+    result = PromptFormat.replaceSpecialToken(mCurrentSettingsFields.getModelType(), result);
     if (result.equals("\n\n") || result.equals("\n")) {
       if (!mResultMessage.getText().isEmpty()) {
         mResultMessage.appendText(result);
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
index 5f8ecdd8042..524ad7cbf6d 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
@@ -99,6 +99,22 @@ public static String getThinkingModeToken(ModelType modelType, boolean thinkingM
     }
   }
 
+  public static String replaceSpecialToken(ModelType modelType, String token) {
+    switch (modelType) {
+      case QWEN_3:
+        switch (token) {
+          case "<|im_end|>":
+            return "";
+          case "<think>":
+            return "Thinking...\n";
+          case "</think>":
+            return "\nDone thinking";
+        }
+      default:
+        return token;
+    }
+  }
+
   public static String getLlavaPresetPrompt() {
     return "A chat between a curious human and an artificial intelligence assistant. The assistant"
         + " gives helpful, detailed, and polite answers to the human's questions. USER: ";

From 001e5ef4dd6c418ffbaf99f1c1fbfe941a2373f0 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 13 May 2025 18:35:00 -0700
Subject: [PATCH 083/178] Change lowbit example to use 4-bit as default in
 example (#10865)

As titled
---
 examples/models/llama/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index ed0a37b4f86..94ba1102853 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -398,7 +398,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 # Set low-bit quantization parameters
-QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_BITWIDTH=4 # Can be 1-8
 QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
 QEMBEDDING_BITWIDTH=4 # Can be 1-8
 QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16

From a21022c2ca08b7f3cdbaabc0e9bb4732e20a4dd9 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 13 May 2025 23:22:18 -0400
Subject: [PATCH 084/178] [Executorch][llm] Enable local global attention in
 export_llama script (#10836)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/10612 by
@kimishpatel
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/189/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/189/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/188/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/kimishpatel/189/orig
@diff-train-skip-merge

Co-authored-by: Kimish Patel <kimishpatel@fb.com>
---
 examples/models/llama/export_llama_lib.py     | 35 +++++++++++++++++++
 .../source_transformation/custom_kv_cache.py  | 15 +++++++-
 examples/models/llama/tests/TARGETS           | 16 +++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 0a80369df11..22697925907 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -62,6 +62,7 @@
 from .source_transformation.custom_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
     replace_kv_cache_with_quantized_kv_cache,
+    replace_kv_cache_with_ring_kv_cache,
 )
 
 from .source_transformation.quantize import (
@@ -153,6 +154,23 @@ def build_model(
     return export_llama(args)
 
 
+def parse_list_of_ints(s):
+    import ast
+
+    try:
+        parsed = ast.literal_eval(s)
+        if isinstance(parsed, list) and all(isinstance(i, int) for i in parsed):
+            print(parsed)
+            return parsed
+        raise argparse.ArgumentTypeError(
+            "Must be a list of integers, e.g., [0, 16, 0, 16]"
+        )
+    except Exception:
+        raise argparse.ArgumentTypeError(
+            "Must be a list of integers, e.g., [0, 16, 0, 16]"
+        )
+
+
 def build_args_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
     parser.add_argument("-o", "--output-dir", default=".", help="output directory")
@@ -363,6 +381,15 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="maximum length of context for model to remember",
     )
 
+    parser.add_argument(
+        "--local_global_attention",
+        type=parse_list_of_ints,
+        default=None,
+        help="List of integers specifying local and global attention pattern, e.g., [0, 16, 0, 16] to specify that every other layer is sliding window of 16."
+        " [0, 16, 32] pattern specifes 2nd and 3rd layer has sliding window of 16 and 32 respecitvely. "
+        " [16] pattern specifies all layers have sliding window of 16.",
+    )
+
     parser.add_argument("-2", "--fairseq2", action="store_true")
     parser.add_argument("-v", "--verbose", action="store_true")
     parser.add_argument(
@@ -1440,6 +1467,14 @@ class Args:
     if vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
 
+    if getattr(args, "local_global_attention", None) is not None:
+        transforms.append(
+            partial(
+                replace_kv_cache_with_ring_kv_cache,
+                layer_sizes=args.local_global_attention,
+            )
+        )
+
     return transforms
 
 
diff --git a/examples/models/llama/source_transformation/custom_kv_cache.py b/examples/models/llama/source_transformation/custom_kv_cache.py
index ffe6732dd53..25ec207d0e0 100644
--- a/examples/models/llama/source_transformation/custom_kv_cache.py
+++ b/examples/models/llama/source_transformation/custom_kv_cache.py
@@ -555,8 +555,17 @@ def replace_kv_cache_with_ring_kv_cache(module, layer_sizes):
     # This is needed to ensure that custom ops are registered
     from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
 
+    assert len(module.layers) >= len(
+        layer_sizes
+    ), f"Length of layer sizes {len(layer_sizes)} must match the number of layers in the module {len(module.layers)}."
+    multiplier = len(module.layers) // len(layer_sizes)
+    modulo = len(module.layers) % len(layer_sizes)
+    assert (
+        modulo == 0
+    ), f"num layers specified must be multiple of model layers in order to specify pattern. pattern: {layer_sizes} model's num layers {len(module.layers)}"
+    layer_sizes = layer_sizes * multiplier
     logging.info(
-        "Replacing kv cache with ring kv cache. This modifies the model in place."
+        f"Applying local sliding window attention with following pattern {layer_sizes}."
     )
     assert len(layer_sizes) == len(
         module.layers
@@ -570,4 +579,8 @@ def replace_kv_cache_with_ring_kv_cache(module, layer_sizes):
         ), f"Transfomer block must have attention module. Transformer block {transformer_block}"
         attention = transformer_block.attention
         _replace_kv_cache_with_ring_kv_cache(attention, sliding_window_size)
+        # if attention's sdpa is custom sdpa then we have to make sure
+        # it is not doing causal attention
+        if "SDPACustom" in attention.SDPA.__class__.__name__:
+            attention.SDPA.use_attention_mask = True
     return module
diff --git a/examples/models/llama/tests/TARGETS b/examples/models/llama/tests/TARGETS
index 40ab6653c60..fe79e405cab 100644
--- a/examples/models/llama/tests/TARGETS
+++ b/examples/models/llama/tests/TARGETS
@@ -85,3 +85,19 @@ python_unittest(
         "//executorch/examples/models/llama:sdpa",
     ],
 )
+
+python_unittest(
+    name = "test_export_llama_lib",
+    srcs = [
+        "test_export_llama_lib.py",
+    ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:export_library",
+        "//executorch/examples/models/llama:llama_transformer",
+        "//executorch/extension/pybindings:portable_lib",
+    ],
+)

From f7853868cbfb748edc1e56321034d2dd399a4430 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Per=20=C3=85strand?= <per@users.noreply.github.com>
Date: Wed, 14 May 2025 11:27:36 +0200
Subject: [PATCH 085/178] Arm backend: Update operator support for
 TOSA-1.0+INT+u55 (#10849)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
Make is_U55_subset part of the base TOSA specification class as the
subset is not tied to a specific specification. Using the
is_U55_subset attribute the supported checks for U55 subset are
updated to catch TOSA 1.0 as well.

### Test plan
Tested on internal and external CI.


Signed-off-by: Per Åstrand <per.astrand@arm.com>
---
 backends/arm/_passes/arm_pass_manager.py        |  6 +++---
 .../arm/operator_support/convolution_support.py | 16 +++++-----------
 .../arm/operator_support/pool_2d_support.py     |  6 +++---
 .../arm/operator_support/reduce_sum_support.py  |  4 ++--
 .../arm/operator_support/right_shift_support.py |  4 ++--
 .../tosa_supported_operators.py                 | 10 ++--------
 backends/arm/operators/op_rshift_tensor.py      |  5 ++---
 backends/arm/test/tester/test_pipeline.py       |  4 +++-
 backends/arm/tosa_specification.py              | 17 ++++++++++-------
 9 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 20d4f41a273..4123d217e94 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -59,7 +59,7 @@
     UnsqueezeScalarPlaceholdersPass,
 )
 
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
 )
@@ -92,7 +92,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ConvertAnyDefaultDimDimsPass())
         self.add_pass(MatchWhereSelfDtypePass())
-        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+        if self.tosa_spec.is_U55_subset:
             self.add_pass(CastToInt32Pass())
 
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
@@ -210,7 +210,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(DecomposeSiluPass())
 
-        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+        if self.tosa_spec.is_U55_subset:
             # Numerically stable softmax uses amax which is not supported on Ethos-U55
             self.add_pass(DecomposeSoftmaxUnstablePass())
         else:
diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index 5b4fefdbf81..3e3149f3443 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -11,11 +11,8 @@
     register_tosa_support_check,
     SupportedTOSAOperatorCheck,
 )
-from executorch.backends.arm.tosa_specification import (
-    Tosa_0_80,
-    Tosa_1_00,
-    TosaSpecification,
-)
+from executorch.backends.arm.tosa_specification import TosaSpecification
+
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -46,13 +43,10 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
                 return False
 
         # Hardware specific constraints
-        if not (isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset):
-            # TODO remove this once TOSA 1.0 support for u55 is added.
-            if isinstance(tosa_spec, Tosa_1_00) and "u55" in tosa_spec.extensions:
-                return False
-            return True
-        else:
+        if tosa_spec.is_U55_subset:
             return self._is_node_supported_u55(node)
+        else:
+            return True
 
     def _is_node_supported_u55(self, node: fx.Node):
         """Hardware constraints for Ethos-U-55 case, Vela 4.2.0 (25.02 release)"""
diff --git a/backends/arm/operator_support/pool_2d_support.py b/backends/arm/operator_support/pool_2d_support.py
index f4ada36de80..753cd7c747b 100644
--- a/backends/arm/operator_support/pool_2d_support.py
+++ b/backends/arm/operator_support/pool_2d_support.py
@@ -11,7 +11,7 @@
     register_tosa_support_check,
     SupportedTOSAOperatorCheck,
 )
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -46,7 +46,7 @@ class AvgPool2dSupported(SupportedTOSAOperatorCheck):
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
-        if not (isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset):
+        if not tosa_spec.is_U55_subset:
             return True
 
         # U55 case, Vela 4.2.0 (25.02 release)
@@ -104,7 +104,7 @@ class MaxPool2dSupported(SupportedTOSAOperatorCheck):
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
-        if not (isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset):
+        if not tosa_spec.is_U55_subset:
             return True
 
         # U55 case, Vela 4.2.0 (25.02 release)
diff --git a/backends/arm/operator_support/reduce_sum_support.py b/backends/arm/operator_support/reduce_sum_support.py
index a50bcbceab7..4d0614d4b1a 100644
--- a/backends/arm/operator_support/reduce_sum_support.py
+++ b/backends/arm/operator_support/reduce_sum_support.py
@@ -10,7 +10,7 @@
     register_tosa_support_check,
     SupportedTOSAOperatorCheck,
 )
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -26,7 +26,7 @@ class SumSupported(SupportedTOSAOperatorCheck):
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
-        if not (isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset):
+        if not tosa_spec.is_U55_subset:
             return True
 
         # U55 case, Vela 4.2.0 (25.02 release)
diff --git a/backends/arm/operator_support/right_shift_support.py b/backends/arm/operator_support/right_shift_support.py
index 49976b2346f..d18950a58a2 100644
--- a/backends/arm/operator_support/right_shift_support.py
+++ b/backends/arm/operator_support/right_shift_support.py
@@ -13,7 +13,7 @@
     register_tosa_support_check,
     SupportedTOSAOperatorCheck,
 )
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.dialects._ops import ops as exir_ops
 
 logger = logging.getLogger(__name__)
@@ -36,6 +36,6 @@ class RightShiftSupported(SupportedTOSAOperatorCheck):
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
 
         # TODO MLETORCH-525 Remove warning
-        if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
+        if tosa_spec.is_U55_subset:
             logging.warning(f"{node.target} may introduce one-off errors.")
         return True
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index c732c91a20a..547eafbfa8d 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -24,11 +24,7 @@
     EthosU55NotSupported,
     EthosU55TransposeCheck,
 )
-from executorch.backends.arm.tosa_specification import (
-    Tosa_0_80,
-    Tosa_1_00,
-    TosaSpecification,
-)
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir import ExportedProgram
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -129,9 +125,7 @@ def tosa_support_factory(
     if not tosa_spec.support_float():
         negative_checks.append(NeedsDecompositionCheck(reporter))
         negative_checks.append(CheckProperQuantization(reporter))
-    if (isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset) or (
-        isinstance(tosa_spec, Tosa_1_00) and "u55" in tosa_spec.extensions
-    ):
+    if tosa_spec.is_U55_subset:
         negative_checks.append(EthosU55NotSupported(reporter))
         negative_checks.append(EthosU55DtypeSupport(reporter))
         negative_checks.append(EthosU55TransposeCheck(reporter))
diff --git a/backends/arm/operators/op_rshift_tensor.py b/backends/arm/operators/op_rshift_tensor.py
index e843f669a58..ece6debeab4 100644
--- a/backends/arm/operators/op_rshift_tensor.py
+++ b/backends/arm/operators/op_rshift_tensor.py
@@ -17,7 +17,6 @@
     validate_num_inputs,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_specification import Tosa_0_80, Tosa_1_00
 
 
 @register_node_visitor
@@ -39,7 +38,7 @@ def define_node(
 
         attr = ts.TosaSerializerAttribute()
         round = False
-        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+        if self.tosa_spec.is_U55_subset:
             # U55 only supports INT32 and round == True
             # TODO MLETORCH-525 Emulate round == False with different decomposition
             round = True
@@ -72,7 +71,7 @@ def define_node(
 
         attr = ts.TosaSerializerAttribute()
         round = False
-        if isinstance(self.tosa_spec, Tosa_1_00) and "u55" in self.tosa_spec.extensions:
+        if self.tosa_spec.is_U55_subset:
             # U55 only supports INT32 and round == True
             # TODO MLETORCH-525 Emulate round == False with different decomposition
             round = True
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
index 480497b4aee..13e2f80b5c5 100644
--- a/backends/arm/test/tester/test_pipeline.py
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -293,7 +293,9 @@ def __init__(
         )
         quant_stage = (
             Quantize(
-                TOSAQuantizer(compile_spec).set_io(get_symmetric_quantization_config()),
+                TOSAQuantizer(tosa_profiles[tosa_version]).set_io(
+                    get_symmetric_quantization_config()
+                ),
                 get_symmetric_quantization_config(),
             )
             if symmetric_io_quantization
diff --git a/backends/arm/tosa_specification.py b/backends/arm/tosa_specification.py
index 640361e059c..0cf5cfab74d 100644
--- a/backends/arm/tosa_specification.py
+++ b/backends/arm/tosa_specification.py
@@ -36,6 +36,7 @@ class TosaSpecification:
     """
 
     version: Version
+    is_U55_subset: bool
 
     def support_integer(self) -> bool:
         """
@@ -49,9 +50,13 @@ def support_float(self) -> bool:
         """
         raise NotImplementedError
 
-    def __init__(self, version: Version):
+    def __init__(self, version: Version, extras: List[str]):
         self.version = version
 
+        self.is_U55_subset = "u55" in extras
+        if self.is_U55_subset:
+            extras.remove("u55")
+
     @staticmethod
     def create_from_string(repr: str) -> "TosaSpecification":
         """
@@ -85,11 +90,10 @@ def create_from_string(repr: str) -> "TosaSpecification":
 class Tosa_0_80(TosaSpecification):
     profile: str
     level_8k: bool
-    is_U55_subset: bool
     available_profiles = ["BI", "MI"]  # MT is not defined
 
     def __init__(self, version: Version, extras: List[str]):
-        super().__init__(version)
+        super().__init__(version, extras)
         assert version >= Version("0.80") and version < Version("0.90")
 
         # Check that we only have one profile in the extensions list
@@ -105,9 +109,6 @@ def __init__(self, version: Version, extras: List[str]):
         self.level_8k = "8k" in extras
         if self.level_8k:
             extras.remove("8k")
-        self.is_U55_subset = "u55" in extras
-        if self.is_U55_subset:
-            extras.remove("u55")
 
         if len(extras) > 0:
             raise ValueError(f"Unhandled extras found: {extras}")
@@ -147,7 +148,7 @@ class Tosa_1_00(TosaSpecification):
     }
 
     def __init__(self, version: Version, extras: List[str]):
-        super().__init__(version)
+        super().__init__(version, extras)
 
         # Check that we have at least one profile in the extensions list
         if [e in Tosa_1_00.available_profiles for e in extras].count(True) == 0:
@@ -194,6 +195,8 @@ def __repr__(self):
         extensions = self._get_extensions_string()
         if self.level_8k:
             extensions += "+8k"
+        if self.is_U55_subset:
+            extensions += "+u55"
         return f"TOSA-{self.version}{self._get_profiles_string()}{extensions}"
 
     def __hash__(self) -> int:

From e1738cca8750a1f98a90c761311e2835ad48617d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Per=20=C3=85strand?= <per@users.noreply.github.com>
Date: Wed, 14 May 2025 11:29:33 +0200
Subject: [PATCH 086/178] Arm backend: Update NEGATE with TOSA 1.0 support
 (#10845)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
Add the serialization to TOSA 1.0 where the attributes has moved to
input tensors instead.

### Test plan
Tested on internal and external CI.

Signed-off-by: Per Åstrand <per.astrand@arm.com>
---
 backends/arm/operators/op_neg.py | 90 ++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 16 deletions(-)

diff --git a/backends/arm/operators/op_neg.py b/backends/arm/operators/op_neg.py
index a5fefe25db6..0b474a0b077 100644
--- a/backends/arm/operators/op_neg.py
+++ b/backends/arm/operators/op_neg.py
@@ -4,11 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-from typing import List
+from typing import Any, List
 
 import torch.fx
 
-import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
     get_output_qparams,
@@ -21,12 +20,12 @@
 from executorch.backends.arm.tosa_mapping import TosaArg
 
 
-def get_negate_zero_points(node: torch.fx.Node, dtype: ts.DType) -> tuple[int, int]:
+def get_negate_zero_points(node: torch.fx.Node, is_int8: bool) -> tuple[int, int]:
     """
     Returns (input1_zp, output_zp) for TOSA NEGATE.
     Must be zero for non-int8 types.
     """
-    if dtype == ts.DType.INT8:
+    if is_int8:
         return (
             get_input_qparams(node)[0].zp,
             get_output_qparams(node)[0].zp,
@@ -35,17 +34,10 @@ def get_negate_zero_points(node: torch.fx.Node, dtype: ts.DType) -> tuple[int, i
 
 
 @register_node_visitor
-class NegVisitor(NodeVisitor):
+class NegVisitor_0_80(NodeVisitor):
     target = "aten.neg.default"
 
-    supported_dtypes = {
-        ts.DType.INT8,
-        ts.DType.INT16,
-        ts.DType.INT32,
-        ts.DType.FP16,
-        ts.DType.BF16,
-        ts.DType.FP32,
-    }
+    tosa_specs = NodeVisitor.tosa_specs_0_80
 
     def __init__(self, *args):
         super().__init__(*args)
@@ -53,12 +45,22 @@ def __init__(self, *args):
     def define_node(
         self,
         node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
+        tosa_graph: Any,
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
+        import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
+        supported_dtypes = {
+            ts.DType.INT8,
+            ts.DType.INT16,
+            ts.DType.INT32,
+            ts.DType.FP16,
+            ts.DType.BF16,
+            ts.DType.FP32,
+        }
 
-        if inputs[0].dtype not in self.supported_dtypes:
+        if inputs[0].dtype not in supported_dtypes:
             raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
 
         if inputs[0].dtype != output.dtype:
@@ -66,7 +68,9 @@ def define_node(
                 "All inputs and output need same dtype."
                 f"Got {inputs[0].dtype=}, {output.dtype=}"
             )
-        input_zp, output_zp = get_negate_zero_points(node, inputs[0].dtype)
+        input_zp, output_zp = get_negate_zero_points(
+            node, inputs[0].dtype == ts.DType.INT8
+        )
 
         attr = ts.TosaSerializerAttribute()
         attr.NegateAttribute(input1_zp=input_zp, output_zp=output_zp)
@@ -76,3 +80,57 @@ def define_node(
             [output.name],
             attributes=attr,
         )
+
+
+@register_node_visitor
+class NegVisitor(NodeVisitor):
+    target = "aten.neg.default"
+
+    tosa_specs = NodeVisitor.tosa_specs_1_00
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        import serializer.tosa_serializer as ts  # type: ignore
+
+        supported_dtypes = {
+            ts.DType.INT8,
+            ts.DType.INT16,
+            ts.DType.INT32,
+            ts.DType.FP16,
+            ts.DType.BF16,
+            ts.DType.FP32,
+        }
+
+        if inputs[0].dtype not in supported_dtypes:
+            raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
+
+        if inputs[0].dtype != output.dtype:
+            raise ValueError(
+                "All inputs and output need same dtype."
+                f"Got {inputs[0].dtype=}, {output.dtype=}"
+            )
+        input_zp, output_zp = get_negate_zero_points(
+            node, inputs[0].dtype == ts.DType.INT8
+        )
+
+        input_zp_tensor = tosa_graph.addConst(
+            (1,), inputs[0].dtype, [input_zp], name=output.name + "_input_zp"
+        )
+
+        output_zp_tensor = tosa_graph.addConst(
+            (1,), output.dtype, [output_zp], name=output.name + "_output_zp"
+        )
+
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().NEGATE,
+            [inputs[0].name, input_zp_tensor.name, output_zp_tensor.name],
+            [output.name],
+        )

From fa5048bd4939dce081695e7e4330000b21880fd1 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 14 May 2025 09:34:09 -0700
Subject: [PATCH 087/178] Support prequant qwen3 (#10839)

As titled
---
 examples/models/qwen3/convert_weights.py | 25 ++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
index 53a609885d7..6d5254906fb 100644
--- a/examples/models/qwen3/convert_weights.py
+++ b/examples/models/qwen3/convert_weights.py
@@ -13,6 +13,7 @@
 _QWEN_3_FROM_META = {
     "tok_embeddings.weight": "model.embed_tokens.weight",
     "norm.weight": "model.norm.weight",
+    "output.weight": "lm_head.weight",
     "layers.{}.attention.wk.weight": "model.layers.{}.self_attn.k_proj.weight",
     "layers.{}.attention.k_norm_fn.weight": "model.layers.{}.self_attn.k_norm.weight",
     "layers.{}.attention.wq.weight": "model.layers.{}.self_attn.q_proj.weight",
@@ -47,20 +48,19 @@ def qwen_3_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
     inverted_mapping_dict = {v: k for k, v in _QWEN_3_FROM_META.items()}
 
     for key, value in state_dict.items():
-        # Tied embeddings for 0.6b and 4b models.
-        if key == "lm_head.weight":
-            continue
         new_key = get_mapped_key(key, inverted_mapping_dict)
         converted_state_dict[new_key] = value
 
-    converted_state_dict["output.weight"] = converted_state_dict[
-        "tok_embeddings.weight"
-    ]
+    # If lm_head.weight is not present in state dict, assume tied embeddings (e.g., 0.6b and 4b models)
+    if "lm_head.weight" not in state_dict:
+        converted_state_dict["output.weight"] = converted_state_dict[
+            "tok_embeddings.weight"
+        ]
 
     return converted_state_dict
 
 
-def load_checkpoint(input_dir: str) -> Dict:
+def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
     index_path = os.path.join(input_dir, "model.safetensors.index.json")
     if os.path.exists(index_path):
         # Sharded checkpoint.
@@ -86,6 +86,15 @@ def load_checkpoint(input_dir: str) -> Dict:
         return state_dict
 
 
+def load_checkpoint(input_dir: str) -> Dict:
+    pytorch_path = os.path.join(input_dir, "pytorch_model.bin")
+    if os.path.exists(pytorch_path):
+        print("Loading checkpoint from PyTorch .bin file")
+        return torch.load(pytorch_path, map_location="cpu", weights_only=True)
+    print("Loading checkpoint from safetensors directory")
+    return load_checkpoint_from_safetensors(input_dir)
+
+
 def convert_weights(input_dir: str, output_file: str) -> None:
     print("Loading checkpoint...")
     sd = load_checkpoint(input_dir)
@@ -103,7 +112,7 @@ def main():
     parser.add_argument(
         "input_dir",
         type=str,
-        help="Path to directory containing checkpoint files",
+        help="Path to directory containing safetensor checkpoint files, or PyTorch checkpoint file.",
     )
     parser.add_argument("output", type=str, help="Path to the output checkpoint")
 

From 101746e65229bb8d313ebaa3df8fa5e59d2218bf Mon Sep 17 00:00:00 2001
From: Juanfi8 <145443656+Juanfi8@users.noreply.github.com>
Date: Wed, 14 May 2025 18:45:11 +0200
Subject: [PATCH 088/178] Arm backend: Example external model to be used by the
 ahead of time arm compiler (#10810)

Example of an external model to be loaded as a module by the
aot_arm_compiler.py.
Useful for documentation.
---
 examples/arm/example_modules/README.md |  7 +++++++
 examples/arm/example_modules/add.py    | 13 +++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 examples/arm/example_modules/README.md
 create mode 100644 examples/arm/example_modules/add.py

diff --git a/examples/arm/example_modules/README.md b/examples/arm/example_modules/README.md
new file mode 100644
index 00000000000..9a746114b98
--- /dev/null
+++ b/examples/arm/example_modules/README.md
@@ -0,0 +1,7 @@
+# Example of an external model for the ARM AOT Compiler
+Example of an external Python file to be used as a module by the `run.sh` (and the `aot_arm_compiler.py`) scripts in `examples/arm` directory. 
+Just pass the path of the `add.py` file as `--model_name`:
+
+`ModelUnderTest` should be a `torch.nn.module` instance.
+
+`ModelInputs` should be a tuple of inputs to the forward function.
diff --git a/examples/arm/example_modules/add.py b/examples/arm/example_modules/add.py
new file mode 100644
index 00000000000..6942e97f807
--- /dev/null
+++ b/examples/arm/example_modules/add.py
@@ -0,0 +1,13 @@
+import torch
+
+
+class myModelAdd(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x + x
+
+
+ModelUnderTest = myModelAdd()
+ModelInputs = (torch.ones(5),)

From 12b5eb6fd3bab06cc176a873844b791807d31177 Mon Sep 17 00:00:00 2001
From: mcremon-meta <134334895+mcremon-meta@users.noreply.github.com>
Date: Wed, 14 May 2025 10:22:53 -0700
Subject: [PATCH 089/178] Add a pass to fuse mul.Scalar into dequant

Differential Revision: D74626626

Pull Request resolved: https://github.com/pytorch/executorch/pull/10853
---
 backends/cadence/aot/fuse_ops.py              | 61 +++++++++++++++++--
 .../aot/tests/test_fusion_ops_passes.py       | 46 +++++++++++++-
 2 files changed, 100 insertions(+), 7 deletions(-)

diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
index 7a20a3f64b4..1aec147cd67 100644
--- a/backends/cadence/aot/fuse_ops.py
+++ b/backends/cadence/aot/fuse_ops.py
@@ -814,11 +814,61 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
-class FuseMulIntoDequantPass(ExportPass):
+class FuseMulScalarIntoDequantPass(ExportPass):
     """
-    Looks for the pattern where atem.mul is multiplying the outputs of dequantize
-    and aten.full. If found, updates the dequant scale to reflect the multiplication
-    and removes the full and mul nodes.
+    Looks for the pattern where aten.mul.Scalar is multiplying the
+     outputs of dequantize. If found, updates the dequant scale
+    to reflect the multiplication and removes the mul node.
+    """
+
+    def attempt_fusion(
+        self, graph_module: torch.fx.GraphModule, node: torch.fx.Node
+    ) -> None:
+        if node.target not in {
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+        }:
+            return
+
+        # ensure that the single user of dequant is aten.mul.Scalar
+        user = list(node.users.keys())[0]
+        if len(node.users) != 1 or user.target != exir_ops.edge.aten.mul.Scalar:
+            return
+
+        # ensure that the other arg to mul is a node (i.e. not a constant)
+        if len(user.args) > 1 and isinstance(user.args[1], torch.fx.Node):
+            return
+
+        new_deq_args = list(node.args)
+        assert isinstance(node.args[1], Number)
+        assert isinstance(user.args[1], Number)
+        # pyre-ignore[58]: Unsupported operand *
+        new_deq_args[1] = node.args[1] * user.args[1]
+
+        logging.debug(
+            f"Fused {node} and {user} into {node}. Updated scale from {node.args[1]} to {new_deq_args[1]}"
+        )
+
+        user.replace_all_uses_with(node)
+        node.args = tuple(new_deq_args)
+
+        graph_module.graph.erase_node(user)
+
+        graph_module.recompile()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            self.attempt_fusion(graph_module, node)
+        result = super().call(graph_module)
+        return result
+
+
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class FuseMulTensorIntoDequantPass(ExportPass):
+    """
+    Looks for the pattern where aten.mul is multiplying the outputs of dequantize
+    and aten.full, or vice versa. If found, updates the dequant scale to reflect
+    the multiplication and removes the full and mul nodes.
     """
 
     def attempt_fusion(
@@ -1017,7 +1067,8 @@ class CadenceFuseOpsInGraph:
         FuseCascadedTransposeOrPermuteOps,
         FuseCascadedViewOps,
         FuseQuantDequantToRequantizePass,
-        FuseMulIntoDequantPass,
+        FuseMulTensorIntoDequantPass,
+        FuseMulScalarIntoDequantPass,
         FuseFullThenReshapePass,
         FuseTransposeOrPermuteOpPairsPass,
     ]
diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
index 4e267254488..3d9cadf741b 100644
--- a/backends/cadence/aot/tests/test_fusion_ops_passes.py
+++ b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -19,7 +19,8 @@
 )
 from executorch.backends.cadence.aot.fuse_ops import (
     FuseFullThenReshapePass,
-    FuseMulIntoDequantPass,
+    FuseMulScalarIntoDequantPass,
+    FuseMulTensorIntoDequantPass,
     FuseQuantDequantToRequantizePass,
     FuseTransposeOrPermuteOpPairsPass,
 )
@@ -446,7 +447,7 @@ def forward(self, x):
 
         inputs = (torch.randint(0, 255, [4, 32], dtype=torch.uint8),)
         graph_module = export_to_edge(M(), inputs).exported_program().graph_module
-        graph_module = FuseMulIntoDequantPass()(graph_module).graph_module
+        graph_module = FuseMulTensorIntoDequantPass()(graph_module).graph_module
 
         # verify that the mul and full ops were removed
         self.check_op_counts(
@@ -467,6 +468,47 @@ def forward(self, x):
                 deq_scale = node.args[1]
         self.assertEqual(deq_scale, 4.5)
 
+    def test_fuse_mul_scalar_into_dequant(self):
+        dequant_scale = 0.006
+        mul_value = 0.3
+
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 3, 4, dtype=torch.float32))
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 1, 0, -128, 127, torch.int8),
+        )
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(quant, dequant_scale, 5, -128, 127, torch.int8),
+        )
+        mul_scalar = builder.call_operator(
+            op=exir_ops.edge.aten.mul.Scalar,
+            args=(dequant, mul_value),
+        )
+        builder.output(mul_scalar)
+        graph_module = builder.get_graph_module()
+
+        graph_module = FuseMulScalarIntoDequantPass()(graph_module).graph_module
+
+        # verify that the mul and full ops were removed
+        self.check_op_counts(
+            graph_module,
+            expected_op_counts={
+                exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 1,
+                exir_ops.edge.aten.mul.Scalar: 0,
+            },
+        )
+
+        # verify that the dequant scale value was updated correctly
+        for node in graph_module.graph.nodes:
+            if (
+                node.target
+                == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+            ):
+                deq_scale = node.args[1]
+        self.assertEqual(deq_scale, dequant_scale * mul_value)
+
     def test_fuse_then_transpose_pass(self):
         # Create a graph with full -> transpose.
         builder = GraphBuilder()

From f39e6945855b066d33547f4fea5cfca44b9a812c Mon Sep 17 00:00:00 2001
From: Richard Howell <rmaz@users.noreply.github.com>
Date: Wed, 14 May 2025 11:57:51 -0700
Subject: [PATCH 090/178] fix Swift compiler assert (#10874)

Differential Revision: D74737507

Pull Request resolved: https://github.com/pytorch/executorch/pull/10875
---
 extension/apple/ExecuTorch/Exported/ExecuTorchValue.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
index 1554132e929..f95fa48210a 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
@@ -34,8 +34,7 @@ typedef NSNumber *ExecuTorchScalarValue
     NS_SWIFT_BRIDGED_TYPEDEF NS_SWIFT_NAME(ScalarValue);
 typedef NSString *ExecuTorchStringValue
     NS_SWIFT_BRIDGED_TYPEDEF NS_SWIFT_NAME(StringValue);
-typedef BOOL ExecuTorchBooleanValue
-    NS_SWIFT_BRIDGED_TYPEDEF NS_SWIFT_NAME(BoolValue);
+typedef BOOL ExecuTorchBooleanValue NS_SWIFT_NAME(BoolValue);
 typedef NSInteger ExecuTorchIntegerValue
     NS_SWIFT_BRIDGED_TYPEDEF NS_SWIFT_NAME(IntegerValue);
 typedef double ExecuTorchDoubleValue

From 587f2f83291a2e4622aef850ee3fc5233c49ba26 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Wed, 14 May 2025 13:17:35 -0700
Subject: [PATCH 091/178] [jit] Remove more reference to TorchScript (#10856)

Summary: Remove some occurrences in comments.

Test Plan: None

Reviewers:

Subscribers:

Tasks:

Tags:

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 exir/dialects/_ops.py          |   2 +-
 exir/serde/TARGETS             |   1 -
 exir/serde/export_serialize.py |  70 +----------
 exir/serde/serialize.py        |  16 +--
 exir/serde/upgrade.py          | 212 ---------------------------------
 5 files changed, 9 insertions(+), 292 deletions(-)
 delete mode 100644 exir/serde/upgrade.py

diff --git a/exir/dialects/_ops.py b/exir/dialects/_ops.py
index ec4d71395bf..fc25158e07c 100644
--- a/exir/dialects/_ops.py
+++ b/exir/dialects/_ops.py
@@ -100,7 +100,7 @@ def __getattr__(self, op_name):
             parent_packet = getattr(self._op_namespace, op_name)
         except AttributeError as e:
             # Turn this into AttributeError so getattr(obj, key, default)
-            # works (this is called by TorchScript with __origin__)
+            # works
             raise AttributeError(
                 f"'_OpNamespace' '{self._dialect}.{self._name}' object has no attribute '{op_name}'"
             ) from e
diff --git a/exir/serde/TARGETS b/exir/serde/TARGETS
index 7bede435359..ec3db22aac6 100644
--- a/exir/serde/TARGETS
+++ b/exir/serde/TARGETS
@@ -12,7 +12,6 @@ python_library(
         "schema_check.py",
         "serialize.py",
         "union.py",
-        "upgrade.py",
     ],
     deps = [
         "fbsource//third-party/pypi/sympy:sympy",
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
index 08cd03adcea..7a1d35c432e 100644
--- a/exir/serde/export_serialize.py
+++ b/exir/serde/export_serialize.py
@@ -114,7 +114,6 @@
     "ExportedProgramDeserializer",
 ]
 
-from .upgrade import GraphModuleOpUpgrader
 
 log = logging.getLogger(__name__)
 
@@ -2220,12 +2219,8 @@ def deserialize_module_call_graph(
 
 
 class ExportedProgramDeserializer:
-    def __init__(self, expected_opset_version: Optional[Dict[str, int]] = None):
-        self.expected_opset_version: Dict[str, int] = {}
-        if expected_opset_version:
-            self.expected_opset_version.update(expected_opset_version)
-        if "aten" not in self.expected_opset_version:
-            self.expected_opset_version["aten"] = torch._C._get_max_operator_version()
+    def __init__(self):
+        pass
 
     def deserialize_range_constraints(
         self,
@@ -2278,13 +2273,6 @@ def deserialize(
             symbol_name_to_range,
             res.names_to_symbols,
         )
-        model_opset_version: Optional[Dict[str, int]] = exported_program.opset_version
-        self._validate_model_opset_version(model_opset_version)
-
-        upgrader = GraphModuleOpUpgrader(
-            self.expected_opset_version, model_opset_version
-        )
-
         exported_program = ep.ExportedProgram(
             root=res.graph_module,
             graph=res.graph_module.graph,
@@ -2296,56 +2284,7 @@ def deserialize(
             verifier=load_verifier(exported_program.dialect),
             constants=res.constants,
         )
-        return upgrader.upgrade(exported_program)
-
-    def _validate_model_opset_version(
-        self, model_opset_version: Optional[Dict[str, int]]
-    ):
-        """Compare model_opset_version with expected_opset_version and raise error if we can't resolve the version
-        difference.
-        E.g., model_opset_version = {"aten": 3, "custom": 4}
-        expected_opset_version = {"aten": 4, "custom": 4}
-        This means we can use an upgrader for ATen to reconcile the deserialized model.
-
-        The logic of this method:
-
-        For common op namespaces:
-        1. if model version < expected version, this case can be handled by upgraders.
-        2. if model version > expected version, we need downgraders but not implemented yet.
-        3. if model version == expected version, we don't need extra handling.
-
-        For op namespace only in model_opset_version, we should give a warning because it is missing from
-        expected_opset_version.
-        """
-        if not model_opset_version:
-            raise RuntimeError("Serialized model should have opset version.")
-        common_namespaces = {
-            key for key in model_opset_version if key in self.expected_opset_version
-        }
-        for namespace in common_namespaces:
-            model_version = model_opset_version[namespace]
-            assert isinstance(
-                model_version, int
-            ), f"model_opset_version value should be int, got {model_version}"
-
-            compiler_version = self.expected_opset_version[namespace]
-            assert isinstance(
-                compiler_version, int
-            ), f"expected_opset_version value should be int, got {compiler_version}"
-
-            # TODO(larryliu0820): Add support for upgrader & downgrader
-            if model_version != compiler_version:
-                raise NotImplementedError(
-                    f"Model opset version {model_opset_version} doesn't match to compiler opset version "
-                    f"{self.expected_opset_version}! Upgrader/downgrader is not implemented yet."
-                )
-        for namespace in model_opset_version:
-            if namespace in common_namespaces:
-                continue
-            log.warning(
-                "Compiler doesn't have a version table for op namespace: {ns}. ",
-                extra={"ns": namespace},
-            )
+        return exported_program
 
 
 class EnumEncoder(json.JSONEncoder):
@@ -2435,7 +2374,6 @@ def _dict_to_dataclass(cls, data):
 
 def deserialize(
     artifact: SerializedArtifact,
-    expected_opset_version: Optional[Dict[str, int]] = None,
 ) -> ep.ExportedProgram:
     assert isinstance(artifact.exported_program, bytes)
     exported_program_str = artifact.exported_program.decode("utf-8")
@@ -2443,7 +2381,7 @@ def deserialize(
     serialized_exported_program = _dict_to_dataclass(
         ExportedProgram, exported_program_dict
     )
-    return ExportedProgramDeserializer(expected_opset_version).deserialize(
+    return ExportedProgramDeserializer().deserialize(
         serialized_exported_program,
         artifact.state_dict,
         artifact.constants,
diff --git a/exir/serde/serialize.py b/exir/serde/serialize.py
index c9605018c4a..b587813c72c 100644
--- a/exir/serde/serialize.py
+++ b/exir/serde/serialize.py
@@ -32,7 +32,7 @@
 from executorch.exir.lowered_backend_module import (
     LoweredBackendModule as ExirLoweredBackendModule,
 )
-from executorch.exir.serde.export_serialize import GraphModuleOpUpgrader, SerializeError
+from executorch.exir.serde.export_serialize import SerializeError
 from executorch.exir.serde.schema import (
     CompileSpec,
     LoweredBackendModule as SerdeLoweredBackendModule,
@@ -617,12 +617,6 @@ def deserialize(
             symbol_name_to_range,
             res.names_to_symbols,
         )
-        model_opset_version: Optional[Dict[str, int]] = exported_program.opset_version
-        self._validate_model_opset_version(model_opset_version)
-
-        upgrader = GraphModuleOpUpgrader(
-            self.expected_opset_version, model_opset_version
-        )
 
         dummy_g = torch.fx.Graph()
         dummy_g.output(())
@@ -656,7 +650,7 @@ def deserialize(
                     node.target,
                     getattr(res.graph_module, node.target),
                 )
-        return upgrader.upgrade(exported_program)
+        return exported_program
 
 
 def serialize(
@@ -683,7 +677,6 @@ def serialize(
 
 def deserialize(
     artifact: export_serialize.SerializedArtifact,
-    expected_opset_version: Optional[Dict[str, int]] = None,
 ) -> ep.ExportedProgram:
     assert isinstance(artifact.exported_program, bytes)
     exported_program_str = artifact.exported_program.decode("utf-8")
@@ -691,7 +684,7 @@ def deserialize(
     serialized_exported_program = export_serialize._dict_to_dataclass(
         schema.ExportedProgram, exported_program_dict
     )
-    return ExportedProgramDeserializer(expected_opset_version).deserialize(
+    return ExportedProgramDeserializer().deserialize(
         serialized_exported_program,
         artifact.state_dict,
         artifact.constants,
@@ -735,7 +728,6 @@ def load(
     f: Union[str, os.PathLike[str], io.BytesIO],
     *,
     extra_files: Optional[Dict[str, Any]] = None,
-    expected_opset_version: Optional[Dict[str, int]] = None,
 ) -> ep.ExportedProgram:
     if isinstance(f, (str, os.PathLike)):
         f = os.fspath(str(f))
@@ -796,6 +788,6 @@ def load(
         )
 
         # Deserialize ExportedProgram
-        ep = deserialize(artifact, expected_opset_version)
+        ep = deserialize(artifact)
 
         return ep
diff --git a/exir/serde/upgrade.py b/exir/serde/upgrade.py
deleted file mode 100644
index 5f0ffbf6818..00000000000
--- a/exir/serde/upgrade.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import re
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
-
-import torch
-from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
-from torch._export.pass_infra.node_metadata import NodeMetadata
-from torch._export.pass_infra.proxy_value import ProxyValue
-from torch.fx.node import Argument, Target
-from torch.library import Library
-
-lib = Library("aten", "FRAGMENT")
-impl_lib = Library("aten", "IMPL")
-
-log = logging.getLogger(__name__)
-
-
-def get_target_version(versioned_upgrader_name: str) -> int:
-    """div_Scalar_0_3 is the name of the upgrader, meaning it applies to div.Scalar of version 0 to 3 and is
-    upgrading to version 4."""
-    if not re.match("^.*_[0-9]+_[0-9]+$", versioned_upgrader_name):
-        raise RuntimeError(f"Upgrader name {versioned_upgrader_name} is invalid")
-
-    return int(versioned_upgrader_name.split("_")[-1]) + 1
-
-
-def get_upgraders() -> Dict[str, Tuple[str, str]]:
-    """Getting upgraders entry map and operator version map and merge them into one dict."""
-    upgraders = torch._C._get_upgraders_entry_map()
-    op_version_map = torch._C._get_operator_version_map()
-    output: Dict[str, Tuple[str, str]] = defaultdict(tuple)  # type: ignore[arg-type]
-    for opname, entry_list in op_version_map.items():
-        if not entry_list:
-            raise RuntimeError(f"Op version map has an empty entry for opname {opname}")
-        entry = entry_list[0]
-        old_schema = entry.old_schema
-        upgrader_name = entry.upgrader_name
-        upgrader_str = upgraders.get(upgrader_name, None)
-        if not upgrader_str:
-            raise RuntimeError(
-                f"Can't find upgrader for op {opname} and upgrader name {upgrader_name}"
-            )
-        output[upgrader_name] = (old_schema, upgrader_str)
-    return output
-
-
-class GraphModuleOpUpgrader:
-    """This upgrader is able to upgrade the old version of ops in a given GraphModule, if all upgraders are available.
-    To use it, retrieve upgraders from somewhere (TorchScript API or new API) and pass it into this upgrader. In
-    __init__() it does the following:
-    1. parse the upgrader list and reorder for upgrading purpose.
-    2. register old versions of operators as custom ops.
-    3. prepare upgrader passes.
-
-    In `upgrade()` API run these upgrader passes.
-
-    An example of op_upgraders input:
-    {
-        "aten::div__Scalar_0_3": (                              # versioned op name
-            "div._Scalar(self: Tensor, other: Scalar)",         # old schema
-            '''
-            def div__Scalar_0_3(self: torch.Tensor, other) -> torch.Tensor:     # upgrader in literal string
-              if (self.is_floating_point() or isinstance(other, float)):
-                return self.true_divide_(other)
-              return self.divide_(other, rounding_mode='trunc')
-            ''',
-        ),
-    },
-
-    Note that we require the upgrader function to be runnable in Python (which is a stricter requirement than the
-    original TorchScript upgrader).
-    """
-
-    class UpgraderPass(_ExportPassBaseDeprecatedDoNotUse):
-        def __init__(self, old_target: Target, new_target: Target):
-            super().__init__()
-            self.old_target = old_target
-            self.new_target = new_target
-
-        def call_operator(
-            self,
-            op,
-            args: Tuple[Argument, ...],
-            kwargs: Dict[str, Argument],
-            meta: NodeMetadata,
-        ) -> ProxyValue:
-            if op == self.old_target:
-                return super().call_operator(self.new_target, args, kwargs, meta)
-            return super().call_operator(op, args, kwargs, meta)
-
-    def __init__(
-        self,
-        compiler_opset_version: Optional[Dict[str, int]] = None,
-        model_opset_version: Optional[Dict[str, int]] = None,
-        op_upgraders: Optional[Dict[str, Tuple[str, str]]] = None,
-    ):
-        self.op_upgraders: Dict[str, Tuple[str, str]] = (
-            get_upgraders() if not op_upgraders else op_upgraders
-        )
-        self.compiler_opset_version = (
-            compiler_opset_version if compiler_opset_version else {}
-        )
-        self.model_opset_version = model_opset_version if model_opset_version else {}
-        self.upgrader_passes: List[GraphModuleOpUpgrader.UpgraderPass] = (
-            GraphModuleOpUpgrader._populate_passes(
-                self._parse_upgraders(self.op_upgraders)
-            )
-        )
-
-    def _parse_upgraders(
-        self, op_upgraders: Optional[Dict[str, Tuple[str, str]]] = None
-    ) -> List[Tuple[str, str]]:
-        """Reorder op_upgraders by version number, return an ordered list of tuples, containing old op schema as well
-        as the upgrader function string literal."""
-        # TODO(larryliu0820): Add support for custom ops
-        op_namespace = "aten"
-        if (
-            not op_upgraders
-            or op_namespace not in self.model_opset_version
-            or op_namespace not in self.compiler_opset_version
-        ):
-            return []
-        model_ver = self.model_opset_version[op_namespace]
-        curr_ver = self.compiler_opset_version[op_namespace]
-
-        # key is the target version. div__Scalar_0_3 should have a key of 4.
-        versioned_upgraders: Dict[int, Tuple[str, str]] = {
-            get_target_version(name): v for name, v in op_upgraders.items()
-        }
-        target_upgraders: List[Tuple[str, str]] = []
-        # we need all upgraders from model_ver + 1 to curr_ver, inclusively
-        for ver in range(model_ver + 1, curr_ver + 1):
-            if ver in versioned_upgraders:
-                target_upgraders.append(versioned_upgraders[ver])
-            else:
-                # we may be able to get away with missing upgraders, if that operator is missing from given graph
-                # module.
-                log.warning(
-                    "Missing an upgrader to upgrade to version {ver}.",
-                    extra={"ver": ver},
-                )
-
-        return target_upgraders
-
-    @staticmethod
-    def _populate_passes(upgraders: List[Tuple[str, str]]) -> List[UpgraderPass]:
-        """Given a list of upgraders, loop through it from lower version to higher version and create passes for all
-        upgraders. se torch.Library API to register old ops. Op name will be
-        <name>_<valid_from_ver>_<valid_till_ver>. Register upgraders as CompositeImplicitAutograd kernels. For example:
-
-        lib = Library("aten", "FRAGMENT")
-        lib.define(old_schema)
-
-        impl_lib = Library("aten", "IMPL")
-        impl_lib.impl("div__Scalar_0_3", div__Scalar_0_3, "CompositeImplicitAutograd")
-
-        @:var upgraders: a list of tuples. The first element of the tuple is the old schema and the second is the
-        upgrader function literal text.
-        @:return upgrader passes, order matters
-        """
-
-        upgrader_passes = []
-
-        def register_old_op(name: str, schema: str, impl_str: str):
-            """Registers an old version operator using impl_name as old op name."""
-            lib.define(schema)
-            try:
-                exec(impl_str)
-            except Exception as e:
-                raise RuntimeError(f"Invalid upgrader string: {impl_str}") from e
-            impl_lib.impl(name, locals()[name], "CompositeImplicitAutograd")
-
-        for schema, upgrader_str in upgraders:
-            upgrader_name = upgrader_str.split("(")[0].split(" ")[-1]
-            op_name = schema.split("(")[0].split("::")[-1]
-            schema = schema.replace(op_name, upgrader_name)
-            try:
-                register_old_op(
-                    name=upgrader_name, schema=schema, impl_str=upgrader_str
-                )
-            except RuntimeError as e:
-                if "with the same name and overload name multiple times" in str(e):
-                    print(f"Registering {upgrader_name} multiple times")
-                else:
-                    raise RuntimeError from e
-            old_op_target = getattr(torch.ops.aten, upgrader_name).default
-            # for example, the operator instance of "aten::div" is torch.op.aten.div.default. We need to append the
-            # "default" at the end.
-            op_name, overload_name = (
-                (op_name, "default")
-                if "." not in op_name
-                else tuple(op_name.split(".")[:2])
-            )
-            new_op_target = getattr(getattr(torch.ops.aten, op_name), overload_name)
-            # Note that the graph will have op names in the graph, but actually they are of old versions.
-            upgrader_passes.append(
-                GraphModuleOpUpgrader.UpgraderPass(
-                    old_target=new_op_target, new_target=old_op_target
-                )
-            )
-
-        return upgrader_passes
-
-    def upgrade(self, exported_program):
-        return exported_program

From e7d39c239879cf6e84d6d9db869a5d4c495b10ca Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 14 May 2025 23:38:19 +0200
Subject: [PATCH 092/178] Fix broken tests

Differential Revision: D74669998

Pull Request resolved: https://github.com/pytorch/executorch/pull/10866
---
 examples/models/llama/export_llama_lib.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 22697925907..c430da78832 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -1273,6 +1273,7 @@ def _get_source_transforms(  # noqa
     preq_mode: Optional[str] = None,
     preq_group_size: Optional[int] = None,
     preq_embedding_quantize: Optional[str] = None,
+    local_global_attention: Optional[List[int]] = None,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1467,7 +1468,7 @@ class Args:
     if vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
 
-    if getattr(args, "local_global_attention", None) is not None:
+    if local_global_attention:
         transforms.append(
             partial(
                 replace_kv_cache_with_ring_kv_cache,

From d67fb523d9ed6f7ddd09e80782bfe4e369fc99c4 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Wed, 14 May 2025 15:59:55 -0700
Subject: [PATCH 093/178] Use the built-in notify crate for OSS Buck (#10884)

---
 .buckconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buckconfig b/.buckconfig
index 1995dc91946..e04d6b95ce7 100644
--- a/.buckconfig
+++ b/.buckconfig
@@ -39,6 +39,7 @@
 
 [buck2]
 restarter=true
+file_watcher=notify
 
 [oss]
 folly_cxx_tests = False

From 1b593ad1943e53c33de4445f0aab483d181bf81e Mon Sep 17 00:00:00 2001
From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com>
Date: Wed, 14 May 2025 17:48:38 -0700
Subject: [PATCH 094/178] Recipe and Input class definitions with e2e export
 (#10034)

Based on the discussion in
https://github.com/pytorch/executorch/discussions/9027

This PR adds the executorch.export API and all the supporting components
required for it. At a high level the executorch.export API takes in a
model, example inputs and a recipe, then underneath the hood executes
all the steps required to export, quantize and lower the model based on
the recipe.

The pipeline consists of a staged setup where each major step in the
process such as Export, Quantization etc. is listed as a separate stage
and a chain of these is formed and then executed. The result of this
will be that we'll get a PTE file which can then be executed on device.

The major new components added in this PR are:
- class definition for ExportSession and ExportRecipe
- Definitions for each stage in the process
- executorch.export API which will return a session object that the user
can then use to get access to the PTE file, run the model via
pybindings, print delegation info etc.
---
 export/TARGETS                         |  38 ++
 export/__init__.py                     |  24 +
 export/export.py                       | 746 +++++++++++++++++++++++++
 export/recipe.py                       | 104 ++++
 export/tests/TARGETS                   |  16 +
 export/tests/test_executorch_export.py |  34 ++
 runtime/TARGETS                        |   3 +-
 7 files changed, 963 insertions(+), 2 deletions(-)
 create mode 100644 export/TARGETS
 create mode 100644 export/__init__.py
 create mode 100644 export/export.py
 create mode 100644 export/recipe.py
 create mode 100644 export/tests/TARGETS
 create mode 100644 export/tests/test_executorch_export.py

diff --git a/export/TARGETS b/export/TARGETS
new file mode 100644
index 00000000000..ae8be8a5e98
--- /dev/null
+++ b/export/TARGETS
@@ -0,0 +1,38 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+oncall("executorch")
+
+python_library(
+    name = "recipe",
+    srcs = [
+        "recipe.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir:pass_manager",
+        "//executorch/devtools/backend_debug:delegation_info",
+    ]
+)
+
+python_library(
+    name = "export",
+    srcs = [
+        "export.py",
+    ],
+    deps = [
+        ":recipe",
+        "//executorch/runtime:runtime",
+    ]
+)
+
+python_library(
+    name = "lib",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = [
+        ":export",
+        ":recipe",
+    ],
+)
diff --git a/export/__init__.py b/export/__init__.py
new file mode 100644
index 00000000000..5eaf2add02e
--- /dev/null
+++ b/export/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+ExecuTorch export module.
+
+This module provides the tools and utilities for exporting PyTorch models
+to the ExecuTorch format, including configuration, quantization, and
+export management.
+"""
+
+# pyre-strict
+
+from .export import export, ExportSession
+from .recipe import ExportRecipe
+
+__all__ = [
+    "ExportRecipe",
+    "ExportSession",
+    "export",
+]
diff --git a/export/export.py b/export/export.py
new file mode 100644
index 00000000000..7ea4de20a9a
--- /dev/null
+++ b/export/export.py
@@ -0,0 +1,746 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from executorch.devtools.backend_debug import get_delegation_info
+from executorch.exir._warnings import experimental
+from executorch.exir.program import (
+    EdgeProgramManager,
+    ExecutorchProgramManager,
+    to_edge_transform_and_lower,
+)
+from executorch.exir.schema import Program
+from executorch.runtime import Runtime, Verification
+from tabulate import tabulate
+from torch import nn
+from torch.ao.quantization import allow_exported_model_train_eval
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.export import ExportedProgram
+from torchao.quantization import quantize_
+from torchao.utils import unwrap_tensor_subclass
+
+from .recipe import ExportRecipe
+
+
+class Stage(ABC):
+    """
+    Interface for a Stage in the ExecuTorch export pipeline.
+
+    Each stage can be connected to other stages to form a pipeline.
+    Stages have clear run and get_outputs functions to make the data flow explicit.
+    Each stage implements its own run method with specific parameter names.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize the stage.
+        """
+        self._next_stage = None
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """
+        Returns the name of this stage.
+        """
+        pass
+
+    @abstractmethod
+    def run(self, **kwargs) -> None:
+        """
+        Executes this stage with the given inputs.
+
+        Each concrete stage class implements this method with specific parameter names.
+        """
+        pass
+
+    @abstractmethod
+    def get_artifacts(self) -> Any:
+        """
+        Returns the artifacts generated by this stage.
+
+        Returns:
+            The artifacts of this stage, to be used as inputs for the next stage
+        """
+        pass
+
+    def set_next_stage(self, next_stage: "Stage") -> None:
+        """
+        Set the next stage in the pipeline.
+
+        Args:
+            next_stage: The next stage to execute after this one
+        """
+        self._next_stage = next_stage
+
+    @property
+    def next_stage(self) -> Optional["Stage"]:
+        """
+        Get the next stage in the pipeline.
+
+        Returns:
+            The next stage, or None if this is the last stage
+        """
+        return self._next_stage
+
+
+class ExportStage(Stage):
+    """
+    First stage: Export PyTorch model to ExportedProgram.
+    """
+
+    def __init__(
+        self,
+        pre_edge_transform_passes: Optional[
+            Callable[[ExportedProgram], ExportedProgram]
+        ] = None,
+    ) -> None:
+        self._exported_program: Dict[str, ExportedProgram] = {}
+        self._pre_edge_transform_passes = pre_edge_transform_passes
+        self._model_dict: Dict[str, nn.Module] = {}
+        self._example_inputs_dict: Dict[str, List[tuple[torch.Tensor, ...]]] = {}
+        self._dynamic_shapes_dict: Dict[str, Any] = {}
+
+    @property
+    def name(self) -> str:
+        return "export"
+
+    def run(
+        self,
+        models: Dict[str, Any],
+        export_config: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Export PyTorch model to ExportedProgram.
+
+        Args:
+            models: Dictionary mapping method names to PyTorch models
+            export_config: Configuration containing example inputs and dynamic shapes
+            **kwargs: Additional keyword arguments (not used)
+        """
+        # Store inputs
+        self._model_dict = models.get("model", {})
+
+        if export_config is not None:
+            self._example_inputs_dict = export_config.get("example_inputs", {})
+            self._dynamic_shapes_dict = export_config.get("dynamic_shapes", {})
+
+        # Process inputs
+        with torch.no_grad():
+            for method_name, model in self._model_dict.items():
+                # Check if method_name exists in example_inputs
+                if method_name not in self._example_inputs_dict:
+                    raise ValueError(
+                        f"Example inputs for method {method_name} not found."
+                    )
+
+                # Get dynamic shapes if available
+                dynamic_shapes = None
+                if method_name in self._dynamic_shapes_dict:
+                    dynamic_shapes = self._dynamic_shapes_dict[method_name]
+
+                # Export the model
+                self._exported_program[method_name] = torch.export.export(
+                    model,
+                    self._example_inputs_dict[method_name][0],
+                    dynamic_shapes=dynamic_shapes,
+                )
+
+                # Apply pre-edge transform passes if available
+                if self._pre_edge_transform_passes is not None:
+                    self._exported_program[method_name] = (
+                        self._pre_edge_transform_passes(
+                            self._exported_program[method_name]
+                        )
+                    )
+
+    def get_artifacts(self) -> Dict[str, ExportedProgram]:
+        """
+        Returns the exported program dictionary.
+
+        Returns:
+            Dictionary mapping method names to exported programs
+        """
+        return self._exported_program
+
+
+class EdgeTransformAndLowerStage(Stage):
+    """
+    Second stage: Transform and lower to EdgeProgramManager.
+    """
+
+    def __init__(
+        self,
+        partitioners: Optional[List[Any]] = None,
+        transform_passes: Optional[Sequence[Callable[[Any], Optional[Any]]]] = None,
+        compile_config: Optional[Any] = None,
+    ) -> None:
+        self._partitioners = partitioners
+        self._transform_passes = transform_passes
+        self._compile_config = compile_config
+        self._edge_program_manager: Optional[EdgeProgramManager] = None
+        self._delegation_info = None
+        self._exported_program: Dict[str, ExportedProgram] = {}
+        self._constant_methods = None
+
+    @property
+    def name(self) -> str:
+        return "edge_transform_and_lower"
+
+    def run(
+        self,
+        exported_programs: Dict[str, ExportedProgram],
+        transform_config: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Transform and lower to EdgeProgramManager.
+
+        Args:
+            exported_programs: Dictionary mapping method names to exported programs
+            transform_config: Configuration containing constant methods
+            **kwargs: Additional keyword arguments (not used)
+        """
+        # Store inputs
+        self._exported_program = exported_programs
+
+        self._constant_methods = None
+        if transform_config is not None:
+            self._constant_methods = transform_config.get("constant_methods", None)
+
+        # Process inputs
+        self._edge_program_manager = to_edge_transform_and_lower(
+            self._exported_program,
+            partitioner=self._partitioners,
+            transform_passes=self._transform_passes,
+            constant_methods=self._constant_methods,
+            compile_config=self._compile_config,
+        )
+        self._delegation_info = get_delegation_info(
+            self._edge_program_manager.exported_program().graph_module
+        )
+
+    def get_artifacts(self) -> EdgeProgramManager:
+        """
+        Returns the edge program manager.
+
+        Returns:
+            The edge program manager
+
+        Raises:
+            RuntimeError: If the edge program manager is not initialized
+        """
+        if self._edge_program_manager is None:
+            raise RuntimeError("Edge program manager is not initialized.")
+        return self._edge_program_manager
+
+    @property
+    def delegation_info(self) -> Any:
+        """
+        Returns the delegation info.
+        """
+        return self._delegation_info
+
+
+class ExecutorchStage(Stage):
+    """
+    Third stage: Convert to ExecutorchProgramManager.
+    """
+
+    def __init__(self, backend_config: Any) -> None:
+        self._backend_config = backend_config
+        self._executorch_program_manager: Optional[ExecutorchProgramManager] = None
+        self._edge_program_manager: Optional[EdgeProgramManager] = None
+
+    @property
+    def name(self) -> str:
+        return "executorch"
+
+    def run(
+        self,
+        edge_program: EdgeProgramManager,
+        backend_options: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Convert to ExecutorchProgramManager.
+
+        Args:
+            edge_program: Edge program manager containing the lowered program
+            backend_options: Additional backend-specific options (not used in this stage)
+            **kwargs: Additional keyword arguments (not used)
+        """
+        # Store inputs
+        self._edge_program_manager = edge_program
+
+        # Process inputs
+        if self._edge_program_manager is None:
+            raise RuntimeError("Edge program manager is not set.")
+
+        self._executorch_program_manager = self._edge_program_manager.to_executorch(
+            self._backend_config
+        )
+
+    def get_artifacts(self) -> ExecutorchProgramManager:
+        """
+        Returns the executorch program manager.
+
+        Returns:
+            The executorch program manager
+
+        Raises:
+            RuntimeError: If the executorch program manager is not initialized
+        """
+        if self._executorch_program_manager is None:
+            raise RuntimeError("Executorch program manager is not initialized.")
+        return self._executorch_program_manager
+
+
+class SourceTransformStage(Stage):
+    """
+    Source transform stage: Apply source transformations to the model.
+    """
+
+    def __init__(self, quantization_recipe: Any) -> None:
+        self._quantization_recipe = quantization_recipe
+        self._transformed_models: Dict[str, nn.Module] = {}
+
+    @property
+    def name(self) -> str:
+        return "source_transform"
+
+    def run(self, models: Dict[str, nn.Module], *args, **kwargs) -> None:
+        """
+        Apply source transformations to the model.
+
+        Args:
+            models: Dictionary mapping method names to PyTorch models
+            **kwargs: Additional keyword arguments (not used)
+        """
+        # Store the original models
+        self._transformed_models = models
+
+        # Check if there's a quantization recipe with ao_base_config
+        if self._quantization_recipe and self._quantization_recipe.ao_base_config:
+            # Apply torchao quantize_ to each model
+            for method_name, model in models.items():
+                for config in self._quantization_recipe.ao_base_config:
+                    quantize_(model, config)
+                    unwrap_tensor_subclass(model)
+                    self._transformed_models[method_name] = model
+
+    def get_artifacts(self) -> Dict[str, nn.Module]:
+        """
+        Returns the transformed models.
+
+        Returns:
+            Dictionary mapping method names to transformed models
+        """
+        return self._transformed_models
+
+
+class QuantizeStage(Stage):
+    """
+    Optional stage: Perform post-training quantization on the model.
+    """
+
+    def __init__(self, quantizer: Any) -> None:
+        self._quantizer = quantizer
+        self._quantized_models: Dict[str, nn.Module] = {}
+        self._model_dict: Dict[str, nn.Module] = {}
+        self._exported_program_dict: Dict[str, ExportedProgram] = {}
+        self._example_inputs_dict: Dict[str, List[tuple[torch.Tensor, ...]]] = {}
+
+    @property
+    def name(self) -> str:
+        return "quantize"
+
+    def run(
+        self,
+        exported_program_data: Dict[str, Any],
+        calibration_config: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Perform post-training quantization on the exported program.
+
+        Args:
+            exported_program_data: Dictionary containing exported programs
+            calibration_config: Configuration containing example inputs for calibration
+            **kwargs: Additional keyword arguments (not used)
+        """
+        # Store inputs
+        self._exported_program_dict = exported_program_data["exported_program"]
+
+        # Initialize with empty dictionaries
+        self._example_inputs_dict = {}
+
+        if calibration_config is not None:
+            self._example_inputs_dict = calibration_config.get("example_inputs", {})
+
+        # Process inputs
+        for method_name, exported_program in self._exported_program_dict.items():
+            # Check if method_name exists in example_inputs and has at least one element
+            if (
+                method_name not in self._example_inputs_dict
+                or not self._example_inputs_dict[method_name]
+            ):
+                raise ValueError(
+                    f"Example inputs for method {method_name} not found or empty."
+                )
+
+            # Get the module from the exported program
+            model = exported_program.module()
+
+            # Prepare the model for quantization
+            prepared_model = prepare_pt2e(model, self._quantizer)  # type: ignore
+
+            # Allow the model to switch between train and eval modes
+            allow_exported_model_train_eval(prepared_model)
+
+            # Calibrate the model with the provided calibration data
+            for calibration_input in self._example_inputs_dict[method_name]:  # type: ignore
+                prepared_model(*calibration_input)
+
+            # Convert the prepared model to a quantized model
+            quantized_model = convert_pt2e(prepared_model)
+            self._quantized_models[method_name] = quantized_model  # type: ignore
+
+    def get_artifacts(self) -> Dict[str, nn.Module]:
+        """
+        Returns the quantized models.
+
+        Returns:
+            Dictionary mapping method names to quantized models
+        """
+        return self._quantized_models
+
+
+@experimental(
+    "This API and all of its related functionality such as ExportSession and ExportRecipe are experimental."
+)
+def export(
+    model: Union[nn.Module, Dict[str, nn.Module]],
+    example_inputs: Union[
+        List[tuple[torch.Tensor, ...]], Dict[str, List[tuple[torch.Tensor, ...]]]
+    ],
+    export_recipe: ExportRecipe,
+    name: Optional[str] = None,
+    dynamic_shapes: Optional[Union[Any, Dict[str, Any]]] = None,
+    constant_methods: Optional[Union[Dict[str, Callable]]] = None,
+    artifact_dir: Optional[str] = None,
+) -> "ExportSession":
+    """
+    Create and configure an ExportSession with the given parameters.
+
+    This function provides a convenient way to create an ExportSession and
+    optionally run the export process in one step.
+
+    Args:
+        model: The PyTorch model(s) to export, either a single model or a dictionary
+              mapping method names to models
+        example_inputs: Example inputs for the model(s), either a list of input tuples
+                      or a dictionary mapping method names to lists of input tuples
+        export_recipe: Contains the configuration for the export process
+        name: Optional name for the export
+        dynamic_shapes: Optional dynamic shape specifications
+        constant_methods: Optional dictionary of constant methods
+        artifact_dir: Optional directory to store artifacts
+
+    Returns:
+        A configured ExportSession instance with the export process completed if requested
+    """
+    session = ExportSession(
+        model=model,
+        example_inputs=example_inputs,
+        export_recipe=export_recipe,
+        name=name,
+        dynamic_shapes=dynamic_shapes,
+        constant_methods=constant_methods,
+        artifact_dir=artifact_dir,
+    )
+    session.export()
+
+    return session
+
+
+@experimental(
+    "This API and all of its related functionality such as ExportSession and ExportRecipe are experimental."
+)
+class ExportSession:
+    """
+    Manages the export process for ExecuTorch models.
+
+    This class handles the export process through a pipeline of stages:
+    1. (Optional) Quantize - Apply post-training quantization to the model
+    2. Export - Export PyTorch model to ExportedProgram
+    3. EdgeTransformAndLower - Transform and lower to EdgeProgramManager
+    4. Executorch - Convert to ExecutorchProgramManager for final execution
+    """
+
+    def __init__(
+        self,
+        model: Union[nn.Module, Dict[str, nn.Module]],
+        example_inputs: Union[
+            List[tuple[torch.Tensor, ...]], Dict[str, List[tuple[torch.Tensor, ...]]]
+        ],
+        export_recipe: ExportRecipe,
+        name: Optional[str] = None,
+        dynamic_shapes: Optional[Union[Any, Dict[str, Any]]] = None,
+        constant_methods: Optional[Union[Dict[str, Callable]]] = None,
+        artifact_dir: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize the ExportSession with model, inputs, and recipe.
+
+        Args:
+            model: The PyTorch model(s) to export, either a single model or a dictionary
+                  mapping method names to models
+            example_inputs: Example inputs for the model(s), either a list of input tuples
+                          or a dictionary mapping method names to lists of input tuples
+            export_recipe: Contains the configuration for the export process
+            name: Optional name for the export
+            dynamic_shapes: Optional dynamic shape specifications
+            constant_methods: Optional dictionary of constant methods
+            artifact_dir: Optional directory to store artifacts
+        """
+        # Standardize model to dictionary format
+        self._model = model if isinstance(model, dict) else {"forward": model}
+
+        # Standardize example_inputs to dictionary format
+        self._example_inputs = (
+            example_inputs
+            if isinstance(example_inputs, dict)
+            else {"forward": example_inputs}
+        )
+
+        # Standardize dynamic_shapes to dictionary format
+        self._dynamic_shapes = {}
+        if dynamic_shapes is not None:
+            if isinstance(dynamic_shapes, dict):
+                self._dynamic_shapes = dynamic_shapes
+            else:
+                self._dynamic_shapes = {"forward": dynamic_shapes}
+
+        self._name = name
+        self._constant_methods = constant_methods
+        self._artifact_dir = artifact_dir
+        self._export_recipe = export_recipe
+
+        # Initialize pipeline as a list of stages
+        self._pipeline = []
+
+        # Create the source transform stage if a quantization recipe is provided
+        if self._export_recipe.quantization_recipe is not None:
+            source_transform_stage = SourceTransformStage(
+                quantization_recipe=self._export_recipe.quantization_recipe
+            )
+            self._pipeline.append(source_transform_stage)
+
+        # Create the export stage
+        export_stage = ExportStage(
+            pre_edge_transform_passes=self._export_recipe.pre_edge_transform_passes
+        )
+        self._pipeline.append(export_stage)
+
+        # Create the quantize stage if a quantizer is provided
+        if self._export_recipe.quantization_recipe is not None:
+            quantizer = self._export_recipe.quantization_recipe.get_quantizer()
+            if quantizer is not None:
+                quantize_stage = QuantizeStage(quantizer=quantizer)
+                self._pipeline.append(quantize_stage)
+
+        # Create the edge transform and lower stage
+        edge_transform_and_lower_stage = EdgeTransformAndLowerStage(
+            partitioners=self._export_recipe.partitioners,
+            transform_passes=self._export_recipe.edge_transform_passes,
+            compile_config=self._export_recipe.edge_compile_config,
+        )
+        self._pipeline.append(edge_transform_and_lower_stage)
+
+        # Create the executorch stage
+        executorch_stage = ExecutorchStage(
+            backend_config=self._export_recipe.executorch_backend_config
+        )
+        self._pipeline.append(executorch_stage)
+
+        # Initialize stage artifacts
+        self._exported_models: Dict[str, nn.Module] = {}
+
+        # Initialize stage artifacts
+        self._exported_program: Dict[str, ExportedProgram] = {}
+        self._edge_program_manager: Optional[EdgeProgramManager] = None
+        self._executorch_program_manager: Optional[ExecutorchProgramManager] = None
+        self._delegation_info = None
+
+    def _run_pipeline(self) -> None:
+        """
+        Run the pipeline from the beginning.
+
+        This method cascades through the pipeline of stages, executing each stage in order.
+        Each stage directly configures the inputs for the next stage when it completes.
+        """
+        # Process each stage in the pipeline
+        for stage in self._pipeline:
+            stage_name = stage.name
+            # Configure inputs for the current stage
+            if stage_name == "source_transform":
+                # Run the source transform stage
+                stage.run(self._model, {})
+                self._model = stage.get_artifacts()
+            elif stage_name == "quantize":
+                # Run the quantize stage
+                exported_program_data = {"exported_program": self._exported_program}
+                config_params = {"example_inputs": self._example_inputs}
+                stage.run(exported_program_data, config_params)
+                self._model = stage.get_artifacts()
+            elif stage_name == "export":
+                # Run the export stage
+                models = {"model": self._model}
+                config_params = {
+                    "example_inputs": self._example_inputs,
+                    "dynamic_shapes": self._dynamic_shapes,
+                }
+                stage.run(models, config_params)
+                self._exported_program = stage.get_artifacts()
+            elif stage_name == "edge_transform_and_lower":
+                # Run the edge transform and lower stage
+                stage.run(
+                    self._exported_program, {"constant_methods": self._constant_methods}
+                )
+                self._edge_program_manager = stage.get_artifacts()
+                self._delegation_info = stage.delegation_info
+            elif stage_name == "executorch":
+                # Run the executorch stage
+                stage.run(self._edge_program_manager, {})
+                self._executorch_program_manager = stage.get_artifacts()
+
+    def export(self) -> None:
+        """
+        Execute the full export process.
+
+        This method orchestrates the export process with optional quantization:
+        1. (Optional) Apply quantization to the model
+        2. Export the PyTorch model to ExportedProgram
+        3. Transform and lower to EdgeProgramManager
+        4. Convert to ExecutorchProgramManager
+        """
+        # Run the pipeline from the beginning
+        self._run_pipeline()
+
+    def save_pte_file(self, path: str) -> None:
+        """
+        Save the exported program to a PTE file.
+
+        Args:
+            path: Path where the PTE file will be saved
+
+        Raises:
+            RuntimeError: If the executorch program manager is not initialized
+        """
+        if self._executorch_program_manager is None:
+            raise RuntimeError(
+                "Executorch program manager is not initialized. Run export() first."
+            )
+        self._executorch_program_manager.save(path)
+
+    def get_executorch_program(self) -> Program:
+        """
+        Get the ExecutorchProgram from the ExecutorchProgramManager.
+
+        Returns:
+            The ExecutorchProgram
+
+        Raises:
+            RuntimeError: If the executorch program manager is not initialized
+        """
+        if self._executorch_program_manager is None:
+            raise RuntimeError(
+                "Executorch program manager is not initialized. Run export() first."
+            )
+        return self._executorch_program_manager.executorch_program
+
+    def get_pte_buffer(self) -> bytes:
+        """
+        Get the PTE buffer as bytes.
+
+        Returns:
+            The PTE buffer as bytes
+
+        Raises:
+            RuntimeError: If the executorch program manager is not initialized
+        """
+        if self._executorch_program_manager is None:
+            raise RuntimeError(
+                "Executorch program manager is not initialized. Run export() first."
+            )
+        return self._executorch_program_manager.buffer
+
+    def get_example_input(
+        self, method_name: str = "forward"
+    ) -> Tuple[torch.Tensor, ...]:
+        """
+        Get the example input for a specific method.
+
+        Args:
+            method_name: Name of the method to get example input for, defaults to "forward"
+
+        Returns:
+            Tuple of tensors representing the example input
+
+        Raises:
+            KeyError: If the method name is not found in example inputs
+            ValueError: If the example inputs list is empty
+        """
+        if method_name not in self._example_inputs:
+            raise KeyError(f"Method name '{method_name}' not found in example inputs")
+
+        # Access the first element of the list for this method
+        example_inputs_list = self._example_inputs[method_name]
+        if not example_inputs_list:
+            raise ValueError(f"Example inputs list for method {method_name} is empty")
+
+        # The original code expects this to be a tuple of tensors
+        return self._example_inputs[method_name][0]
+
+    def run_method(
+        self,
+        method_name: str = "forward",
+        example_inputs: Optional[Tuple[torch.Tensor, ...]] = None,
+    ) -> Sequence[Any]:
+        """
+        Run a specific method with the given inputs.
+
+        Args:
+            method_name: Name of the method to run, defaults to "forward"
+            example_inputs: Optional inputs to use, defaults to the example inputs
+
+        Returns:
+            The outputs of the method execution
+
+        Raises:
+            RuntimeError: If the method cannot be loaded
+        """
+        et_runtime = Runtime.get()
+        program = et_runtime.load_program(
+            self.get_pte_buffer(), verification=Verification.Minimal
+        )
+        forward = program.load_method(method_name)
+
+        if forward is None:
+            raise RuntimeError(
+                f"Failed to load method '{method_name}' from the program"
+            )
+        if example_inputs is None:
+            example_inputs = self.get_example_input(method_name)
+
+        return forward.execute(example_inputs)
+
+    def print_delegation_info(self) -> None:
+        """
+        Print delegation information for the exported program.
+        """
+        print(self._delegation_info.get_summary())
+        df = self._delegation_info.get_operator_delegation_dataframe()
+        print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
diff --git a/export/recipe.py b/export/recipe.py
new file mode 100644
index 00000000000..5a6b1330368
--- /dev/null
+++ b/export/recipe.py
@@ -0,0 +1,104 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Export recipe definitions for ExecuTorch.
+
+This module provides the data structures needed to configure the export process
+for ExecuTorch models, including export configurations and quantization recipes.
+"""
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Callable, List, Optional, Sequence
+
+from executorch.exir._warnings import experimental
+
+from executorch.exir.backend.partitioner import Partitioner
+from executorch.exir.capture import EdgeCompileConfig, ExecutorchBackendConfig
+from executorch.exir.pass_manager import PassType
+from torch.ao.quantization.quantizer import Quantizer
+from torch.export import ExportedProgram
+from torchao.core.config import AOBaseConfig
+
+
+class Mode(str, Enum):
+    """
+    Export mode enumeration.
+
+    Attributes:
+        DEBUG: Debug mode with additional checks and information
+        RELEASE: Release mode optimized for performance
+    """
+
+    DEBUG = "debug"
+    RELEASE = "release"
+
+
+@dataclass
+class QuantizationRecipe:
+    """
+    Configuration recipe for quantization.
+
+    This class holds the configuration parameters for quantizing a model.
+
+    Attributes:
+        quantizer: Optional quantizer for model quantization
+    """
+
+    quantizer: Optional[Quantizer] = None
+    ao_base_config: Optional[List[AOBaseConfig]] = None
+
+    def get_quantizer(self) -> Optional[Quantizer]:
+        """
+        Get the quantizer associated with this recipe.
+
+        Returns:
+            The quantizer if one is set, otherwise None
+        """
+        return self.quantizer
+
+
+@experimental(
+    "This API and all of its related functionality such as ExportSession and ExportRecipe are experimental."
+)
+@dataclass
+class ExportRecipe:
+    """
+    Configuration recipe for the export process.
+
+    This class holds the configuration parameters for exporting a model,
+    including compilation and transformation options.
+
+    Attributes:
+        name: Optional name for the recipe
+        quantization_recipe: Optional quantization recipe for model quantization
+        edge_compile_config: Optional edge compilation configuration
+        pre_edge_transform_passes: Optional function to apply transformation passes
+                                  before edge lowering
+        edge_transform_passes: Optional sequence of transformation passes to apply
+                              during edge lowering
+        transform_check_ir_validity: Whether to check IR validity during transformation
+        partitioners: Optional list of partitioners for model partitioning
+        executorch_backend_config: Optional backend configuration for ExecuTorch
+        mode: Export mode (debug or release)
+    """
+
+    name: Optional[str] = None
+    quantization_recipe: Optional[QuantizationRecipe] = None
+    edge_compile_config: Optional[EdgeCompileConfig] = (
+        None  # pyre-ignore[11]: Type not defined
+    )
+    pre_edge_transform_passes: Optional[
+        Callable[[ExportedProgram], ExportedProgram]
+    ] = None
+    edge_transform_passes: Optional[Sequence[PassType]] = None
+    transform_check_ir_validity: bool = True
+    partitioners: Optional[list[Partitioner]] = None
+    executorch_backend_config: Optional[ExecutorchBackendConfig] = (
+        None  # pyre-ignore[11]: Type not defined
+    )
+    mode: Mode = Mode.RELEASE
diff --git a/export/tests/TARGETS b/export/tests/TARGETS
new file mode 100644
index 00000000000..93556cb03dd
--- /dev/null
+++ b/export/tests/TARGETS
@@ -0,0 +1,16 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "executorch_export",
+    srcs = [
+        "test_executorch_export.py",
+    ],
+    deps = [
+        "//executorch/exir:lib",
+        "//executorch/export:lib",
+        "//executorch/devtools/backend_debug:delegation_info",
+        "//executorch/runtime:runtime",
+    ]
+)
diff --git a/export/tests/test_executorch_export.py b/export/tests/test_executorch_export.py
new file mode 100644
index 00000000000..6d9909ed791
--- /dev/null
+++ b/export/tests/test_executorch_export.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+
+import torch
+from executorch.export import export, ExportRecipe
+
+
+class TestExecutorchExport(unittest.TestCase):
+    def test_basic_recipe(self) -> None:
+        class SimpleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = SimpleModel()
+        example_inputs = [(torch.rand(1, 10),)]
+        export_recipe = ExportRecipe()
+
+        # Use the export API instead of creating ExportSession directly
+        export_session = export(
+            model=model, example_inputs=example_inputs, export_recipe=export_recipe
+        )
+
+        self.assertTrue(len(export_session.get_pte_buffer()) != 0)
diff --git a/runtime/TARGETS b/runtime/TARGETS
index c341c042d03..7448523f5ff 100644
--- a/runtime/TARGETS
+++ b/runtime/TARGETS
@@ -9,8 +9,7 @@ runtime.python_library(
         "//executorch/extension/pybindings:portable_lib",
     ],
     visibility = [
-        "//executorch/runtime/...",
-        "//executorch/exir/emit/test/...",
+        "//executorch/...",
         "@EXECUTORCH_CLIENTS",
     ],
 )

From c9136342f7577682b610347c32668df003e77fcf Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 15 May 2025 07:38:05 +0200
Subject: [PATCH 095/178] Refactor quantize.py functions to remove args

Differential Revision: D74617796

Pull Request resolved: https://github.com/pytorch/executorch/pull/10893
---
 examples/models/llama/export_llama_lib.py     | 49 ++++-------------
 .../llama/source_transformation/quantize.py   | 55 +++++++++----------
 examples/models/llava/export_llava.py         | 11 +++-
 examples/qualcomm/oss_scripts/llama/llama.py  |  6 +-
 4 files changed, 51 insertions(+), 70 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index c430da78832..d669379b6a9 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -1341,23 +1341,11 @@ def _get_source_transforms(  # noqa
         transformations based on the given checkpoint first. In those cases,
         this wil be a no-op.
         """
-
-        # Create a mock args object with the necessary attributes
-        class Args:
-            pass
-
-        args = Args()
-        args.checkpoint = checkpoint
-        args.tokenizer_path = tokenizer_path
-        args.embedding_quantize = embedding_quantize
-        args.use_shared_embedding = use_shared_embedding
-        args.use_qat = use_qat
-        args.use_lora = use_lora
-        args.preq_mode = preq_mode
-        args.preq_group_size = preq_group_size
-        args.preq_embedding_quantize = preq_embedding_quantize
-
-        transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
+        transforms.append(
+            get_quant_embedding_transform(
+                embedding_quantize, use_shared_embedding, checkpoint_dtype
+            )
+        )
 
     # quantization_mode should be applied after embedding_quantize
     # to support shared_embedding
@@ -1375,30 +1363,17 @@ class Args:
         There are cases where this may be a no-op, namely, if all linears are
         quantized in the checkpoint.
         """
-
-        # Create a mock args object with the necessary attributes
-        class Args:
-            pass
-
-        args = Args()
-        args.checkpoint = checkpoint
-        args.tokenizer_path = tokenizer_path
-        args.quantization_mode = quantization_mode
-        args.group_size = group_size
-        args.use_shared_embedding = use_shared_embedding
-        args.calibration_tasks = calibration_tasks
-        args.calibration_limit = calibration_limit
-        args.calibration_seq_length = calibration_seq_length
-        args.use_shared_embedding = use_shared_embedding
-        args.use_qat = use_qat
-        args.use_lora = use_lora
-        args.preq_mode = preq_mode
-
         transforms.append(
             get_quant_weight_transform(
-                args=args,
+                quantization_mode=quantization_mode,
+                group_size=group_size,
                 computation_dtype=dtype_override,
                 checkpoint_dtype=checkpoint_dtype,
+                checkpoint_path=checkpoint,
+                tokenizer_path=tokenizer_path,
+                calibration_tasks=calibration_tasks,
+                calibration_limit=calibration_limit,
+                calibration_seq_length=calibration_seq_length,
             )
         )
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index ec02f442217..d2e2d5396d3 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -41,7 +41,7 @@ def quantize(  # noqa C901
     checkpoint_dtype: Optional[DType] = None,
     checkpoint_path: Optional[Path] = None,
     # following arguments only available when setting int4 or gptq quantization.
-    group_size: Optional[int] = 128,
+    group_size: Optional[int] = None,
     # following arguments are only used for GPTQ
     calibration_tasks: Optional[list] = None,
     calibration_limit: Optional[int] = None,
@@ -146,9 +146,9 @@ def quantize(  # noqa C901
             print("quantized model:", model)
         return model
     elif qmode == "8da4w":
-        # Check for required args
         if group_size is None:
-            raise Exception("For 8da4w quantization, group size must be specified.")
+            # TODO: Default value for group size for 8da4w. Need this here for refactor, will clean this up.
+            group_size = 128
 
         from torchao.quantization import int8_dynamic_activation_int4_weight, quantize_
         from torchao.utils import unwrap_tensor_subclass
@@ -784,8 +784,12 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 ############################ Source Transform Start #######################
 
 
-def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
-    if args.embedding_quantize.startswith("torchao:"):
+def get_quant_embedding_transform(
+    embedding_quantize: str,
+    use_shared_embedding: bool = False,
+    dtype_override: Optional[DType] = None,
+):
+    if embedding_quantize.startswith("torchao:"):
         from torchao.experimental.quant_api import (
             EmbeddingQuantizer,
             SharedEmbeddingQuantizer,
@@ -793,7 +797,7 @@ def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import MappingType
 
-        quant_args = args.embedding_quantize.split(":")[1].split(",")
+        quant_args = embedding_quantize.split(":")[1].split(",")
         if len(quant_args) == 2:
             bitwidth, group_size = quant_args
             is_asymmetric = True
@@ -814,7 +818,7 @@ def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
 
         def _torchao_embedding_quantizer(model):
             with torch.no_grad():
-                if not args.use_shared_embedding:
+                if not use_shared_embedding:
                     EmbeddingQuantizer(
                         weight_dtype=weight_dtype,
                         granularity=granularity,
@@ -831,7 +835,7 @@ def _torchao_embedding_quantizer(model):
 
         return _torchao_embedding_quantizer
 
-    bitwidth, group_size = args.embedding_quantize.split(",")
+    bitwidth, group_size = embedding_quantize.split(",")
     if group_size == "none" or group_size == "None" or group_size == "0":
         group_size = None
     else:
@@ -848,34 +852,27 @@ def _torchao_embedding_quantizer(model):
 
 
 def get_quant_weight_transform(
-    args,
+    quantization_mode: str,
+    group_size: Optional[int] = None,
     computation_dtype: Optional[DType] = None,
     checkpoint_dtype: Optional[DType] = None,
+    checkpoint_path: Optional[Path] = None,
+    tokenizer_path: Optional[Path] = None,
+    calibration_tasks: Optional[list] = None,
+    calibration_limit: Optional[int] = None,
+    calibration_seq_length: Optional[int] = None,
 ):
-    # If these optional args are None, don't provide them to quantize().
-    quant_args_str = [
-        "group_size",
-        "calibration_tasks",
-        "calibration_limit",
-        "calibration_seq_length",
-    ]
-    arg_dict = vars(args)
-    quant_args = {
-        param: val
-        for param in quant_args_str
-        if (val := arg_dict.get(param)) is not None
-    }
-
     return partial(
         quantize,
-        **quant_args,
-        qmode=args.quantization_mode,
+        qmode=quantization_mode,
         computation_dtype=computation_dtype,
         checkpoint_dtype=checkpoint_dtype,
-        checkpoint_path=(Path(path) if (path := args.checkpoint) is not None else None),
-        tokenizer_path=(
-            Path(path) if (path := args.tokenizer_path) is not None else None
-        ),
+        checkpoint_path=(Path(path) if (path := checkpoint_path) is not None else None),
+        group_size=group_size,
+        calibration_tasks=calibration_tasks,
+        calibration_limit=calibration_limit,
+        calibration_seq_length=calibration_seq_length,
+        tokenizer_path=(Path(path) if (path := tokenizer_path) is not None else None),
     )
 
 
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 60c21897e7f..6e0f2413786 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -107,7 +107,16 @@ def forward(self, input_pos, embeddings):
             "4,32",
         ]
     )
-    quant_transform = get_quant_weight_transform(args, dtype_override)
+    quant_transform = get_quant_weight_transform(
+        quantization_mode=args.quantization_mode,
+        group_size=args.group_size,
+        computation_dtype=dtype_override,
+        checkpoint_path=args.checkpoint_path,
+        tokenizer_path=args.tokenizer_path,
+        calibration_tasks=args.calibration_tasks,
+        calibration_limit=args.calibration_limit,
+        calibration_seq_length=args.calibration_seq_length,
+    )
     _, quantizers, _ = get_quantizer_and_quant_params(args)
     source_transforms = []
     if llava.use_sdpa_with_kv_cache_op:
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 375edf9fb6c..6f7bdac8e15 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -603,9 +603,9 @@ def permute(w, heads):
 
     for i in range(len(llama_instance_list)):
         if args.embedding_quantize:
-            llama_instance_list[i] = get_quant_embedding_transform(args)(
-                llama_instance_list[i]
-            )
+            llama_instance_list[i] = get_quant_embedding_transform(
+                embedding_quantize=args.embedding_quantize
+            )(llama_instance_list[i])
         llama_instance_list[i] = convert_linear_to_conv2d(llama_instance_list[i])
         llama_instance_list[i] = SingleLlama(
             llama_instance_list[i].eval(), pte_filename

From 52224896cd28f61b44467094efbf38ff9378610b Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Thu, 15 May 2025 12:59:44 +0200
Subject: [PATCH 096/178] Arm backend: Add validation for same dtype to
 operators (#10872)

When applicable, check that the data types of inputs to a given operator
are the same.

Signed-off-by: Sebastian Larsson <sebastian.larsson@arm.com>
---
 backends/arm/operators/op_abs.py              | 35 +++----------
 backends/arm/operators/op_add.py              | 39 +++------------
 backends/arm/operators/op_amax.py             |  3 ++
 backends/arm/operators/op_amin.py             |  3 ++
 backends/arm/operators/op_any.py              | 13 ++---
 backends/arm/operators/op_avg_pool2d.py       |  5 ++
 backends/arm/operators/op_bmm.py              | 14 ++----
 backends/arm/operators/op_clamp.py            |  5 ++
 backends/arm/operators/op_constant_pad_nd.py  |  3 ++
 backends/arm/operators/op_cos.py              |  2 +
 backends/arm/operators/op_eq.py               | 15 ++----
 backends/arm/operators/op_erf.py              | 13 ++---
 backends/arm/operators/op_exp.py              |  3 ++
 backends/arm/operators/op_ge.py               | 15 ++----
 backends/arm/operators/op_gt.py               | 15 ++----
 backends/arm/operators/op_le.py               | 15 ++----
 backends/arm/operators/op_log.py              |  3 ++
 backends/arm/operators/op_lt.py               | 15 ++----
 backends/arm/operators/op_max_pool2d.py       |  3 ++
 backends/arm/operators/op_maximum.py          | 17 ++-----
 backends/arm/operators/op_minimum.py          | 17 ++-----
 backends/arm/operators/op_mul.py              |  5 ++
 backends/arm/operators/op_permute.py          |  3 ++
 backends/arm/operators/op_pow.py              | 13 ++---
 backends/arm/operators/op_reciprocal.py       |  3 ++
 backends/arm/operators/op_repeat.py           |  3 ++
 backends/arm/operators/op_rshift_tensor.py    |  3 ++
 backends/arm/operators/op_rsqrt.py            |  3 ++
 backends/arm/operators/op_sigmoid.py          |  3 ++
 backends/arm/operators/op_sin.py              |  2 +
 backends/arm/operators/op_slice.py            |  3 ++
 backends/arm/operators/op_sub.py              | 30 ++---------
 backends/arm/operators/op_sum.py              |  7 +++
 backends/arm/operators/op_tanh.py             |  3 ++
 backends/arm/operators/op_transpose.py        |  3 ++
 .../arm/operators/op_upsample_bilinear2d.py   |  3 ++
 .../arm/operators/op_upsample_nearest2d.py    |  3 ++
 backends/arm/operators/op_view.py             |  3 ++
 backends/arm/operators/op_where.py            | 15 ++----
 .../operators/operator_validation_utils.py    | 50 ++++++++++++++++++-
 backends/arm/operators/ops_binary.py          | 15 ++----
 backends/arm/operators/ops_identity.py        |  3 ++
 backends/arm/operators/ops_unary.py           | 15 ++----
 43 files changed, 193 insertions(+), 251 deletions(-)

diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index 43929d3b1c8..5d82810f0d7 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -43,13 +44,8 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
-        # Specification (0.80) states that input and output types
-        # should all be the same
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and outputs need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
+
         # Handle int8 (quantized) and int32
         if not (inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]):
             raise ValueError(
@@ -110,13 +106,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
-        # Specification (0.80) states that input and output types
-        # should all be the same
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
@@ -163,14 +153,8 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
-        # Specification (1.0) states that input and output types
-        # should all be the same
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and outputs need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         # Handle int8 (quantized) and int32
         if not (inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]):
             raise ValueError(
@@ -232,14 +216,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
-
-        # Specification (1.0) states that input and output types
-        # should all be the same
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
index fc8ecbb960a..b8e3d1561ca 100644
--- a/backends/arm/operators/op_add.py
+++ b/backends/arm/operators/op_add.py
@@ -16,6 +16,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -44,14 +45,8 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        # Specification (0.80) states that input and output types
-        # should all be the same
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got input 1: "
-                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
-                f"{output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
+
         # Handle int8 (quantized) and int32
         supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
         if inputs[0].dtype not in supported_dtypes:
@@ -123,14 +118,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        # Specification (0.80) states that input and output types
-        # should all be the same
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got input 1: "
-                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
-                f"{output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
@@ -175,15 +163,8 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
-        # Specification (1.0) states that input and output types
-        # should all be the same
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got input 1: "
-                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
-                f"{output.dtype}"
-            )
         # Handle int8 (quantized) and int32
         supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
         if inputs[0].dtype not in supported_dtypes:
@@ -245,15 +226,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        # Specification (1.0) states that input and output types
-        # should all be the same
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got input 1: "
-                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
-                f"{output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
diff --git a/backends/arm/operators/op_amax.py b/backends/arm/operators/op_amax.py
index 52cfbb18e81..e76e234d7e0 100644
--- a/backends/arm/operators/op_amax.py
+++ b/backends/arm/operators/op_amax.py
@@ -11,6 +11,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from torch.fx import Node
@@ -35,6 +36,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         input = inputs[0]
         dim = inputs[1].number
@@ -77,6 +79,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         input = inputs[0]
         dim = inputs[1].number
diff --git a/backends/arm/operators/op_amin.py b/backends/arm/operators/op_amin.py
index d9f05c6f9f1..97c80b7ef63 100644
--- a/backends/arm/operators/op_amin.py
+++ b/backends/arm/operators/op_amin.py
@@ -11,6 +11,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from torch.fx import Node
@@ -35,6 +36,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         input = inputs[0]
         dim = inputs[1].number
@@ -77,6 +79,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         input = inputs[0]
         dim = inputs[1].number
diff --git a/backends/arm/operators/op_any.py b/backends/arm/operators/op_any.py
index d8be68fbbc1..9484c23298e 100644
--- a/backends/arm/operators/op_any.py
+++ b/backends/arm/operators/op_any.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 
 from executorch.backends.arm.tosa_mapping import TosaArg  # type: ignore
@@ -34,12 +35,8 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and outputs need same dtype."
-                f"Got {ts.DTypeNames[inputs[0].dtype]=}, {ts.DTypeNames[output.dtype]=}."
-            )
         if not (inputs[0].dtype == ts.DType.BOOL):
             raise ValueError("All inputs need to be BOOL." f"Got {inputs[0].dtype=}")
 
@@ -75,12 +72,8 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and outputs need same dtype."
-                f"Got {ts.DTypeNames[inputs[0].dtype]=}, {ts.DTypeNames[output.dtype]=}."
-            )
         if not (inputs[0].dtype == ts.DType.BOOL):
             raise ValueError("All inputs need to be BOOL." f"Got {inputs[0].dtype=}")
 
diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py
index 504de7319a2..9eb533b7968 100644
--- a/backends/arm/operators/op_avg_pool2d.py
+++ b/backends/arm/operators/op_avg_pool2d.py
@@ -18,6 +18,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -89,6 +90,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [3, 4, 6])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         supported_dtypes = [ts.DType.INT8]
         if inputs[0].dtype not in supported_dtypes:
@@ -128,6 +130,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [3, 4, 6])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         supported_dtypes = [ts.DType.INT8, ts.DType.FP32]
         if inputs[0].dtype not in supported_dtypes:
@@ -220,6 +223,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [3, 4, 6])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         supported_dtypes = [ts.DType.INT8]
         if inputs[0].dtype not in supported_dtypes:
@@ -262,6 +266,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [3, 4, 6])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         supported_dtypes = [ts.DType.INT8, ts.DType.FP32]
         if inputs[0].dtype not in supported_dtypes:
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
index 8c68bde2006..a88a3de20b1 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_bmm.py
@@ -19,6 +19,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_quant_utils import build_rescale, build_rescale_v0_80
@@ -49,11 +50,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got: "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=} and {output.dtype=}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         # aten.bmm maps directly to MATMUL
         # NOTE: For now, only INT8 & FP32 is supported
@@ -132,12 +129,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got: "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=} and {output.dtype=}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         # aten.bmm maps directly to MATMUL
         # NOTE: For now, only INT8 & FP32 is supported
diff --git a/backends/arm/operators/op_clamp.py b/backends/arm/operators/op_clamp.py
index 566121d1bbb..221439ef87a 100644
--- a/backends/arm/operators/op_clamp.py
+++ b/backends/arm/operators/op_clamp.py
@@ -17,6 +17,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 
 from executorch.backends.arm.tosa_mapping import TosaArg
@@ -88,6 +89,7 @@ def define_node(
         output: TosaArg,
     ) -> None:
         validate_num_inputs(self.target, inputs, [2, 3])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         min_int8, max_int8 = self._get_min_max_arguments(
             node,
@@ -128,6 +130,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [2, 3])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         if inputs[0].dtype == ts.DType.INT8:
             # Call the inherited define_node for handling integers
@@ -194,6 +197,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [2, 3])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         # NOTE: Quantization of the min/max arguments is handled by QuantizeOperatorArguments
         min_int8, max_int8 = self._get_min_max_arguments(
@@ -236,6 +240,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [2, 3])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         min_fp32, max_fp32 = self._get_min_max_arguments(
             node,
diff --git a/backends/arm/operators/op_constant_pad_nd.py b/backends/arm/operators/op_constant_pad_nd.py
index 57c13664e76..5f0534a54aa 100644
--- a/backends/arm/operators/op_constant_pad_nd.py
+++ b/backends/arm/operators/op_constant_pad_nd.py
@@ -18,6 +18,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -43,6 +44,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         if inputs[0].dtype == ts.DType.INT8:
             input_qparams = get_input_qparams(node)
@@ -106,6 +108,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         if inputs[0].dtype == ts.DType.INT8:
             input_qparams = get_input_qparams(node)
diff --git a/backends/arm/operators/op_cos.py b/backends/arm/operators/op_cos.py
index 43fa26176e5..85addaab730 100644
--- a/backends/arm/operators/op_cos.py
+++ b/backends/arm/operators/op_cos.py
@@ -13,6 +13,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -37,6 +38,7 @@ def define_node(
         output: TosaArg,
     ) -> None:
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
                 f"Input and output for {self.target} need to be FP32, got input_dtype: "
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index 4cfa6012145..c0839120821 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -45,12 +46,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator EQ but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
@@ -95,12 +91,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator EQ but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_erf.py b/backends/arm/operators/op_erf.py
index bfce5c26699..4c239f2c08e 100644
--- a/backends/arm/operators/op_erf.py
+++ b/backends/arm/operators/op_erf.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -37,12 +38,8 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         if not (inputs[0].dtype == ts.DType.FP32):
             raise ValueError("All inputs need to be FP32." f"Got {inputs[0].dtype=}")
         # MI lowering
@@ -69,12 +66,8 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
-        if not (inputs[0].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         if not (inputs[0].dtype == ts.DType.FP32):
             raise ValueError("All inputs need to be FP32." f"Got {inputs[0].dtype=}")
 
diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py
index b23973a20a9..1a431d7f3d4 100644
--- a/backends/arm/operators/op_exp.py
+++ b/backends/arm/operators/op_exp.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -38,6 +39,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
@@ -68,6 +70,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index 9c4425857f8..7a8f793e24b 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -45,12 +46,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator GE but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
@@ -94,12 +90,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator GE but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index 638dee7ccfc..b640b9bc31d 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -45,12 +46,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator GT but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
@@ -94,12 +90,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator GT but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index bc7751c90dc..a458ef126ee 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -45,12 +46,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator LE but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
@@ -94,12 +90,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator LE but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py
index 9b4ef4c7b73..8095a96a506 100644
--- a/backends/arm/operators/op_log.py
+++ b/backends/arm/operators/op_log.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -38,6 +39,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
@@ -68,6 +70,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index 02ca0d4d263..76b9a281c76 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -45,12 +46,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator LT but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
@@ -94,12 +90,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype:
-            raise TypeError(
-                "All inputs need to have the same data type for operator LT but got "
-                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
-            )
+        validate_same_dtype(self.target, inputs)
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_max_pool2d.py b/backends/arm/operators/op_max_pool2d.py
index 6ba85789914..8a37627a416 100644
--- a/backends/arm/operators/op_max_pool2d.py
+++ b/backends/arm/operators/op_max_pool2d.py
@@ -18,6 +18,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -63,6 +64,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [3, 4])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         input_tensor = inputs[0]
         kernel_size = inputs[1].special
@@ -147,6 +149,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [3, 4])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         input_tensor = inputs[0]
         kernel_size = inputs[1].special
diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py
index 5d5c56b90f8..ed7afa4bfd8 100644
--- a/backends/arm/operators/op_maximum.py
+++ b/backends/arm/operators/op_maximum.py
@@ -19,6 +19,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -49,13 +50,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"Data type of inputs and output must be the same. Got input 0 dtype: "
-                f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output "
-                f"dtype: {output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         scale_back = 1.0
         max_output = output
@@ -118,13 +113,7 @@ def define_node(
         from tosa.NanPropagationMode import NanPropagationMode  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"Data type of inputs and output must be the same. Got input 0 dtype: "
-                f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output "
-                f"dtype: {output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         scale_back = 1.0
         max_output = output
diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py
index 85c9b4ac3ed..c0169e75910 100644
--- a/backends/arm/operators/op_minimum.py
+++ b/backends/arm/operators/op_minimum.py
@@ -18,6 +18,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -48,13 +49,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"Data type of inputs and output must be the same. Got input 0 dtype: "
-                f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output "
-                f"dtype: {output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         scale_back = 1.0
         min_output = output
@@ -117,13 +112,7 @@ def define_node(
         from tosa.NanPropagationMode import NanPropagationMode  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"Data type of inputs and output must be the same. Got input 0 dtype: "
-                f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output "
-                f"dtype: {output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         scale_back = 1.0
         min_output = output
diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
index 7d84be213b9..c4c9c135e6e 100644
--- a/backends/arm/operators/op_mul.py
+++ b/backends/arm/operators/op_mul.py
@@ -21,6 +21,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -46,6 +47,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if (
             inputs[0].dtype != ts.DType.INT8
@@ -128,6 +130,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype == ts.DType.INT8:
             return super().define_node(node, tosa_graph, inputs, output)
@@ -160,6 +163,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if (
             inputs[0].dtype != ts.DType.INT8
@@ -228,6 +232,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype == ts.DType.INT8:
             return super().define_node(node, tosa_graph, inputs, output)
diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py
index 2b345cb5118..0274de3adcd 100644
--- a/backends/arm/operators/op_permute.py
+++ b/backends/arm/operators/op_permute.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 
@@ -111,6 +112,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         # The permutation vector describes a permutation P in default Pytorch dim_order.
         # For rank 4, the default dim_order NCHW.
@@ -150,6 +152,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         # The permutation vector describes a permutation P in default Pytorch dim_order.
         # For rank 4, the default dim_order NCHW.
diff --git a/backends/arm/operators/op_pow.py b/backends/arm/operators/op_pow.py
index 0b9ba6321f7..dbecac2fa0d 100644
--- a/backends/arm/operators/op_pow.py
+++ b/backends/arm/operators/op_pow.py
@@ -13,6 +13,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -40,12 +41,8 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
-        if not (inputs[0].dtype == inputs[1].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and outputs need same dtype."
-                f"Got {inputs[0].dtype=}, {inputs[1].dtype=}, {output.dtype=}"
-            )
         if inputs[0].dtype not in [ts.DType.FP32, ts.DType.FP16]:
             raise ValueError(
                 f"All inputs need to be FP32 or FP16. Got {inputs[0].dtype}"
@@ -83,12 +80,8 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
-        if not (inputs[0].dtype == inputs[1].dtype == output.dtype):
-            raise ValueError(
-                "All inputs and outputs need same dtype."
-                f"Got {inputs[0].dtype=}, {inputs[1].dtype=}, {output.dtype=}"
-            )
         if inputs[0].dtype not in [ts.DType.FP32, ts.DType.FP16]:
             raise ValueError(
                 f"All inputs need to be FP32 or FP16. Got {inputs[0].dtype}"
diff --git a/backends/arm/operators/op_reciprocal.py b/backends/arm/operators/op_reciprocal.py
index d8888ec9d49..57ca087e34b 100644
--- a/backends/arm/operators/op_reciprocal.py
+++ b/backends/arm/operators/op_reciprocal.py
@@ -14,6 +14,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -39,6 +40,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
@@ -71,6 +73,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index 1ed42b23b9e..f8f83430ae1 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -14,6 +14,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import tosa_shape
@@ -38,6 +39,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         multiples = inputs[1].special
 
@@ -67,6 +69,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         multiples = inputs[1].special
 
diff --git a/backends/arm/operators/op_rshift_tensor.py b/backends/arm/operators/op_rshift_tensor.py
index ece6debeab4..9908db8f230 100644
--- a/backends/arm/operators/op_rshift_tensor.py
+++ b/backends/arm/operators/op_rshift_tensor.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 
@@ -35,6 +36,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
         attr = ts.TosaSerializerAttribute()
         round = False
@@ -68,6 +70,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
         attr = ts.TosaSerializerAttribute()
         round = False
diff --git a/backends/arm/operators/op_rsqrt.py b/backends/arm/operators/op_rsqrt.py
index 53156e9249a..c25627d0681 100644
--- a/backends/arm/operators/op_rsqrt.py
+++ b/backends/arm/operators/op_rsqrt.py
@@ -14,6 +14,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -39,6 +40,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
@@ -69,6 +71,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py
index 2881fc02eb5..3b5f3dcd77e 100644
--- a/backends/arm/operators/op_sigmoid.py
+++ b/backends/arm/operators/op_sigmoid.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -38,6 +39,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
@@ -68,6 +70,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
diff --git a/backends/arm/operators/op_sin.py b/backends/arm/operators/op_sin.py
index e082f6cb7a4..0f82223b666 100644
--- a/backends/arm/operators/op_sin.py
+++ b/backends/arm/operators/op_sin.py
@@ -13,6 +13,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -37,6 +38,7 @@ def define_node(
         output: TosaArg,
     ) -> None:
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py
index 83d34e52b47..d72df5dbf75 100644
--- a/backends/arm/operators/op_slice.py
+++ b/backends/arm/operators/op_slice.py
@@ -13,6 +13,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from torch.fx import Node
@@ -51,6 +52,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [4, 5])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         # See slice_copy_support.py
         if not (len(inputs) == 4 or (len(inputs) == 5 and inputs[4].number == 1)):
@@ -113,6 +115,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, [4, 5])
+        validate_same_dtype(self.target, [inputs[0], output])
 
         # See slice_copy_support.py
         if not (len(inputs) == 4 or (len(inputs) == 5 and inputs[4].number == 1)):
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
index 03c930918d7..b711b2f5056 100644
--- a/backends/arm/operators/op_sub.py
+++ b/backends/arm/operators/op_sub.py
@@ -16,6 +16,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -44,15 +45,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        # Specification (0.80) states that input and output types
-        # should all be the same
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got input 1: "
-                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
-                f"{output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         # Handle int8 (quantized) and int32
         supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
@@ -119,15 +112,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        # Specification (0.80) states that input and output types
-        # should all be the same
-        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
-            raise TypeError(
-                f"All IO needs to have the same data type, got input 1: "
-                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
-                f"{output.dtype}"
-            )
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
@@ -175,10 +160,8 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output])
 
-        # Specification (1.0) states that input and output types
-        # should all be the same
-        assert inputs[0].dtype == inputs[1].dtype == output.dtype
         # Handle int8 (quantized) and int32
         assert inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]
 
@@ -238,10 +221,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-
-        # Specification (1.0) states that input and output types
-        # should all be the same
-        assert inputs[0].dtype == inputs[1].dtype == output.dtype
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py
index c0a436f4d99..4eb08569005 100644
--- a/backends/arm/operators/op_sum.py
+++ b/backends/arm/operators/op_sum.py
@@ -16,6 +16,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -44,6 +45,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         tensor = inputs[0]
         input_shape = list(tensor.shape)
@@ -98,6 +100,9 @@ def define_node(
 
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
+        validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
+
         if inputs[0].dtype == ts.DType.INT8:
             return super().define_node(node, tosa_graph, inputs, output)
 
@@ -143,6 +148,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         tensor = inputs[0]
         input_shape = list(tensor.shape)
@@ -196,6 +202,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         tensor = inputs[0]
         input_shape = list(tensor.shape)
diff --git a/backends/arm/operators/op_tanh.py b/backends/arm/operators/op_tanh.py
index 02727d0fabe..9923c3da7b7 100644
--- a/backends/arm/operators/op_tanh.py
+++ b/backends/arm/operators/op_tanh.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -39,6 +40,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
@@ -69,6 +71,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
 
         if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
             raise ValueError(
diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_transpose.py
index 8b0754fa079..4886bd66ced 100644
--- a/backends/arm/operators/op_transpose.py
+++ b/backends/arm/operators/op_transpose.py
@@ -15,6 +15,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 
@@ -41,6 +42,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         output_rank = len(output.shape)
         perms = [dim % output_rank for dim in inputs[1].special]
@@ -73,6 +75,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         output_rank = len(output.shape)
         perms = [dim % output_rank for dim in inputs[1].special]
diff --git a/backends/arm/operators/op_upsample_bilinear2d.py b/backends/arm/operators/op_upsample_bilinear2d.py
index 88149a7be91..3d3c47b7e84 100644
--- a/backends/arm/operators/op_upsample_bilinear2d.py
+++ b/backends/arm/operators/op_upsample_bilinear2d.py
@@ -14,6 +14,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_quant_utils import build_rescale, build_rescale_v0_80
@@ -40,6 +41,7 @@ def define_node(
         from tosa_tools.v0_80.tosa.ResizeMode import ResizeMode  # type: ignore
 
         validate_num_inputs(self.target, inputs, 4)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         if inputs[0].shape is None or output.shape is None:
             raise ValueError("Only static shapes are supported")
@@ -129,6 +131,7 @@ def define_node(
         from tosa.RoundingMode import RoundingMode  # type: ignore
 
         validate_num_inputs(self.target, inputs, 4)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         if inputs[0].shape is None or output.shape is None:
             raise ValueError("Only static shapes are supported")
diff --git a/backends/arm/operators/op_upsample_nearest2d.py b/backends/arm/operators/op_upsample_nearest2d.py
index da40859de74..d5f7b951e40 100644
--- a/backends/arm/operators/op_upsample_nearest2d.py
+++ b/backends/arm/operators/op_upsample_nearest2d.py
@@ -14,6 +14,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import get_resize_parameters, tosa_shape
@@ -40,6 +41,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         if inputs[0].shape is None or output.shape is None:
             raise ValueError("Only static shapes are supported")
@@ -98,6 +100,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         assert (
             inputs[0].shape is not None and output.shape is not None
diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py
index 22a8146ecbd..e7a062bbf22 100644
--- a/backends/arm/operators/op_view.py
+++ b/backends/arm/operators/op_view.py
@@ -14,6 +14,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import tosa_shape
@@ -38,6 +39,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         attr = ts.TosaSerializerAttribute()
         new_shape = tosa_shape(inputs[1].special, output.dim_order)
@@ -67,6 +69,7 @@ def define_node(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [inputs[0], output])
 
         tosa_graph = cast(ts.TosaSerializer, tosa_graph)
 
diff --git a/backends/arm/operators/op_where.py b/backends/arm/operators/op_where.py
index 67392fefcd8..49a6e3d295b 100644
--- a/backends/arm/operators/op_where.py
+++ b/backends/arm/operators/op_where.py
@@ -12,6 +12,7 @@
 
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -39,14 +40,11 @@ def _add_node_to_tosa_graph(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 3)
+        # Not first input, which is condition tensor.
+        validate_same_dtype(self.target, inputs[1:])
 
         if inputs[0].dtype is not ts.DType.BOOL:
             raise ValueError("Input 0 needs to have dtype BOOL")
-        if inputs[1].dtype != inputs[2].dtype:
-            raise ValueError(
-                "Non-condition tensors must have same data type, got "
-                f"{inputs[1].dtype} and {inputs[2].dtype}"
-            )
         for input_ in inputs[1:]:
             if input_.dtype not in supported_dtypes:
                 raise ValueError(
@@ -129,14 +127,11 @@ def _add_node_to_tosa_graph(
         import serializer.tosa_serializer as ts
 
         validate_num_inputs(self.target, inputs, 3)
+        # Not first input, which is condition tensor.
+        validate_same_dtype(self.target, inputs[1:])
 
         if inputs[0].dtype is not ts.DType.BOOL:
             raise ValueError("Input 0 needs to have dtype BOOL")
-        if inputs[1].dtype != inputs[2].dtype:
-            raise ValueError(
-                "Non-condition tensors must have same data type, got "
-                f"{inputs[1].dtype} and {inputs[2].dtype}"
-            )
         for input_ in inputs[1:]:
             if input_.dtype not in supported_dtypes:
                 raise ValueError(
diff --git a/backends/arm/operators/operator_validation_utils.py b/backends/arm/operators/operator_validation_utils.py
index 824695b4643..d15bb65ba77 100644
--- a/backends/arm/operators/operator_validation_utils.py
+++ b/backends/arm/operators/operator_validation_utils.py
@@ -41,7 +41,6 @@ def validate_num_inputs(op_name: str, inputs: List[Any], expected: int | List[in
     )
 
     validate_num_inputs(self.target, inputs, [3, 4])
-
     """
     if isinstance(expected, int):
         expected = [expected]
@@ -51,3 +50,52 @@ def validate_num_inputs(op_name: str, inputs: List[Any], expected: int | List[in
             f"{op_name}: Expected number of input(s) to be "
             f"[{expected_str}], got {len(inputs)}"
         )
+
+
+def validate_same_dtype(op_name: str, tensors: List[Any]):
+    """
+    Validates that all given tensors have the same dtype attribute.
+
+    This function checks whether all items in the `tensors` list have the same
+    `dtype` as the first item.
+
+    Parameters:
+    -----------
+    op_name : str
+        The name of the operation for which the dtype validation is being performed.
+        Used in the error message to provide context.
+
+    tensors : List[Any]
+        A list of tensors to be validated, each is assumed to have a `dtype` attribute.
+
+    Raises:
+    -------
+    ValueError
+        If the dtype of any item in the list does not match the dtype of the first item,
+        a `ValueError` is raised with a message indicating the operation name and the
+        mismatch in dtypes.
+
+    Example:
+    --------
+    # Example usage:
+    from executorch.backends.arm.operators.operator_validation_utils import (
+        validate_same_dtype,
+    )
+
+    validate_same_dtype(self.target, [input1, input2, output])
+
+    """
+    if not tensors:
+        raise ValueError(
+            f"{op_name}: Input tensor list is empty, cannot validate dtypes"
+        )
+
+    # Get dtype of the first tensor to reference for comparison
+    reference_dtype = tensors[0].dtype
+
+    for tensor in tensors:
+        if tensor.dtype != reference_dtype:
+            raise ValueError(
+                f"{op_name}: Expected all tensors to have dtype {reference_dtype}, but "
+                f"found inconsistent dtype {tensor.dtype}."
+            )
diff --git a/backends/arm/operators/ops_binary.py b/backends/arm/operators/ops_binary.py
index 0a2f4419dfb..0dc4ba41a2b 100644
--- a/backends/arm/operators/ops_binary.py
+++ b/backends/arm/operators/ops_binary.py
@@ -16,6 +16,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 
@@ -37,12 +38,7 @@ def define_node(
             import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore  # noqa: F401
 
             validate_num_inputs(self.target, inputs, 2)
-
-            if not (inputs[0].dtype == inputs[1].dtype == output.dtype):
-                raise ValueError(
-                    "All inputs and outputs need same dtype."
-                    f"Got {inputs[0].dtype=}, {inputs[1].dtype=}, {output.dtype=}."
-                )
+            validate_same_dtype(self.target, [*inputs, output])
 
             tosa_graph.addOperator(
                 tosa_op, [inputs[0].name, inputs[1].name], [output.name]
@@ -68,12 +64,7 @@ def define_node(
             import serializer.tosa_serializer as ts  # type: ignore  # noqa: F401
 
             validate_num_inputs(self.target, inputs, 2)
-
-            if not (inputs[0].dtype == inputs[1].dtype == output.dtype):
-                raise ValueError(
-                    "All inputs and outputs need same dtype."
-                    f"Got {inputs[0].dtype=}, {inputs[1].dtype=}, {output.dtype=}."
-                )
+            validate_same_dtype(self.target, [*inputs, output])
 
             tosa_graph.addOperator(
                 tosa_op, [inputs[0].name, inputs[1].name], [output.name]
diff --git a/backends/arm/operators/ops_identity.py b/backends/arm/operators/ops_identity.py
index cd5fa9956a3..395208a63ff 100644
--- a/backends/arm/operators/ops_identity.py
+++ b/backends/arm/operators/ops_identity.py
@@ -16,6 +16,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 
@@ -41,6 +42,7 @@ def define_node(
             import tosa_tools.v0_80.serializer.tosa_serializer as ts
 
             validate_num_inputs(self.target, inputs, 1)
+            validate_same_dtype(self.target, [*inputs, output])
 
             # Simply add an identityOp
             tosa_graph.addOperator(
@@ -75,6 +77,7 @@ def define_node(
             import serializer.tosa_serializer as ts
 
             validate_num_inputs(self.target, inputs, 1)
+            validate_same_dtype(self.target, [*inputs, output])
 
             # Simply add an identityOp
             tosa_graph.addOperator(
diff --git a/backends/arm/operators/ops_unary.py b/backends/arm/operators/ops_unary.py
index b7ba2df4277..2492b29b794 100644
--- a/backends/arm/operators/ops_unary.py
+++ b/backends/arm/operators/ops_unary.py
@@ -14,6 +14,7 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_same_dtype,
 )
 
 from executorch.backends.arm.tosa_mapping import TosaArg
@@ -42,12 +43,7 @@ def define_node(
             import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore  # noqa: F401
 
             validate_num_inputs(self.target, inputs, 1)
-
-            if not (inputs[0].dtype == output.dtype):
-                raise ValueError(
-                    "All inputs and output need same dtype."
-                    f"Got {inputs[0].dtype=}, {output.dtype=}"
-                )
+            validate_same_dtype(self.target, [*inputs, output])
 
             if self.target in fp_only_ops and not (inputs[0].dtype == ts.DType.FP32):
                 raise ValueError(
@@ -82,12 +78,7 @@ def define_node(
             import serializer.tosa_serializer as ts  # type: ignore  # noqa: F401
 
             validate_num_inputs(self.target, inputs, 1)
-
-            if not (inputs[0].dtype == output.dtype):
-                raise ValueError(
-                    "All inputs and output need same dtype."
-                    f"Got {inputs[0].dtype=}, {output.dtype=}"
-                )
+            validate_same_dtype(self.target, [*inputs, output])
 
             if self.target in fp_only_ops and not (inputs[0].dtype == ts.DType.FP32):
                 raise ValueError(

From ccabb2e830543bf8ec1a2b5e8a5951977cd1a477 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Thu, 15 May 2025 13:02:28 +0200
Subject: [PATCH 097/178] Arm backend: Refactor misc tests for TOSA V1.0
 (#10851)

### Summary
Refactor misc tests for TOSA V1.0.

### Test plan
Tested on internal and external GitHub CI.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 backends/arm/test/common.py                   |   7 +-
 .../arm/test/misc/test_custom_partition.py    | 219 +++++------
 backends/arm/test/misc/test_debug_feats.py    | 361 +++++++++---------
 .../arm/test/misc/test_dim_order_guards.py    |  83 ++--
 backends/arm/test/misc/test_lifted_tensor.py  | 184 ++++-----
 .../arm/test/misc/test_multiple_delegates.py  |  88 ++---
 .../arm/test/misc/test_multiple_outputs.py    | 142 +++----
 backends/arm/test/misc/test_tosa_spec.py      |   1 -
 backends/arm/test/tester/arm_tester.py        |  18 +-
 backends/arm/test/tester/test_pipeline.py     |   4 +-
 10 files changed, 519 insertions(+), 588 deletions(-)

diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 3f90c8c056c..3e3d89f4569 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -47,16 +47,15 @@ def maybe_get_tosa_collate_path() -> str | None:
     tosa_test_base = os.environ.get("TOSA_TESTCASES_BASE_PATH")
     if tosa_test_base:
         current_test = os.environ.get("PYTEST_CURRENT_TEST")
-        #'backends/arm/test/ops/test_mean_dim.py::TestMeanDim::test_meandim_tosa_BI_0_zeros (call)'
-        test_class = current_test.split("::")[1]  # type: ignore[union-attr]
-        test_name = current_test.split("::")[-1].split(" ")[0]  # type: ignore[union-attr]
+        # '::test_collate_tosa_BI_tests[randn] (call)'
+        test_name = current_test.split("::")[1].split(" ")[0]  # type: ignore[union-attr]
         if "BI" in test_name:
             tosa_test_base = os.path.join(tosa_test_base, "tosa-bi")
         elif "MI" in test_name:
             tosa_test_base = os.path.join(tosa_test_base, "tosa-mi")
         else:
             tosa_test_base = os.path.join(tosa_test_base, "other")
-        return os.path.join(tosa_test_base, test_class, test_name)
+        return os.path.join(tosa_test_base, test_name)
 
     return None
 
diff --git a/backends/arm/test/misc/test_custom_partition.py b/backends/arm/test/misc/test_custom_partition.py
index 00bc4d306ae..c2889f17ce3 100644
--- a/backends/arm/test/misc/test_custom_partition.py
+++ b/backends/arm/test/misc/test_custom_partition.py
@@ -4,11 +4,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+from typing import Tuple
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_partitioner import TOSAPartitioner
+from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineMI
 from executorch.exir.backend.operator_support import (
     DontPartition,
     DontPartitionModule,
@@ -16,9 +16,13 @@
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
+
 
 class CustomPartitioning(torch.nn.Module):
-    inputs = (torch.randn(10, 4, 5), torch.randn(10, 4, 5))
+    inputs = {
+        "randn": (torch.randn(10, 4, 5), torch.randn(10, 4, 5)),
+    }
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         z = x + y
@@ -27,7 +31,9 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 
 class NestedModule(torch.nn.Module):
-    inputs = (torch.randn(10, 4, 5), torch.randn(10, 4, 5))
+    inputs = {
+        "randn": (torch.randn(10, 4, 5), torch.randn(10, 4, 5)),
+    }
 
     def __init__(self):
         super().__init__()
@@ -39,192 +45,139 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return self.nested(a, b)
 
 
-def test_single_reject(caplog):
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_single_reject(caplog, test_data: input_t1):
     caplog.set_level(logging.INFO)
 
     module = CustomPartitioning()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
     check = DontPartition(exir_ops.edge.aten.sigmoid.default)
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 2})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2}
+    )
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
     )
+    pipeline.run()
     assert check.has_rejected_node()
     assert "Rejected by DontPartition" in caplog.text
 
 
-def test_multiple_reject():
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_multiple_reject(test_data: input_t1):
     module = CustomPartitioning()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
     check = DontPartition(
         exir_ops.edge.aten.sigmoid.default, exir_ops.edge.aten.mul.Tensor
     )
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2}
+    )
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
     )
+    pipeline.run()
     assert check.has_rejected_node()
 
 
-def test_torch_op_reject(caplog):
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_torch_op_reject(caplog, test_data: input_t1):
     caplog.set_level(logging.INFO)
 
     module = CustomPartitioning()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
     check = DontPartition(torch.ops.aten.sigmoid.default)
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 2})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2}
     )
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
+    )
+    pipeline.run()
     assert check.has_rejected_node()
     assert "Rejected by DontPartition" in caplog.text
 
 
-def test_string_op_reject():
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_string_op_reject(test_data: input_t1):
     module = CustomPartitioning()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
     check = DontPartition("aten.sigmoid.default")
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 2})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2}
     )
-
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
+    )
+    pipeline.run()
     assert check.has_rejected_node()
 
 
-def test_name_reject(caplog):
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_name_reject(caplog, test_data: input_t1):
     caplog.set_level(logging.INFO)
 
     module = CustomPartitioning()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
     check = DontPartitionName("mul", "sigmoid", exact=False)
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
     )
+    pipeline.run()
     assert check.has_rejected_node()
     assert "Rejected by DontPartitionName" in caplog.text
 
 
-def test_module_reject():
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_module_reject(test_data: input_t1):
     module = NestedModule()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
     check = DontPartitionModule(module_name="CustomPartitioning")
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
     )
+    pipeline.run()
     assert check.has_rejected_node()
 
 
-def test_inexact_module_reject(caplog):
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_inexact_module_reject(caplog, test_data: input_t1):
     caplog.set_level(logging.INFO)
 
     module = NestedModule()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
     check = DontPartitionModule(module_name="Custom", exact=False)
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
     )
+    pipeline.run()
     assert check.has_rejected_node()
     assert "Rejected by DontPartitionModule" in caplog.text
 
 
-def test_module_instance_reject():
+@common.parametrize("test_data", CustomPartitioning.inputs)
+def test_module_instance_reject(test_data: input_t1):
     module = NestedModule()
-    inputs = module.inputs
-    compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
     check = DontPartitionModule(instance_name="nested")
-    partitioner = TOSAPartitioner(compile_spec, additional_checks=[check])
-    (
-        ArmTester(
-            module,
-            example_inputs=inputs,
-            compile_spec=compile_spec,
-        )
-        .export()
-        .to_edge_transform_and_lower(partitioners=[partitioner])
-        .check(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-        .to_executorch()
-        .run_method_and_compare_outputs(inputs=inputs)
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], exir_op=[])
+    pipeline.change_args("to_edge_transform_and_lower", additional_checks=[check])
+    pipeline.change_args(
+        "check_count.exir",
+        {"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1},
     )
+    pipeline.run()
     assert check.has_rejected_node()
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index 60bf89b6e17..e9305b3dc21 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,228 +6,218 @@
 import os
 import shutil
 import tempfile
-import unittest
+
+from typing import Tuple
+
+import pytest
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
 
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+input_t1 = Tuple[torch.Tensor]  # Input x
 
 
 class Linear(torch.nn.Module):
+    inputs = {
+        "randn": (torch.randn(5, 10, 25, 3),),
+    }
+
     def __init__(
         self,
-        in_features: int,
-        out_features: int = 3,
-        bias: bool = True,
     ):
         super().__init__()
-        self.inputs = (torch.randn(5, 10, 25, in_features),)
         self.fc = torch.nn.Linear(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
+            in_features=3,
+            out_features=5,
+            bias=True,
         )
 
-    def get_inputs(self):
-        return self.inputs
-
     def forward(self, x):
         return self.fc(x)
 
 
-class TestDumpPartitionedArtifact(unittest.TestCase):
-    """Tests dumping the partition artifact in ArmTester. Both to file and to stdout."""
-
-    def _tosa_MI_pipeline(self, module: torch.nn.Module, dump_file=None):
-        (
-            ArmTester(
-                module,
-                example_inputs=module.get_inputs(),  # type: ignore[operator]
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .dump_artifact(dump_file)
-            .dump_artifact()
-        )
+"""Tests dumping the partition artifact in ArmTester. Both to file and to stdout."""
 
-    def _tosa_BI_pipeline(self, module: torch.nn.Module, dump_file=None):
-        (
-            ArmTester(
-                module,
-                example_inputs=module.get_inputs(),  # type: ignore[operator]
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .dump_artifact(dump_file)
-            .dump_artifact()
-        )
 
-    def _is_tosa_marker_in_file(self, tmp_file):
-        for line in open(tmp_file).readlines():
-            if "'name': 'main'" in line:
-                return True
-        return False
+def _tosa_MI_pipeline(module: torch.nn.Module, test_data: input_t1, dump_file=None):
 
-    def test_MI_artifact(self):
-        model = Linear(20, 30)
-        tmp_file = common.get_time_formatted_path(
-            tempfile.mkdtemp(), self._testMethodName
-        )
-        self._tosa_MI_pipeline(model, dump_file=tmp_file)
-        assert os.path.exists(tmp_file), f"File {tmp_file} was not created"
-        if self._is_tosa_marker_in_file(tmp_file):
-            return  # Implicit pass test
-        self.fail("File does not contain TOSA dump!")
-
-    def test_BI_artifact(self):
-        model = Linear(20, 30)
-        tmp_file = common.get_time_formatted_path(
-            tempfile.mkdtemp(), self._testMethodName
-        )
-        self._tosa_BI_pipeline(model, dump_file=tmp_file)
-        assert os.path.exists(tmp_file), f"File {tmp_file} was not created"
-        if self._is_tosa_marker_in_file(tmp_file):
-            return  # Implicit pass test
-        self.fail("File does not contain TOSA dump!")
-
-
-class TestNumericalDiffPrints(unittest.TestCase):
-    """Tests trigging the exception printout from the ArmTester's run and compare function."""
-
-    def test_numerical_diff_prints(self):
-        model = Linear(20, 30)
-        tester = (
-            ArmTester(
-                model,
-                example_inputs=model.get_inputs(),
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                    custom_path=tempfile.mkdtemp("diff_print_test"),
-                ),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-        )
-        # We expect an assertion error here. Any other issues will cause the
-        # test to fail. Likewise the test will fail if the assertion error is
-        # not present.
-        try:
-            # Tolerate 0 difference => we want to trigger a numerical diff
-            tester.run_method_and_compare_outputs(atol=0, rtol=0, qtol=0)
-        except AssertionError:
-            pass  # Implicit pass test
-        else:
-            self.fail()
-
-
-def test_dump_ops_and_dtypes():
-    model = Linear(20, 30)
-    (
-        ArmTester(
-            model,
-            example_inputs=model.get_inputs(),
-            compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-        )
-        .quantize()
-        .dump_dtype_distribution()
-        .dump_operator_distribution()
-        .export()
-        .dump_dtype_distribution()
-        .dump_operator_distribution()
-        .to_edge_transform_and_lower()
-        .dump_dtype_distribution()
-        .dump_operator_distribution()
+    pipeline = TosaPipelineMI[input_t1](module, test_data, [], [])
+    pipeline.dump_artifact("to_edge_transform_and_lower")
+    pipeline.dump_artifact("to_edge_transform_and_lower", suffix=dump_file)
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+
+def _tosa_BI_pipeline(module: torch.nn.Module, test_data: input_t1, dump_file=None):
+
+    pipeline = TosaPipelineBI[input_t1](module, test_data, [], [])
+    pipeline.dump_artifact("to_edge_transform_and_lower")
+    pipeline.dump_artifact("to_edge_transform_and_lower", suffix=dump_file)
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+
+def _is_tosa_marker_in_file(tmp_file):
+    for line in open(tmp_file).readlines():
+        if "'name': 'main'" in line:
+            return True
+    return False
+
+
+@common.parametrize("test_data", Linear.inputs)
+def test_MI_artifact(test_data: input_t1):
+    model = Linear()
+    tmp_file = common.get_time_formatted_path(
+        tempfile.mkdtemp(), test_MI_artifact.__name__
     )
-    # Just test that there are no execptions.
+    _tosa_MI_pipeline(model, test_data, dump_file=tmp_file)
+    assert os.path.exists(tmp_file), f"File {tmp_file} was not created"
+    if _is_tosa_marker_in_file(tmp_file):
+        return  # Implicit pass test
+    pytest.fail("File does not contain TOSA dump!")
+
+
+@common.parametrize("test_data", Linear.inputs)
+def test_BI_artifact(test_data: input_t1):
+    model = Linear()
+    tmp_file = common.get_time_formatted_path(
+        tempfile.mkdtemp(), test_BI_artifact.__name__
+    )
+    _tosa_BI_pipeline(model, test_data, dump_file=tmp_file)
+    assert os.path.exists(tmp_file), f"File {tmp_file} was not created"
+    if _is_tosa_marker_in_file(tmp_file):
+        return  # Implicit pass test
+    pytest.fail("File does not contain TOSA dump!")
 
 
-def test_dump_ops_and_dtypes_parseable():
-    model = Linear(20, 30)
-    (
-        ArmTester(
-            model,
-            example_inputs=model.get_inputs(),
-            compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-        )
-        .quantize()
-        .dump_dtype_distribution(print_table=False)
-        .dump_operator_distribution(print_table=False)
-        .export()
-        .dump_dtype_distribution(print_table=False)
-        .dump_operator_distribution(print_table=False)
-        .to_edge_transform_and_lower()
-        .dump_dtype_distribution(print_table=False)
-        .dump_operator_distribution(print_table=False)
+"""Tests trigging the exception printout from the ArmTester's run and compare function."""
+
+
+@common.parametrize("test_data", Linear.inputs)
+def test_numerical_diff_print(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](
+        Linear(),
+        test_data,
+        [],
+        [],
+        custom_path="diff_print_test",
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+    tester = pipeline.tester
+    # We expect an assertion error here. Any other issues will cause the
+    # test to fail. Likewise the test will fail if the assertion error is
+    # not present.
+    try:
+        # Tolerate 0 difference => we want to trigger a numerical diff
+        tester.run_method_and_compare_outputs(atol=0, rtol=0, qtol=0)
+    except AssertionError:
+        pass  # Implicit pass test
+    else:
+        pytest.fail()
+
+
+@common.parametrize("test_data", Linear.inputs)
+def test_dump_ops_and_dtypes(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Linear(), test_data, [], [])
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.add_stage_after("quantize", pipeline.tester.dump_dtype_distribution)
+    pipeline.add_stage_after("quantize", pipeline.tester.dump_operator_distribution)
+    pipeline.add_stage_after("export", pipeline.tester.dump_dtype_distribution)
+    pipeline.add_stage_after("export", pipeline.tester.dump_operator_distribution)
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower", pipeline.tester.dump_dtype_distribution
     )
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower", pipeline.tester.dump_operator_distribution
+    )
+    pipeline.run()
     # Just test that there are no execptions.
 
 
-class TestCollateTosaTests(unittest.TestCase):
-    """Tests the collation of TOSA tests through setting the environment variable TOSA_TESTCASE_BASE_PATH."""
-
-    def test_collate_tosa_BI_tests(self):
-        # Set the environment variable to trigger the collation of TOSA tests
-        os.environ["TOSA_TESTCASES_BASE_PATH"] = "test_collate_tosa_tests"
-        # Clear out the directory
-
-        model = Linear(20, 30)
-        (
-            ArmTester(
-                model,
-                example_inputs=model.get_inputs(),
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-        )
+@common.parametrize("test_data", Linear.inputs)
+def test_dump_ops_and_dtypes_parseable(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Linear(), test_data, [], [])
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.add_stage_after("quantize", pipeline.tester.dump_dtype_distribution, False)
+    pipeline.add_stage_after(
+        "quantize", pipeline.tester.dump_operator_distribution, False
+    )
+    pipeline.add_stage_after("export", pipeline.tester.dump_dtype_distribution, False)
+    pipeline.add_stage_after(
+        "export", pipeline.tester.dump_operator_distribution, False
+    )
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower", pipeline.tester.dump_dtype_distribution, False
+    )
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower", pipeline.tester.dump_operator_distribution, False
+    )
+    pipeline.run()
+    # Just test that there are no execptions.
 
-        test_collate_dir = "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests"
-        # test that the output directory is created and contains the expected files
-        assert os.path.exists(test_collate_dir)
 
-        for file in os.listdir(test_collate_dir):
-            assert file.endswith(("TOSA-0.80+BI.json", "TOSA-0.80+BI.tosa"))
+"""Tests the collation of TOSA tests through setting the environment variable TOSA_TESTCASE_BASE_PATH."""
 
-        os.environ.pop("TOSA_TESTCASES_BASE_PATH")
-        shutil.rmtree("test_collate_tosa_tests", ignore_errors=True)
 
+@common.parametrize("test_data", Linear.inputs)
+def test_collate_tosa_BI_tests(test_data: input_t1):
+    # Set the environment variable to trigger the collation of TOSA tests
+    os.environ["TOSA_TESTCASES_BASE_PATH"] = "test_collate_tosa_tests"
+    # Clear out the directory
+    pipeline = TosaPipelineBI[input_t1](Linear(), test_data, [], [])
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
 
-def test_dump_tosa_ops(caplog):
-    model = Linear(20, 30)
-    (
-        ArmTester(
-            model,
-            example_inputs=model.get_inputs(),
-            compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-        )
-        .quantize()
-        .export()
-        .to_edge_transform_and_lower()
-        .dump_operator_distribution()
+    test_collate_dir = (
+        "test_collate_tosa_tests/tosa-bi/test_collate_tosa_BI_tests[randn]"
     )
+    # test that the output directory is created and contains the expected files
+    assert os.path.exists(test_collate_dir)
+    tosa_version = conftest.get_option("tosa_version")
+    for file in os.listdir(test_collate_dir):
+        file_name_prefix = f"TOSA-{tosa_version}+" + (
+            "INT" if tosa_version == "1.0" else "BI"
+        )
+        assert file.endswith((f"{file_name_prefix}.json", f"{file_name_prefix}.tosa"))
+
+    os.environ.pop("TOSA_TESTCASES_BASE_PATH")
+    shutil.rmtree("test_collate_tosa_tests", ignore_errors=True)
+
+
+@common.parametrize("test_data", Linear.inputs)
+def test_dump_tosa_ops(caplog, test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Linear(), test_data, [], [])
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.dump_operator_distribution("to_edge_transform_and_lower")
+    pipeline.run()
     assert "TOSA operators:" in caplog.text
 
 
-def test_fail_dump_tosa_ops(caplog):
+class Add(torch.nn.Module):
+    inputs = {
+        "ones": (torch.ones(5),),
+    }
+
+    def forward(self, x):
+        return x + x
 
-    class Add(torch.nn.Module):
-        def forward(self, x):
-            return x + x
 
-    model = Add()
-    compile_spec = common.get_u55_compile_spec()
-    (
-        ArmTester(model, example_inputs=(torch.ones(5),), compile_spec=compile_spec)
-        .quantize()
-        .export()
-        .to_edge_transform_and_lower()
-        .dump_operator_distribution()
+@common.parametrize("test_data", Add.inputs)
+def test_fail_dump_tosa_ops(caplog, test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Add(),
+        test_data,
+        [],
+        [],
+        use_to_edge_transform_and_lower=True,
     )
+    pipeline.dump_operator_distribution("to_edge_transform_and_lower")
+    pipeline.run()
     assert "Can not get operator distribution for Vela command stream." in caplog.text
diff --git a/backends/arm/test/misc/test_dim_order_guards.py b/backends/arm/test/misc/test_dim_order_guards.py
index 0698773e6f8..44c9e707324 100644
--- a/backends/arm/test/misc/test_dim_order_guards.py
+++ b/backends/arm/test/misc/test_dim_order_guards.py
@@ -1,19 +1,29 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+
+from typing import Tuple
 
 import pytest
 
 import torch
 from executorch.backends.arm.test import common
 
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+input_t1 = Tuple[torch.Tensor]  # Input x
 
 
 class Conv2D(torch.nn.Module):
+    inputs: dict[str, input_t1] = {
+        "randn": (torch.randn(1, 2, 20, 20),),
+    }
 
     def __init__(self):
         super().__init__()
@@ -22,37 +32,36 @@ def __init__(self):
     def forward(self, x):
         return self.conv2d(x.to(memory_format=torch.channels_last))
 
-    def get_inputs(self):
-        return (torch.randn(1, 2, 20, 20),)
-
-
-class TestDimOrderGuards(unittest.TestCase):
-
-    def test_tosa_MI_pipeline(self):
-        module = Conv2D()
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=module.get_inputs(),
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-        )
-        with pytest.raises(RuntimeError):
-            tester.partition()
-
-    def test_tosa_BI_pipeline(self):
-        module = Conv2D()
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=module.get_inputs(),
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-        )
-        with pytest.raises(RuntimeError):
-            tester.partition()
+
+@common.parametrize("test_data", Conv2D.inputs)
+def test_tosa_MI_pipeline(test_data: input_t1):
+    module = Conv2D()
+    pipeline = TosaPipelineMI[input_t1](
+        module,
+        test_data,
+        [],
+        [],
+        use_to_edge_transform_and_lower=False,
+    )
+    pos = pipeline.find_pos("partition")
+    pipeline._stages = pipeline._stages[:pos]
+    pipeline.run()
+    with pytest.raises(RuntimeError):
+        pipeline.tester.partition()
+
+
+@common.parametrize("test_data", Conv2D.inputs)
+def test_tosa_BI_pipeline(test_data: input_t1):
+    module = Conv2D()
+    pipeline = TosaPipelineBI[input_t1](
+        module,
+        test_data,
+        [],
+        [],
+        use_to_edge_transform_and_lower=False,
+    )
+    pos = pipeline.find_pos("partition")
+    pipeline._stages = pipeline._stages[:pos]
+    pipeline.run()
+    with pytest.raises(RuntimeError):
+        pipeline.tester.partition()
diff --git a/backends/arm/test/misc/test_lifted_tensor.py b/backends/arm/test/misc/test_lifted_tensor.py
index 092483fd632..7f1a9938037 100644
--- a/backends/arm/test/misc/test_lifted_tensor.py
+++ b/backends/arm/test/misc/test_lifted_tensor.py
@@ -4,24 +4,29 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
-import unittest
-from typing import Union
+from typing import Tuple, Union
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized  # type: ignore[import-untyped]
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+from executorch.backends.xnnpack.test.tester import ToEdge
+
+
+input_t1 = Tuple[torch.Tensor]
 
 
 class LiftedTensor(torch.nn.Module):
 
-    test_data = [
-        # (operator, test_data, length)
-        (operator.add, (torch.randn(2, 2), 2)),
-        (operator.truediv, (torch.ones(2, 2), 2)),
-        (operator.mul, (torch.randn(2, 2), 2)),
-        (operator.sub, (torch.rand(2, 2), 2)),
-    ]
+    test_data = {
+        # test_name: (operator, test_data, length)
+        "add": (operator.add, (torch.randn(2, 2), 2)),
+        "truediv": (operator.truediv, (torch.ones(2, 2), 2)),
+        "mul": (operator.mul, (torch.randn(2, 2), 2)),
+        "sub": (operator.sub, (torch.rand(2, 2), 2)),
+    }
 
     def __init__(self, op: callable):  # type: ignore[valid-type]
         super().__init__()
@@ -34,13 +39,13 @@ def forward(self, x: torch.Tensor, length) -> torch.Tensor:
 
 
 class LiftedScalarTensor(torch.nn.Module):
-    test_data = [
-        # (operator, test_data)
-        (operator.add, (torch.randn(2, 2),), 1.0),
-        (operator.truediv, (torch.randn(4, 2),), 1.0),
-        (operator.mul, (torch.randn(1, 2),), 2.0),
-        (operator.sub, (torch.randn(3),), 1.0),
-    ]
+    test_data = {
+        # test_name: (operator, test_data)
+        "add": (operator.add, (torch.randn(2, 2),), 1.0),
+        "truediv": (operator.truediv, (torch.randn(4, 2),), 1.0),
+        "mul": (operator.mul, (torch.randn(1, 2),), 2.0),
+        "sub": (operator.sub, (torch.randn(3),), 1.0),
+    }
 
     def __init__(self, op: callable, arg1: Union[int, float, torch.tensor]):  # type: ignore[valid-type]
         super().__init__()
@@ -51,71 +56,78 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.op(x, self.arg1)  # type: ignore[misc]
 
 
-class TestLiftedTensor(unittest.TestCase):
-    """Tests the ArmPartitioner with a placeholder of type lifted tensor."""
-
-    @parameterized.expand(LiftedTensor.test_data)
-    def test_partition_lifted_tensor_tosa_MI(self, op, data):
-        tester = (
-            ArmTester(
-                LiftedTensor(op),
-                example_inputs=data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-        )
-        signature = tester.get_artifact().exported_program().graph_signature
-        assert len(signature.lifted_tensor_constants) > 0
-        tester.partition()
-        tester.to_executorch()
-        tester.run_method_and_compare_outputs(data)
-
-    @parameterized.expand(LiftedTensor.test_data)
-    def test_partition_lifted_tensor_tosa_BI(self, op, data):
-        tester = (
-            ArmTester(
-                LiftedTensor(op),
-                example_inputs=data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-        )
-        signature = tester.get_artifact().exported_program().graph_signature
-        assert len(signature.lifted_tensor_constants) == 0
-        tester.partition()
-        tester.to_executorch()
-        tester.run_method_and_compare_outputs(data)
-
-    @parameterized.expand(LiftedScalarTensor.test_data)
-    def test_partition_lifted_scalar_tensor_tosa_MI(self, op, data, arg1):
-        (
-            ArmTester(
-                LiftedScalarTensor(op, arg1),
-                example_inputs=(data),
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .to_executorch()
-            .run_method_and_compare_outputs(data)
-        )
-
-    @parameterized.expand(LiftedScalarTensor.test_data)
-    def test_partition_lifted_scalar_tensor_tosa_BI(self, op, data, arg1):
-        (
-            ArmTester(
-                LiftedScalarTensor(op, arg1),
-                example_inputs=(data),
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .to_executorch()
-            .run_method_and_compare_outputs(data)
-        )
+"""Tests the ArmPartitioner with a placeholder of type lifted tensor."""
+
+
+@common.parametrize("test_data", LiftedTensor.test_data)
+def test_partition_lifted_tensor_tosa_MI(test_data: input_t1):
+    op = test_data[0]
+    data = test_data[1:]
+    module = LiftedTensor(op)
+    pipeline = TosaPipelineMI[input_t1](
+        module,
+        *data,
+        [],
+        exir_op=[],
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+    to_edge_stage_name = pipeline.tester.stage_name(ToEdge)
+    signature = (
+        pipeline.tester.stages[to_edge_stage_name]
+        .artifact.exported_program()
+        .graph_signature
+    )
+    assert len(signature.lifted_tensor_constants) > 0
+
+
+@common.parametrize("test_data", LiftedTensor.test_data)
+def test_partition_lifted_tensor_tosa_BI(test_data: input_t1):
+    op = test_data[0]
+    data = test_data[1:]
+    module = LiftedTensor(op)
+    pipeline = TosaPipelineBI[input_t1](
+        module,
+        *data,
+        [],
+        exir_op=[],
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+    to_edge_stage_name = pipeline.tester.stage_name(ToEdge)
+    signature = (
+        pipeline.tester.stages[to_edge_stage_name]
+        .artifact.exported_program()
+        .graph_signature
+    )
+    assert len(signature.lifted_tensor_constants) == 0
+
+
+@common.parametrize("test_data", LiftedScalarTensor.test_data)
+def test_partition_lifted_scalar_tensor_tosa_MI(test_data: input_t1):
+    op = test_data[0]
+    data = test_data[1:]
+    module = LiftedScalarTensor(op, data[-1])
+    pipeline = TosaPipelineMI[input_t1](
+        module,
+        data[0],
+        [],
+        exir_op=[],
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", LiftedScalarTensor.test_data)
+def test_partition_lifted_scalar_tensor_tosa_BI(test_data: input_t1):
+    op = test_data[0]
+    data = test_data[1:]
+    module = LiftedScalarTensor(op, data[-1])
+    pipeline = TosaPipelineBI[input_t1](
+        module,
+        data[0],
+        [],
+        exir_op=[],
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/misc/test_multiple_delegates.py b/backends/arm/test/misc/test_multiple_delegates.py
index ab768d273c6..0b0122bf65e 100644
--- a/backends/arm/test/misc/test_multiple_delegates.py
+++ b/backends/arm/test/misc/test_multiple_delegates.py
@@ -1,57 +1,47 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-
-
-class TestMultipleDelegates(unittest.TestCase):
-    class MultipleDelegatesModule(torch.nn.Module):
-        inputs = (torch.randn(10, 4, 5), torch.randn(10, 4, 5))
-
-        def get_inputs(self):
-            return self.inputs
-
-        def forward(self, x: torch.Tensor, y: torch.Tensor):
-            z = x + y
-            s = torch.tan(z)
-            return s * z
-
-    def test_tosa_MI(self):
-        module = self.MultipleDelegatesModule()
-        inputs = module.get_inputs()
-        (
-            ArmTester(
-                module,
-                example_inputs=inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 2})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=inputs)
-        )
-
-    def test_tosa_BI(self):
-        module = self.MultipleDelegatesModule()
-        inputs = module.get_inputs()
-        (
-            ArmTester(
-                module,
-                example_inputs=inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 2})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=inputs, qtol=1.0)
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
+
+
+class MultipleDelegatesModule(torch.nn.Module):
+    inputs = {
+        "randn": (torch.randn(10, 4, 5), torch.randn(10, 4, 5)),
+    }
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        z = x + y
+        s = torch.tan(z)
+        return s * z
+
+
+@common.parametrize("test_data", MultipleDelegatesModule.inputs)
+def test_tosa_MI_pipeline(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](MultipleDelegatesModule(), test_data, [], [])
+    pipeline.change_args(
+        "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2}
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", MultipleDelegatesModule.inputs)
+def test_tosa_BI_pipeline(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        MultipleDelegatesModule(), test_data, [], [], qtol=1
+    )
+    pipeline.change_args(
+        "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2}
+    )
+    pipeline.run()
diff --git a/backends/arm/test/misc/test_multiple_outputs.py b/backends/arm/test/misc/test_multiple_outputs.py
index d3bea9a4005..abb6bb1bf30 100644
--- a/backends/arm/test/misc/test_multiple_outputs.py
+++ b/backends/arm/test/misc/test_multiple_outputs.py
@@ -1,96 +1,60 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
-import pytest
+from typing import Tuple
+
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-
-
-class TestMultipleOutputs(unittest.TestCase):
-    class MultipleOutputsModule(torch.nn.Module):
-        inputs = (torch.randn(10, 4, 5), torch.randn(10, 4, 5))
-
-        def get_inputs(self):
-            return self.inputs
-
-        def forward(self, x: torch.Tensor, y: torch.Tensor):
-            return (x * y, x.sum(dim=-1, keepdim=True))
-
-    def test_tosa_MI_pipeline(self):
-        module = self.MultipleOutputsModule()
-        inputs = module.get_inputs()
-        (
-            ArmTester(
-                module,
-                example_inputs=inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=inputs)
-        )
-
-    def test_tosa_BI_pipeline(self):
-        module = self.MultipleOutputsModule()
-        inputs = module.get_inputs()
-        (
-            ArmTester(
-                module,
-                example_inputs=inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=inputs, qtol=1.0)
-        )
-
-    def _test_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor],
-        compile_spec: CompileSpec,
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @pytest.mark.corstone_fvp
-    def test_u55_BI(self):
-        module = self.MultipleOutputsModule()
-        test_data = module.get_inputs()
-        self._test_ethosu_BI_pipeline(
-            module,
-            test_data,
-            common.get_u55_compile_spec(),
-        )
-
-    @pytest.mark.corstone_fvp
-    def test_u85_BI(self):
-        module = self.MultipleOutputsModule()
-        test_data = module.get_inputs()
-        self._test_ethosu_BI_pipeline(
-            module,
-            test_data,
-            common.get_u85_compile_spec(),
-        )
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
+
+
+class MultipleOutputsModule(torch.nn.Module):
+    inputs: dict[str, input_t1] = {
+        "randn": (torch.randn(10, 4, 5), torch.randn(10, 4, 5)),
+    }
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return (x * y, x.sum(dim=-1, keepdim=True))
+
+
+@common.parametrize("test_data", MultipleOutputsModule.inputs)
+def test_tosa_MI_pipeline(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](MultipleOutputsModule(), test_data, [], [])
+    pipeline.run()
+
+
+@common.parametrize("test_data", MultipleOutputsModule.inputs)
+def test_tosa_BI_pipeline(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        MultipleOutputsModule(), test_data, [], [], qtol=1
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", MultipleOutputsModule.inputs)
+@common.XfailIfNoCorstone300
+def test_U55_pipeline(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        MultipleOutputsModule(), test_data, [], [], qtol=1
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", MultipleOutputsModule.inputs)
+@common.XfailIfNoCorstone320
+def test_U85_pipeline(test_data: input_t1):
+    pipeline = EthosU85PipelineBI[input_t1](
+        MultipleOutputsModule(), test_data, [], [], qtol=1
+    )
+    pipeline.run()
diff --git a/backends/arm/test/misc/test_tosa_spec.py b/backends/arm/test/misc/test_tosa_spec.py
index 44eee2236e2..19136c514fb 100644
--- a/backends/arm/test/misc/test_tosa_spec.py
+++ b/backends/arm/test/misc/test_tosa_spec.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 9e7e7450b7d..46d736a6688 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -62,6 +62,11 @@
 )
 from executorch.exir.backend.backend_api import validation_disabled
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.operator_support import (
+    DontPartition,
+    DontPartitionModule,
+    DontPartitionName,
+)
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.pass_base import ExportPass
@@ -331,14 +336,23 @@ def to_edge_transform_and_lower(
         to_edge_and_lower_stage: Optional[ToEdgeTransformAndLower] = None,
         partitioners: Optional[List[Partitioner]] = None,
         edge_compile_config: Optional[EdgeCompileConfig] = None,
+        additional_checks: Optional[
+            List[Union[DontPartition | DontPartitionModule | DontPartitionName]]
+        ] = None,
     ):
         if to_edge_and_lower_stage is None:
             if partitioners is None:
                 arm_partitioner = None
                 if is_tosa(self.compile_spec):
-                    arm_partitioner = TOSAPartitioner(compile_spec=self.compile_spec)
+                    arm_partitioner = TOSAPartitioner(
+                        compile_spec=self.compile_spec,
+                        additional_checks=additional_checks,
+                    )
                 elif is_ethosu(self.compile_spec):
-                    arm_partitioner = EthosUPartitioner(compile_spec=self.compile_spec)
+                    arm_partitioner = EthosUPartitioner(
+                        compile_spec=self.compile_spec,
+                        additional_checks=additional_checks,
+                    )
                 else:
                     raise ValueError("compile spec doesn't target any Arm Partitioner")
                 partitioners = [arm_partitioner]
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
index 13e2f80b5c5..58c7c657250 100644
--- a/backends/arm/test/tester/test_pipeline.py
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -143,11 +143,13 @@ def add_stage(self, func: Callable, *args, **kwargs):
                 f"Pos must be between [-{pipeline_length}, {pipeline_length}]"
             )
 
+        stage_id = func.__name__
         suffix = None
         if "suffix" in kwargs:
             suffix = kwargs.pop("suffix")
+            if stage_id == "dump_artifact":
+                args = (*args, suffix)
 
-        stage_id = func.__name__
         unique_stages = [
             "quantize",
             "export",

From b09e793277f296a85ea043ea18efa73186769502 Mon Sep 17 00:00:00 2001
From: fumchin <Fang-Ching.Chen@arm.com>
Date: Thu, 15 May 2025 12:04:28 +0100
Subject: [PATCH 098/178] Arm backend: Increase atol, rtol to tosa_BI test in
 test_unary.py (#10897)

The patch solves the problem of flaky tosa_BI test in test_unary.py by
increasing the atol and rtol.

Signed-off-by: Fang-Ching <Fang-Ching.Chen@arm.com>
---
 backends/arm/test/ops/test_unary.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/arm/test/ops/test_unary.py b/backends/arm/test/ops/test_unary.py
index f8359bb4339..dcb80b901e4 100644
--- a/backends/arm/test/ops/test_unary.py
+++ b/backends/arm/test/ops/test_unary.py
@@ -116,6 +116,8 @@ def test_unary_tosa_BI(test_data: input_t1):
         (test_data,),
         module.aten_op,
         module.exir_op,
+        atol=0.06,
+        rtol=0.01,
     )
     pipeline.run()
 

From 02d315d8970b234ee55028a8efb04e2af1138bc3 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Thu, 15 May 2025 13:18:52 +0200
Subject: [PATCH 099/178] Arm backend: Check in tosa.fbs for TOSA 0.80 and 1.0
 (#10870)

### Summary
Add schema files for 0.80 and 1.0. This will enable dump_artifact and
dump_operator_distribution for both 0.80 and 1.0. Also bumps reference
model.


### Test plan
Tested on internal and external GitHub CI.


Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 .../arm/scripts/install_reference_model.sh    |   2 +-
 backends/arm/test/runner_utils.py             |  12 +-
 backends/arm/tosa/schemas/README.md           |   5 +
 backends/arm/tosa/schemas/tosa_0.80.fbs       | 314 ++++++++++
 backends/arm/tosa/schemas/tosa_1.0.fbs        | 540 ++++++++++++++++++
 5 files changed, 871 insertions(+), 2 deletions(-)
 create mode 100644 backends/arm/tosa/schemas/README.md
 create mode 100644 backends/arm/tosa/schemas/tosa_0.80.fbs
 create mode 100644 backends/arm/tosa/schemas/tosa_1.0.fbs

diff --git a/backends/arm/scripts/install_reference_model.sh b/backends/arm/scripts/install_reference_model.sh
index 0141b195a0d..98fa8ffd794 100755
--- a/backends/arm/scripts/install_reference_model.sh
+++ b/backends/arm/scripts/install_reference_model.sh
@@ -13,7 +13,7 @@ tosa_reference_model_url="https://git.gitlab.arm.com/tosa/tosa-reference-model.g
 tosa_reference_model_0_80_branch="v0.80"
 tosa_reference_model_0_80_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
 tosa_serialization_lib_0_80_rev="v0.80.1"
-tosa_reference_model_1_0_rev="f9b4ceb850964be03a39e965ad7a0546dc6c57ae"
+tosa_reference_model_1_0_rev="4d17b5b960cd986d8cb8052188fbe3ae494789e8"
 
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 4481a9c7cc2..60622187d6c 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -31,6 +31,7 @@
 from torch.fx.node import Node
 
 from torch.overrides import TorchFunctionMode
+from tosa.TosaGraph import TosaGraph
 
 logger = logging.getLogger(__name__)
 
@@ -461,10 +462,19 @@ def dbg_tosa_fb_to_json(tosa_fb: bytes) -> Dict:
     tosa_input_file = os.path.join(tmp, "output.tosa")
     with open(tosa_input_file, "wb") as f:
         f.write(tosa_fb)
+    tosa_graph = TosaGraph.GetRootAsTosaGraph(tosa_fb)
+    version = tosa_graph.Version()
+    major = version._Major()
+    minor = version._Minor()
+    patch = version._Patch()
+    if not ((major == 1 and minor == 0) or (major == 0 and minor == 80)):
+        raise RuntimeError(
+            f"Unsupported version in TOSA flatbuffer: version={major}.{minor}.{patch}"
+        )
 
     arm_backend_path = os.path.realpath(os.path.dirname(__file__) + "/..")
     tosa_schema_file = os.path.join(
-        arm_backend_path, "third-party/serialization_lib/schema/tosa.fbs"
+        arm_backend_path, f"tosa/schemas/tosa_{major}.{minor}.fbs"
     )
     assert os.path.exists(
         tosa_schema_file
diff --git a/backends/arm/tosa/schemas/README.md b/backends/arm/tosa/schemas/README.md
new file mode 100644
index 00000000000..9ee41b87578
--- /dev/null
+++ b/backends/arm/tosa/schemas/README.md
@@ -0,0 +1,5 @@
+# License
+
+The FlatBuffer schema (fbs) files originates from
+https://git.mlplatform.org/tosa/reference_model.git/ and are relicensed under the BSD-style license
+file found in the [LICENSE](../../../../LICENSE) file in the root directory of this source tree.
\ No newline at end of file
diff --git a/backends/arm/tosa/schemas/tosa_0.80.fbs b/backends/arm/tosa/schemas/tosa_0.80.fbs
new file mode 100644
index 00000000000..a781b0d8a24
--- /dev/null
+++ b/backends/arm/tosa/schemas/tosa_0.80.fbs
@@ -0,0 +1,314 @@
+// Copyright 2025 Arm Limited and/or its affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+namespace tosa;
+
+// This corresponds to the version.
+file_identifier "TOSA";
+// File extension of any written files.
+file_extension "tosa";
+
+// NOTE: New values added to the schema should be placed
+// at the end of the list in order to keep schema stable.
+
+enum DType:uint32 {
+  UNKNOWN = 0,
+  BOOL,
+  UINT8,
+  INT4,
+  INT8,
+  INT16,
+  INT32,
+  INT48,
+  FP32,
+  UINT16,
+  FP16,
+  BF16,
+  SHAPE,
+}
+
+enum ResizeMode:uint32 {
+  UNKNOWN = 0,
+  NEAREST,
+  BILINEAR,
+}
+
+enum Op:uint32 {
+  UNKNOWN = 0,
+  ARGMAX,
+  AVG_POOL2D,
+  CONV2D,
+  CONV3D,
+  DEPTHWISE_CONV2D,
+  FULLY_CONNECTED,
+  MATMUL,
+  MAX_POOL2D,
+  TRANSPOSE_CONV2D,
+  CLAMP,
+  RESERVED,
+  SIGMOID,
+  TANH,
+  ADD,
+  ARITHMETIC_RIGHT_SHIFT,
+  BITWISE_AND,
+  BITWISE_OR,
+  BITWISE_XOR,
+  INTDIV,
+  LOGICAL_AND,
+  LOGICAL_LEFT_SHIFT,
+  LOGICAL_RIGHT_SHIFT,
+  LOGICAL_OR,
+  LOGICAL_XOR,
+  MAXIMUM,
+  MINIMUM,
+  MUL,
+  POW,
+  SUB,
+  TABLE,
+  ABS,
+  BITWISE_NOT,
+  CEIL,
+  CLZ,
+  EXP,
+  FLOOR,
+  LOG,
+  LOGICAL_NOT,
+  NEGATE,
+  RECIPROCAL,
+  RSQRT,
+  SELECT,
+  EQUAL,
+  GREATER,
+  GREATER_EQUAL,
+  REDUCE_ANY,
+  REDUCE_ALL,
+  REDUCE_MAX,
+  REDUCE_MIN,
+  REDUCE_PRODUCT,
+  REDUCE_SUM,
+  CONCAT,
+  PAD,
+  RESHAPE,
+  REVERSE,
+  SLICE,
+  TILE,
+  TRANSPOSE,
+  GATHER,
+  SCATTER,
+  RESIZE,
+  CAST,
+  RESCALE,
+  CONST,
+  IDENTITY,
+  CUSTOM,
+  COND_IF,
+  WHILE_LOOP,
+  FFT2D,
+  RFFT2D,
+  ERF,
+  DIM,
+}
+
+union Attribute {
+  PoolAttribute,
+  ConvAttribute,
+  TransposeConvAttribute,
+  PadAttribute,
+  AxisAttribute,
+  ReshapeAttribute,
+  SliceAttribute,
+  TileAttribute,
+  ResizeAttribute,
+  ClampAttribute,
+  RescaleAttribute,
+  MulAttribute,
+  ArithmeticRightShiftAttribute,
+  CondIfAttribute,
+  WhileLoopAttribute,
+  TransposeAttribute,
+  TableAttribute,
+  MatMulAttribute,
+  FullyConnectedAttribute,
+  NegateAttribute,
+  CustomAttribute,
+  FFTAttribute,
+  RFFTAttribute,
+}
+
+table PoolAttribute {
+  pad: [int32];
+  kernel: [int32];
+  stride: [int32];
+  input_zp: int32;
+  output_zp: int32;
+  accum_dtype: DType;
+}
+
+table ConvAttribute {
+  pad: [int32];
+  stride: [int32];
+  dilation: [int32];
+  input_zp: int32;
+  weight_zp: int32;
+  local_bound: bool;
+}
+
+table TransposeConvAttribute {
+  out_pad: [int32];
+  stride: [int32];
+  output_shape: [int32];
+  input_zp: int32;
+  weight_zp: int32;
+  local_bound: bool;
+}
+
+table PadAttribute {
+  padding: [int32];
+  pad_const_int: int32;
+  pad_const_fp: [ubyte] (force_align: 8);
+}
+
+table AxisAttribute {
+  axis: int32;
+}
+
+table ReshapeAttribute {
+  new_shape: [int32];
+}
+
+table SliceAttribute {
+  start: [int32];
+  size: [int32];
+}
+
+table TileAttribute {
+  multiples: [int32];
+}
+
+table ResizeAttribute {
+  scale: [int16];
+  offset: [int16];
+  border: [int16];
+  mode: ResizeMode;
+}
+
+table ClampAttribute {
+  min_int: int32;
+  max_int: int32;
+  min_fp: [ubyte] (force_align: 8);
+  max_fp: [ubyte] (force_align: 8);
+}
+
+table RescaleAttribute {
+  input_zp: int32;
+  output_zp: int32;
+  multiplier: [int32];
+  shift: [int32];
+  scale32: bool;
+  double_round: bool;
+  per_channel: bool;
+  input_unsigned: bool;
+  output_unsigned: bool;
+}
+
+table MulAttribute {
+  shift: int32;
+}
+
+table ArithmeticRightShiftAttribute {
+  round: bool;
+}
+
+table CondIfAttribute {
+  then_branch: string;
+  else_branch: string;
+}
+
+table WhileLoopAttribute {
+  cond_branch: string;
+  body_branch: string;
+}
+
+table TransposeAttribute {
+  perms: [int32];
+}
+
+table TableAttribute {
+  table: [int16];
+}
+
+table MatMulAttribute {
+  a_zp: int32;
+  b_zp: int32;
+}
+
+table FullyConnectedAttribute {
+  input_zp: int32;
+  weight_zp: int32;
+}
+
+table NegateAttribute {
+  input1_zp: int32;
+  output_zp: int32;
+}
+
+table CustomAttribute {
+  operator_name:string;
+  domain_name:string;
+  implementation_attrs:[ubyte];
+}
+
+table FFTAttribute {
+  inverse: bool;
+  local_bound: bool;
+}
+
+table RFFTAttribute {
+  local_bound: bool;
+}
+
+table Version {
+  _major: int32 = -1;
+  _minor: int32 = -1;
+  _patch: int32 = -1;
+  _draft: bool = true;
+}
+
+table TosaTensor {
+  name:string;                      // name of the tensor, used for solving dependency
+  shape:[int32];                    // shape of the tensor
+  type:DType;                       // data type of the tensor
+  data: [ubyte] (force_align: 8);   // raw data array if it's a constant tensor.
+  variable: bool;                   // is this a variable tensor
+  is_unranked: bool;                // whether this is an unranked tensor
+  variable_name:string;             // name for variable attribute
+}
+
+table TosaOperator {
+  op:Op;                    // operator enum
+  attribute:Attribute;      // union structure. operator attribute
+  inputs:[string];          // list of input tensor names
+  outputs:[string];         // list of output tensor names
+}
+
+table TosaBasicBlock {
+  name:string;              // basic block name
+  operators:[TosaOperator]; // operators array
+  tensors:[TosaTensor];     // tensors array
+  inputs:[string];          // name of graph inputs
+  outputs:[string];         // name of graph outputs
+}
+
+table TosaRegion {
+  name:string;             // name of region
+  blocks:[TosaBasicBlock]; // basic blocks array
+}
+
+table TosaGraph {
+  version:Version (required);
+  regions:[TosaRegion];       // regions array
+}
+
+root_type TosaGraph;
diff --git a/backends/arm/tosa/schemas/tosa_1.0.fbs b/backends/arm/tosa/schemas/tosa_1.0.fbs
new file mode 100644
index 00000000000..acd376daa9f
--- /dev/null
+++ b/backends/arm/tosa/schemas/tosa_1.0.fbs
@@ -0,0 +1,540 @@
+// Copyright 2025 Arm Limited and/or its affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+namespace tosa;
+
+// This corresponds to the version.
+file_identifier "TOSA";
+// File extension of any written files.
+file_extension "tosa";
+
+// NOTE: New values added to the schema should be placed
+// at the end of the list in order to keep schema stable.
+
+enum DType:uint32 {
+  UNKNOWN = 0,
+  BOOL,
+  INT4,
+  INT8,
+  INT16,
+  INT32,
+  INT48,
+  FP32,
+  FP16,
+  BF16,
+  SHAPE,
+  FP8E4M3,
+  FP8E5M2,
+}
+
+enum ResizeMode:uint32 {
+  UNKNOWN = 0,
+  NEAREST,
+  BILINEAR,
+}
+
+enum NanPropagationMode:uint32 {
+  UNKNOWN = 0,
+  PROPAGATE,
+  IGNORE,
+}
+
+enum RoundingMode:uint32 {
+  UNKNOWN = 0,
+  SINGLE_ROUND,
+  INEXACT_ROUND,
+  DOUBLE_ROUND
+}
+
+enum Op:uint32 {
+  UNKNOWN = 0,
+  ARGMAX,
+  AVG_POOL2D,
+  CONV2D,
+  CONV3D,
+  DEPTHWISE_CONV2D,
+  FFT2D,
+  MATMUL,
+  MAX_POOL2D,
+  RFFT2D,
+  TRANSPOSE_CONV2D,
+  CLAMP,
+  ERF,
+  SIGMOID,
+  TANH,
+  ADD,
+  ARITHMETIC_RIGHT_SHIFT,
+  BITWISE_AND,
+  BITWISE_OR,
+  BITWISE_XOR,
+  INTDIV,
+  LOGICAL_AND,
+  LOGICAL_LEFT_SHIFT,
+  LOGICAL_RIGHT_SHIFT,
+  LOGICAL_OR,
+  LOGICAL_XOR,
+  MAXIMUM,
+  MINIMUM,
+  MUL,
+  POW,
+  SUB,
+  TABLE,
+  ABS,
+  BITWISE_NOT,
+  CEIL,
+  CLZ,
+  COS,
+  EXP,
+  FLOOR,
+  LOG,
+  LOGICAL_NOT,
+  NEGATE,
+  RECIPROCAL,
+  RSQRT,
+  SIN,
+  SELECT,
+  EQUAL,
+  GREATER,
+  GREATER_EQUAL,
+  REDUCE_ALL,
+  REDUCE_ANY,
+  REDUCE_MAX,
+  REDUCE_MIN,
+  REDUCE_PRODUCT,
+  REDUCE_SUM,
+  CONCAT,
+  PAD,
+  RESHAPE,
+  REVERSE,
+  SLICE,
+  TILE,
+  TRANSPOSE,
+  GATHER,
+  SCATTER,
+  RESIZE,
+  CAST,
+  RESCALE,
+  CONST,
+  IDENTITY,
+  CUSTOM,
+  COND_IF,
+  WHILE_LOOP,
+  VARIABLE,
+  VARIABLE_WRITE,
+  VARIABLE_READ,
+  CONST_SHAPE,
+}
+
+union Attribute {
+  ArgMaxAttribute,
+  AvgPool2dAttribute,
+  Conv2dAttribute,
+  Conv3dAttribute,
+  DepthwiseConv2dAttribute,
+  FFT2dAttribute,
+  MatMulAttribute,
+  MaxPool2dAttribute,
+  RFFT2dAttribute,
+  TransposeConv2dAttribute,
+  ClampAttribute,
+  ErfAttribute,
+  SigmoidAttribute,
+  TanhAttribute,
+  AddAttribute,
+  ArithmeticRightShiftAttribute,
+  BitwiseAndAttribute,
+  BitwiseOrAttribute,
+  BitwiseXorAttribute,
+  IntDivAttribute,
+  LogicalAndAttribute,
+  LogicalLeftShiftAttribute,
+  LogicalRightShiftAttribute,
+  LogicalOrAttribute,
+  LogicalXorAttribute,
+  MaximumAttribute,
+  MinimumAttribute,
+  MulAttribute,
+  PowAttribute,
+  SubAttribute,
+  TableAttribute,
+  AbsAttribute,
+  BitwiseNotAttribute,
+  CeilAttribute,
+  ClzAttribute,
+  CosAttribute,
+  ExpAttribute,
+  FloorAttribute,
+  LogAttribute,
+  LogicalNotAttribute,
+  NegateAttribute,
+  ReciprocalAttribute,
+  RsqrtAttribute,
+  SinAttribute,
+  SelectAttribute,
+  EqualAttribute,
+  GreaterAttribute,
+  GreaterEqualAttribute,
+  ReduceAllAttribute,
+  ReduceAnyAttribute,
+  ReduceMaxAttribute,
+  ReduceMinAttribute,
+  ReduceProductAttribute,
+  ReduceSumAttribute,
+  ConcatAttribute,
+  PadAttribute,
+  ReshapeAttribute,
+  ReverseAttribute,
+  SliceAttribute,
+  TileAttribute,
+  TransposeAttribute,
+  GatherAttribute,
+  ScatterAttribute,
+  ResizeAttribute,
+  CastAttribute,
+  RescaleAttribute,
+  ConstAttribute,
+  IdentityAttribute,
+  CustomAttribute,
+  CondIfAttribute,
+  WhileLoopAttribute,
+  VariableAttribute,
+  VariableWriteAttribute,
+  VariableReadAttribute,
+  ConstShapeAttribute,
+}
+
+table ArgMaxAttribute {
+  axis: int32;
+  nan_mode: NanPropagationMode;
+}
+
+table AvgPool2dAttribute {
+  kernel: [int32];
+  stride: [int32];
+  pad: [int32];
+  acc_type: DType;
+}
+
+table Conv2dAttribute {
+  pad: [int32];
+  stride: [int32];
+  dilation: [int32];
+  local_bound: bool;
+  acc_type: DType;
+}
+
+table Conv3dAttribute {
+  pad: [int32];
+  stride: [int32];
+  dilation: [int32];
+  local_bound: bool;
+  acc_type: DType;
+}
+
+table DepthwiseConv2dAttribute {
+  pad: [int32];
+  stride: [int32];
+  dilation: [int32];
+  local_bound: bool;
+  acc_type: DType;
+}
+
+table FFT2dAttribute {
+  inverse: bool;
+  local_bound: bool;
+}
+
+table MatMulAttribute {
+}
+
+table MaxPool2dAttribute {
+  kernel: [int32];
+  stride: [int32];
+  pad: [int32];
+  nan_mode: NanPropagationMode;
+}
+
+table RFFT2dAttribute {
+  local_bound: bool;
+}
+
+table TransposeConv2dAttribute {
+  out_pad: [int32];
+  stride: [int32];
+  local_bound: bool;
+  acc_type: DType;
+}
+
+table ClampAttribute {
+  min_val: [ubyte] (force_align: 8);
+  max_val: [ubyte] (force_align: 8);
+  nan_mode: NanPropagationMode;
+}
+
+table ErfAttribute {
+}
+
+table SigmoidAttribute {
+}
+
+table TanhAttribute {
+}
+
+table AddAttribute {
+}
+
+table ArithmeticRightShiftAttribute {
+  round: bool;
+}
+
+table BitwiseAndAttribute {
+}
+
+table BitwiseOrAttribute {
+}
+
+table BitwiseXorAttribute {
+}
+
+table IntDivAttribute {
+}
+
+table LogicalAndAttribute {
+}
+
+table LogicalLeftShiftAttribute {
+}
+
+table LogicalRightShiftAttribute {
+}
+
+table LogicalOrAttribute {
+}
+
+table LogicalXorAttribute {
+}
+
+table MaximumAttribute {
+  nan_mode: NanPropagationMode;
+}
+
+table MinimumAttribute {
+  nan_mode: NanPropagationMode;
+}
+
+table MulAttribute {
+}
+
+table PowAttribute {
+}
+
+table SubAttribute {
+}
+
+table TableAttribute {
+}
+
+table AbsAttribute {
+}
+
+table BitwiseNotAttribute {
+}
+
+table CeilAttribute {
+}
+
+table ClzAttribute {
+}
+
+table CosAttribute {
+}
+
+table ExpAttribute {
+}
+
+table FloorAttribute {
+}
+
+table LogAttribute {
+}
+
+table LogicalNotAttribute {
+}
+
+table NegateAttribute {
+}
+
+table ReciprocalAttribute {
+}
+
+table RsqrtAttribute {
+}
+
+table SinAttribute {
+}
+
+table SelectAttribute {
+}
+
+table EqualAttribute {
+}
+
+table GreaterAttribute {
+}
+
+table GreaterEqualAttribute {
+}
+
+table ReduceAllAttribute {
+  axis: int32;
+}
+
+table ReduceAnyAttribute {
+  axis: int32;
+}
+
+table ReduceMaxAttribute {
+  axis: int32;
+  nan_mode: NanPropagationMode;
+}
+
+table ReduceMinAttribute {
+  axis: int32;
+  nan_mode: NanPropagationMode;
+}
+
+table ReduceProductAttribute {
+  axis: int32;
+}
+
+table ReduceSumAttribute {
+  axis: int32;
+}
+
+table ConcatAttribute {
+  axis: int32;
+}
+
+table PadAttribute {
+}
+
+table ReshapeAttribute {
+}
+
+table ReverseAttribute {
+  axis: int32;
+}
+
+table SliceAttribute {
+}
+
+table TileAttribute {
+}
+
+table TransposeAttribute {
+  perms: [int32];
+}
+
+table GatherAttribute {
+}
+
+table ScatterAttribute {
+}
+
+table ResizeAttribute {
+  mode: ResizeMode;
+}
+
+table CastAttribute {
+}
+
+table RescaleAttribute {
+  scale32: bool;
+  rounding_mode: RoundingMode;
+  per_channel: bool;
+  input_unsigned: bool;
+  output_unsigned: bool;
+}
+
+table ConstAttribute {
+  // value is stored in output TosaTensor
+}
+
+table IdentityAttribute {
+}
+
+table CustomAttribute {
+  operator_name:string;
+  domain_name:string;
+  implementation_attrs:[ubyte];
+}
+
+table CondIfAttribute {
+  then_graph: string;
+  else_graph: string;
+}
+
+table WhileLoopAttribute {
+  cond_graph: string;
+  body_graph: string;
+}
+
+table VariableAttribute {
+}
+
+table VariableWriteAttribute {
+}
+
+table VariableReadAttribute {
+}
+
+table ConstShapeAttribute {
+  // value is stored in output TosaTensor
+}
+
+
+table Version {
+  _major: int32 = -1;
+  _minor: int32 = -1;
+  _patch: int32 = -1;
+  _draft: bool = true;
+}
+
+table TosaTensor {
+  name:string;                      // name of the tensor, used for solving dependency
+  shape:[int32];                    // shape of the tensor
+  type:DType;                       // data type of the tensor
+  data: [ubyte] (force_align: 8);   // raw data array if it's a constant tensor.
+  variable: bool;                   // is this a variable tensor
+  is_unranked: bool;                // whether this is an unranked tensor
+  variable_name:string;             // name for variable attribute
+}
+
+table TosaOperator {
+  op:Op;                    // operator enum
+  attribute:Attribute;      // union structure. operator attribute
+  inputs:[string];          // list of input tensor names
+  outputs:[string];         // list of output tensor names
+}
+
+table TosaBasicBlock {
+  name:string;              // basic block name
+  operators:[TosaOperator]; // operators array
+  tensors:[TosaTensor];     // tensors array
+  inputs:[string];          // name of graph inputs
+  outputs:[string];         // name of graph outputs
+}
+
+table TosaRegion {
+  name:string;             // name of region
+  blocks:[TosaBasicBlock]; // basic blocks array
+}
+
+table TosaGraph {
+  version:Version (required);
+  regions:[TosaRegion];       // regions array
+}
+
+root_type TosaGraph;

From b427188ea8162be1bf46bc48d4b32401ce1a535d Mon Sep 17 00:00:00 2001
From: fumchin <Fang-Ching.Chen@arm.com>
Date: Thu, 15 May 2025 12:45:34 +0100
Subject: [PATCH 100/178] Arm backend: Increase atol and rtol to tosa_BI test
 in test_conv_constant_pad_nd.py (#10896)

This patch solves the problem of flaky BI test in
test_conv_constant_pad_nd.py by increasing atol and rtol.

Signed-off-by: Fang-Ching <Fang-Ching.Chen@arm.com>
---
 backends/arm/test/ops/test_conv_constant_pad_nd.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/arm/test/ops/test_conv_constant_pad_nd.py b/backends/arm/test/ops/test_conv_constant_pad_nd.py
index 026b5d1dc4d..61497578fb6 100644
--- a/backends/arm/test/ops/test_conv_constant_pad_nd.py
+++ b/backends/arm/test/ops/test_conv_constant_pad_nd.py
@@ -110,5 +110,7 @@ def test_constant_pad_nd_tosa_BI(test_data: Tuple):
         (test_data,),
         aten_op,
         exir_op,
+        atol=0.005,  # TODO: Investigate flakyness (MLETORCH-989)
+        rtol=0.01,
     )
     pipeline.run()

From b058afbff70d961c943424b2583e4629406b99ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
 <33344797+martinlsm@users.noreply.github.com>
Date: Thu, 15 May 2025 13:46:25 +0200
Subject: [PATCH 101/178] Arm backend: Add test for DeiT Tiny for TOSA BI
 (#10846)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add test case for the DeiT Tiny model for the TOSA BI profile. At this
time the output of the model differs from the reference implementation
by a mean absolute error of around 2.5, which is too high. An internal
ticket has been raised to resolve this issue.

Signed-off-by: Martin Lindström <martin.lindstroem@arm.com>
---
 backends/arm/test/models/test_deit_tiny_arm.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/backends/arm/test/models/test_deit_tiny_arm.py b/backends/arm/test/models/test_deit_tiny_arm.py
index b19eb811bb1..f2269e3bed1 100644
--- a/backends/arm/test/models/test_deit_tiny_arm.py
+++ b/backends/arm/test/models/test_deit_tiny_arm.py
@@ -11,7 +11,10 @@
 
 import torch
 
-from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineMI
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 from torchvision import transforms
@@ -42,3 +45,16 @@ def test_deit_tiny_tosa_MI():
         qtol=1,
     )
     pipeline.run()
+
+
+def test_deit_tiny_tosa_BI():
+    pipeline = TosaPipelineBI[input_t](
+        deit_tiny,
+        model_inputs,
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+        atol=3.0,  # This needs to go down: MLETORCH-956
+        qtol=1,
+    )
+    pipeline.run()

From 9dece67e09cfb6444cd5f347b3f241b1042c4656 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Thu, 15 May 2025 15:12:56 +0100
Subject: [PATCH 102/178] Arm backend: Add DecomposeLinalgVectorNorm pass +
 tests (#10848)

Added decomposition of linalg vector norm.

Signed-off-by: Elena Zhelezina <elena.zhelezina@arm.com>
---
 backends/arm/_passes/__init__.py              |   1 +
 backends/arm/_passes/arm_pass_manager.py      |   4 +
 .../decompose_linalg_vector_norm_pass.py      |  78 +++++++++++
 backends/arm/scripts/parse_test_names.py      |   1 +
 .../arm/test/ops/test_linalg_vector_norm.py   | 131 ++++++++++++++++++
 .../test_decompose_linalg_vector_norm_pass.py |  91 ++++++++++++
 6 files changed, 306 insertions(+)
 create mode 100644 backends/arm/_passes/decompose_linalg_vector_norm_pass.py
 create mode 100644 backends/arm/test/ops/test_linalg_vector_norm.py
 create mode 100644 backends/arm/test/passes/test_decompose_linalg_vector_norm_pass.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 37d3e4278df..e4a1526f573 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -24,6 +24,7 @@
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
+from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 4123d217e94..5f79757f212 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -29,6 +29,7 @@
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
+    DecomposeLinearVectorNormPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
     DecomposeSelectPass,
@@ -86,6 +87,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
@@ -133,6 +135,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(FuseBatchnorm2DPass(exported_program))
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeBatchNormPass())
         self.add_pass(DecomposeLayerNormPass())
@@ -207,6 +210,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeCosineSimilarityPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
+        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(DecomposeSiluPass())
 
diff --git a/backends/arm/_passes/decompose_linalg_vector_norm_pass.py b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py
new file mode 100644
index 00000000000..78cb0deae62
--- /dev/null
+++ b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py
@@ -0,0 +1,78 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass
+
+
+class DecomposeLinearVectorNormPass(ExportPass):
+    """
+    This pass decomposes aten.linalg_vector_norm.default into more primitive ops.
+    We need to add this pass before quantization for graph annotation.
+    By default, aten.linalg_vector_norm op is decomposed during legalization to Edge IR.
+
+    The decomposition is as follows:
+
+      For p == 1:
+          out = REDUCE_SUM(ABS(x), dims, keepdim)
+
+      For p == 2:
+          out = SQRT(REDUCE_SUM(MUL(x, x), dims, keepdim))
+
+      For arbitrary p:
+          We dont support arbitrary p, because our decomposition looks like
+          out = POW(REDUCE_SUM(POW(ABS(x), p), dims, keepdim), 1/p)
+          In this case we need to wrap p into Tensor and we need to know
+          dtype prior, but we dont know this from FX graph.
+    """
+
+    torch_linalg_vector_norm = (torch.ops.aten.linalg_vector_norm.default,)
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in self.torch_linalg_vector_norm:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Extract inputs and optional arguments.
+        # Expected args:
+        #   args[0]: input tensor
+        #   args[1]: norm order 'p' (optional, default: 2.0)
+        #   args[2]: dimensions to reduce (should be provided)
+        #   args[3]: keepdim flag (optional, default: False)
+        input_tensor = args[0]
+        norm_order = args[1] if len(args) > 1 else 2.0
+        norm_dim = args[2] if len(args) > 2 else None
+        keepdim = args[3] if len(args) > 3 else False
+
+        if norm_order not in (1, 2):
+            raise ValueError(
+                f"The order of {norm_order}\n"
+                f"is not supported for linalg_vector_norm operator"
+            )
+
+        if norm_dim is None:
+            raise ValueError("The norm_dim for linalg_vector_norm is None.")
+
+        dims = [norm_dim] if isinstance(norm_dim, int) else list(norm_dim)
+
+        # Decomposition based on norm order.
+        if norm_order == 1:
+            op1 = super().call_operator(
+                torch.ops.aten.abs.default, (input_tensor,), {}, meta
+            )
+            op2 = super().call_operator(
+                torch.ops.aten.sum.dim_IntList, (op1, dims, keepdim), {}, meta
+            )
+            return op2
+
+        elif norm_order == 2:
+            # For p == 2, decomposition is sqrt(sum(x * x, dims, keepdim))
+            op1 = super().call_operator(
+                torch.ops.aten.mul.Tensor, (input_tensor, input_tensor), {}, meta
+            )
+            op2 = super().call_operator(
+                torch.ops.aten.sum.dim_IntList, (op1, dims, keepdim), {}, meta
+            )
+            op3 = super().call_operator(torch.ops.aten.sqrt.default, (op2,), {}, meta)
+            return op3
diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index 46cf3e17a73..c50f5520b49 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -8,6 +8,7 @@
 CUSTOM_EDGE_OPS = [
     "linspace.default",
     "eye.default",
+    "vector_norm.default",
     "hardsigmoid.default",
     "hardswish.default",
     "linear.default",
diff --git a/backends/arm/test/ops/test_linalg_vector_norm.py b/backends/arm/test/ops/test_linalg_vector_norm.py
new file mode 100644
index 00000000000..36533d786dd
--- /dev/null
+++ b/backends/arm/test/ops/test_linalg_vector_norm.py
@@ -0,0 +1,131 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t = Tuple[torch.Tensor]
+
+aten_op_q_decomposed_q = "torch.ops.quantized_decomposed.quantize_per_tensor.default"
+exir_op_q_decomposed = "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default"
+
+
+class VectorNormModel(torch.nn.Module):
+    def __init__(
+        self,
+        ord=None,
+        dim=1,
+        keepdim=False,
+    ):
+        """
+        A simple module that applies torch.linalg.vector_norm to its input.
+        Ord is 2 by default.
+        """
+        super().__init__()
+        self.ord = ord
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.ord is None and self.dim is None:
+            return torch.linalg.vector_norm(x, keepdim=self.keepdim)
+        elif self.ord is None:
+            return torch.linalg.vector_norm(x, dim=self.dim, keepdim=self.keepdim)
+        elif self.dim is None:
+            return torch.linalg.vector_norm(x, ord=self.ord, keepdim=self.keepdim)
+        else:
+            return torch.linalg.vector_norm(
+                x, ord=self.ord, dim=self.dim, keepdim=self.keepdim
+            )
+
+
+test_modules = {
+    "default": (VectorNormModel(dim=1), (torch.rand(10, 4),)),
+    "ord1": (VectorNormModel(ord=1, dim=1), (torch.rand(10, 4),)),
+    "ord2": (VectorNormModel(ord=2, dim=1), (torch.rand(10, 20),)),
+    # Norm computed along a specific dimension of a 3D tensor
+    "dim_3d": (VectorNormModel(dim=2), (torch.rand(4, 5, 6),)),
+}
+
+
+@common.parametrize("test_module", test_modules)
+def test_vector_norm_tosa_MI(test_module):
+    model, input_tensor = test_module
+
+    # We decompose LinalgVectorNorm before quantize stage to have annotations
+    # with q/dq nodes. In case of MI, this operator will be decomposed
+    # by global decompositions.
+    aten_op = "torch.ops.aten.linalg_vector_norm.default"
+    # Should not found this op
+    exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
+
+    pipeline = TosaPipelineMI[input_t](model, input_tensor, aten_op, exir_op)
+
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1e-4, rtol=1e-4)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_vector_norm_tosa_BI(test_module):
+    model, input_tensor = test_module
+
+    # Should not found this op
+    exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
+
+    pipeline = TosaPipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op_q_decomposed_q,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.XfailIfNoCorstone300
+def test_vector_norm_u55_BI_fvp(test_module):
+    model, input_tensor = test_module
+
+    pipeline = EthosU55PipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op_q_decomposed_q,
+        exir_op_q_decomposed,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+    pipeline.pop_stage("check_not.exir")
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.XfailIfNoCorstone300
+def test_vector_norm_u85_BI_fvp(test_module):
+    model, input_tensor = test_module
+
+    # The should be decomposed and annotated in DecomposeLinalgVectorNorm pass.
+    pipeline = EthosU85PipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op_q_decomposed_q,
+        exir_op_q_decomposed,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+    pipeline.pop_stage("check_not.exir")
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_linalg_vector_norm_pass.py b/backends/arm/test/passes/test_decompose_linalg_vector_norm_pass.py
new file mode 100644
index 00000000000..de605f666ac
--- /dev/null
+++ b/backends/arm/test/passes/test_decompose_linalg_vector_norm_pass.py
@@ -0,0 +1,91 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm._passes.decompose_linalg_vector_norm_pass import (
+    DecomposeLinearVectorNormPass,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]
+
+
+class VectorNormModel(torch.nn.Module):
+    """
+    A test module with torch.linalg.vector_norm.
+    https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+
+    We support only order 1 or 2.
+    """
+
+    def __init__(self, ord: float = None, dim=None, keepdim: bool = False):
+        super().__init__()
+        self.ord = ord
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.ord is None and self.dim is None:
+            return torch.linalg.vector_norm(x, keepdim=self.keepdim)
+        elif self.ord is None:
+            return torch.linalg.vector_norm(x, dim=self.dim, keepdim=self.keepdim)
+        elif self.dim is None:
+            return torch.linalg.vector_norm(x, ord=self.ord, keepdim=self.keepdim)
+        else:
+            return torch.linalg.vector_norm(
+                x, ord=self.ord, dim=self.dim, keepdim=self.keepdim
+            )
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(4, 4),)
+
+
+modules = {
+    # Default uses p=2 (l2 vector norm)
+    "default_p2": VectorNormModel(dim=1),
+    # p = 1: L1 norm over all elements
+    "p1": VectorNormModel(ord=1, dim=1),
+}
+
+
+@common.parametrize("module", modules)
+def test_decompose_vector_norm_tosa_BI(module):
+    """
+    This test creates a PassPipeline that applies the DecomposeLinearVectorNormPass.
+    The expected primitive ops vary depending on the norm order:
+      - p == 1: should decompose to ABS and SUM.
+      - p == 2 (default): should decompose to MUL, SUM, and SQRT.
+      - Other p: should decompose to ABS, two instances of POW, and SUM.
+    """
+    ord_val = module.ord if module.ord is not None else 2.0
+
+    if ord_val == 1:
+        ops_after_pass = {
+            "executorch_exir_dialects_edge__ops_aten_abs_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+        }
+    elif ord_val == 2:
+        ops_after_pass = {
+            "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2,
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+        }
+
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        # The op is decomposed in legalization aten -> edge, so we are not able to check ops before
+        ops_before_pass=None,
+        ops_not_before_pass=None,
+        ops_after_pass=ops_after_pass,
+        ops_not_after_pass=[
+            "executorch_exir_dialects_edge__ops_aten_linarg_vector_norm_default",
+        ],
+        pass_list=[DecomposeLinearVectorNormPass],
+    )
+    pipeline.run()

From 0121dae66f188cc3b95b1ba963b8a802697a3eb1 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Thu, 15 May 2025 16:13:54 +0200
Subject: [PATCH 103/178] Arm backend: Update parse_test_name script (#10902)

- Make TARGETS a list to preserve order.
- Replace edge_ops with a dict mapping test op names to edge op names,
e.g. op_name_map["split_tensor"] = "split_copy.Tensor"

Both changes are slightly less efficient compared to using sets, but
more convenient to use when handling the results.

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 backends/arm/scripts/parse_test_names.py | 33 +++++++++++++-----------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index c50f5520b49..4d4f9a21155 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -18,21 +18,21 @@
 ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
 
 # Add all targets and TOSA profiles we support here.
-TARGETS = {"tosa_BI", "tosa_MI", "u55_BI", "u85_BI"}
+TARGETS = ["tosa_MI", "tosa_BI", "u55_BI", "u85_BI"]
 
 
-def get_edge_ops():
+def get_op_name_map():
     """
-    Returns a set with edge_ops with names on the form to be used in unittests:
+    Returns a mapping from names on the form to be used in unittests to edge op:
     1. Names are in lowercase.
-    2. Overload is ignored if it is 'default', otherwise its appended with an underscore.
+    2. Overload is ignored if 'default', otherwise it's appended with an underscore.
     3. Overly verbose name are shortened by removing certain prefixes/suffixes.
 
     Examples:
         abs.default -> abs
         split_copy.Tensor -> split_tensor
     """
-    edge_ops = set()
+    map = {}
     for edge_name in ALL_EDGE_OPS:
         op, overload = edge_name.split(".")
 
@@ -45,21 +45,24 @@ def get_edge_ops():
         overload = overload.lower()
 
         if overload == "default":
-            edge_ops.add(op)
+            map[op] = edge_name
         else:
-            edge_ops.add(f"{op}_{overload}")
+            map[f"{op}_{overload}"] = edge_name
 
-    return edge_ops
+    return map
 
 
-def parse_test_name(test_name: str, edge_ops: set[str]) -> tuple[str, str, bool]:
+def parse_test_name(
+    test_name: str, op_name_map: dict[str, str]
+) -> tuple[str, str, bool]:
     """
     Parses a test name on the form
         test_OP_TARGET_<not_delegated>_<any_other_info>
-    where OP must match a string in edge_ops and TARGET must match one string in TARGETS.
-    The "not_delegated" suffix indicates that the test tests that the op is not delegated.
+    where OP must match a key in op_name_map and TARGET one string in TARGETS. The
+    "not_delegated" suffix indicates that the test tests that the op is not delegated.
 
-    Examples of valid names: "test_mm_u55_BI_not_delegated" or "test_add_scalar_tosa_MI_two_inputs".
+    Examples of valid names: "test_mm_u55_BI_not_delegated" and
+    "test_add_scalar_tosa_MI_two_inputs".
 
     Returns a tuple (OP, TARGET, IS_DELEGATED) if valid.
     """
@@ -83,7 +86,7 @@ def parse_test_name(test_name: str, edge_ops: set[str]) -> tuple[str, str, bool]
 
     assert target != "None", f"{test_name} does not contain one of {TARGETS}"
     assert (
-        op in edge_ops
+        op in op_name_map.keys()
     ), f"Parsed unvalid OP from {test_name}, {op} does not exist in edge.yaml or CUSTOM_EDGE_OPS"
 
     return op, target, is_delegated
@@ -95,13 +98,13 @@ def parse_test_name(test_name: str, edge_ops: set[str]) -> tuple[str, str, bool]
 
     sys.tracebacklimit = 0  # Do not print stack trace
 
-    edge_ops = get_edge_ops()
+    op_name_map = get_op_name_map()
     exit_code = 0
 
     for test_name in sys.argv[1:]:
         try:
             assert test_name[:5] == "test_", f"Unexpected input: {test_name}"
-            parse_test_name(test_name, edge_ops)
+            parse_test_name(test_name, op_name_map)
         except AssertionError as e:
             print(e)
             exit_code = 1

From 43ab323d716df14295fd02537bd3acfad43205f9 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Thu, 15 May 2025 17:00:04 +0200
Subject: [PATCH 104/178] Arm backend: Add arange.default dummy tests (#10901)

Since we use unittests for coverage on the Edge IR op set which includes
arange.default, we need tests for it even though it is decomposed to
indicate that it has not been missed.

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 backends/arm/test/ops/test_arange.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/backends/arm/test/ops/test_arange.py b/backends/arm/test/ops/test_arange.py
index cb5f329a7f9..2474d356555 100644
--- a/backends/arm/test/ops/test_arange.py
+++ b/backends/arm/test/ops/test_arange.py
@@ -5,6 +5,8 @@
 
 from typing import Callable
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -152,3 +154,26 @@ def test_linspace_tosa_BI(test_data: test_data_t):
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
+
+
+skip_str = "aten.arange.default is decomposed to aten.arange.start_step, so it will never exist in a lowered graph."
+
+
+@pytest.mark.skip(reason=skip_str)
+def test_arange_tosa_MI():
+    pass
+
+
+@pytest.mark.skip(reason=skip_str)
+def test_arange_tosa_BI():
+    pass
+
+
+@pytest.mark.skip(reason=skip_str)
+def test_arange_u55_BI():
+    pass
+
+
+@pytest.mark.skip(reason=skip_str)
+def test_arange_u85_BI():
+    pass

From 0a6f6220dd7252f5b79bf7b33edf1ea0631158f3 Mon Sep 17 00:00:00 2001
From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com>
Date: Thu, 15 May 2025 16:59:43 +0100
Subject: [PATCH 105/178] Arm Backend: Updated TosaPipelineBI default qtol
 (#10907)

### Summary
* Updates the default qtol=1
* Removes x_fails from test_conv_combos
---
 backends/arm/test/ops/test_conv_combos.py | 10 +---------
 backends/arm/test/tester/test_pipeline.py |  2 +-
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 7f54fa226aa..c06a6e666ec 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -430,16 +430,8 @@ def test_convolution_2d_tosa_MI_avgpool2d(test_data: torch.Tensor):
     pipeline.run()
 
 
-x_fails = {
-    "combo_conv_avgpool_20_x_4d": "AssertionError: Output 0 does not match reference output.",
-    "combo_conv_avgpool_4d": "AssertionError: Output 0 does not match reference output.",
-    "combo_conv_avgpool_5_x_4d_randn": "AssertionError: Output 0 does not match reference output.",
-    "combo_conv_avgpool_2_x_4d": "AssertionError: Output 0 does not match reference output.",
-}
-
-
 @pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLTORCH-307)
-@common.parametrize("test_data", ComboConvAvgPool2d.test_data, x_fails)
+@common.parametrize("test_data", ComboConvAvgPool2d.test_data)
 def test_convolution_2d_tosa_BI_avgpool2d(test_data: torch.Tensor):
     model = ComboConvAvgPool2d()
     pipeline = TosaPipelineBI[input_t1](
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
index 58c7c657250..58f8bda5391 100644
--- a/backends/arm/test/tester/test_pipeline.py
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -282,7 +282,7 @@ def __init__(
         custom_path: str = None,
         atol: float = 1e-03,
         rtol: float = 1e-03,
-        qtol: int = 0,
+        qtol: int = 1,
     ):
         tosa_profiles = {
             "0.80": TosaSpecification.create_from_string("TOSA-0.80+BI"),

From 879235b7e86e2c5b10a63c1bbfb73c11d691da4a Mon Sep 17 00:00:00 2001
From: neuropilot-captain <neuropilot@mediatek.com>
Date: Fri, 16 May 2025 01:53:09 +0800
Subject: [PATCH 106/178] Introduce `platform-config` in CompileSpec for
 MediaTek backend (#10464)

### Summary
- Introduce platform-config for setting targeted platform
- mt6989 for D9300
- mt6991 for D9400
---
 backends/mediatek/preprocess.py               | 27 +++++++++++++++++--
 .../mediatek/aot_utils/oss_utils/utils.py     |  3 ++-
 .../mediatek/model_export_scripts/llama.py    |  7 ++---
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/backends/mediatek/preprocess.py b/backends/mediatek/preprocess.py
index 3ec8c87e080..6ffaebb7589 100644
--- a/backends/mediatek/preprocess.py
+++ b/backends/mediatek/preprocess.py
@@ -20,6 +20,8 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 SKIP_COMPILE_SPEC_KEYS = {"ImportForever"}
+REQUIRED_COMPILE_SPEC_KEYS = {"platform-config"}
+SUPPORTED_PLATFORM_CONFIGS = {"mt6989", "mt6991"}
 
 
 def assert_default_dim_order(edge_graph_module: torch.fx.GraphModule) -> None:
@@ -47,6 +49,28 @@ def preprocess(
         cls, edge_program: ExportedProgram, module_compile_spec: List[CompileSpec]
     ) -> PreprocessResult:
 
+        # Validate CompileSpec settings
+        compile_spec_keys = [spec.key for spec in module_compile_spec]
+        if len(compile_spec_keys) != len(set(compile_spec_keys)):
+            raise RuntimeError(
+                "Unsupported duplicated keys in the CompileSpec settings."
+            )
+        if not REQUIRED_COMPILE_SPEC_KEYS.issubset(set(compile_spec_keys)):
+            raise RuntimeError(
+                "Following keys are required in the CompileSpec settings: {}."
+                "".format(REQUIRED_COMPILE_SPEC_KEYS)
+            )
+        platform = [
+            spec.value.decode("utf-8")
+            for spec in module_compile_spec
+            if spec.key == "platform-config"
+        ][0]
+        if platform not in SUPPORTED_PLATFORM_CONFIGS:
+            raise ValueError(
+                "Unsupported value of platform-config CompileSpec. Given {} but expected to be one "
+                "of {}.".format(platform, SUPPORTED_PLATFORM_CONFIGS)
+            )
+
         # Make sure all inputs are contiguous_format or NCHW or default dim order
         assert_default_dim_order(edge_program.graph_module)
 
@@ -64,8 +88,7 @@ def preprocess(
             if name_to_node_mappings[name].meta["val"].dtype == torch.float32
         ]
 
-        # This default compile options are only for mt6989 SOC
-        compile_options = ["--arch=mdla5.1,edpa1.0", "--relax-fp32", "--opt=3"]
+        compile_options = ["--relax-fp32", "--opt=3"]
         for spec in module_compile_spec:
             if spec.key in SKIP_COMPILE_SPEC_KEYS:
                 continue
diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py
index 25362788e31..d286a380d5c 100755
--- a/examples/mediatek/aot_utils/oss_utils/utils.py
+++ b/examples/mediatek/aot_utils/oss_utils/utils.py
@@ -14,6 +14,7 @@
     NeuropilotQuantizer,
     Precision,
 )
+from executorch.exir.backend.backend_details import CompileSpec
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -48,7 +49,7 @@ def build_executorch_binary(
     edge_compile_config = exir.EdgeCompileConfig(_check_ir_validity=False)
     # skipped op names are used for deeplabV3 model
     neuro_partitioner = NeuropilotPartitioner(
-        [],
+        [CompileSpec("platform-config", b"mt6989")],
         op_names_to_skip={
             "aten_convolution_default_106",
             "aten_convolution_default_107",
diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py
index 413df21d5cc..34e935bb03b 100644
--- a/examples/mediatek/model_export_scripts/llama.py
+++ b/examples/mediatek/model_export_scripts/llama.py
@@ -350,10 +350,11 @@ def export_to_et_ir(
 
         print("Delegating Edge Program to Neuropilot Backend")
         compile_spec = [
-            CompileSpec("gno", struct.pack("3s", b"LTS")),
-            CompileSpec("gno-exp", struct.pack("0s", b"")),
-            CompileSpec("gno-non-4d-tiling", struct.pack("0s", b"")),
+            CompileSpec("gno", b"LTS"),
+            CompileSpec("gno-exp", b""),
+            CompileSpec("gno-non-4d-tiling", b""),
             CompileSpec("ImportForever", struct.pack("?", True)),
+            CompileSpec("platform-config", b"mt6989"),
         ]
         partitioner = NeuropilotPartitioner(compile_spec)
         delegated_program = edge_program.to_backend(partitioner)

From 56eb18b78114e91197b62b4c639d0b308a28a793 Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 15 May 2025 22:07:16 +0200
Subject: [PATCH 107/178] Pipe in local_global attention (#10883)

Differential Revision: D74762916
---
 examples/models/llama/export_llama_lib.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index d669379b6a9..3a3102886f8 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -719,6 +719,7 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
             preq_mode=args.preq_mode,
             preq_group_size=args.preq_group_size,
             preq_embedding_quantize=args.preq_embedding_quantize,
+            local_global_attention=args.local_global_attention,
         )
     )
 
@@ -1447,7 +1448,7 @@ def _get_source_transforms(  # noqa
         transforms.append(
             partial(
                 replace_kv_cache_with_ring_kv_cache,
-                layer_sizes=args.local_global_attention,
+                layer_sizes=local_global_attention,
             )
         )
 

From 47164cc81857003dc279f42d4df95d90a6666944 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Thu, 15 May 2025 13:56:49 -0700
Subject: [PATCH 108/178] Update lint_urls.sh (#10919)

To match
[PyTorch](https://github.com/pytorch/pytorch/blob/main/scripts/lint_urls.sh)
---
 scripts/lint_urls.sh | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/scripts/lint_urls.sh b/scripts/lint_urls.sh
index 25b2c780066..64e79c9bec6 100755
--- a/scripts/lint_urls.sh
+++ b/scripts/lint_urls.sh
@@ -7,6 +7,8 @@
 
 set -euo pipefail
 
+trap 'kill 0' SIGINT
+
 status=0
 green='\e[1;32m'; red='\e[1;31m'; cyan='\e[1;36m'; yellow='\e[1;33m'; reset='\e[0m'
 user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
@@ -21,9 +23,11 @@ while IFS=: read -r filepath url; do
   (
     code=$(curl -k -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -I "$url") || code=000
     if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+      sleep 1
       code=$(curl -k -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -r 0-0 -A "$user_agent" "$url") || code=000
     fi
     if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+      sleep 1
       request_id=$(curl -sS -G -H 'Accept: application/json' \
         --data-urlencode "host=$url" \
         --data-urlencode "max_nodes=1" \
@@ -45,11 +49,16 @@ while IFS=: read -r filepath url; do
         done
       fi
     fi
+    # Treat Cloudflare JS-challenge and rate-limit as success.
+    if [[ "$code" == "403" || "$code" == "429" || "$code" == "503" ]]; then
+      printf "${yellow}WARN %s${reset} ${cyan}%s${reset} %s\n" "$code" "$url" "$filepath"
+      exit 0
+    fi
     if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
-      printf "${red}%s${reset} ${yellow}%s${reset} %s\n" "$code" "$url" "$filepath" >&2
+      printf "${red}FAIL %s${reset} ${yellow}%s${reset} %s\n" "$code" "$url" "$filepath" >&2
       exit 1
     else
-      printf "${green}%s${reset} ${cyan}%s${reset} %s\n" "$code" "$url" "$filepath"
+      printf "${green} OK  %s${reset} ${cyan}%s${reset} %s\n" "$code" "$url" "$filepath"
       exit 0
     fi
   ) &
@@ -58,7 +67,7 @@ while IFS=: read -r filepath url; do
     sleep 1
   done
  done < <(
-  pattern='(?!.*@lint-ignore)(?<!git\+)(?<!\$\{)https?://(?![^\s<>\")]*[<>\{\}\$])[^[:space:]<>")\[\]\\]+'
+  pattern='(?!.*@lint-ignore)(?<!git\+)(?<!\$\{)https?://(?![^/]*@)(?![^\s<>\")]*[<>\{\}\$])[^[:space:]<>")\[\]\\|]+'
   excludes=(
     ':(exclude,glob)**/.*'
     ':(exclude,glob)**/*.lock'
@@ -77,6 +86,7 @@ while IFS=: read -r filepath url; do
   git --no-pager grep --no-color -I -P -o "$pattern" -- "${paths[@]}" "${excludes[@]}" \
   | sed -E 's/[^/[:alnum:]]+$//' \
   | grep -Ev '://(0\.0\.0\.0|127\.0\.0\.1|localhost)([:/])' \
+  | grep -Ev '://[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' \
   | grep -Ev 'fwdproxy:8080' \
   || true
 )

From 41063f70caf063be94e860e616216ec1ff2c8f6d Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 15 May 2025 23:06:11 +0200
Subject: [PATCH 109/178] Update llama runner README.md (#10869)

---
 examples/models/llama/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 94ba1102853..041c7bb1d97 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -295,6 +295,7 @@ Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DSUPPORT_REGEX_LOOKAHEAD=ON
         -Bcmake-out/examples/models/llama \
         examples/models/llama
 
@@ -353,6 +354,7 @@ cmake  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DSUPPORT_REGEX_LOOKAHEAD=ON
     -Bcmake-out-android/examples/models/llama \
     examples/models/llama
 

From 9f6c0f2d85fed86f9daac76e8ccb13af2b838c3d Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 15 May 2025 23:13:29 +0200
Subject: [PATCH 110/178] Update Qwen3 README.md (#10882)

---
 examples/models/qwen3/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
index d5507d79f2f..65923fb020c 100644
--- a/examples/models/qwen3/README.md
+++ b/examples/models/qwen3/README.md
@@ -86,3 +86,6 @@ cmake-out/examples/models/llama/llama_main
 ```
 
 To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
+
+### FAQ
+For more help with exporting or running this model, feel free to ask in our [discord channel](https://lnkd.in/gWCM4ViK).

From a63a6481ef8f1944af1d44ec3b3a0b7cdc2fb542 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Thu, 15 May 2025 15:34:21 -0700
Subject: [PATCH 111/178] Default CMAKE_SYSTEM_PROCESSOR to the host (#10912)

### Summary

Although by default cmake use the host's architecture if
`CMAKE_SYSTEM_PROCESSOR` isn't provided, it doesn't actually set the
value. This can be an issue in cases where people try to read this value
(i.e.
[extension/llm/custom_ops/CMakeLists.txt](https://github.com/pytorch/executorch/blob/0a6f6220dd7252f5b79bf7b33edf1ea0631158f3/extension/llm/custom_ops/CMakeLists.txt#L52-L66)).
So, let's just explicitly set it.

### Test plan

CI

cc @larryliu0820
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47b8a4eb678..557f36fb833 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,11 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 announce_configured_options(CMAKE_CXX_STANDARD)
 
+if(NOT CMAKE_SYSTEM_PROCESSOR)
+  set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
+endif()
+announce_configured_options(CMAKE_SYSTEM_PROCESSOR)
+
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug)
 endif()

From 24789c86401a93a534190fcaf5101634694f6653 Mon Sep 17 00:00:00 2001
From: Michael Adragna <33380470+leafs1@users.noreply.github.com>
Date: Thu, 15 May 2025 15:52:22 -0700
Subject: [PATCH 112/178] Added debug logs for loading/executing model methods
 (#10915)

### Summary
Added debug logs showing model methods loading and running at runtime.

### Test plan
Manually ran tests on sample model and observed output
---
 runtime/executor/method.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index ac799d3e14e..af40f4a7bfd 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -754,7 +754,7 @@ Result<Method> Method::load(
     temp_allocator = platform_allocator;
   }
   Method method(program, memory_manager, event_tracer, temp_allocator);
-
+  ET_LOG(Debug, "Loading method: %s.", s_plan->name()->c_str());
   Error err = method.init(s_plan, named_data_map);
   if (err != Error::Ok) {
     return err;
@@ -1522,6 +1522,7 @@ Error Method::execute() {
       initialized(),
       NotSupported,
       "Cannot execute until method has been initialized.");
+  ET_LOG(Debug, "Executing method: %s.", method_meta().name());
 
   // Chains are executed sequentially today, but future async designs may
   // branch and run many in parallel or out of order.

From 78fe7ee18bcba13d527fb4657a933fb2f6e1d353 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Thu, 15 May 2025 16:41:23 -0700
Subject: [PATCH 113/178] Define PYTHON_EXECUTABLE only once in cmake (#10911)

### Summary
There is no need to redefine this over and over again. Do it once at the
root. Lmk if I'm missing something:

```
$ rg -t cmake -g '!examples' "resolve_python"
$ rg -t cmake -g '!examples' "NOT PYTHON_EXECUTABLE"
$ rg -t cmake -g '!examples' "python3"
$ rg -t cmake -g '!examples' "python\s"
```

### Test plan

CI


cc @larryliu0820
---
 CMakeLists.txt                                 | 18 ++++++++----------
 backends/apple/mps/CMakeLists.txt              |  4 ----
 backends/cadence/CMakeLists.txt                |  4 ----
 .../cadence/fusion_g3/operators/CMakeLists.txt |  4 ----
 backends/cadence/hifi/operators/CMakeLists.txt |  4 ----
 .../cadence/reference/operators/CMakeLists.txt |  4 ----
 backends/cortex_m/CMakeLists.txt               |  4 ----
 backends/vulkan/CMakeLists.txt                 |  4 ----
 backends/vulkan/test/op_tests/CMakeLists.txt   |  4 ----
 backends/xnnpack/CMakeLists.txt                |  4 ----
 configurations/CMakeLists.txt                  |  4 ----
 extension/flat_tensor/test/CMakeLists.txt      |  2 +-
 extension/llm/custom_ops/CMakeLists.txt        |  4 ----
 extension/module/test/CMakeLists.txt           |  4 ++--
 extension/runner_util/test/CMakeLists.txt      |  2 +-
 kernels/optimized/CMakeLists.txt               |  3 ---
 kernels/portable/CMakeLists.txt                |  4 ----
 kernels/quantized/CMakeLists.txt               |  4 ----
 kernels/test/CMakeLists.txt                    |  4 ++--
 runtime/executor/test/CMakeLists.txt           |  6 +++---
 20 files changed, 17 insertions(+), 74 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 557f36fb833..60415f4ab33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,9 @@ project(executorch)
 # MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
+include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
+include(CMakeDependentOption)
+include(ExternalProject)
 
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
@@ -64,10 +67,14 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 announce_configured_options(CMAKE_BUILD_TYPE)
 
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+announce_configured_options(PYTHON_EXECUTABLE)
+
 announce_configured_options(CMAKE_CXX_COMPILER_ID)
 announce_configured_options(CMAKE_TOOLCHAIN_FILE)
 announce_configured_options(BUCK2)
-announce_configured_options(PYTHON_EXECUTABLE)
 
 load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -77,10 +84,6 @@ print_configured_options()
 
 # MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
 
-include(tools/cmake/Utils.cmake)
-include(CMakeDependentOption)
-include(ExternalProject)
-
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Setup RPATH.
@@ -256,11 +259,6 @@ if(EXECUTORCH_BUILD_TESTS)
   include(CTest)
 endif()
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-message(STATUS "Using python executable '${PYTHON_EXECUTABLE}'")
-
 # TODO(dbort): Fix these warnings and remove this flag.
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index f8119fbea97..7822afdef46 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -18,10 +18,6 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 set(_common_compile_options -Wno-deprecated-declarations)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index f8bb42cba50..dc1f46c47a7 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -30,10 +30,6 @@ add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
   include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-  if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-  endif()
-
   set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
   # Find prebuilt libraries. executorch package should contain portable_ops_lib,
diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt
index ec3220179a6..c29ffa91af9 100644
--- a/backends/cadence/fusion_g3/operators/CMakeLists.txt
+++ b/backends/cadence/fusion_g3/operators/CMakeLists.txt
@@ -14,10 +14,6 @@ endif()
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 3867e0f44f2..1e193437571 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -14,10 +14,6 @@ endif()
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index e0a10c6fa36..6a71af012e4 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -14,10 +14,6 @@ endif()
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp"
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index 39638bf0ee4..5c353389d94 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -23,10 +23,6 @@ endif()
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # Cortex-M ops kernel sources
 set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index ae3ec4b4e24..db7c3694f28 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -24,10 +24,6 @@ if(NOT RUNTIME_PATH)
   set(RUNTIME_PATH ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
 endif()
 
-if(NOT PYTHON_EXECUTABLE)
-  set(PYTHON_EXECUTABLE python3)
-endif()
-
 # Include this file to access target_link_options_shared_lib This is required to
 # provide access to target_link_options_shared_lib which allows libraries to be
 # linked with the --whole-archive flag. This is required for libraries that
diff --git a/backends/vulkan/test/op_tests/CMakeLists.txt b/backends/vulkan/test/op_tests/CMakeLists.txt
index a34d3f297f6..59baafe3cef 100644
--- a/backends/vulkan/test/op_tests/CMakeLists.txt
+++ b/backends/vulkan/test/op_tests/CMakeLists.txt
@@ -45,10 +45,6 @@ find_library(LIB_TORCH torch HINTS ${TORCH_INSTALL_PREFIX}/lib)
 find_library(LIB_TORCH_CPU torch_cpu HINTS ${TORCH_INSTALL_PREFIX}/lib)
 find_library(LIB_C10 c10 HINTS ${TORCH_INSTALL_PREFIX}/lib)
 
-if(NOT PYTHON_EXECUTABLE)
-  set(PYTHON_EXECUTABLE python3)
-endif()
-
 # Third party include paths
 
 set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party)
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index f3bfc4f669b..670f1fba6df 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -25,10 +25,6 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # NB: Enabling this will serialize execution of delegate instances Keeping this
 # OFF by default to maintain existing behavior, to be revisited.
 option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index d77ea1633ed..c039d1a8e05 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -18,10 +18,6 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 set(_common_compile_options -Wno-deprecated-declarations)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
diff --git a/extension/flat_tensor/test/CMakeLists.txt b/extension/flat_tensor/test/CMakeLists.txt
index 6c3110065c2..c3296dc61f3 100644
--- a/extension/flat_tensor/test/CMakeLists.txt
+++ b/extension/flat_tensor/test/CMakeLists.txt
@@ -22,7 +22,7 @@ add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
   COMMAND
-    python -m test.models.export_program --modules "ModuleAddMul"
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
     --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index cf915054ad7..a3e9a65b3ed 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -11,10 +11,6 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-if(NOT PYTHON_EXECUTABLE)
-  set(PYTHON_EXECUTABLE python3)
-endif()
-
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index bb93553d370..f5c1fd8d857 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -24,10 +24,10 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
   COMMAND
-    python3 -m test.models.export_program --modules "ModuleAdd"
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAdd"
     --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   COMMAND
-    python3 -m test.models.export_program --modules "ModuleAddMul"
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
     --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt
index 99136f1f3e0..1be569cf4eb 100644
--- a/extension/runner_util/test/CMakeLists.txt
+++ b/extension/runner_util/test/CMakeLists.txt
@@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
-  COMMAND python3 -m test.models.export_program --modules "ModuleAdd" --outdir
+  COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAdd" --outdir
           "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index ae6d8e6fcd3..5e9c0223492 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -36,9 +36,6 @@ list(APPEND _common_compile_options -DET_BUILD_WITH_BLAS)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
 # Build cpublas.
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 4094225c3db..d301ea564f6 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -26,10 +26,6 @@ set(_common_compile_options -Wno-deprecated-declarations)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # Portable kernel sources TODO(larryliu0820): use buck2 to gather the sources
 file(GLOB_RECURSE _portable_kernels__srcs
      "${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp"
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index 1c4e952b6ae..bfcd8f7f324 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -29,10 +29,6 @@ set(_common_compile_options -Wno-deprecated-declarations)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # Quantized ops kernel sources TODO(larryliu0820): use buck2 to gather the
 # sources
 list(TRANSFORM _quantized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index a56fc6cab22..6cd34773d14 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -42,11 +42,11 @@ foreach(kernel ${_kernels})
            "${_wrapper_dir}/supported_features.h"
     COMMAND mkdir -p ${_wrapper_dir}
     COMMAND
-      python kernels/test/gen_supported_features.py
+      ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
       kernels/${kernel}/test/supported_features_def.yaml >
       ${_wrapper_dir}/supported_features.cpp
     COMMAND
-      python kernels/test/gen_supported_features.py
+      ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
       kernels/test/supported_features.yaml >
       ${_wrapper_dir}/supported_features.h
     WORKING_DIRECTORY "${EXECUTORCH_ROOT}"
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index cd815428997..7a0e1be938d 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -30,14 +30,14 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/delegated/ModuleAddMul.pte"
   COMMAND
-    python3 -m test.models.export_program --modules
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
     "ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful"
     --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   COMMAND
-    python3 -m test.models.export_program --modules "ModuleAddMul"
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
     --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   COMMAND
-    python3 -m test.models.export_delegated_program --modules "ModuleAddMul"
+    ${PYTHON_EXECUTABLE} -m test.models.export_delegated_program --modules "ModuleAddMul"
     --backend_id "StubBackend" --outdir "${CMAKE_CURRENT_BINARY_DIR}/delegated/" || true
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )

From 7175ca420dc5a173f8635da976457bf6f17bbbc1 Mon Sep 17 00:00:00 2001
From: Abdurrahman Akkas <akkasa@meta.com>
Date: Thu, 15 May 2025 17:22:19 -0700
Subject: [PATCH 114/178] Fix CatFromSliceCopyPass indexing issue.

Differential Revision: D74765369

Pull Request resolved: https://github.com/pytorch/executorch/pull/10913
---
 backends/cadence/aot/pass_utils.py            | 31 ++++++
 backends/cadence/aot/remove_ops.py            | 97 ++++++++-----------
 .../aot/tests/test_remove_ops_passes.py       | 27 ++++++
 3 files changed, 101 insertions(+), 54 deletions(-)

diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py
index ca5ed017046..bc6153ac8cc 100644
--- a/backends/cadence/aot/pass_utils.py
+++ b/backends/cadence/aot/pass_utils.py
@@ -157,3 +157,34 @@ def nodes_not_adjacent_in_gm(
         if node.next.target == succ_target:
             return False
     return True
+
+
+def get_arg(
+    node: torch.fx.Node,
+    arg_index: int,
+    kwarg_name: str,
+    *,
+    default: torch.fx.node.Argument = None,
+) -> torch.fx.node.Argument:
+    """
+    Get the arg at arg_index or kwarg with arg_name of the node. If neither is found
+    return default.
+    """
+    if arg_index < len(node.args):
+        return node.args[arg_index]
+    elif kwarg_name in node.kwargs:
+        return node.kwargs[kwarg_name]
+    else:
+        return default
+
+
+def set_arg(
+    node: torch.fx.Node, arg_index: int, kwarg_name: str, value: torch.fx.node.Argument
+) -> None:
+    """
+    Set the arg at arg_index if it exists, otherwise set the kwarg.
+    """
+    if arg_index < len(node.args):
+        node.update_arg(arg_index, value)
+    else:
+        node.update_kwarg(kwarg_name, value)
diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index 3d277d81901..97cb1ae49d1 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -25,7 +25,9 @@
 import torch.fx
 from executorch.backends.cadence.aot.pass_utils import (
     CadencePassAttribute,
+    get_arg,
     register_cadence_pass,
+    set_arg,
 )
 
 from executorch.backends.cadence.aot.simplify_ops import SimplifySliceOpPass
@@ -37,7 +39,7 @@
 from executorch.exir.pass_manager import PassManager, PassType
 from executorch.exir.passes import dead_code_elimination_pass
 from executorch.exir.passes.spec_prop_pass import SpecPropPass
-from torch.fx.node import Argument
+from torch.fx.node import Argument, Node
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
@@ -771,65 +773,52 @@ def remove_branched(
 
 
 class RemoveCatFromSliceCopyPass(ExportPass):
-    def _remove_unused_cat(  # noqa: C901
-        self, graph_module: torch.fx.GraphModule
-    ) -> None:
-        slice_copy_nodes = [
-            node
-            for node in graph_module.graph.nodes
-            if node.target == exir_ops.edge.aten.slice_copy.Tensor
-        ]
-        for slice_copy_node in slice_copy_nodes:
-            slice_dim, start_idx, end_idx, step = 0, 0, float("inf"), 1
-            input_node, *other_args = slice_copy_node.args
-            if len(other_args) >= 1:
-                slice_dim = other_args[0]
-            if len(other_args) >= 2:
-                start_idx = other_args[1]
-            if len(other_args) >= 3:
-                end_idx = other_args[2]
-            if len(other_args) >= 4:
-                step = other_args[3]
-            if step != 1:
-                continue
-            slice_copy_dtype = slice_copy_node.meta["val"].dtype
-            if input_node.target != exir_ops.edge.aten.cat.default:
-                continue
-            cat_dtype = input_node.meta["val"].dtype
-            if slice_copy_dtype != cat_dtype:
+    """
+    Simplifies cat->slice_copy chains where one of the cat inputs can be directly passed
+    to the slice_copy.
+    """
+
+    def _remove_unused_cat(self, graph_module: torch.fx.GraphModule) -> None:
+        for slice_copy_node in graph_module.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        ):
+            cat_node = cast(Node, get_arg(slice_copy_node, 0, "input"))
+            slice_dim = cast(int, get_arg(slice_copy_node, 1, "dim", default=0))
+            start_idx = cast(int, get_arg(slice_copy_node, 2, "start", default=None))
+            end_idx = cast(int, get_arg(slice_copy_node, 3, "end", default=None))
+            step = cast(int, get_arg(slice_copy_node, 4, "step", default=1))
+
+            if cat_node.target != exir_ops.edge.aten.cat.default or step != 1:
                 continue
-            cat_dim = input_node.args[1:]
-            if len(cat_dim) == 0:
-                cat_dim = 0
+
+            # Make sure cat and slice happens on the same dimension.
+            cat_dim = cast(Node, get_arg(cat_node, 1, "dim", default=0))
             if cat_dim != slice_dim:
                 continue
-            cat_output_shape = input_node.meta["val"].shape
-            start_idx = (
-                cat_output_shape[cat_dim] + start_idx if start_idx < 0 else start_idx
-            )
-            end_idx = (
-                cat_output_shape[cat_dim]
-                if end_idx > cat_output_shape[cat_dim]
-                else end_idx
-            )
-            base_idx = 0
-            cat_input_to_keep = None
-            for cat_input_node in input_node.args[0]:
-                cat_input_dtype = cat_input_node.meta["val"].dtype
-                if slice_copy_dtype != cat_input_dtype:
-                    continue
+
+            # Canonicalize slice indices.
+            cat_output_shape = cat_node.meta["val"].shape
+            if start_idx is None:
+                start_idx = 0
+            elif start_idx < 0:
+                start_idx += cat_output_shape[cat_dim]
+            if end_idx is None or end_idx > cat_output_shape[cat_dim]:
+                end_idx = cat_output_shape[cat_dim]
+            elif end_idx < 0:
+                end_idx += cat_output_shape[cat_dim]
+
+            offset = 0
+            for cat_input_node in cast(List[Node], get_arg(cat_node, 0, "tensors")):
                 cat_input_shape = cat_input_node.meta["val"].shape
 
-                # check if the slice range overlaps with the cat range
-                if (
-                    base_idx <= start_idx
-                    and end_idx <= list(cat_input_shape)[cat_dim] + base_idx
-                ):
-                    cat_input_to_keep = cat_input_node
+                # Check if the slice range overlaps with the cat input range.
+                if offset <= start_idx and end_idx <= offset + cat_input_shape[cat_dim]:
+                    slice_copy_node.replace_input_with(cat_node, cat_input_node)
+                    set_arg(slice_copy_node, 2, "start", start_idx - offset)
+                    set_arg(slice_copy_node, 3, "end", end_idx - offset)
                     break
-                base_idx += list(cat_input_shape)[cat_dim]
-            if cat_input_to_keep is not None:
-                slice_copy_node.replace_input_with(input_node, cat_input_to_keep)
+
+                offset += cat_input_shape[cat_dim]
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         self._remove_unused_cat(graph_module)
diff --git a/backends/cadence/aot/tests/test_remove_ops_passes.py b/backends/cadence/aot/tests/test_remove_ops_passes.py
index 74c39ae3ee3..b5ae756c076 100644
--- a/backends/cadence/aot/tests/test_remove_ops_passes.py
+++ b/backends/cadence/aot/tests/test_remove_ops_passes.py
@@ -864,3 +864,30 @@ def forward(self, x, y):
 
         # Ensure both cat nodes were removed
         self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
+
+    def test_remove_cat_from_slice_copy_second_input(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 4))
+        y = builder.placeholder("y", torch.randn(2, 4))
+        cat = builder.call_operator(
+            op=exir_ops.edge.aten.cat.default,
+            args=((x, y), 1),
+        )
+        slice_copy = builder.call_operator(
+            op=exir_ops.edge.aten.slice_copy.Tensor,
+            args=(cat, 1, 5, 7, 1),
+        )
+        builder.output([slice_copy])
+        graph_module = builder.get_graph_module()
+
+        inputs = (torch.randn(2, 4), torch.randn(2, 4))
+        expected_outputs = graph_module(*inputs)[0]
+
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Cat should be removed.
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
+
+        # Output should remain the same.
+        self.assertTrue(torch.equal(graph_module(*inputs)[0], expected_outputs))

From 71767c3f1cb05ddbae1c4442ebcd8c571e83c74f Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Thu, 15 May 2025 18:06:39 -0700
Subject: [PATCH 115/178] Update javadoc in Module.java

Differential Revision: D74823057

Pull Request resolved: https://github.com/pytorch/executorch/pull/10914
---
 .../src/main/java/org/pytorch/executorch/Module.java            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
index f3f543dc2a8..3baa476871f 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
@@ -138,7 +138,7 @@ public String[] readLogBuffer() {
   }
 
   /**
-   * Explicitly destroys the native torch::jit::Module. Calling this method is not required, as the
+   * Explicitly destroys the native Module object. Calling this method is not required, as the
    * native object will be destroyed when this object is garbage-collected. However, the timing of
    * garbage collection is not guaranteed, so proactively calling {@code destroy} can free memory
    * more quickly. See {@link com.facebook.jni.HybridData#resetNative}.

From bc0fdf311639c0bf38ed66c6a842df2badfd024a Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 15 May 2025 18:31:26 -0700
Subject: [PATCH 116/178] Add getitem support in graph builder.

Differential Revision: D74817590

Pull Request resolved: https://github.com/pytorch/executorch/pull/10910
---
 backends/cadence/aot/graph_builder.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backends/cadence/aot/graph_builder.py b/backends/cadence/aot/graph_builder.py
index b0a4dc7f835..0e7a113eed5 100644
--- a/backends/cadence/aot/graph_builder.py
+++ b/backends/cadence/aot/graph_builder.py
@@ -96,6 +96,11 @@ def call_submodule(
     ) -> PassResult:
         return ExportPass().call(graph_module)
 
+    def call_getitem(
+        self, value: ProxyValue, key: int, meta: Optional[NodeMetadata] = None
+    ) -> ProxyValue:
+        return super().call_getitem(value, key, meta or NodeMetadata({}))
+
     def _fx(
         self,
         kind: str,

From 12446723a273353242438cc3587c5f282c8e20b8 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Thu, 15 May 2025 18:59:12 -0700
Subject: [PATCH 117/178] Don't build executor runner with Apple frameworks
 (#10933)

---
 scripts/build_apple_frameworks.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
index 07dfd3bf69f..cef4486c843 100755
--- a/scripts/build_apple_frameworks.sh
+++ b/scripts/build_apple_frameworks.sh
@@ -193,6 +193,7 @@ cmake_build() {
         -DEXECUTORCH_BUILD_MPS=$MPS \
         -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
         -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
         -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \

From e09f33cfdb5bca4a6e24f8660f233d1ab81eb7f9 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Thu, 15 May 2025 19:19:14 -0700
Subject: [PATCH 118/178] Android check pte exists

Differential Revision: D74850727

Pull Request resolved: https://github.com/pytorch/executorch/pull/10931
---
 .../executorch/ModuleInstrumentationTest.java | 19 ++--------
 .../java/org/pytorch/executorch/Module.java   |  5 +++
 .../executorch/extension/llm/LlmModule.java   | 37 ++++++++++++++-----
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
index be6efdd67be..21b6a0610fd 100644
--- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
@@ -96,20 +96,9 @@ public void testModuleLoadForwardExplicit() throws IOException{
         assertTrue(results[0].isTensor());
     }
 
-    @Test
+    @Test(expected = RuntimeException.class)
     public void testModuleLoadNonExistantFile() throws IOException{
         Module module = Module.load(getTestFilePath(MISSING_FILE_NAME));
-
-        EValue[] results = module.forward();
-        assertEquals(null, results);
-    }
-
-    @Test
-    public void testModuleLoadMethodNonExistantFile() throws IOException{
-        Module module = Module.load(getTestFilePath(MISSING_FILE_NAME));
-
-        int loadMethod = module.loadMethod(FORWARD_METHOD);
-        assertEquals(loadMethod, ACCESS_FAILED);
     }
 
     @Test
@@ -146,11 +135,11 @@ public void testForwardOnDestroyedModule() throws IOException{
         assertEquals(loadMethod, OK);
 
         module.destroy();
-        
+
         EValue[] results = module.forward();
         assertEquals(0, results.length);
     }
-    
+
     @Test
     public void testForwardFromMultipleThreads() throws InterruptedException, IOException {
         Module module = Module.load(getTestFilePath(TEST_FILE_NAME));
@@ -169,7 +158,7 @@ public void run() {
                     assertTrue(results[0].isTensor());
                     completed.incrementAndGet();
                 } catch (InterruptedException e) {
-        
+
                 }
             }
         };
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
index 3baa476871f..d4f1e99a3c7 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
@@ -11,6 +11,7 @@
 import android.util.Log;
 import com.facebook.soloader.nativeloader.NativeLoader;
 import com.facebook.soloader.nativeloader.SystemDelegate;
+import java.io.File;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 import org.pytorch.executorch.annotations.Experimental;
@@ -52,6 +53,10 @@ public static Module load(final String modelPath, int loadMode) {
     if (!NativeLoader.isInitialized()) {
       NativeLoader.init(new SystemDelegate());
     }
+    File modelFile = new File(modelPath);
+    if (!modelFile.canRead() || !modelFile.isFile()) {
+      throw new RuntimeException("Cannot load model path " + modelPath);
+    }
     return new Module(new NativePeer(modelPath, loadMode));
   }
 
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
index f845937be41..69e302edf78 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -12,6 +12,7 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import com.facebook.soloader.nativeloader.NativeLoader;
 import com.facebook.soloader.nativeloader.SystemDelegate;
+import java.io.File;
 import org.pytorch.executorch.annotations.Experimental;
 
 /**
@@ -41,9 +42,26 @@ public class LlmModule {
   private static native HybridData initHybrid(
       int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath);
 
+  /**
+   * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+   * data path.
+   */
+  public LlmModule(
+      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+    File modelFile = new File(modulePath);
+    if (!modelFile.canRead() || !modelFile.isFile()) {
+      throw new RuntimeException("Cannot load model path " + modulePath);
+    }
+    File tokenizerFile = new File(tokenizerPath);
+    if (!tokenizerFile.canRead() || !tokenizerFile.isFile()) {
+      throw new RuntimeException("Cannot load tokenizer path " + tokenizerPath);
+    }
+    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataPath);
+  }
+
   /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
   public LlmModule(String modulePath, String tokenizerPath, float temperature) {
-    mHybridData = initHybrid(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, null);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, null);
   }
 
   /**
@@ -51,23 +69,22 @@ public LlmModule(String modulePath, String tokenizerPath, float temperature) {
    * path.
    */
   public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) {
-    mHybridData = initHybrid(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, dataPath);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, dataPath);
   }
 
   /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
   public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
-    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, null);
+    this(modelType, modulePath, tokenizerPath, temperature, null);
   }
 
   /** Constructs a LLM Module for a model with the given LlmModuleConfig */
   public LlmModule(LlmModuleConfig config) {
-    mHybridData =
-        initHybrid(
-            config.getModelType(),
-            config.getModulePath(),
-            config.getTokenizerPath(),
-            config.getTemperature(),
-            config.getDataPath());
+    this(
+        config.getModelType(),
+        config.getModulePath(),
+        config.getTokenizerPath(),
+        config.getTemperature(),
+        config.getDataPath());
   }
 
   public void resetNative() {

From d069d651b906cfd4a09ffb23f455a0bf7ba5724b Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 15 May 2025 19:53:10 -0700
Subject: [PATCH 119/178] Add pass to convert kwargs to args + populate
 optional args.

Differential Revision: D74510388

Pull Request resolved: https://github.com/pytorch/executorch/pull/10857
---
 backends/cadence/aot/TARGETS                  |  1 +
 backends/cadence/aot/simplify_ops.py          | 39 ++++++++++++++++++-
 .../aot/tests/test_simplify_ops_passes.py     | 36 ++++++++++++++++-
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 9670e1cf00d..fa925c5d84e 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -367,6 +367,7 @@ python_unittest(
         "fbsource//third-party/pypi/parameterized:parameterized",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:compiler",
+        "//executorch/backends/cadence/aot:graph_builder",
         "//executorch/backends/cadence/aot:ops_registrations",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:simplify_ops",
diff --git a/backends/cadence/aot/simplify_ops.py b/backends/cadence/aot/simplify_ops.py
index 967cdc968ba..6fe163723d7 100644
--- a/backends/cadence/aot/simplify_ops.py
+++ b/backends/cadence/aot/simplify_ops.py
@@ -16,9 +16,10 @@
     CadencePassAttribute,
     register_cadence_pass,
 )
-
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, ProxyValue
+from torch.fx.operator_schemas import get_signature_for_torch_op
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
@@ -109,8 +110,44 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(op, new_args, kwargs, meta)
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class BindOptionalArgsPass(ExportPass):
+    """Bind all optional args and kwargs."""
+
+    def call_operator(self, op, args, kwargs, meta):
+        if not isinstance(op, EdgeOpOverload):
+            return super().call_operator(op, args, kwargs, meta)
+        assert callable(op)
+
+        torch_op_schemas = get_signature_for_torch_op(op._op)
+        if len(torch_op_schemas) == 0:
+            return super().call_operator(op, args, kwargs, meta)
+
+        matched_schemas = []
+        # Iterate through all of the schema until we find one that matches
+        # If one matches, populate `new_args_and_kwargs` with the new args/kwargs
+        # values. If none matches, `new_args_and_kwargs` will be None
+        for candidate_signature in torch_op_schemas:
+            try:
+                candidate_signature.bind(*args, **kwargs)
+                matched_schemas.append(candidate_signature)
+            except TypeError:
+                continue
+
+        if len(matched_schemas) != 1:
+            # Did not match any schema. Cannot normalize
+            return super().call_operator(op, args, kwargs, meta)
+
+        sig = matched_schemas[0]
+        bound_args = sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+
+        return super().call_operator(op, bound_args.args, bound_args.kwargs, meta)
+
+
 # This class encapsulates all the functions that simplify the op's args
 class CadenceSimplifyOpsInGraph:
     passes = [
         SimplifySliceOpPass,
+        BindOptionalArgsPass,
     ]
diff --git a/backends/cadence/aot/tests/test_simplify_ops_passes.py b/backends/cadence/aot/tests/test_simplify_ops_passes.py
index e4050374a55..00229757764 100644
--- a/backends/cadence/aot/tests/test_simplify_ops_passes.py
+++ b/backends/cadence/aot/tests/test_simplify_ops_passes.py
@@ -13,8 +13,12 @@
 import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
 from executorch.backends.cadence.aot.compiler import export_to_edge
+from executorch.backends.cadence.aot.graph_builder import single_op_builder
 from executorch.backends.cadence.aot.pass_utils import count_node
-from executorch.backends.cadence.aot.simplify_ops import SimplifySliceOpPass
+from executorch.backends.cadence.aot.simplify_ops import (
+    BindOptionalArgsPass,
+    SimplifySliceOpPass,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from parameterized.parameterized import parameterized
 from torch.fx.passes.infra.pass_base import PassResult
@@ -112,3 +116,33 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(
             count_node(graph_after_passes, exir_ops.edge.aten.full.default), 1
         )
+
+    def test_simplify_slice_op_args(self) -> None:
+        x = torch.rand(4, 5)
+        gm = single_op_builder(
+            placeholders=(x,),
+            op=exir_ops.edge.aten.slice_copy.Tensor,
+            args=(x, 1),
+            kwargs={"end": 3},
+        )
+        self.assertEqual(
+            [
+                (n.args[1:], n.kwargs)
+                for n in gm.graph.find_nodes(
+                    op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+                )
+            ],
+            [((1,), {"end": 3})],
+        )
+
+        gm = BindOptionalArgsPass().call(gm).graph_module
+
+        self.assertEqual(
+            [
+                (n.args[1:], n.kwargs)
+                for n in gm.graph.find_nodes(
+                    op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+                )
+            ],
+            [((1, None, 3, 1), {})],
+        )

From 4b67dc90ff50fcf4469bf21481448065b35c2608 Mon Sep 17 00:00:00 2001
From: Yufeng Shi <yufeng.shi@arm.com>
Date: Fri, 16 May 2025 10:23:23 +0100
Subject: [PATCH 120/178] Arm backend: Do not delegate casting to FP dtypes
 with BI profile (#10906)

- Casting to floating-point dtypes should be rejected for delegation.
Class ToCopySupported should guarantee this. However, the shallow copy
used in func _merge_supported_types will modify the dict
SUPPORTED_INT_TYPES unintentionally, merging the dict
SUPPORTED_FLOAT_TYPES into SUPPORTED_INT_TYPES. Therefore, casting to
floating-point dtypes can also pass the check under BI profile.
- Fix it by using deepcopy.
- Add unittest in test_to_copy.py to check the castings to FP dtypes are
not delegated.

Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
---
 .../arm/operator_support/to_copy_support.py   |  5 +-
 backends/arm/test/ops/test_to_copy.py         | 58 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py
index 343d949c244..7f27d0b5b36 100644
--- a/backends/arm/operator_support/to_copy_support.py
+++ b/backends/arm/operator_support/to_copy_support.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+import copy
 import logging
 
 import torch
@@ -42,7 +43,9 @@ def _merge_supported_types(
         dtypes1: SupportedTypeDict,
         dtypes2: SupportedTypeDict,
     ) -> SupportedTypeDict:
-        merged_dtypes = dtypes1
+        merged_dtypes = copy.deepcopy(
+            dtypes1
+        )  # Use deepcopy to avoid unintentionally modifying SUPPORTED_INT_TYPES
         for k, v in dtypes2.items():
             merged_dtypes[k] = merged_dtypes.get(k, []) + v
         return merged_dtypes
diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py
index 9d873f30ce9..9fcd65dc957 100644
--- a/backends/arm/test/ops/test_to_copy.py
+++ b/backends/arm/test/ops/test_to_copy.py
@@ -12,7 +12,10 @@
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineMI
+from executorch.backends.arm.test.tester.test_pipeline import (
+    OpNotSupportedPipeline,
+    TosaPipelineMI,
+)
 
 input_t1 = Tuple[torch.Tensor]  # Input x
 
@@ -31,11 +34,14 @@ def forward(self, x: torch.Tensor):
 
 Only test unquantized graphs as explicit casting of dtypes messes with the
 quantization.
+However, the model being exported may have some explicit casting to floating
+point dtypes. The casting or their decomposition should be rejected during
+partition. This test will be coveraged by class TestToCopy_BI.
 
 Note: This is also covered by test_scalars.py.
 """
 
-_TO_COPY_TEST_DATA = {
+_TO_COPY_TEST_DATA_MI = {
     "rand_fp16": lambda: (torch.rand((1, 2, 3, 4), dtype=torch.float16), torch.float32),
     "rand_fp32": lambda: (torch.rand((1, 2, 3, 4), dtype=torch.float32), torch.float16),
     "rand_int8": lambda: (
@@ -53,7 +59,7 @@ def forward(self, x: torch.Tensor):
 }
 
 
-@common.parametrize("test_data", _TO_COPY_TEST_DATA)
+@common.parametrize("test_data", _TO_COPY_TEST_DATA_MI)
 def test_copy_tosa_MI(test_data: Tuple):
     test_tensor, new_dtype = test_data()
 
@@ -64,3 +70,49 @@ def test_copy_tosa_MI(test_data: Tuple):
         exir_op=[],
     )
     pipeline.run()
+
+
+"""
+Casting operations that output floating-point dtypes should be rejected under BI profile,
+rather than introducing an invalid dtype into the tosa graph.
+For example, x.to(dtype=torch.float32) will be eventually lowered to
+exir_ops.edge.dim_order_ops._to_dim_order_copy.default. We should reject this operation
+in ToCopySupported::is_node_tosa_supported() before it goes into the delegated graph.
+"""
+_TO_COPY_TEST_DATA_BI = {
+    "rand_int8_fp32": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8),
+        torch.float32,
+    ),
+    "rand_int16_fp32": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int16),
+        torch.float32,
+    ),
+    "rand_int32_fp32": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int32),
+        torch.float32,
+    ),
+    "rand_int32_fp16": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int32),
+        torch.float16,
+    ),
+    "rand_int32_bf16": lambda: (
+        torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int32),
+        torch.bfloat16,
+    ),
+}
+
+
+@common.parametrize("test_data", _TO_COPY_TEST_DATA_BI)
+def test_copy_tosa_BI(test_data: Tuple):
+    test_tensor, new_dtype = test_data()
+
+    pipeline = OpNotSupportedPipeline[input_t1](
+        Cast(new_dtype),
+        (test_tensor,),
+        {
+            "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1
+        },
+        quantize=True,
+    )
+    pipeline.run()

From 12af535a5e5bf8e25dfd6a47e58c7612506073f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Per=20=C3=85strand?= <per@users.noreply.github.com>
Date: Fri, 16 May 2025 11:33:44 +0200
Subject: [PATCH 121/178] Arm backend: Fix TOSA 1.0 node visitor for sum
 (#10908)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
Fixes serialization for sum.dim_IntList node visitor as well as some
rescale handling issues.

### Test plan
Tested with internal and external GitHub CI.

Signed-off-by: Per Åstrand <per.astrand@arm.com>
---
 backends/arm/operators/op_abs.py     |  4 ++--
 backends/arm/operators/op_add.py     |  4 ++--
 backends/arm/operators/op_eq.py      |  2 +-
 backends/arm/operators/op_ge.py      |  2 +-
 backends/arm/operators/op_gt.py      |  2 +-
 backends/arm/operators/op_le.py      |  2 +-
 backends/arm/operators/op_lt.py      |  2 +-
 backends/arm/operators/op_maximum.py |  4 ++--
 backends/arm/operators/op_minimum.py |  4 ++--
 backends/arm/operators/op_mul.py     |  6 +++---
 backends/arm/operators/op_sub.py     |  4 ++--
 backends/arm/operators/op_sum.py     | 12 ++++++------
 backends/arm/tosa_quant_utils.py     | 23 +++++++++++++----------
 13 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index 5d82810f0d7..c1509b4feae 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -164,7 +164,7 @@ def define_node(
         scale_back = 1.0
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
         else:
             # input[0].dtype == ts.DType.INT32
@@ -192,7 +192,7 @@ def define_node(
             # Scale output back to 8 bit
             # pyre-ignore
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, abs_output, scale_back, node, self.tosa_specs
+                tosa_graph, abs_output, scale_back, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
 
 
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
index b8e3d1561ca..9b981f23710 100644
--- a/backends/arm/operators/op_add.py
+++ b/backends/arm/operators/op_add.py
@@ -174,7 +174,7 @@ def define_node(
         scale_back = 1.0
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
         else:
             # input[0].dtype == ts.DType.INT32
@@ -202,7 +202,7 @@ def define_node(
             # Scale output back to 8 bit
             # pyre-ignore
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, add_output, scale_back, node, self.tosa_specs
+                tosa_graph, add_output, scale_back, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
 
 
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index c0839120821..a4318904c8e 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -98,7 +98,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index 7a8f793e24b..5ff2aefa4db 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index b640b9bc31d..230e42ea0ce 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index a458ef126ee..3960c768ce3 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index 76b9a281c76..e0717f75246 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py
index ed7afa4bfd8..99da0026a7f 100644
--- a/backends/arm/operators/op_maximum.py
+++ b/backends/arm/operators/op_maximum.py
@@ -129,7 +129,7 @@ def define_node(
                 )
 
             operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             output.shape = tosa_shape(output.shape, output.dim_order)
@@ -155,5 +155,5 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             # insert RESCALE from int32 back to int8
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, max_output, scale_back, node, self.tosa_specs
+                tosa_graph, max_output, scale_back, node, self.tosa_spec
             )
diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py
index c0169e75910..82f3ea945a9 100644
--- a/backends/arm/operators/op_minimum.py
+++ b/backends/arm/operators/op_minimum.py
@@ -128,7 +128,7 @@ def define_node(
                 )
 
             operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             output.shape = tosa_shape(output.shape, output.dim_order)
@@ -154,5 +154,5 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             # insert RESCALE from int32 back to int8
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, min_output, scale_back, node, self.tosa_specs
+                tosa_graph, min_output, scale_back, node, self.tosa_spec
             )
diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
index c4c9c135e6e..789f9222ef7 100644
--- a/backends/arm/operators/op_mul.py
+++ b/backends/arm/operators/op_mul.py
@@ -189,14 +189,14 @@ def define_node(
             input_A,
             input_A_qargs.zp,
             [1.0],
-            tosa_spec=self.tosa_specs,
+            tosa_spec=self.tosa_spec,
         )
         input_B_rescaled = tqutils.build_rescale_to_int32(
             tosa_graph,
             input_B,
             input_B_qargs.zp,
             [1.0],
-            tosa_spec=self.tosa_specs,
+            tosa_spec=self.tosa_spec,
         )
 
         output_shape = tutils.tosa_shape(output.shape, output.dim_order)
@@ -211,7 +211,7 @@ def define_node(
         )
         output_scale = input_A_qargs.scale * input_B_qargs.scale
         tqutils.insert_rescale_op_to_int8(
-            tosa_graph, mul_output, output_scale, node, self.tosa_specs
+            tosa_graph, mul_output, output_scale, node, self.tosa_spec
         )
 
 
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
index b711b2f5056..cc3a5591a4c 100644
--- a/backends/arm/operators/op_sub.py
+++ b/backends/arm/operators/op_sub.py
@@ -168,7 +168,7 @@ def define_node(
         scale_back = 1.0
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
         else:
             # input[0].dtype == ts.DType.INT32
@@ -197,7 +197,7 @@ def define_node(
             # Scale output back to 8 bit
             # pyre-ignore
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, sub_output, scale_back, node, self.tosa_specs
+                tosa_graph, sub_output, scale_back, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
 
 
diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py
index 4eb08569005..dd81a0ef077 100644
--- a/backends/arm/operators/op_sum.py
+++ b/backends/arm/operators/op_sum.py
@@ -159,13 +159,11 @@ def define_node(
 
         # Rescale input to 32 bit
         rescaled_inputs, scale = tqutils.insert_rescale_ops_to_int32(
-            tosa_graph,
-            [tensor],
-            node,
+            tosa_graph, [tensor], node, self.tosa_spec
         )
 
         attr = ts.TosaSerializerAttribute()
-        attr.AxisAttribute(tensor.dim_order.index(dim))
+        attr.ReduceSumAttribute(tensor.dim_order.index(dim))
 
         intermediate = tosa_graph.addIntermediate(
             tutils.tosa_shape(output_shape, tensor.dim_order),
@@ -179,7 +177,9 @@ def define_node(
             attr,
         )
 
-        tqutils.insert_rescale_op_to_int8(tosa_graph, intermediate, scale, node)
+        tqutils.insert_rescale_op_to_int8(
+            tosa_graph, intermediate, scale, node, self.tosa_spec
+        )
 
 
 @register_node_visitor
@@ -212,7 +212,7 @@ def define_node(
         output_shape[dim] = 1  # Output shape is input shape with dim reduced
 
         attr = ts.TosaSerializerAttribute()
-        attr.AxisAttribute(tensor.dim_order.index(dim))
+        attr.ReduceSumAttribute(tensor.dim_order.index(dim))
 
         tosa_graph.addOperator(
             ts.TosaOp.Op().REDUCE_SUM,
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index 96e9ab4e34a..10dc810da6b 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -32,7 +32,7 @@ def insert_rescale_ops_to_int32(
     tosa_graph: Any,
     inputs: list[TosaArg],
     node: Node,
-    tosa_spec=tosa_specification.Tosa_0_80,
+    tosa_spec=None,
 ) -> tuple[list[Any], float]:
     """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'.
     The scales are adjusted using the smallest scale of all 'nodes'.
@@ -79,7 +79,7 @@ def insert_rescale_op_to_int8(
     last_tensor: TosaArg,
     scale: float,
     node: Node,
-    tosa_spec=tosa_specification.Tosa_0_80,
+    tosa_spec=None,
 ) -> None:
     """Rescales the node back to int8, adding a suitable RESCALE op to 'tosa_graph'.
     Parameters:
@@ -323,10 +323,11 @@ def build_rescale_to_int32(
     is_scale32: bool = True,
     is_double_round: bool = False,
     per_channel: bool = False,
-    tosa_spec=tosa_specification.Tosa_0_80,
+    tosa_spec=None,
 ) -> Any:
     input_A_rescaled_to_int32 = None
-    if tosa_spec == tosa_specification.Tosa_0_80:
+    if not tosa_spec or isinstance(tosa_spec, tosa_specification.Tosa_0_80):
+        # default to TOSA v0.80 until we switch to v1.0
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         input_A_rescaled_to_int32 = tosa_fb.addIntermediate(
@@ -343,7 +344,7 @@ def build_rescale_to_int32(
             output_zp=0,
         )  # type: ignore[call-arg]
 
-    elif isinstance(tosa_spec[0], tosa_specification.Tosa_1_00):
+    elif isinstance(tosa_spec, tosa_specification.Tosa_1_00):
         # For TOSA v1.0 multipliers, shifts, input_zp and output_zp are now inputs
         # to the RESCALE op see: https://www.mlplatform.org/tosa/tosa_spec.html#_rescale
         import serializer.tosa_serializer as ts  # type: ignore
@@ -375,9 +376,10 @@ def build_rescale_from_int32(
     is_scale32: bool = True,
     is_double_round: bool = False,
     per_channel: bool = False,
-    tosa_spec=tosa_specification.Tosa_0_80,
+    tosa_spec=None,
 ) -> None:
-    if tosa_spec == tosa_specification.Tosa_0_80:
+    if not tosa_spec or isinstance(tosa_spec, tosa_specification.Tosa_0_80):
+        # default to TOSA v0.80 until we switch to v1.0
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         build_rescale_v0_80(
@@ -390,7 +392,7 @@ def build_rescale_from_int32(
             output_zp=output_zp,
         )  # type: ignore[call-arg]
 
-    elif isinstance(tosa_spec[0], tosa_specification.Tosa_1_00):
+    elif isinstance(tosa_spec, tosa_specification.Tosa_1_00):
         import serializer.tosa_serializer as ts  # type: ignore
 
         # For TOSA v1.0 multipliers, shifts, input_zp and output_zp are now inputs
@@ -420,7 +422,7 @@ def build_rescale_conv_output(
     weight_scale: list[float],
     output_scale: list[float],
     output_zp: int,
-    tosa_spec=tosa_specification.Tosa_0_80,
+    tosa_spec=None,
 ):
     # TODO add check to verify if this is a Per-channel quantization.
     post_conv2d_scale = [
@@ -428,7 +430,8 @@ def build_rescale_conv_output(
     ]
 
     # Since we assume the input tensor that is being rescaled is int32 date type, zero point must be 0.
-    if tosa_spec == tosa_specification.Tosa_0_80:
+    if not tosa_spec or isinstance(tosa_spec, tosa_specification.Tosa_0_80):
+        # default to TOSA v0.80 until we switch to v1.0
         build_rescale_v0_80(
             tosa_fb=tosa_fb,
             scale=post_conv2d_scale,

From 6f015f6334e08c8dc85441d37c5f2c06639121e7 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Fri, 16 May 2025 11:45:06 +0200
Subject: [PATCH 122/178] Arm backend: Improve broadcasting (#10940)

Ethos-U55 only supports broadcasting of one argument. This patch introduces a
pass which will insert repeat ops to make sure that only one input needs
to be broadcasted. The pass is only applied for Ethos-U55.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 backends/arm/_passes/__init__.py              |  1 +
 backends/arm/_passes/arm_pass_manager.py      |  3 +
 backends/arm/_passes/broadcast_args_pass.py   | 63 +++++++++++++++++++
 backends/arm/test/ops/test_add.py             |  4 ++
 backends/arm/test/ops/test_div.py             |  5 ++
 backends/arm/test/ops/test_mul.py             |  4 ++
 backends/arm/test/ops/test_sub.py             |  4 ++
 .../test/passes/test_broadcast_args_pass.py   | 54 ++++++++++++++++
 8 files changed, 138 insertions(+)
 create mode 100644 backends/arm/_passes/broadcast_args_pass.py
 create mode 100644 backends/arm/test/passes/test_broadcast_args_pass.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index e4a1526f573..ac7989f1b9b 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -8,6 +8,7 @@
 from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
 from .arm_pass import ArmPass  # noqa
+from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 5f79757f212..06758e5de14 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -10,6 +10,7 @@
 from executorch.backends.arm._passes import (
     AnnotateChannelsLastDimOrder,
     AnnotateDecomposedMatmulPass,
+    BroadcastArgsPass,
     CastInt64BuffersToInt32Pass,
     CastToInt32Pass,
     ComputeConstantOpsAOT,
@@ -104,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
+        if self.tosa_spec.is_U55_subset:
+            self.add_pass(BroadcastArgsPass())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
diff --git a/backends/arm/_passes/broadcast_args_pass.py b/backends/arm/_passes/broadcast_args_pass.py
new file mode 100644
index 00000000000..f125ba13ff4
--- /dev/null
+++ b/backends/arm/_passes/broadcast_args_pass.py
@@ -0,0 +1,63 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm._passes import ArmPass
+
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from executorch.exir.pass_base import PassResult
+from torch.fx import GraphModule, Node
+
+
+class BroadcastArgsPass(ArmPass):
+    """
+    Pass to manually broadcast arguments by inserting repeats.
+    This is done when more than one arg needs broadcasting.
+    """
+
+    targeted_ops = {
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.sub.Tensor,
+        # mul is indirectly targeting div as div is decompsed to reciprocal + mul
+        exir_ops.edge.aten.mul.Tensor,
+    }
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in self.targeted_ops:
+                continue
+
+            output_shape = get_first_fake_tensor(node).shape
+            nbr_of_broacasts = 0
+            for arg in node.args:
+                if not isinstance(arg, Node):
+                    continue
+
+                shape = get_first_fake_tensor(arg).shape
+                if shape != output_shape:
+                    nbr_of_broacasts += 1
+                if nbr_of_broacasts > 1:
+                    multiples = [
+                        int(output_shape[d] / shape[d])
+                        for d in range(len(output_shape))
+                    ]
+                    with graph_module.graph.inserting_before(node):
+                        repeat = create_node(
+                            graph_module.graph,
+                            exir_ops.edge.aten.repeat.default,
+                            args=(arg, multiples),
+                            kwargs={},
+                            from_node=node,
+                        )
+                        node.replace_input_with(arg, repeat)
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 67833576886..76d4950be6d 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -60,6 +60,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
             10000 * torch.randn(1, 1, 4, 4),
             torch.randn(1, 1, 4, 1),
         ),
+        "4d_randn_1_mutltiple_broadcasts": lambda: (
+            torch.randn(1, 4, 4, 1),
+            torch.ones(1, 1, 4, 4),
+        ),
     }
 
 
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index 087bdb84a63..0e1ca005fa1 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -66,6 +66,11 @@
         torch.rand(5, 10, 25, 20) + 1,
         None,
     ),
+    "op_div_rank4_randn_mutltiple_broadcasts": lambda: (
+        torch.randn(1, 4, 4, 1),
+        torch.randn(1, 1, 4, 4),
+        None,
+    ),
 }
 
 
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index f960f348a87..a4c0dd4a0f8 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -51,6 +51,10 @@
         200 * torch.randn(1, 10, 25, 20),
         torch.rand(1, 10, 25, 1),
     ),
+    "op_mul_rank4_randn_mutltiple_broadcasts": lambda: (
+        torch.randn(1, 4, 4, 1),
+        torch.randn(1, 1, 4, 4),
+    ),
 }
 
 
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index f61f3b0583d..e41e589f6a7 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -38,6 +38,10 @@
     "rand_3D_4x4x4": lambda: (torch.rand(4, 2, 2), torch.rand(4, 2, 2)),
     "rand_4D_2x2x4x4": lambda: (torch.rand(2, 2, 4, 4), torch.rand(2, 2, 4, 4)),
     "zeros": lambda: (torch.rand(4, 4), torch.zeros(4, 4)),
+    "randn_4D_mutltiple_broadcasts": lambda: (
+        torch.randn(1, 4, 4, 1),
+        torch.randn(1, 1, 4, 4),
+    ),
 }
 fvp_sub2_xfails = {"rand_4D_2x2x4x4": "MLETORCH-517 : Multiple batches not supported"}
 
diff --git a/backends/arm/test/passes/test_broadcast_args_pass.py b/backends/arm/test/passes/test_broadcast_args_pass.py
new file mode 100644
index 00000000000..719a0ddd622
--- /dev/null
+++ b/backends/arm/test/passes/test_broadcast_args_pass.py
@@ -0,0 +1,54 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes import BroadcastArgsPass
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class NeedsMultipleBroadcastsModel(torch.nn.Module):
+    test_data = (torch.rand(1, 10), torch.rand(10, 1))
+
+    def __init__(self, op: operator):
+        self.op = op
+        super().__init__()
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return self.op(x, y)
+
+
+modules = {
+    "add": NeedsMultipleBroadcastsModel(operator.add),
+    "sub": NeedsMultipleBroadcastsModel(operator.sub),
+    "mul": NeedsMultipleBroadcastsModel(operator.mul),
+    "div": NeedsMultipleBroadcastsModel(operator.truediv),
+}
+
+
+@common.parametrize("module", modules)
+def test_multiple_broacasts_model(module: NeedsMultipleBroadcastsModel):
+    test_data = module.test_data
+    ops_not_before_pass = [
+        "executorch_exir_dialects_edge__ops_aten_repeat_default",
+    ]
+    ops_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten_repeat_default": 1,
+    }
+    pipeline = PassPipeline[input_t](
+        module,
+        test_data,
+        quantize=True,
+        ops_not_before_pass=ops_not_before_pass,
+        ops_after_pass=ops_after_pass,
+        pass_list=[BroadcastArgsPass],
+    )
+    pipeline.run()

From 2ec867819743321559b2005440be4324ba1330fb Mon Sep 17 00:00:00 2001
From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com>
Date: Fri, 16 May 2025 11:20:16 +0100
Subject: [PATCH 123/178] Arm backend: Refactor Quantizer test to allow for
 TOSA 1.0 (#10905)

### Summary
Update quantizer unit tests to use the new test infrastructure pipeline.
---
 .../test/quantizer/test_generic_annotater.py  | 150 +++++++++---------
 1 file changed, 79 insertions(+), 71 deletions(-)

diff --git a/backends/arm/test/quantizer/test_generic_annotater.py b/backends/arm/test/quantizer/test_generic_annotater.py
index 275a44583dc..d87521485e5 100644
--- a/backends/arm/test/quantizer/test_generic_annotater.py
+++ b/backends/arm/test/quantizer/test_generic_annotater.py
@@ -1,18 +1,21 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import itertools
-import unittest
+
+from typing import Tuple
 
 import torch
 from executorch.backends.arm.quantizer import is_annotated
-from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineBI
+
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
 class SingleOpModel(torch.nn.Module):
     def __init__(self, op, example_input, **op_kwargs) -> None:
         super().__init__()
@@ -27,69 +30,74 @@ def example_inputs(self):
         return self._example_input
 
 
-class TestGenericAnnotator(unittest.TestCase):
-    def check_annotation(self, model):
-        tester = ArmTester(
-            model,
-            model.example_inputs(),
-            common.get_tosa_compile_spec("TOSA-0.80+BI"),
-        )
-        quant_model = tester.quantize().get_artifact()
-        partitions = get_source_partitions(quant_model.graph, [model.op])
-        partitions = list(itertools.chain.from_iterable(partitions.values()))
-
-        assert len(partitions) == 1
-        partition = partitions[0]
-        assert all(is_annotated(node) for node in partition.nodes)
-
-    def test_squeeze(self):
-        self.check_annotation(SingleOpModel(torch.squeeze, (torch.rand(8, 8, 1),)))
-        self.check_annotation(SingleOpModel(torch.squeeze_copy, (torch.rand(8, 8, 1),)))
-
-    def test_unsqueeze(self):
-        self.check_annotation(
-            SingleOpModel(torch.unsqueeze, (torch.rand(8, 8),), dim=0)
-        )
-        self.check_annotation(
-            SingleOpModel(torch.unsqueeze_copy, (torch.rand(8, 8),), dim=0)
-        )
-
-    def test_reshape(self):
-        self.check_annotation(
-            SingleOpModel(torch.reshape, (torch.randn(8, 8),), shape=(64,)),
-        )
-
-    def test_view(self):
-        self.check_annotation(
-            SingleOpModel(torch.view_copy, (torch.randn(4, 4),), size=(2, 8)),
-        )
-
-    def test_slice(self):
-        self.check_annotation(
-            SingleOpModel(torch.slice_copy, (torch.randn(3, 4),)),
-        )
-
-    def test_transpose(self):
-        self.check_annotation(
-            SingleOpModel(torch.transpose, (torch.randn(2, 3),), dim0=0, dim1=1),
-        )
-        self.check_annotation(
-            SingleOpModel(torch.transpose_copy, (torch.randn(2, 3),), dim0=0, dim1=1),
-        )
-
-    def test_tile(self):
-        self.check_annotation(
-            SingleOpModel(torch.tile, (torch.randn(4, 4),), dims=(2,)),
-        )
-
-    def test_flip(self):
-        self.check_annotation(
-            SingleOpModel(torch.flip, (torch.randn(2, 4),), dims=(0, 1)),
-        )
-
-    def test_concat(self):
-        self.check_annotation(
-            SingleOpModel(
-                torch.concatenate, ((torch.randn(2, 3), torch.randn(2, 3)),), dim=0
-            ),
-        )
+def check_annotation(model):
+    pipeline = TosaPipelineBI[input_t1](model, model.example_inputs(), [], [])
+    pipeline.pop_stage("check_count.exir")
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+    artifact = pipeline.tester.get_artifact("Quantize")
+
+    partitions = get_source_partitions(artifact.graph, [model.op])
+    partitions = list(itertools.chain.from_iterable(partitions.values()))
+
+    assert len(partitions) == 1
+    partition = partitions[0]
+    assert all(is_annotated(node) for node in partition.nodes)
+
+
+def test_squeeze():
+    check_annotation(SingleOpModel(torch.squeeze, (torch.rand(8, 8, 1),)))
+    check_annotation(SingleOpModel(torch.squeeze_copy, (torch.rand(8, 8, 1),)))
+
+
+def test_unsqueeze():
+    check_annotation(SingleOpModel(torch.unsqueeze, (torch.rand(8, 8),), dim=0))
+    check_annotation(SingleOpModel(torch.unsqueeze_copy, (torch.rand(8, 8),), dim=0))
+
+
+def test_reshape():
+    check_annotation(
+        SingleOpModel(torch.reshape, (torch.randn(8, 8),), shape=(64,)),
+    )
+
+
+def test_view():
+    check_annotation(
+        SingleOpModel(torch.view_copy, (torch.randn(4, 4),), size=(2, 8)),
+    )
+
+
+def test_slice():
+    check_annotation(
+        SingleOpModel(torch.slice_copy, (torch.randn(3, 4),)),
+    )
+
+
+def test_transpose():
+    check_annotation(
+        SingleOpModel(torch.transpose, (torch.randn(2, 3),), dim0=0, dim1=1),
+    )
+    check_annotation(
+        SingleOpModel(torch.transpose_copy, (torch.randn(2, 3),), dim0=0, dim1=1),
+    )
+
+
+def test_tile():
+    check_annotation(
+        SingleOpModel(torch.tile, (torch.randn(4, 4),), dims=(2,)),
+    )
+
+
+def test_flip():
+    check_annotation(
+        SingleOpModel(torch.flip, (torch.randn(2, 4),), dims=(0, 1)),
+    )
+
+
+def test_concat():
+    check_annotation(
+        SingleOpModel(
+            torch.concatenate, ((torch.randn(2, 3), torch.randn(2, 3)),), dim=0
+        ),
+    )

From 54e7c75cd6fc11992167e5b7c487530405d52103 Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Fri, 16 May 2025 14:42:07 +0200
Subject: [PATCH 124/178] Arm backend: Add validation steps to op_neg (#10942)

The validation steps replace the raises that previously verified the
same thing. This reduces duplicated code.

Signed-off-by: Sebastian Larsson <sebastian.larsson@arm.com>
---
 backends/arm/operators/op_neg.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/backends/arm/operators/op_neg.py b/backends/arm/operators/op_neg.py
index 0b474a0b077..91499a5a892 100644
--- a/backends/arm/operators/op_neg.py
+++ b/backends/arm/operators/op_neg.py
@@ -16,7 +16,10 @@
     NodeVisitor,
     register_node_visitor,
 )
-
+from executorch.backends.arm.operators.operator_validation_utils import (
+    validate_num_inputs,
+    validate_same_dtype,
+)
 from executorch.backends.arm.tosa_mapping import TosaArg
 
 
@@ -60,14 +63,12 @@ def define_node(
             ts.DType.FP32,
         }
 
+        validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
+
         if inputs[0].dtype not in supported_dtypes:
             raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
 
-        if inputs[0].dtype != output.dtype:
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         input_zp, output_zp = get_negate_zero_points(
             node, inputs[0].dtype == ts.DType.INT8
         )
@@ -109,14 +110,12 @@ def define_node(
             ts.DType.FP32,
         }
 
+        validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
+
         if inputs[0].dtype not in supported_dtypes:
             raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
 
-        if inputs[0].dtype != output.dtype:
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         input_zp, output_zp = get_negate_zero_points(
             node, inputs[0].dtype == ts.DType.INT8
         )

From 94d13811328dd68bfa9f5e9bd90131112998de86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
 <33344797+martinlsm@users.noreply.github.com>
Date: Fri, 16 May 2025 17:11:46 +0200
Subject: [PATCH 125/178] Arm backend: Reenable test_fuse_const_ops_tosa_BI
 (#10847)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fusing of constant ops now works for TOSA BI again.

Co-authored-by: Martin Lindström <Martin.Lindstroem@arm.com>
---
 .../test/passes/test_fuse_constant_ops_pass.py    | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/backends/arm/test/passes/test_fuse_constant_ops_pass.py b/backends/arm/test/passes/test_fuse_constant_ops_pass.py
index 5e759d7a824..4ec6942430f 100644
--- a/backends/arm/test/passes/test_fuse_constant_ops_pass.py
+++ b/backends/arm/test/passes/test_fuse_constant_ops_pass.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
-import unittest
 from typing import Tuple
 
 import torch
@@ -13,10 +12,7 @@
     FuseConstantArgsPass,
 )
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.test_pipeline import (
-    PassPipeline,
-    TosaPipelineBI,
-)
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 input_t = Tuple[torch.Tensor]  # Input x
 
@@ -111,15 +107,14 @@ def test_fuse_const_ops_tosa_MI(module: torch.nn.Module):
     pipeline.run()
 
 
-@unittest.skip("Test failing on internal CI")
 @common.parametrize("module", modules)
 def test_fuse_const_ops_tosa_BI(module: torch.nn.Module):
-    pipeline = TosaPipelineBI[input_t](
+    pipeline = PassPipeline[input_t](
         module,
         (torch.rand(10, 10),),
-        [],
-        [],
         quantize=True,
-        use_to_edge_transform_and_lower=True,
+        ops_before_pass=module.ops_before_pass,
+        ops_after_pass=module.ops_after_pass,
+        passes_with_exported_program=[ComputeConstantOpsAOT, FuseConstantArgsPass],
     )
     pipeline.run()

From f39a1bbfde20e3b25f6958dc4d3f1dc84fb64a58 Mon Sep 17 00:00:00 2001
From: George Gekov <george.gekov@arm.com>
Date: Fri, 16 May 2025 16:13:26 +0100
Subject: [PATCH 126/178] Arm backend: Allocate the scratch buffer runtime
 rather than in the pte (#10714)

This change lowers the size of the pte and allows you to allocate the
scratch buffer in an array, usually in the SRAM, for more efficient
memory usage on a MCU.
---
 backends/arm/arm_vela.py                      |  4 +-
 backends/arm/runtime/EthosUBackend.cpp        | 37 ++++++---
 backends/arm/runtime/VelaBinStream.cpp        |  9 +-
 backends/arm/scripts/build_executor_runner.sh |  2 +-
 backends/arm/test/ops/test_conv_combos.py     | 20 ++---
 backends/arm/test/test_arm_baremetal.sh       |  8 +-
 backends/arm/test/test_model.py               |  2 +-
 examples/arm/executor_runner/CMakeLists.txt   | 82 ++++++++++---------
 .../executor_runner/arm_executor_runner.cpp   | 58 ++++++++++---
 examples/arm/run.sh                           |  2 +-
 10 files changed, 143 insertions(+), 81 deletions(-)

diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index c931d49547f..c47a5c58f49 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             np_path = os.path.join(tmpdir, "output", "out_vela.npz")
         else:
             np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
-        blocks = b""
 
+        blocks = b""
         with np.load(np_path, allow_pickle=False) as data:
             # Construct our modified output_blocks with data in a form easily
             # digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             if not isinstance(data["scratch_shape"][0], np.int64):
                 raise RuntimeError("Expected scratch to be int64")
             block_length = int(data["scratch_shape"][0])
-            bin_blocks["scratch_data"] = b"\x00" * block_length
+            bin_blocks["scratch_size"] = struct.pack("<I", block_length)
 
             # Capture inputs and outputs
             bin_blocks["inputs"] = vela_bin_pack_io("input", data)
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index 367550c0f06..74d35b3e3d7 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -71,6 +71,8 @@ using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 
+#define ETHOSU_NUM_BASE_ADDRS 3
+
 namespace executorch {
 namespace backends {
 namespace arm {
@@ -181,15 +183,25 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     }
     EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
+    MemoryAllocator* temp_allocator = context.get_temp_allocator();
+    // Use a temporary allocator for the intermediate tensors of the
+    // computation. The allocator is released in runtime/executor/method.cpp at
+    // the end of the execution of the Ethos-U custom delegate
+    char* ethosu_scratch =
+        static_cast<char*>(temp_allocator->allocate(handles.scratch_data_size));
+    extern size_t ethosu_fast_scratch_size;
+    extern unsigned char* ethosu_fast_scratch;
     ET_LOG(
         Debug,
-        "EthosUBackend::execute: Running program data:\n  cmd %p %zu\n  weight %p %zu\n  scratch %p %zu\n",
+        "EthosUBackend::execute: Running program data:\n  cmd %p %zu\n  weight %p %zu\n  scratch %p %zu\n  fast scratch %p %zu\n",
         handles.cmd_data,
         handles.cmd_data_size,
         handles.weight_data,
         handles.weight_data_size,
-        handles.scratch_data,
-        handles.scratch_data_size);
+        ethosu_scratch,
+        handles.scratch_data_size,
+        ethosu_fast_scratch,
+        ethosu_fast_scratch_size);
 
     // Write argument values (from EValue tensor) into Ethos-U scratch
     // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM
@@ -197,7 +209,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     for (int i = 0; i < handles.inputs->count; i++) {
       auto tensor_count = 1, io_count = 1;
       auto tensor_in = args[i]->toTensor();
-      char* scratch_addr = handles.scratch_data + handles.inputs->io[i].offset;
+      char* scratch_addr = ethosu_scratch + handles.inputs->io[i].offset;
 
       // We accept:
       bool supported = 0;
@@ -294,13 +306,17 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // Ethos-U low level driver expected order for Ethos U-55, we have
     // constant weight data, then scratch (which contains input and output)
     // scratch is written above in this function.
-    uint64_t bases[2] = {
+
+    uint64_t bases[ETHOSU_NUM_BASE_ADDRS] = {
         static_cast<uint64_t>(
             reinterpret_cast<uintptr_t>((handles.weight_data))),
+        static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch)),
         static_cast<uint64_t>(
-            reinterpret_cast<uintptr_t>((handles.scratch_data)))};
-    size_t bases_size[2] = {
-        handles.weight_data_size, handles.scratch_data_size};
+            reinterpret_cast<uintptr_t>(ethosu_fast_scratch))};
+    size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = {
+        handles.weight_data_size,
+        handles.scratch_data_size,
+        ethosu_fast_scratch_size};
     int result = 0;
     EXECUTORCH_PROF_START(
         event_tracer, event_tracer_local_scope, "+EthosUBackend::execute()NPU");
@@ -310,7 +326,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         handles.cmd_data_size,
         bases,
         bases_size,
-        2, /* fixed array of pointers to binary interface*/
+        3, /* fixed array of pointers to binary interface*/
         nullptr);
     EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
@@ -325,8 +341,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // Write outputs from scratch into EValue pointers
     for (int i = 0; i < handles.outputs->count; i++) {
       int tensor_count = 1, io_count = 1;
-      const char* output_addr =
-          handles.scratch_data + handles.outputs->io[i].offset;
+      const char* output_addr = ethosu_scratch + handles.outputs->io[i].offset;
       // Process input EValue into scratch
       // Outputs are in the index immediately after inputs
       auto tensor_out = args[handles.inputs->count + i]->toTensor();
diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp
index a26fe9f23e2..fbd9e2daadb 100644
--- a/backends/arm/runtime/VelaBinStream.cpp
+++ b/backends/arm/runtime/VelaBinStream.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Arm Limited and/or its affiliates.
+ * Copyright 2023, 2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -71,9 +71,10 @@ bool vela_bin_read(const char* data, VelaHandles* handles, int size) {
     } else if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
       handles->weight_data = b->data;
       handles->weight_data_size = b->size;
-    } else if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
-      handles->scratch_data = b->data;
-      handles->scratch_data_size = b->size;
+    } else if (!strncmp(b->name, "scratch_size", strlen("scratch_size"))) {
+      const uint32_t* scratch_size_ptr =
+          reinterpret_cast<const uint32_t*>(b->data);
+      handles->scratch_data_size = *scratch_size_ptr;
     } else if (!strncmp(b->name, "inputs", strlen("inputs"))) {
       handles->inputs = (VelaIOs*)b->data;
     } else if (!strncmp(b->name, "outputs", strlen("outputs"))) {
diff --git a/backends/arm/scripts/build_executor_runner.sh b/backends/arm/scripts/build_executor_runner.sh
index 807821d427f..9e2f3954c53 100755
--- a/backends/arm/scripts/build_executor_runner.sh
+++ b/backends/arm/scripts/build_executor_runner.sh
@@ -103,7 +103,7 @@ then
     memory_mode="Shared_Sram"
     if [[ ${target} =~ "ethos-u85" ]]
     then
-        memory_mode="Sram_Only"
+        memory_mode="Dedicated_Sram_384KB"
     fi
 fi
 
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index c06a6e666ec..bddc30f04ab 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -41,28 +41,28 @@ def __init__(self):
         # (t, c, n, s) = (6, 96, 1, 1)
         # 1. 1x1 CONV2d + ReLU6 (Pointwise)
         self.pointwise_conv2d = torch.nn.Conv2d(
-            in_channels=64, out_channels=384, kernel_size=1, stride=1, groups=1
-        )  ## (1, 384, 81, 81)
-        self.batch_norm2d_16 = torch.nn.BatchNorm2d(384, affine=False)
+            in_channels=32, out_channels=128, kernel_size=1, stride=1, groups=1
+        )  ## (1, 128, 81, 81)
+        self.batch_norm2d_16 = torch.nn.BatchNorm2d(128, affine=False)
         self.relu6 = torch.nn.ReLU6()
 
         # 2. 3x3 DepthwiseConv2d + ReLu6
         self.depthwise_conv2d = torch.nn.Conv2d(
-            in_channels=384,
-            out_channels=384,
+            in_channels=128,
+            out_channels=128,
             kernel_size=3,
             padding=1,
             stride=1,
-            groups=384,
-        )  ## (1, 384, H, W)
+            groups=128,
+        )  ## (1, 128, H, W)
 
         # 3. Linear 1x1 Conv2d
         self.pointwise_conv2d_linear = torch.nn.Conv2d(
-            in_channels=384, out_channels=64, kernel_size=1, stride=1, groups=1
-        )  ## (1, 64, 81, 81)
+            in_channels=128, out_channels=32, kernel_size=1, stride=1, groups=1
+        )  ## (1, 32, 81, 81)
 
     def get_inputs(self) -> Tuple[torch.Tensor]:
-        return (torch.randn(1, 64, 81, 81),)
+        return (torch.randn(1, 32, 81, 81),)
 
     def forward(self, x):
         input = x
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 330f0f138d0..6764dd27d96 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -206,11 +206,11 @@ test_models_ethos-u85() { # End to End model tests using model_test.py
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=mv2  --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-512 --model=mv3  --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=mv2 --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-512 --model=mv3 --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=w2l  --extra_flags="-DET_ATOL=0.01 -DET_RTOL=0.01"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4  --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=w2l --extra_flags="-DET_ATOL=0.01 -DET_RTOL=0.01"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4 --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
 
     echo "${TEST_SUITE_NAME}: PASS"
     }
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
index b0fd2f2a381..072583ef862 100755
--- a/backends/arm/test/test_model.py
+++ b/backends/arm/test/test_model.py
@@ -81,7 +81,7 @@ def get_args():
         if "u55" in args.target:
             args.memory_mode = "Shared_Sram"
         elif "u85" in args.target:
-            args.memory_mode = "Sram_Only"
+            args.memory_mode = "Dedicated_Sram_384KB"
         else:
             raise RuntimeError(f"Invalid target name {args.target}")
 
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 1568bef0301..6816a55d443 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -8,7 +8,6 @@ project(arm_executor_runner)
 
 option(SEMIHOSTING "Enable semihosting" OFF)
 option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
-option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)
 option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
 option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
 option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
@@ -99,20 +98,45 @@ if(NOT ${SEMIHOSTING})
   get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH)
 endif()
 
+if(SYSTEM_CONFIG MATCHES "Ethos_U55")
+  add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-300 target)
+elseif(SYSTEM_CONFIG MATCHES "Ethos_U85")
+  add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-320 target)
+else()
+  message(FATAL_ERROR "Unsupported SYSTEM_CONFIG ${SYSTEM_CONFIG}.")
+endif()
 
+if(MEMORY_MODE MATCHES "Dedicated_Sram")
+  target_compile_definitions(ethosu_target_common INTERFACE
+    ETHOSU_MODEL=1
+    ETHOSU_ARENA=1)
+elseif(MEMORY_MODE MATCHES "Shared_Sram" OR MEMORY_MODE MATCHES "Sram_Only")
+  target_compile_definitions(ethosu_target_common INTERFACE
+    ETHOSU_MODEL=1
+    ETHOSU_ARENA=0)
+else()
+  message(FATAL_ERROR "Unsupported MEMORY_MODE ${MEMORY_MODE}. Memory_mode can be Shared_Sram, Sram_Only or Dedicated_Sram(applicable for the Ethos-U85)")
+endif()
+
+# By default, use 2MB of temporary scratch buffer
+# For Dedicated_Sram, use 128MB for the temporary scratch buffer and
+# 384KB for the fast scratch buffer(the cache, applicable only for Ethos-U65 and Ethos-U85)
+set(ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE 0x200000)
+if(MEMORY_MODE MATCHES "Dedicated_Sram")
+  set(ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE 0x8000000)
+  set(ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE 0x60000)
+endif()
+message(STATUS "ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE = ${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}")
+message(STATUS "ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE = ${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}")
 
 # Dependencies from the Ethos-U Core This is the platform target of
 # Corstone-300, that includes ethosu_core_driver and bare-metal bringup
 # libraries. We link against ethosu_target_init which includes all of these
 # dependencies.
-if(SYSTEM_CONFIG STREQUAL "Ethos_U55_High_End_Embedded")
-  add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-300 target)
+if(SYSTEM_CONFIG MATCHES "Ethos_U55_High_End_Embedded")
   set(TARGET_BOARD "corstone-300")
-  if(MEMORY_MODE STREQUAL "Shared_Sram")
+  if(MEMORY_MODE MATCHES "Shared_Sram")
     target_compile_definitions(ethosu_target_common INTERFACE
-        # ETHOSU_MODEL=0 place pte file/data in SRAM area
-        # ETHOSU_MODEL=1 place pte file/data in DDR area
-        ETHOSU_MODEL=1
         # Configure NPU architecture timing adapters
         # This is just example numbers and you should make this match your hardware
         # SRAM
@@ -144,7 +168,7 @@ if(SYSTEM_CONFIG STREQUAL "Ethos_U55_High_End_Embedded")
         ETHOSU_TA_HISTBIN_1=0
         ETHOSU_TA_HISTCNT_1=0
         )
-  elseif(MEMORY_MODE STREQUAL "Sram_Only")
+  elseif(MEMORY_MODE MATCHES "Sram_Only")
     target_compile_definitions(ethosu_target_common INTERFACE
       # This is just example numbers and you should make this match your hardware
       # SRAM
@@ -180,14 +204,11 @@ if(SYSTEM_CONFIG STREQUAL "Ethos_U55_High_End_Embedded")
   else()
     message(FATAL_ERROR "Unsupported memory_mode ${MEMORY_MODE} for the Ethos-U55. The Ethos-U55 supports only Shared_Sram and Sram_Only.")
   endif()
-elseif(SYSTEM_CONFIG STREQUAL "Ethos_U55_Deep_Embedded")
+elseif(SYSTEM_CONFIG MATCHES "Ethos_U55_Deep_Embedded")
   add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-300 target)
   set(TARGET_BOARD "corstone-300")
-  if(MEMORY_MODE STREQUAL "Shared_Sram")
+  if(MEMORY_MODE MATCHES "Shared_Sram")
     target_compile_definitions(ethosu_target_common INTERFACE
-        # ETHOSU_MODEL=0 place pte file/data in SRAM area
-        # ETHOSU_MODEL=1 place pte file/data in DDR area
-        ETHOSU_MODEL=1
         # Configure NPU architecture timing adapters
         # This is just example numbers and you should make this match your hardware
         # SRAM
@@ -219,9 +240,8 @@ elseif(SYSTEM_CONFIG STREQUAL "Ethos_U55_Deep_Embedded")
         ETHOSU_TA_HISTBIN_1=0
         ETHOSU_TA_HISTCNT_1=0
         )
-    elseif(MEMORY_MODE STREQUAL "Sram_Only")
+    elseif(MEMORY_MODE MATCHES "Sram_Only")
       target_compile_definitions(ethosu_target_common INTERFACE
-      ETHOSU_MODEL=1
       # Configure NPU architecture timing adapters
       # This is just example numbers and you should make this match your hardware
       # SRAM
@@ -256,14 +276,11 @@ elseif(SYSTEM_CONFIG STREQUAL "Ethos_U55_Deep_Embedded")
     else()
       message(FATAL_ERROR "Unsupported memory_mode ${MEMORY_MODE} for the Ethos-U55. The Ethos-U55 supports only Shared_Sram and Sram_Only.")
   endif()
-elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Low")
+elseif(SYSTEM_CONFIG MATCHES "Ethos_U85_SYS_DRAM_Low")
   add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-320 target)
   set(TARGET_BOARD "corstone-320")
-  if(MEMORY_MODE STREQUAL "Dedicated_Sram")
+  if(MEMORY_MODE MATCHES "Dedicated_Sram")
     target_compile_definitions(ethosu_target_common INTERFACE
-        # ETHOSU_MODEL=0 place pte file/data in SRAM area
-        # ETHOSU_MODEL=1 place pte file/data in DDR area
-        ETHOSU_MODEL=1
         # Configure NPU architecture timing adapters
         # This is just example numbers and you should make this match your hardware
         # SRAM
@@ -295,11 +312,8 @@ elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Low")
         ETHOSU_TA_HISTBIN_1=0
         ETHOSU_TA_HISTCNT_1=0
         )
-  elseif(MEMORY_MODE STREQUAL "Sram_Only")
+  elseif(MEMORY_MODE MATCHES "Sram_Only")
       target_compile_definitions(ethosu_target_common INTERFACE
-      # ETHOSU_MODEL=0 place pte file/data in SRAM area
-      # ETHOSU_MODEL=1 place pte file/data in DDR area
-      ETHOSU_MODEL=1
       # Configure NPU architecture timing adapters
       # This is just example numbers and you should make this match your hardware
       # SRAM
@@ -333,13 +347,9 @@ elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Low")
       )
   endif()
 elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Mid" OR SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_High")
-  add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-320 target)
   set(TARGET_BOARD "corstone-320")
-  if(MEMORY_MODE STREQUAL "Dedicated_Sram")
+  if(MEMORY_MODE MATCHES "Dedicated_Sram")
     target_compile_definitions(ethosu_target_common INTERFACE
-        # ETHOSU_MODEL=0 place pte file/data in SRAM area
-        # ETHOSU_MODEL=1 place pte file/data in DDR area
-        ETHOSU_MODEL=1
         # Configure NPU architecture timing adapters
         # This is just example numbers and you should make this match your hardware
         # SRAM
@@ -371,11 +381,8 @@ elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Mid" OR SYSTEM_CONFIG STREQUAL
         ETHOSU_TA_HISTBIN_1=0
         ETHOSU_TA_HISTCNT_1=0
         )
-  elseif(MEMORY_MODE STREQUAL "Sram_Only")
+  elseif(MEMORY_MODE MATCHES "Sram_Only")
     target_compile_definitions(ethosu_target_common INTERFACE
-    # ETHOSU_MODEL=0 place pte file/data in SRAM area
-    # ETHOSU_MODEL=1 place pte file/data in DDR area
-    ETHOSU_MODEL=1
     # Configure NPU architecture timing adapters
     # This is just example numbers and you should make this match your hardware
     # SRAM
@@ -434,7 +441,7 @@ endif()
 # the memory traffic of Region 1 should pass via the external memory(3) and the traffic for Region 2 should pass via the SRAM(0)
 #
 
-if(MEMORY_MODE STREQUAL "Sram_Only")
+if(MEMORY_MODE MATCHES "Sram_Only")
   target_compile_definitions(ethosu_core_driver PRIVATE
       NPU_QCONFIG=1
       NPU_REGIONCFG_0=1
@@ -445,7 +452,7 @@ if(MEMORY_MODE STREQUAL "Sram_Only")
       NPU_REGIONCFG_5=0
       NPU_REGIONCFG_6=0
       NPU_REGIONCFG_7=0)
-  elseif(MEMORY_MODE STREQUAL "Dedicated_Sram")
+  elseif(MEMORY_MODE MATCHES "Dedicated_Sram")
     target_compile_definitions(ethosu_core_driver PRIVATE
       NPU_QCONFIG=3
       NPU_REGIONCFG_0=3
@@ -632,8 +639,9 @@ if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE)
   target_compile_definitions(arm_executor_runner PUBLIC ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE})
 endif()
 
-if(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
-  target_compile_definitions(arm_executor_runner PUBLIC ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE})
+target_compile_definitions(arm_executor_runner PUBLIC ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE})
+if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+  target_compile_definitions(arm_executor_runner PUBLIC ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE})
 endif()
 
 if(ET_BUNDLE_IO)
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index ed93d2acd8b..e5313345f6c 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -128,17 +128,41 @@ const float et_rtol = 0.01;
  * The temp_allocation_pool is used for allocating temporary data during kernel
  * or delegate execution. This will be reset after each kernel or delegate call.
  * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
- * a better fit
+ * a better fit.
+ *
+ * The Corstone-300/Corstone-320 platforms have 2MB/4MB of SRAM respectively.
+ * For Shared_Sram, ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE is
+ * 2MB and the linker script places the .bss.tensor_arena symbol in the SRAM.
+ * For Dedicated_Sram, the .bss.tensor_arena symbol is placed in the DDR in the
+ * linker script. Hence, we allocate 128MB in DDR and 384KB in the SRAM
+ * (.bss.ethosu_scratch is placed in the SRAM). The examples/arm/CMakeLists.txt
+ * contains the logic for the sizes of
+ * ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE and
+ * ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE
  */
-#if !defined(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
-#define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
-#endif
 const size_t temp_allocation_pool_size =
-    ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE;
+    ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE;
 unsigned char __attribute__((
-    section("input_data_sec"),
+    section(".bss.tensor_arena"),
     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
 
+namespace executorch {
+namespace backends {
+namespace arm {
+#if defined(ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+size_t ethosu_fast_scratch_size =
+    ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE;
+unsigned char __attribute__((section(".bss.ethosu_scratch"), aligned(16)))
+dedicated_sram[ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE];
+unsigned char* ethosu_fast_scratch = dedicated_sram;
+#else
+size_t ethosu_fast_scratch_size = 0;
+unsigned char* ethosu_fast_scratch = nullptr;
+#endif
+} // namespace arm
+} // namespace backends
+} // namespace executorch
+
 void et_pal_init(void) {
   // Enable ARM PMU Clock
   ARM_PMU_Enable();
@@ -207,7 +231,7 @@ namespace {
 class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
  public:
   ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
-      : MemoryAllocator(size, base_address), used_(0) {}
+      : MemoryAllocator(size, base_address), used_(0), peak_used_(0) {}
 
   void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
     void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
@@ -222,6 +246,8 @@ class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
       } else {
         used_ = (used_ | (alignment - 1)) + 1 + size;
       }
+      if (used_ > peak_used_)
+        peak_used_ = used_;
     }
     return ret;
   }
@@ -231,13 +257,25 @@ class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
     return used_;
   }
 
+  // Returns the peak memory usage of the allocator's memory buffer
+  // Peak usage is useful when doing multiple allocations & resets
+  size_t peak_used() const {
+    return peak_used_;
+  }
+
   // Returns the free size of the allocator's memory buffer.
   size_t free_size() const {
     return executorch::runtime::MemoryAllocator::size() - used_;
   }
 
+  void reset() {
+    executorch::runtime::MemoryAllocator::reset();
+    used_ = 0;
+  }
+
  private:
   size_t used_;
+  size_t peak_used_;
 };
 
 Result<BufferCleanup> prepare_input_tensors(
@@ -682,11 +720,11 @@ int main(int argc, const char* argv[]) {
   if (temp_allocator.size() > 0) {
     ET_LOG(
         Info,
-        "temp_allocator_used:       %zu / %zu free: %zu ( used: %zu %% ) ",
-        temp_allocator.used_size(),
+        "peak_temp_allocator:       %zu / %zu free: %zu ( used: %zu %% ) ",
+        temp_allocator.peak_used(),
         temp_allocator.size(),
         temp_allocator.free_size(),
-        100 * temp_allocator.used_size() / temp_allocator.size());
+        100 * temp_allocator.peak_used() / temp_allocator.size());
   }
 
   if (status != Error::Ok) {
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 89ac5cd30a8..750c251596c 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -110,7 +110,7 @@ then
     memory_mode="Shared_Sram"
     if [[ ${target} =~ "ethos-u85" ]]
     then
-        memory_mode="Sram_Only"
+        memory_mode="Dedicated_Sram_384KB"
     fi
 fi
 

From 5c6d4e58f705458e70356dad404841ce87c6292b Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Fri, 16 May 2025 17:15:18 +0200
Subject: [PATCH 127/178] Arm backend: Refactor test_scalars to new naming
 standard (#10944)

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 backends/arm/scripts/parse_test_names.py |   2 +
 backends/arm/test/ops/test_scalars.py    | 414 ++++++++++++++++-------
 2 files changed, 288 insertions(+), 128 deletions(-)

diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index 4d4f9a21155..45dbba4c67e 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -14,6 +14,8 @@
     "linear.default",
     "maximum.default",
     "adaptive_avg_pool2d.default",
+    "bitwise_right_shift.Tensor",
+    "bitwise_left_shift.Tensor",
 ]
 ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
 
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
index a4748e93fdb..7a06f7dfc8d 100644
--- a/backends/arm/test/ops/test_scalars.py
+++ b/backends/arm/test/ops/test_scalars.py
@@ -6,13 +6,14 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
     TosaPipelineBI,
     TosaPipelineMI,
-    TransformAnnotationPassPipeline,
 )
 
 """
@@ -35,247 +36,404 @@
 
 
 class Add(torch.nn.Module):
+    aten_op = "torch.ops.aten.add.Tensor"
+
     def forward(self, x, y):
         return x + y
 
 
 class Sub(torch.nn.Module):
+    aten_op = "torch.ops.aten.sub.Tensor"
+
     def forward(self, x, y):
         return x - y
 
 
 class Div(torch.nn.Module):
+    aten_op = "torch.ops.aten.div.Tensor"
+
     def forward(self, x, y):
         return x / y
 
 
 class Mul(torch.nn.Module):
+    aten_op = "torch.ops.aten.mul.Tensor"
+
     def forward(self, x, y):
         return x * y
 
 
 class MulScalar(torch.nn.Module):
+    aten_op = "torch.ops.aten.mul.Scalar"
+
     def forward(self, x, y):
         return torch.ops.aten.mul.Scalar(x, y)
 
 
 class DivScalar(torch.nn.Module):
+    aten_op = "torch.ops.aten.div.Scalar"
+
     def forward(self, x, y):
         return torch.ops.aten.div.Scalar(x, y)
 
 
 class AddScalar(torch.nn.Module):
+    aten_op = "torch.ops.aten.add.Scalar"
+
     def forward(self, x, y):
         return torch.ops.aten.add.Scalar(x, y)
 
 
 class SubScalar(torch.nn.Module):
+    aten_op = "torch.ops.aten.sub.Scalar"
+
     def forward(self, x, y):
         return torch.ops.aten.sub.Scalar(x, y)
 
 
 class AddInplace(torch.nn.Module):
+    aten_op = "torch.ops.aten.add_.Tensor"
+
     def forward(self, x, y):
         x += y
         return x
 
 
 class SubInplace(torch.nn.Module):
+    aten_op = "torch.ops.aten.sub_.Tensor"
+
     def forward(self, x, y):
         x -= y
         return x
 
 
 class DivInplace(torch.nn.Module):
+    aten_op = "torch.ops.aten.div_.Tensor"
+
     def forward(self, x, y):
         x /= y
         return x
 
 
 class MulInplace(torch.nn.Module):
+    aten_op = "torch.ops.aten.mul_.Tensor"
+
     def forward(self, x, y):
         x *= y
         return x
 
 
 class AddConst(torch.nn.Module):
+    aten_op = "torch.ops.aten.add.Tensor"
+
     def forward(self, x):
         x = 1.0 + x
         return x
 
 
 class ShiftInplaceSub(torch.nn.Module):
+
     def forward(self, x):
         x = x >> 4
         x -= 10
         return x
 
 
-# Inplace ops end with '_' (from aten naming)
-ops = [
-    ("Add", Add()),
-    ("Sub", Sub()),
-    ("Mul", Mul()),
-    ("Div", Div()),
-    ("Add_", AddInplace()),
-    ("Sub_", SubInplace()),
-    ("Mul_", MulInplace()),
-    ("Div_", DivInplace()),
-    ("MulScalar", MulScalar()),
-    ("DivScalar", DivScalar()),
-    ("AddScalar", AddScalar()),
-    ("SubScalar", SubScalar()),
-]
-
-const_ops = [("Add", AddConst())]
-
 dtypes = [("int", 3), ("float", 3.0)]
 sizes = [("r1", (1)), ("r4", (2, 4, 5, 3))]
 
 # Create combinations of tests
 tensor_scalar_tests = {}
-for op in ops:
-    for dtype in dtypes:
-        for size in sizes:
-            test_name = f"{op[0]}_{dtype[0]}_{size[0]}"
-            tensor = torch.rand(size[1])
-            scalar = dtype[1]
-            tensor_scalar_tests[test_name + "_ts"] = (op[1], tensor, scalar)
-            # Don't add (scalar, tensor) test case for .Scalar ops.
-            if op[0][-6:] == "Scalar":
-                continue
-
-            tensor_scalar_tests[test_name + "_st"] = (op[1], scalar, tensor)
-
-tensor_const_tests = {}
-for op in const_ops:
+for dtype in dtypes:
     for size in sizes:
-        test_name = f"{op[0]}_{size[0]}"
+        test_name = f"{dtype[0]}_{size[0]}"
         tensor = torch.rand(size[1])
-        tensor_const_tests[test_name] = (op[1], tensor)
+        scalar = dtype[1]
+        tensor_scalar_tests[test_name + "_ts"] = (tensor, scalar)
+        # # Don't add (scalar, tensor) test case for .Scalar ops.
+        # if op[0][-6:] == "Scalar":
+        # continue
 
+        tensor_scalar_tests[test_name + "_st"] = (scalar, tensor)
 
-def _test_add_tosa_MI_pipeline(module: torch.nn.Module, test_data: tuple):
-    pipeline = TosaPipelineMI[input_t1](module, test_data, aten_op=[], exir_op=[])
+tensor_const_tests = {}
+for size in sizes:
+    test_name = f"{size[0]}"
+    tensor = torch.rand(size[1])
+    tensor_const_tests[test_name] = (tensor,)
+
+xfails = {
+    "int_r1_st": "MLETORCH-408: Arithmetic ops can't handle scalars first",
+    "int_r4_st": "MLETORCH-408: Arithmetic ops can't handle scalars first",
+    "float_r1_st": "MLETORCH-408: Arithmetic ops can't handle scalars first",
+    "float_r4_st": "MLETORCH-408: Arithmetic ops can't handle scalars first",
+}
+
+
+# ADD MI ------------------------------------------------------
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_add_tensor_tosa_MI_scalar(test_data):
+    """Tests regular add with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](Add(), test_data, aten_op=Add.aten_op)
     pipeline.run()
 
 
-def _test_add_tosa_BI_pipeline(
-    module: torch.nn.Module, test_data: tuple, check_quant_nodes=True
-):
-    pipeline = TosaPipelineBI[input_t1](module, test_data, aten_op=[], exir_op=[])
-    if not check_quant_nodes:
-        pipeline.pop_stage("check.quant_nodes")
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_add_tensor_tosa_MI_inplace(test_data):
+    """Tests inplace add with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](AddInplace(), test_data, aten_op=[])
     pipeline.run()
 
 
-fail_str = "MLETORCH-408: Arithmetic ops can't handle scalars first for MI"
-MI_xfails = {
-    "Add_int_r1_st": fail_str,
-    "Add_int_r4_st": fail_str,
-    "Add_float_r1_st": fail_str,
-    "Add_float_r4_st": fail_str,
-    "Sub_int_r1_ts": fail_str,
-    "Sub_int_r1_st": fail_str,
-    "Sub_int_r4_ts": fail_str,
-    "Sub_int_r4_st": fail_str,
-    "Sub_float_r1_st": fail_str,
-    "Sub_float_r4_st": fail_str,
-    "Mul_int_r1_st": fail_str,
-    "Mul_int_r4_st": fail_str,
-    "Mul_float_r1_st": fail_str,
-    "Mul_float_r4_st": fail_str,
-    "Div_int_r1_st": fail_str,
-    "Div_int_r4_st": fail_str,
-    "Div_float_r1_st": fail_str,
-    "Div_float_r4_st": fail_str,
-    "Add__int_r1_st": fail_str,
-    "Add__float_r1_st": fail_str,
-    "Add__float_r4_st": fail_str,
-    "Add__int_r4_st": fail_str,
-    "Sub__int_r1_ts": fail_str,
-    "Sub__int_r1_st": fail_str,
-    "Sub__int_r4_ts": fail_str,
-    "Sub__int_r4_st": fail_str,
-    "Sub__float_r1_st": fail_str,
-    "Sub__float_r4_st": fail_str,
-    "Mul__int_r1_st": fail_str,
-    "Mul__int_r4_st": fail_str,
-    "Mul__float_r1_st": fail_str,
-    "Mul__float_r4_st": fail_str,
-    "Div__int_r1_st": fail_str,
-    "Div__int_r4_st": fail_str,
-    "Div__float_r1_st": fail_str,
-    "Div__float_r4_st": fail_str,
-}
+@common.parametrize("test_data", tensor_const_tests, xfails=xfails)
+def test_add_tensor_tosa_MI_const(test_data):
+    """Tests regular add with one scalar input, with one of inputs constant."""
+    pipeline = TosaPipelineMI[input_t1](AddConst(), test_data, aten_op=AddConst.aten_op)
+    pipeline.run()
 
 
-@common.parametrize(
-    "tensor_scalar_tests",
-    tensor_scalar_tests,
-    MI_xfails,
-)
-def test_tosa_MI(tensor_scalar_tests: list):
-    op, x, y = tensor_scalar_tests
-    _test_add_tosa_MI_pipeline(op, (x, y))
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_add_scalar_tosa_MI(test_data):
+    """Tests a scalar add with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](
+        AddScalar(), test_data, aten_op=AddScalar.aten_op
+    )
+    pipeline.run()
+
+
+# ADD BI ------------------------------------------------------
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_add_tensor_tosa_BI_scalar(test_data):
+    """Tests regular add with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](Add(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_add_tensor_tosa_BI_inplace(test_data):
+    """Tests inplace add with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](AddInplace(), test_data, aten_op=[])
+    pipeline.run()
 
 
-def _test_passes_tosa_BI_pipeline(module: torch.nn.Module, test_data: tuple):
-    pipeline = TransformAnnotationPassPipeline[input_t1](module, test_data)
+@common.parametrize("test_data", tensor_const_tests)
+def test_add_tensor_tosa_BI_const(test_data):
+    """Tests regular add with one scalar input, with one of inputs constant."""
+    pipeline = TosaPipelineBI[input_t1](AddConst(), test_data, aten_op=AddConst.aten_op)
     pipeline.run()
 
 
-fail_str = "MLETORCH-770: Numerical issues on Div Scalar."
-passes_xfails = {
-    "Div__int_r1_ts": fail_str,
-    "Div__int_r4_ts": fail_str,
-    "Div__float_r1_ts": fail_str,
-    "Div__float_r4_ts": fail_str,
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_add_scalar_tosa_BI(test_data):
+    """Tests a scalar add with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](AddScalar(), test_data, aten_op=Add.aten_op)
+    pipeline.run()
+
+
+# ADD ETHOS-U ------------------------------------------------------
+@pytest.mark.skip(reason="This is tested in test_add_scalar_tosa_BI")
+def test_add_scalar_u55_BI():
+    pass
+
+
+@pytest.mark.skip(reason="This is tested in test_add_scalar_tosa_BI")
+def test_add_scalar_u85_BI():
+    pass
+
+
+# SUB MI ------------------------------------------------------
+mi_sub_xfails = {
+    "int_r1_ts": "TypeError: All IO needs to have the same data type, got input 1: 8, input 2: 6 and output: 8",
+    "int_r4_ts": "TypeError: All IO needs to have the same data type, got input 1: 8, input 2: 6 and output: 8",
+    **xfails,
 }
 
 
-@common.parametrize(
-    "tensor_scalar_tests",
-    tensor_scalar_tests,
-    passes_xfails,
-    strict=False,
-)
-def test_scalars_tosa_BI_passes(tensor_scalar_tests: list):
-    op, x, y = tensor_scalar_tests
-    _test_passes_tosa_BI_pipeline(op, (x, y))
+@common.parametrize("test_data", tensor_scalar_tests, xfails=mi_sub_xfails)
+def test_sub_tensor_tosa_MI_scalar(test_data):
+    """Tests regular sub with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](Sub(), test_data, aten_op=Sub.aten_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=mi_sub_xfails)
+def test_sub_tensor_tosa_MI_inplace(test_data):
+    """Tests inplace sub with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](SubInplace(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_sub_scalar_tosa_MI(test_data):
+    """Tests a scalar sub with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](
+        SubScalar(), test_data, aten_op=SubScalar.aten_op
+    )
+    pipeline.run()
+
+
+# SUB BI ------------------------------------------------------
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_sub_tensor_tosa_BI_scalar(test_data):
+    """Tests regular sub with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](Sub(), test_data, aten_op=[])
+    pipeline.run()
 
 
-# op(Scalar float, tensor) works if the scalar is constant.
-@common.parametrize("tensor_const_tests", tensor_const_tests)
-def test_scalars_tosa_MI(tensor_const_tests: list):
-    op, x = tensor_const_tests
-    _test_add_tosa_MI_pipeline(op, (x,))
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_sub_tensor_tosa_BI_inplace(test_data):
+    """Tests inplace sub with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](SubInplace(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_sub_scalar_tosa_BI(test_data):
+    """Tests a scalar sub with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](SubScalar(), test_data, aten_op=Sub.aten_op)
+    pipeline.run()
+
+
+# SUB ETHOS-U ------------------------------------------------------
+@pytest.mark.skip(reason="This is tested in test_sub_scalar_tosa_BI")
+def test_sub_scalar_u55_BI():
+    pass
+
+
+@pytest.mark.skip(reason="This is tested in test_sub_scalar_tosa_BI")
+def test_sub_scalar_u85_BI():
+    pass
+
+
+# MUL MI ------------------------------------------------------
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_mul_tensor_tosa_MI_scalar(test_data):
+    """Tests regular mul with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](Mul(), test_data, aten_op=Mul.aten_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_mul_tensor_tosa_MI_inplace(test_data):
+    """Tests inplace mul with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](MulInplace(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_mul_scalar_tosa_MI(test_data):
+    """Tests a scalar mul with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](
+        MulScalar(), test_data, aten_op=MulScalar.aten_op
+    )
+    pipeline.run()
+
+
+# MUL BI ------------------------------------------------------
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_mul_tensor_tosa_BI_scalar(test_data):
+    """Tests regular mul with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](Mul(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_mul_tensor_tosa_BI_inplace(test_data):
+    """Tests inplace mul with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](MulInplace(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_mul_scalar_tosa_BI(test_data):
+    """Tests a scalar mul with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](MulScalar(), test_data, aten_op=Mul.aten_op)
+    pipeline.run()
+
+
+# MUL ETHOS-U ------------------------------------------------------
+@pytest.mark.skip(reason="This is tested in test_mul_scalar_tosa_BI")
+def test_mul_scalar_u55_BI():
+    pass
+
+
+@pytest.mark.skip(reason="This is tested in test_mul_scalar_tosa_BI")
+def test_mul_scalar_u85_BI():
+    pass
+
+
+# DIV MI ------------------------------------------------------
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_div_tensor_tosa_MI_scalar(test_data):
+    """Tests regular div with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](Div(), test_data, aten_op=Div.aten_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_div_tensor_tosa_MI_inplace(test_data):
+    """Tests inplace div with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](DivInplace(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_div_scalar_tosa_MI(test_data):
+    """Tests a scalar div with one scalar input."""
+    pipeline = TosaPipelineMI[input_t1](
+        DivScalar(), test_data, aten_op=DivScalar.aten_op
+    )
+    pipeline.run()
 
 
-@common.parametrize("tensor_scalar_tests", tensor_scalar_tests)
-def test_scalars_tosa_BI(tensor_scalar_tests: list):
-    op, x, y = tensor_scalar_tests
-    _test_add_tosa_BI_pipeline(op, (x, y))
+# DIV BI ------------------------------------------------------
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_div_tensor_tosa_BI_scalar(test_data):
+    """Tests regular div with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](Div(), test_data, aten_op=[])
+    pipeline.run()
 
 
-# op(Scalar float, tensor) works if the scalar is constant.
-@common.parametrize("tensor_const_tests", tensor_const_tests)
-def test_scalars_tosa_BI_const(tensor_const_tests: list):
-    op, x = tensor_const_tests
-    _test_add_tosa_BI_pipeline(op, (x,))
+@common.parametrize("test_data", tensor_scalar_tests)
+def test_div_tensor_tosa_BI_inplace(test_data):
+    """Tests inplace div with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](DivInplace(), test_data, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
+def test_div_scalar_tosa_BI(test_data):
+    """Tests a scalar div with one scalar input."""
+    pipeline = TosaPipelineBI[input_t1](DivScalar(), test_data, aten_op=[])
+    pipeline.run()
 
 
-def test_shift_sub_inplace_tosa_MI():
-    _test_add_tosa_MI_pipeline(
+# DIV ETHOS-U ------------------------------------------------------
+@pytest.mark.skip(reason="This is tested in test_div_scalar_tosa_BI")
+def test_div_scalar_u55_BI():
+    pass
+
+
+@pytest.mark.skip(reason="This is tested in test_div_scalar_tosa_BI")
+def test_div_scalar_u85_BI():
+    pass
+
+
+# SHIFT ETHOS-U ------------------------------------------------------
+def test_bitwise_right_shift_tensor_tosa_MI_inplace():
+    pipeline = TosaPipelineMI[input_t1](
         ShiftInplaceSub(),
         (torch.IntTensor(5),),
+        aten_op="torch.ops.aten.__rshift__.Scalar",
     )
+    pipeline.run()
 
 
-# Do not check for quant nodes in the graph for rshift.
-def test_shift_sub_inplace_tosa_BI():
-    _test_add_tosa_BI_pipeline(
+def test_bitwise_right_shift_tensor_tosa_BI_inplace():
+    pipeline = TosaPipelineBI[input_t1](
         ShiftInplaceSub(),
         (torch.IntTensor(5),),
-        check_quant_nodes=False,
+        aten_op="torch.ops.aten.bitwise_right_shift.Tensor",
     )
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.run()

From fd87e9892b22128a1f2efc94b44a3d29f1b13b78 Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Fri, 16 May 2025 17:26:46 +0200
Subject: [PATCH 128/178] Arm backend: Convert remaining asserts in operators
 to raise errors (#10945)

Asserts are converted to proper raises to ensure graph integrity.
Improve error message.

Signed-off-by: Sebastian Larsson <sebastian.larsson@arm.com>
---
 backends/arm/operators/op_sub.py                | 17 ++++++++++++++---
 .../arm/operators/op_upsample_bilinear2d.py     |  9 ++++++---
 backends/arm/operators/op_upsample_nearest2d.py | 14 ++++++++------
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
index cc3a5591a4c..c65a2a3f43c 100644
--- a/backends/arm/operators/op_sub.py
+++ b/backends/arm/operators/op_sub.py
@@ -163,7 +163,11 @@ def define_node(
         validate_same_dtype(self.target, [*inputs, output])
 
         # Handle int8 (quantized) and int32
-        assert inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]
+        supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f'IO data type needs to be {supported_dtypes}, got "{inputs[0].dtype}"'
+            )
 
         scale_back = 1.0
         if inputs[0].dtype == ts.DType.INT8:
@@ -228,8 +232,15 @@ def define_node(
             super().define_node(node, tosa_graph, inputs, output)
         else:
             # FP32 Sub lowering
-            assert inputs[0].dtype == ts.DType.FP32
-            assert output.dtype == ts.DType.FP32
+            if (
+                inputs[0].dtype != ts.DType.FP32
+                or inputs[1].dtype != ts.DType.FP32
+                or output.dtype != ts.DType.FP32
+            ):
+                raise TypeError(
+                    f"All IO needs to have data type fp32. Got: {inputs[0].dtype}, "
+                    f"input 2: {inputs[1].dtype} and output: {output.dtype}"
+                )
 
             # MI lowering
             tosa_graph.addOperator(
diff --git a/backends/arm/operators/op_upsample_bilinear2d.py b/backends/arm/operators/op_upsample_bilinear2d.py
index 3d3c47b7e84..4d8c0ff9320 100644
--- a/backends/arm/operators/op_upsample_bilinear2d.py
+++ b/backends/arm/operators/op_upsample_bilinear2d.py
@@ -153,9 +153,12 @@ def define_node(
         def in_int16_range(x):
             return torch.all(x >= -(2**15)) and torch.all(x <= 2**15 - 1)
 
-        assert in_int16_range(scale_n_yx)
-        assert in_int16_range(scale_d_yx)
-        assert in_int16_range(border_yx)
+        if not in_int16_range(scale_n_yx):
+            raise ValueError("scale_n_yx is out of the int16 range")
+        if not in_int16_range(scale_d_yx):
+            raise ValueError("scale_d_yx is out of the int16 range")
+        if not in_int16_range(border_yx):
+            raise ValueError("border_yx is out of the int16 range")
 
         scales = [scale_n_yx[0], scale_d_yx[0], scale_n_yx[1], scale_d_yx[1]]
 
diff --git a/backends/arm/operators/op_upsample_nearest2d.py b/backends/arm/operators/op_upsample_nearest2d.py
index d5f7b951e40..e9c72145555 100644
--- a/backends/arm/operators/op_upsample_nearest2d.py
+++ b/backends/arm/operators/op_upsample_nearest2d.py
@@ -102,9 +102,8 @@ def define_node(
         validate_num_inputs(self.target, inputs, 3)
         validate_same_dtype(self.target, [inputs[0], output])
 
-        assert (
-            inputs[0].shape is not None and output.shape is not None
-        ), "Only static shapes are supported"
+        if inputs[0].shape is None or output.shape is None:
+            raise ValueError("Only static shapes are supported")
 
         # tosa_shape output is NHWC, take HW
         input_size_yx = torch.tensor(
@@ -121,9 +120,12 @@ def define_node(
         def in_int16_range(x):
             return torch.all(x >= -(2**15)) and torch.all(x <= 2**15 - 1)
 
-        assert in_int16_range(scale_n_yx)
-        assert in_int16_range(scale_d_yx)
-        assert in_int16_range(border_yx)
+        if not in_int16_range(scale_n_yx):
+            raise ValueError("scale_n_yx is out of the int16 range")
+        if not in_int16_range(scale_d_yx):
+            raise ValueError("scale_d_yx is out of the int16 range")
+        if not in_int16_range(border_yx):
+            raise ValueError("border_yx is out of the int16 range")
 
         scales = [scale_n_yx[0], scale_d_yx[0], scale_n_yx[1], scale_d_yx[1]]
         scales_tensor = tosa_graph.addConst(

From 89532793aff60dc7a1ea63da5717f942d98959b4 Mon Sep 17 00:00:00 2001
From: Suvadeep Banerjee <suvadeep1989@gmail.com>
Date: Fri, 16 May 2025 10:05:55 -0700
Subject: [PATCH 129/178] Broadcast implementation in quantized_add

Differential Revision: D74773433

Pull Request resolved: https://github.com/pytorch/executorch/pull/10903
---
 backends/cadence/aot/ops_registrations.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index aca4965083d..cdaca41569f 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -334,10 +334,9 @@ def quantized_add_meta(
     out_scale: float,
     out_zero_point: int,
 ) -> torch.Tensor:
-    out_size = X.size()
-    if list(X.size()) == [1]:
-        out_size = Y.size()
 
+    # Determine output shape by broadcasting X and Y
+    out_size = torch.broadcast_shapes(X.size(), Y.size())
     return X.new_empty(out_size, dtype=X.dtype)
 
 
@@ -352,10 +351,8 @@ def quantized_add_per_tensor_meta(
     out_scale: float,
     out_zero_point: int,
 ) -> torch.Tensor:
-    out_size = X.size()
-    if list(X.size()) == [1]:
-        out_size = Y.size()
 
+    out_size = torch.broadcast_shapes(X.size(), Y.size())
     return X.new_empty(out_size, dtype=X.dtype)
 
 
From a0d9c7e68fa9c36c62b6849fd606622ff72a7c0a Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Fri, 16 May 2025 19:27:10 +0200
Subject: [PATCH 130/178] Arm backend: Fix sigmoid int16 and int32 flakyness
 (#10548)

Step vela pin and remove all related pytest.mark_flaky in sigmoid int16
and int32 tests.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 .../tosa_supported_operators.py               |  1 +
 backends/arm/test/models/test_conformer.py    | 20 +++++++++++-------
 backends/arm/test/ops/test_depthwise_conv.py  | 21 +++++++++++++++----
 backends/arm/test/ops/test_sigmoid_16bit.py   | 17 +++++++--------
 backends/arm/test/ops/test_sigmoid_32bit.py   | 16 ++------------
 examples/arm/setup.sh                         |  2 +-
 6 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 547eafbfa8d..29430ca2b0c 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -306,6 +306,7 @@ class CheckProperQuantization(OperatorSupportBase):
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.upsample_bilinear2d.vec,
         exir_ops.edge.aten.upsample_nearest2d.vec,
+        torch.ops.aten.scalar_tensor.default,
         *TableOps.included_ops(),
     )
 
diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
index 3aa953bf602..5b9a50f08e8 100644
--- a/backends/arm/test/models/test_conformer.py
+++ b/backends/arm/test/models/test_conformer.py
@@ -63,7 +63,6 @@ def test_conformer_tosa_MI(self):
             )
         )
 
-    @unittest.expectedFailure  # TODO(MLETORCH-635)
     def test_conformer_tosa_BI(self):
         (
             ArmTester(
@@ -111,7 +110,6 @@ def test_conformer_u55_BI(self):
             except Exception:
                 pass
 
-    @unittest.expectedFailure  # TODO(MLETORCH-635)
     def test_conformer_u85_BI(self):
         tester = (
             ArmTester(
@@ -126,9 +124,15 @@ def test_conformer_u85_BI(self):
             .serialize()
         )
         if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                qtol=1.0,
-                rtol=1.0,
-                atol=5.0,
-                inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
-            )
+            try:
+                tester.run_method_and_compare_outputs(
+                    qtol=1.0,
+                    rtol=1.0,
+                    atol=5.0,
+                    inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
+                )
+                self.fail(
+                    "TODO(MLETORCH-635): Expected failure under FVP option, but test passed."
+                )
+            except Exception:
+                pass
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 91b3dde1bb2..1213a04426b 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -202,13 +202,13 @@ def test_convolution_2d_tosa_BI_depth_wise(test_module: torch.nn.Module):
 
 
 x_fails = {
-    "3x3_2x8x198x198_gp8_st3": "MLETORCH-516: AssertionError: Output 0 does not match reference output.",
-    "two_dw_conv2d": "MLETORCH-516: AssertionError: Output 0 does not match reference output.",
+    "3x3_2x8x198x198_gp8_st3": "MLETORCH-517: Operators fail with batches > 1",
+    "two_dw_conv2d": "MLETORCH-517: Operators fail with batches > 1",
 }
 
 
-@common.parametrize("test_module", testsuite_conv2d, x_fails)
 @common.XfailIfNoCorstone300  # TODO: MLETORCH-516
+@common.parametrize("test_module", testsuite_conv2d, x_fails)
 def test_convolution_2d_u55_BI_depth_wise(test_module: torch.nn.Module):
     pipeline = EthosU55PipelineBI[input_t](
         test_module(),
@@ -233,8 +233,8 @@ def test_convolution_1d_u55_BI_depth_wise(test_module: torch.nn.Module):
     pipeline.run()
 
 
-@common.parametrize("test_module", testsuite_conv1d | testsuite_conv2d, x_fails)
 @common.XfailIfNoCorstone320  # TODO: MLETORCH-516
+@common.parametrize("test_module", testsuite_conv2d, x_fails)
 def test_convolution_2d_u85_BI_depth_wise(test_module: torch.nn.Module):
     pipeline = EthosU85PipelineBI[input_t](
         test_module(),
@@ -244,3 +244,16 @@ def test_convolution_2d_u85_BI_depth_wise(test_module: torch.nn.Module):
         run_on_fvp=True,
     )
     pipeline.run()
+
+
+@common.XfailIfNoCorstone320  # TODO: MLETORCH-516
+@common.parametrize("test_module", testsuite_conv1d, x_fails)
+def test_convolution_1d_u85_BI_depth_wise(test_module: torch.nn.Module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module(),
+        test_module().get_inputs(),
+        aten_ops=[],
+        exir_ops=exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py
index ddec8c61eb9..ff0fe9cc4c1 100644
--- a/backends/arm/test/ops/test_sigmoid_16bit.py
+++ b/backends/arm/test/ops/test_sigmoid_16bit.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import pytest
-
 import torch
 from executorch.backends.arm.quantizer import (
     get_symmetric_quantization_config,
@@ -91,10 +90,13 @@ def forward(self, x):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_BI(test_data):
     pipeline = TosaPipelineBI(
-        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op
+        Sigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+        qtol=1,
     )
     pipeline.change_args("quantize", get_16bit_sigmoid_quantizer())
     pipeline.run()
@@ -108,13 +110,13 @@ def test_sigmoid_tosa_BI(test_data):
     },
     strict=False,
 )
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_BI_add_sigmoid(test_data):
     pipeline = TosaPipelineBI(
         SigmoidAddSigmoid(),
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
+        qtol=1,
     )
     pipeline.run()
 
@@ -131,7 +133,6 @@ def test_sigmoid_tosa_BI_add_sigmoid(test_data):
     "test_data",
     test_data_suite,
 )
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_u55_BI(test_data):
     pipeline = OpNotSupportedPipeline(
         Sigmoid(),
@@ -148,7 +149,6 @@ def test_sigmoid_u55_BI(test_data):
     "test_data",
     test_data_suite,
 )
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_u55_BI_add_sigmoid(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
@@ -163,7 +163,6 @@ def test_sigmoid_u55_BI_add_sigmoid(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
 def test_sigmoid_u85_BI(test_data):
     pipeline = EthosU85PipelineBI(
@@ -181,10 +180,10 @@ def test_sigmoid_u85_BI(test_data):
     "test_data",
     test_data_suite,
     xfails={
-        "ramp": "AssertionError: Output 0 does not match reference output.",
+        "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787"
     },
 )
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
+@pytest.mark.flaky(reruns=5)  # MLETORCH-787: Investigate int16-int8 rescaling precision
 @common.XfailIfNoCorstone320
 def test_sigmoid_u85_BI_add_sigmoid(test_data):
     pipeline = EthosU85PipelineBI(
diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py
index a0fe077da5f..4edfdd6c23e 100644
--- a/backends/arm/test/ops/test_sigmoid_32bit.py
+++ b/backends/arm/test/ops/test_sigmoid_32bit.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer import TOSAQuantizer
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
@@ -107,33 +106,32 @@ def forward(self, x):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_BI(test_data):
     pipeline = TosaPipelineBI(
         Sigmoid(),
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
+        qtol=1,
     )
     pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_BI_add_sigmoid(test_data):
     pipeline = TosaPipelineBI(
         SigmoidAddSigmoid(),
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
+        qtol=1,
     )
     pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_u55_BI(test_data):
     pipeline = OpNotSupportedPipeline(
         Sigmoid(),
@@ -147,7 +145,6 @@ def test_sigmoid_u55_BI(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_u55_BI_add_sigmoid(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
@@ -162,9 +159,7 @@ def test_sigmoid_u55_BI_add_sigmoid(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
-@pytest.mark.flaky(reruns=5)
 def test_sigmoid_u85_BI(test_data):
     pipeline = EthosU85PipelineBI(
         Sigmoid(),
@@ -180,15 +175,8 @@ def test_sigmoid_u85_BI(test_data):
 @common.parametrize(
     "test_data",
     test_data_suite,
-    xfails={
-        "ramp": "AssertionError: Output 0 does not match reference output.",
-        "rand": "AssertionError: Output 0 does not match reference output.",
-        "rand_4d": "AssertionError: Output 0 does not match reference output.",
-    },
 )
-@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
-@pytest.mark.flaky(reruns=5)
 def test_sigmoid_u85_BI_add_sigmoid(test_data):
     pipeline = EthosU85PipelineBI(
         SigmoidAddSigmoid(),
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 2b7e6571c0b..ab9718bd7e6 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -60,7 +60,7 @@ fi
 
 # vela
 vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela"
-vela_rev="859cc066178a87ff28230c1ce9bd370f1e98aa5a"
+vela_rev="8cac2b9a7204b57125a8718049519b091a98846c"
 
 ########
 ### Functions

From 8d53a28e8c73f8eda6ba889bcff0ee0373b801b2 Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Fri, 16 May 2025 10:41:11 -0700
Subject: [PATCH 131/178] Remove op_registration_util.bzl and rely on shim
 (#10935)

https://github.com/pytorch/executorch/pull/10924

See sync issue:
https://fb.workplace.com/groups/osssupport/permalink/28500287159593195/

We want to use the one in the shim, so we can refer to fbsource/xplat/..
/op_registration.bzl.
---
 kernels/optimized/op_registration_util.bzl | 139 ---------------------
 1 file changed, 139 deletions(-)
 delete mode 100644 kernels/optimized/op_registration_util.bzl

diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl
deleted file mode 100644
index 3ac89132380..00000000000
--- a/kernels/optimized/op_registration_util.bzl
+++ /dev/null
@@ -1,139 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
-load(
-    "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
-    "get_vec_deps",
-    "get_vec_preprocessor_flags",
-)
-load(
-    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
-    "get_compiler_optimization_flags",
-)
-
-def op_target(name, deps = [], compiler_flags = []):
-    """Registers an optimized implementation for an operator overload group.
-
-    An operator overload group is a set of operator overloads with a common
-    operator name. That common operator name should be the base name of this
-    target.
-
-    E.g., the "add" operator overload group, named "op_add" in this target,
-    might implement:
-    - add.Tensor
-    - add_.Tensor
-    - add.out
-    - add.Scalar
-
-    If an op target would like to share a header/sources with a different op
-    target (e.g., helpers/utilities), it should declare a separate cxx_library
-    and add it as a dep.
-
-    Args:
-        name: The name of the operator overload group; e.g.,
-            "op_add". This directory must contain a source file named
-            "<name>.cpp"; e.g., "op_add.cpp".
-        deps: Optional extra deps to add to the cxx_library(). Note:
-            - op targets may not depend on other op targets, to keep the
-              dependencies manageable. If two op targets would like to share
-              code, define a separate runtime.cxx_library that they both depend
-              on.
-        compiler_flags: Optional compiler flags to add to the cxx_library().
-    """
-
-    # Note that this doesn't actually define the target, but helps register
-    # it in a table that's used to define the target.
-    return {
-        "compiler_flags": compiler_flags,
-        "deps": deps,
-        "name": name,
-    }
-
-def _enforce_deps(deps, name):
-    """Fails if any of the deps are not allowed.
-
-    Args:
-        deps: A list of build target strings.
-        name: The name of the target; e.g., "op_add"
-    """
-    for dep in deps:
-        if dep.startswith(":op_"):
-            # op targets may not depend on other op targets, to keep the
-            # dependencies manageable. If two op targets would like to share
-            # code, define a separate runtime.cxx_library that they both depend
-            # on.
-            fail("op_target {} may not depend on other op_target {}".format(
-                name,
-                dep,
-            ))
-
-def define_op_library(name, compiler_flags, deps):
-    """Defines a cxx_library target for the named operator overload group.
-
-    Args:
-        name: The name of the target; e.g., "op_add"
-        deps: List of deps for the target.
-    """
-    selects.apply(obj = deps, function = native.partial(_enforce_deps, name = name))
-
-    augmented_deps = deps + [
-        "//executorch/kernels/optimized:libvec",
-        "//executorch/kernels/optimized:libutils",
-    ]
-
-    runtime.cxx_library(
-        name = "{}".format(name),
-        srcs = [
-            "{}.cpp".format(name),
-        ],
-        visibility = [
-            "//executorch/kernels/portable/test/...",
-            "//executorch/kernels/quantized/test/...",
-            "//executorch/kernels/optimized/test/...",
-            "//executorch/kernels/test/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        compiler_flags = [
-            # kernels often have helpers with no prototypes just disabling the warning here as the headers
-            # are codegend and linked in later
-            "-Wno-missing-prototypes",
-            # pragma unroll fails with -Os, don't need to warn us and
-            # fail Werror builds; see https://godbolt.org/z/zvf85vTsr
-            "-Wno-pass-failed",
-        ] + compiler_flags + get_compiler_optimization_flags(),
-        deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-        ] + augmented_deps + get_vec_deps(),
-        preprocessor_flags = get_vec_preprocessor_flags(),
-        # sleef needs to be added as a direct dependency of the operator target when building for Android,
-        # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
-        # dependencies are not transitive
-        fbandroid_platform_deps = [
-            (
-                "^android-arm64.*$",
-                [
-                    "fbsource//third-party/sleef:sleef_arm",
-                ],
-            ),
-        ],
-        # link_whole is necessary because the operators register themselves
-        # via static initializers that run at program startup.
-        # @lint-ignore BUCKLINT link_whole
-        link_whole = True,
-    )
-
-def define_op_target(name, compiler_flags, deps):
-    """Possibly defines cxx_library targets for the named operator group.
-
-    Args:
-        name: The base name of the target; e.g., "op_add"
-        deps: List of deps for the targets.
-    """
-
-    # When building in ATen mode, ATen-compatible (non-custom) operators will
-    # use the implementations provided by ATen, so we should not build the
-    # versions defined here.
-    define_op_library(
-        name = name,
-        compiler_flags = compiler_flags,
-        deps = deps,
-    )

From d0464f8206036d610be3456f4eb3103c15834f63 Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Fri, 16 May 2025 10:44:15 -0700
Subject: [PATCH 132/178] Sync
 shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl (#10924)

Sync with internal:
https://www.internalfb.com/code/fbsource/xplat/executorch/kernels/optimized/op_registration_util.bzl
---
 .../kernels/optimized/op_registration_util.bzl           | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index d48a22cee37..3ac89132380 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -1,10 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under both the MIT license found in the
-# LICENSE-MIT file in the root directory of this source tree and the Apache
-# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
-# of this source tree.
-
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
@@ -106,7 +99,7 @@ def define_op_library(name, compiler_flags, deps):
             # pragma unroll fails with -Os, don't need to warn us and
             # fail Werror builds; see https://godbolt.org/z/zvf85vTsr
             "-Wno-pass-failed",
-        ] + get_compiler_optimization_flags(),
+        ] + compiler_flags + get_compiler_optimization_flags(),
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
         ] + augmented_deps + get_vec_deps(),

From d0848ca6f0e84cf4d7c20abb40640858ce2ffd82 Mon Sep 17 00:00:00 2001
From: "Mengtao (Martin) Yuan" <myuan@fb.com>
Date: Fri, 16 May 2025 12:08:23 -0700
Subject: [PATCH 133/178] Forward fix #10851 for arm backend

Differential Revision: D74857349

Pull Request resolved: https://github.com/pytorch/executorch/pull/10943
---
 backends/arm/test/TARGETS |  1 +
 exir/backend/TARGETS      | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
index a82d4da6bc2..3c29719e1cc 100644
--- a/backends/arm/test/TARGETS
+++ b/backends/arm/test/TARGETS
@@ -45,6 +45,7 @@ python_library(
         "//executorch/backends/arm/quantizer:lib",
         "//executorch/backends/arm:tosa_mapping",
         "//executorch/devtools/backend_debug:delegation_info",
+        "//executorch/exir/backend:operator_support",
         "fbsource//third-party/pypi/tabulate:tabulate",
     ]
 )
diff --git a/exir/backend/TARGETS b/exir/backend/TARGETS
index 5b64f47657c..d656e44fbf1 100644
--- a/exir/backend/TARGETS
+++ b/exir/backend/TARGETS
@@ -42,6 +42,18 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "operator_support",
+    srcs = [
+        "operator_support.py",
+    ],
+    visibility = [
+        "//executorch/...",
+        "//executorch/test/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+)
+
 runtime.python_library(
     name = "partitioner",
     srcs = [

From 502db6457862ef8ef736073ad8a36f41f60df86e Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Fri, 16 May 2025 12:42:49 -0700
Subject: [PATCH 134/178] Create a pybind preset (#10932)

### Summary

In this diff we create a `pybind` preset that is copied with
configurations from
[setup.py](https://github.com/pytorch/executorch/blob/7175ca420dc5a173f8635da976457bf6f17bbbc1/setup.py).
In upcoming diffs, we will completely gut setup.py and point directly to
this preset.

### Test plan

```
$ cmake --list-presets
Available configure presets:

  "macos-arm64"  - Build everything buildable on macOS arm64
  "pybind" - Build pybindings exported in the wheel
```
```
$ cmake --preset pybind && cmake --build cmake-out --parallel
```

cc @larryliu0820
---
 CMakePresets.json                    | 15 +++++++++++
 backends/xnnpack/CMakeLists.txt      | 18 -------------
 kernels/quantized/CMakeLists.txt     | 12 +++++----
 runtime/executor/test/CMakeLists.txt |  1 +
 tools/cmake/preset/default.cmake     | 38 ++++++++++++++++++++++++++++
 tools/cmake/preset/pybind.cmake      | 30 ++++++++++++++++++++++
 6 files changed, 91 insertions(+), 23 deletions(-)
 create mode 100644 tools/cmake/preset/pybind.cmake

diff --git a/CMakePresets.json b/CMakePresets.json
index 5006ba9ec05..ac19bbfed5f 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -15,6 +15,7 @@
     },
     {
       "name": "macos-arm64",
+      "displayName": "Build everything buildable on macOS arm64",
       "inherits": ["common"],
       "generator": "Xcode",
       "cacheVariables": {
@@ -28,6 +29,20 @@
         "type": "equals",
         "rhs": "Darwin"
       }
+    },
+    {
+      "name": "pybind",
+      "displayName": "Build pybindings exported in the wheel",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",
+        "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "type": "inList",
+        "string": "${hostSystemName}",
+        "list": ["Darwin", "Linux", "Windows"]
+      }
     }
   ]
 }
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 670f1fba6df..a1fee7427fc 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -25,24 +25,6 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
-# NB: Enabling this will serialize execution of delegate instances Keeping this
-# OFF by default to maintain existing behavior, to be revisited.
-option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-       "Enable workspace sharing across different delegate instances" ON
-)
-# Keeping this OFF by default due to regressions in decode and model load with
-# kleidi kernels
-option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
-
-# Turning this on cache weights between partitions and methods. If weights
-# are shared across methods/partitions then this can reduce load time and
-# memory usage
-
-# Keeping this off maintains existing behavior. Turning this on serializes
-# execution and initialization of delegates, to be revisited
-option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
-        "Enable weights cache to cache and manage all packed weights" OFF)
-
 if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
   add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
 endif()
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index bfcd8f7f324..e5d1a94e068 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -10,10 +10,6 @@
 # ~~~
 cmake_minimum_required(VERSION 3.19)
 
-option(EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT
-       "Build the optimized ops library for AOT export usage" OFF
-)
-
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
@@ -91,7 +87,13 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode"
     # pybindings.
     if(TARGET portable_lib)
       add_library(quantized_pybind_kernels_lib ${_quantized_kernels__srcs})
-      target_link_libraries(quantized_pybind_kernels_lib PRIVATE portable_lib)
+      target_link_libraries(
+        quantized_pybind_kernels_lib
+        PRIVATE
+          portable_lib
+          executorch_core
+          kernels_util_all_deps
+      )
       target_compile_options(
         quantized_pybind_kernels_lib PUBLIC ${_common_compile_options}
       )
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index 7a0e1be938d..c632ddffaba 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -148,6 +148,7 @@ et_cxx_test(
   portable_kernels
   extension_data_loader
   extension_runner_util
+  program_schema
 )
 add_dependencies(tensor_parser_test generated_pte_files)
 set_property(TEST tensor_parser_test PROPERTY ENVIRONMENT ${test_env})
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 7b6830bd3c8..5dbaa1871ea 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -74,6 +74,11 @@ define_overridable_option(
   "Build the custom ops lib for AOT"
   BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT
+  "Build the optimized ops library for AOT export usage"
+  BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
   "Build the Data Loader extension"
@@ -109,6 +114,11 @@ define_overridable_option(
   "Build the training extension"
   BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_APPLE
+  "Build the Apple extension"
+  BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_MPS
   "Build the MPS backend"
@@ -227,6 +237,8 @@ define_overridable_option(
 set(_default_executorch_build_executor_runner ON)
 if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")
   set(_default_executorch_build_executor_runner OFF)
+elseif(DEFINED EXECUTORCH_BUILD_PRESET_FILE)
+  set(_default_executorch_build_executor_runner OFF)
 endif()
 define_overridable_option(
   EXECUTORCH_BUILD_EXECUTOR_RUNNER
@@ -234,6 +246,32 @@ define_overridable_option(
   BOOL ${_default_executorch_build_executor_runner}
 )
 
+# NB: Enabling this will serialize execution of delegate instances Keeping this
+# OFF by default to maintain existing behavior, to be revisited.
+define_overridable_option(
+  EXECUTORCH_XNNPACK_SHARED_WORKSPACE
+  "Enable workspace sharing across different delegate instances"
+  BOOL ON
+)
+# Keeping this OFF by default due to regressions in decode and model load with
+# kleidi kernels
+define_overridable_option(
+  EXECUTORCH_XNNPACK_ENABLE_KLEIDI
+  "Enable Arm Kleidi kernels"
+  BOOL OFF
+)
+# Turning this on cache weights between partitions and methods. If weights
+# are shared across methods/partitions then this can reduce load time and
+# memory usage
+#
+# Keeping this off maintains existing behavior. Turning this on serializes
+# execution and initialization of delegates, to be revisited
+define_overridable_option(
+  EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
+  "Enable weights cache to cache and manage all packed weights"
+  BOOL OFF
+)
+
 # MARK: - Validations
 # At this point all the options should be configured with their final value.
 
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake
new file mode 100644
index 00000000000..4795ae60c2c
--- /dev/null
+++ b/tools/cmake/preset/pybind.cmake
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set_overridable_option(EXECUTORCH_BUILD_PYBIND ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT ON)
+# Enable logging even when in release mode. We are building for desktop, where
+# saving a few kB is less important than showing useful error information to
+# users.
+set_overridable_option(EXECUTORCH_ENABLE_LOGGING ON)
+set_overridable_option(EXECUTORCH_LOG_LEVEL Info)
+set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT ON)
+
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+  set_overridable_option(EXECUTORCH_BUILD_COREML ON)
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  # Linux-specific code here
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "WIN32")
+  # Windows or other OS-specific code here
+else()
+  message(FATAL_ERROR "Unsupported CMAKE_SYSTEM_NAME for pybind: ${CMAKE_SYSTEM_NAME}")
+endif()

From d18a52d9ac90ebcb54382c3682b4fd313e0e7a63 Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Fri, 16 May 2025 12:51:09 -0700
Subject: [PATCH 135/178] Fix libflatccrt race (#10918)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary

Seems like there is a race in building `libflatccrt.a`. This issue has
existed for a while: https://github.com/pytorch/executorch/issues/7300.
It was temporarily mitigated in
https://github.com/pytorch/executorch/pull/7570 by just reducing the
parallelism.

In this diff I attempt to fix it. This is just my assumption of what is
wrong. Given flatccrt builds a debug version with a `_d` suffix, if the
target isn't depended on (i.e. some target don't use the conditional
target name) then the order of how the lib is built causes a race. So
for now, always use the non-debug version.

Given it's a race, I was never able to repro the issue locally — I can't
guarantee this is the problem. However, it seems my recent changes in
https://github.com/pytorch/executorch/pull/10855 has increased the
frequency of the problem in CI.

### Test plan

CI

cc @larryliu0820
---
 .ci/scripts/build-qnn-sdk.sh                     | 13 ++++---------
 CMakeLists.txt                                   |  1 +
 examples/apple/mps/CMakeLists.txt                |  8 +-------
 examples/arm/executor_runner/CMakeLists.txt      | 14 ++++----------
 examples/qualcomm/executor_runner/CMakeLists.txt |  2 +-
 third-party/CMakeLists.txt                       |  6 ++++++
 tools/cmake/executorch-config.cmake              | 12 +++++-------
 7 files changed, 22 insertions(+), 34 deletions(-)
 mode change 100644 => 100755 .ci/scripts/build-qnn-sdk.sh

diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
old mode 100644
new mode 100755
index f6dafebc756..806df85d211
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -11,17 +11,12 @@ set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
-  export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+  export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
+  export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-  # Workaround to avoid issues around missing flatccrt library (depending on the
-  # number of jobs used), see issue #7300:
-  # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
-  # available.
-  # TODO: Remove this workaround once the underlying issue is fixed.
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
+  parallelism=$(( $(nproc) - 1 ))
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release
 }
 
 set_up_aot() {
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60415f4ab33..d6122d68667 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -582,6 +582,7 @@ if(EXECUTORCH_BUILD_PYBIND)
       ${TORCH_PYTHON_LIBRARY}
       bundled_program
       etdump
+      flatccrt
       executorch
       extension_data_loader
       util
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index 66583592844..9bad0b4b206 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -97,12 +97,6 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   list(TRANSFORM _mps_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
   add_executable(mps_executor_runner ${_mps_executor_runner__srcs})
 
-  if(CMAKE_BUILD_TYPE MATCHES "Debug")
-    set(FLATCC_LIB flatccrt_d)
-  else()
-    set(FLATCC_LIB flatccrt)
-  endif()
-
   if(CMAKE_BUILD_TYPE MATCHES "Debug")
     target_link_options(mps_executor_runner PUBLIC -fsanitize=undefined)
   endif()
@@ -113,7 +107,7 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
     executorch
     gflags
     etdump
-    ${FLATCC_LIB}
+    flatccrt
     mpsdelegate
     mps_portable_ops_lib
     ${mps_executor_runner_libs}
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 6816a55d443..b3bae768996 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -583,22 +583,16 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
             "${ET_BUILD_DIR_PATH}/lib/libetdump.a"
   )
 
-  if(CMAKE_BUILD_TYPE MATCHES "Debug")
-    set(FLATCCRT_LIB flatccrt_d)
-  else()
-    set(FLATCCRT_LIB flatccrt)
-  endif()
-
-  add_library(${FLATCCRT_LIB} STATIC IMPORTED)
+  add_library(flatccrt STATIC IMPORTED)
   set_property(
-      TARGET ${FLATCCRT_LIB}
+      TARGET flatccrt
       PROPERTY IMPORTED_LOCATION
-            "${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
+            "${ET_BUILD_DIR_PATH}/lib/libflatccrt.a"
   )
 
   list(APPEND arm_executor_runner_link
     etdump
-    ${FLATCCRT_LIB}
+    flatccrt
   )
 endif()
 
diff --git a/examples/qualcomm/executor_runner/CMakeLists.txt b/examples/qualcomm/executor_runner/CMakeLists.txt
index 214e0a58547..479d0a248be 100644
--- a/examples/qualcomm/executor_runner/CMakeLists.txt
+++ b/examples/qualcomm/executor_runner/CMakeLists.txt
@@ -20,7 +20,7 @@ target_include_directories(
 )
 target_link_libraries(
   qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
-  ${FLATCCRT_LIB} gflags
+  flatccrt gflags
 )
 set_target_properties(
   qnn_executor_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt
index df7e2b2521d..c323d411e44 100644
--- a/third-party/CMakeLists.txt
+++ b/third-party/CMakeLists.txt
@@ -68,6 +68,7 @@ ExternalProject_Add(
              -DCMAKE_OSX_DEPLOYMENT_TARGET:STRING=${CMAKE_OSX_DEPLOYMENT_TARGET}
   BUILD_BYPRODUCTS <INSTALL_DIR>/bin/flatcc
 )
+file(REMOVE_RECURSE ${PROJECT_SOURCE_DIR}/third-party/flatcc/lib)
 ExternalProject_Get_Property(flatcc_external_project INSTALL_DIR)
 add_executable(flatcc_cli IMPORTED GLOBAL)
 add_dependencies(flatcc_cli flatcc_external_project)
@@ -83,6 +84,11 @@ set(FLATCC_REFLECTION OFF CACHE BOOL "")
 set(FLATCC_DEBUG_CLANG_SANITIZE OFF CACHE BOOL "")
 set(FLATCC_INSTALL OFF CACHE BOOL "")
 add_subdirectory(flatcc)
+# Unfortunately flatcc writes libs directly in to the source tree [1]. So to
+# ensure the target lib is created last, force flatcc_cli to build first.
+#
+# [1] https://github.com/dvidelabs/flatcc/blob/896db54787e8b730a6be482c69324751f3f5f117/CMakeLists.txt#L168
+add_dependencies(flatccrt flatcc_cli)
 # Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
 # a shared object; recompile with -fPIC" when building on some x86 linux
 # systems.
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index a8e756fbb77..aa5776163a9 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -56,18 +56,12 @@ set(EXECUTORCH_FOUND ON)
 
 target_link_libraries(executorch INTERFACE executorch_core)
 
-if(CMAKE_BUILD_TYPE MATCHES "Debug")
-  set(FLATCCRT_LIB flatccrt_d)
-else()
-  set(FLATCCRT_LIB flatccrt)
-endif()
-
 set(lib_list
+    flatccrt
     etdump
     bundled_program
     extension_data_loader
     extension_flat_tensor
-    ${FLATCCRT_LIB}
     coreml_util
     coreml_inmemoryfs
     coremldelegate
@@ -154,6 +148,10 @@ if(TARGET coremldelegate)
   )
 endif()
 
+if(TARGET etdump)
+  set_target_properties(etdump PROPERTIES INTERFACE_LINK_LIBRARIES "flatccrt;executorch")
+endif()
+
 if(TARGET optimized_native_cpu_ops_lib)
   if(TARGET optimized_portable_kernels)
     set(_maybe_optimized_portable_kernels_lib optimized_portable_kernels)

From 309faf8c9e08739f8550480350be378447867805 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Fri, 16 May 2025 13:07:51 -0700
Subject: [PATCH 136/178] Update ModuleTest.swift (#10948)

---
 .../apple/ExecuTorch/__tests__/ModuleTest.swift      | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/extension/apple/ExecuTorch/__tests__/ModuleTest.swift b/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
index aebf50e008a..a4f70602a6f 100644
--- a/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
+++ b/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
@@ -59,6 +59,16 @@ class ModuleTest: XCTestCase {
     let inputs = [Tensor([1], dataType: .float), Tensor([1], dataType: .float)]
     var outputs: [Value]?
     XCTAssertNoThrow(outputs = try module.forward(inputs))
-    XCTAssertEqual(outputs?[0].tensor, Tensor([2], dataType: .float, shapeDynamism: .static))
+    XCTAssertEqual(outputs?.first?.tensor, Tensor([2], dataType: .float, shapeDynamism: .static))
+
+    let inputs2 = [Tensor([2], dataType: .float), Tensor([3], dataType: .float)]
+    var outputs2: [Value]?
+    XCTAssertNoThrow(outputs2 = try module.forward(inputs2))
+    XCTAssertEqual(outputs2?.first?.tensor, Tensor([5], dataType: .float, shapeDynamism: .static))
+
+    let inputs3 = [Tensor([13.25], dataType: .float), Tensor([29.25], dataType: .float)]
+    var outputs3: [Value]?
+    XCTAssertNoThrow(outputs3 = try module.forward(inputs3))
+    XCTAssertEqual(outputs3?.first?.tensor, Tensor([42.5], dataType: .float, shapeDynamism: .static))
   }
 }

From 851b373904f5459a944da0ab23a5b62223a3cf7b Mon Sep 17 00:00:00 2001
From: eigen-k <eigen@meta.com>
Date: Fri, 16 May 2025 13:50:32 -0700
Subject: [PATCH 137/178] Make test_fuse_mul_into_dequant use GraphBuilder.

Differential Revision: D74840805

Pull Request resolved: https://github.com/pytorch/executorch/pull/10925
---
 .../aot/tests/test_fusion_ops_passes.py       | 72 ++++++++++---------
 1 file changed, 40 insertions(+), 32 deletions(-)

diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
index 3d9cadf741b..b3ca3e3de74 100644
--- a/backends/cadence/aot/tests/test_fusion_ops_passes.py
+++ b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -8,7 +8,7 @@
 
 
 import unittest
-from typing import Tuple
+from typing import Final, List, Tuple
 
 import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
@@ -281,25 +281,23 @@ def forward(self, x):
         )
 
     def test_no_replace_quant_permute_dequant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = torch.permute(x, [2, 0, 1, 3])
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        permute = builder.call_operator(
+            op=exir_ops.edge.aten.permute_copy.default, args=(quant, [2, 0, 1, 3])
+        )
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(permute, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(dequant)
+        graph_module = FuseQuantDequantToRequantizePass(
+            force_quant_dequant_fusion=False
+        )(builder.get_graph_module()).graph_module
         self.check_op_counts(
             graph_module,
             expected_op_counts={
@@ -436,18 +434,28 @@ def forward(self, x):
         )
 
     def test_fuse_mul_into_dequant(self):
-        class M(torch.nn.Module):
-            def forward(self, x):
-                x0 = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 1.5, 0, 0, 255, torch.uint8
-                )
-                x1 = torch.full([4, 32], 3, dtype=torch.float32)
-                x2 = x0 * x1
-                return x2
+        INPUT_SHAPE: Final[List[int]] = [4, 32]
+        DEQUANT_SCALE: Final[float] = 1.5
+        FULL_VALUE: Final[float] = 3
 
-        inputs = (torch.randint(0, 255, [4, 32], dtype=torch.uint8),)
-        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
-        graph_module = FuseMulTensorIntoDequantPass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(*INPUT_SHAPE, dtype=torch.float32))
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(x, DEQUANT_SCALE, 0, 0, 255, torch.uint8),
+        )
+        full = builder.call_operator(
+            op=exir_ops.edge.aten.full.default,
+            args=(INPUT_SHAPE, FULL_VALUE),
+        )
+        mul = builder.call_operator(
+            op=exir_ops.edge.aten.mul.Tensor,
+            args=(dequant, full),
+        )
+        builder.output(mul)
+        graph_module = FuseMulTensorIntoDequantPass()(
+            builder.get_graph_module()
+        ).graph_module
 
         # verify that the mul and full ops were removed
         self.check_op_counts(
@@ -466,7 +474,7 @@ def forward(self, x):
                 == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
             ):
                 deq_scale = node.args[1]
-        self.assertEqual(deq_scale, 4.5)
+        self.assertEqual(deq_scale, DEQUANT_SCALE * FULL_VALUE)
 
     def test_fuse_mul_scalar_into_dequant(self):
         dequant_scale = 0.006

From 9cce48db8271677afdbd6fc738b26c2f4b958c08 Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 16 May 2025 23:04:06 +0200
Subject: [PATCH 138/178] Fix export llava (#10947)

---
 examples/models/llava/export_llava.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 6e0f2413786..18ef83ee1e4 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -111,7 +111,7 @@ def forward(self, input_pos, embeddings):
         quantization_mode=args.quantization_mode,
         group_size=args.group_size,
         computation_dtype=dtype_override,
-        checkpoint_path=args.checkpoint_path,
+        checkpoint_path=args.checkpoint,
         tokenizer_path=args.tokenizer_path,
         calibration_tasks=args.calibration_tasks,
         calibration_limit=args.calibration_limit,

From 7719d31cb86c8925cf2b7d6bae2ded271a1b0d55 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 16 May 2025 17:39:50 -0400
Subject: [PATCH 139/178] Mostly sync BlasKernel.cpp with ATen
 ReducedPrecisionGemvFastPathKernel (#10941)

Pull Request resolved: #10868

The two files were similar, but diverged due to recent changes. Since we have sharing of PyTorch headers, we can keep them mostly the same; differences are some of the namespace stuff, lintrunner, and a couple of EXECUTORCH NOTEs.
ghstack-source-id: 284287912
@exported-using-ghexport

Differential Revision: [D74702689](https://our.internmc.facebook.com/intern/diff/D74702689/)
---
 kernels/optimized/CMakeLists.txt      |   1 +
 kernels/optimized/blas/BlasKernel.cpp | 400 +++++++++++++++++---------
 kernels/optimized/blas/BlasKernel.h   |   2 -
 kernels/optimized/lib_defs.bzl        |  12 +-
 4 files changed, 274 insertions(+), 141 deletions(-)

diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 5e9c0223492..7b8ebd58f13 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -39,6 +39,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 # Build cpublas.
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
+target_include_directories(cpublas PRIVATE ${TORCH_INCLUDE_DIRS})
 target_link_libraries(
   cpublas PUBLIC executorch_core eigen_blas extension_threadpool
 )
diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
index a3e2172504d..4d833db65f6 100644
--- a/kernels/optimized/blas/BlasKernel.cpp
+++ b/kernels/optimized/blas/BlasKernel.cpp
@@ -6,148 +6,240 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+// NOTE: This file is mostly the same as
+// ReducedPrecisionFloatGemvFastPathKernel.cpp in PyTorch. Actually
+// sharing the two versions is a TODO.
 #include <executorch/kernels/optimized/blas/BlasKernel.h>
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/core/portable_type/half.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/Unroll.h>
+#include <c10/util/irange.h>
 
 #ifdef __aarch64__
 #include <arm_neon.h>
 #include <cpuinfo.h>
 #endif
 
+namespace vec = at::vec;
+using executorch::extension::parallel_for;
 using torch::executor::BFloat16;
+using torch::executor::Half;
 
-namespace executorch {
-namespace cpublas {
-namespace internal {
-#ifdef __aarch64__
-static inline float32x4_t f32_fma(float32x4_t a, float32x4_t b, float32x4_t c) {
-#ifdef __ARM_FEATURE_FMA
-  return vfmaq_f32(a, b, c);
+namespace executorch::cpublas::internal {
+constexpr auto kF32RegisterPairsPerIteration = 4;
+constexpr auto kF32RegistersPerIteration = kF32RegisterPairsPerIteration * 2;
+constexpr auto kF32ElementsPerRegister = vec::Vectorized<float>::size();
+constexpr auto kF32ElementsPerIteration =
+    kF32RegistersPerIteration * kF32ElementsPerRegister;
+
+namespace {
+template <typename T>
+constexpr int IntegerLog2(T n, int p = 0) {
+  return (n <= 1) ? p : IntegerLog2(n / 2, p + 1);
+}
+
+/*
+ * NOTE [ GGML Copyright Notice ]
+ * The below reduce overload and fp16_dot_with_fp16_arith function is
+ * adapted from llama.cpp's ggml_vec_dot_f16 and surrounding utility
+ * functions, so here is the required copyright notice:
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+float reduce(vec::Vectorized<float> x) {
+#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
+  return vaddvq_f32(x);
 #else
-  return vaddq_f32(a, vmulq_f32(b, c));
-#endif // __ARM_FEATURE_FMA
+  return vec::vec_reduce_all<float>(std::plus<vec::Vectorized<float>>(), x);
+#endif
 }
 
 // The below reduce overload and fp16_dot_with_fp32_arith are adapted
 // from llama.cpp's ggml_vec_dot_f32 and surrounding utility
 // functions. See NOTE [ GGML Copyright Notice ] above for the
 // required notice.
-
-// We need the shift for reduce(), hence the extra constants.
-static constexpr auto kF32ElementsPerIterationShift = 5;
-static constexpr auto kF32ElementsPerIteration = 1
-    << kF32ElementsPerIterationShift;
-static_assert(kF32ElementsPerIteration == 32);
-
-static constexpr auto kF32ElementsPerRegisterShift = 2;
-static constexpr auto kF32ElementsPerRegister = 1
-    << kF32ElementsPerRegisterShift;
-static_assert(kF32ElementsPerRegister == 4);
-
-static constexpr auto kF32RegisterPairsPerIteration = 4;
-static constexpr auto kF32RegistersPerIteration =
-    kF32RegisterPairsPerIteration * 2;
-static constexpr auto kF32RegistersPerIterationShift = 3;
-static_assert(
-    kF32RegistersPerIteration ==
-    kF32ElementsPerIteration / kF32ElementsPerRegister);
-static_assert(kF32RegistersPerIteration == 1 << kF32RegistersPerIterationShift);
-
-static inline double reduce(float32x4_t x[kF32RegistersPerIteration]) {
+float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
   int offset = kF32RegistersPerIteration;
-  utils::ForcedUnroll<kF32RegistersPerIterationShift>{}(
-      [&offset, &x](auto idx) ET_INLINE_ATTRIBUTE {
+  c10::ForcedUnroll<IntegerLog2(kF32RegistersPerIteration)>{}(
+      [&offset, &x](auto idx) {
         offset /= 2;
-        for (int i = 0; i < offset; ++i) {
-          x[i] = vaddq_f32(x[i], x[offset + i]);
+        for (const auto i : c10::irange(offset)) {
+          x[i] = x[i] + x[offset + i];
         }
       });
-  return vaddvq_f32(x[0]);
+  return reduce(x[0]);
 }
 
-static ET_INLINE float32x4_t to_bfloat16(uint16x4_t u16) {
-  int32x4_t shift = vdupq_n_s32(16);
-  return vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(u16), shift));
-}
+// EXECUTORCH NOTE: removed __ARM_FEATURE_BF16_VECTOR_ARITHMETIC gate
+// added in https://github.com/pytorch/pytorch/pull/152766, which I
+// complained on.
 
-static ET_INLINE float32x4_t
-f32_fma_bf16(float32x4_t a, uint16x4_t b, uint16x4_t c) {
-  return f32_fma(a, to_bfloat16(b), to_bfloat16(c));
-}
+// We would have to write a separate SVE-specific path to use SVE
+// BFDOT. Deferring that for now to get the NEON/ASIMD BFDOT path
+// working.
+#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && \
+    defined(__clang__) && __clang_major__ > 15
+// https://godbolt.org/z/z8P4Yncra
+#define COMPILER_SUPPORTS_BF16_TARGET 1
+#elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && \
+    !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 10
+// https://gcc.gnu.org/gcc-10/changes.html
+// https://godbolt.org/z/cdGG7vn8o
+#define COMPILER_SUPPORTS_BF16_TARGET 1
+#else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) &&
+      // defined(__clang__) && __clang_major__ > 15
+#define COMPILER_SUPPORTS_BF16_TARGET 0
+#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) &&
+       // defined(__clang__) && __clang_major__ > 15
 
-#define ET_TARGET_ARM_BF16_ATTRIBUTE \
-  __attribute__((target("arch=armv8.2-a+bf16")))
-ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE float32x4_t
-f32_dot_bf16(float32x4_t a, bfloat16x8_t b, bfloat16x8_t c) {
-  return vbfdotq_f32(a, b, c);
-}
+#if COMPILER_SUPPORTS_BF16_TARGET
+#define TARGET_ARM_BF16_ATTRIBUTE __attribute__((target("arch=armv8.2-a+bf16")))
 
-ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE void
+TARGET_ARM_BF16_ATTRIBUTE C10_ALWAYS_INLINE void
 dot_with_fp32_arith_main_inner_loop_bfdot(
     const BFloat16* vec1,
     const BFloat16* vec2,
-    float32x4_t sum[kF32RegistersPerIteration],
+    vec::VectorizedN<float, kF32RegistersPerIteration>& sum,
     int registerPairIndex) {
-  const bfloat16x8_t temp_vec1 = vld1q_bf16(reinterpret_cast<const __bf16*>(
-      &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
-  const bfloat16x8_t temp_vec2 = vld1q_bf16(reinterpret_cast<const __bf16*>(
-      &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  // NOTE[Intrinsics in bfdot variant]: We can't use
+  // vec::Vectorized<BFloat16>::loadu here because linux-aarch64 GCC
+  // inexplicably can't convert Vectorized<BFloat16> to
+  // bfloat16x8_t. I suspect a bug or incomplete
+  // __attribute__((target)) implementation. Intrinsics should be fine
+  // because we're using vbfdotq_f32 below anyway.
+  const auto temp_vec1 = vld1q_bf16(reinterpret_cast<const bfloat16_t*>(
+      &vec1[registerPairIndex * vec::Vectorized<BFloat16>::size()]));
+  const auto temp_vec2 = vld1q_bf16(reinterpret_cast<const bfloat16_t*>(
+      &vec2[registerPairIndex * vec::Vectorized<BFloat16>::size()]));
   sum[registerPairIndex] =
-      f32_dot_bf16(sum[registerPairIndex], temp_vec1, temp_vec2);
+      vbfdotq_f32(sum[registerPairIndex], temp_vec1, temp_vec2);
 }
 
-static ET_INLINE void dot_with_fp32_arith_main_inner_loop_no_bfdot(
-    const BFloat16* vec1,
-    const BFloat16* vec2,
-    float32x4_t sum[kF32RegistersPerIteration],
-    int registerPairIndex) {
-  const uint16x8_t temp_vec1 = vld1q_u16(reinterpret_cast<const uint16_t*>(
-      &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
-  const uint16x8_t temp_vec2 = vld1q_u16(reinterpret_cast<const uint16_t*>(
-      &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
+TARGET_ARM_BF16_ATTRIBUTE C10_ALWAYS_INLINE void
+dot_with_fp32_arith_vectorized_tail_inner_loop_bfdot(
+    const at::BFloat16* vec1,
+    const at::BFloat16* vec2,
+    vec::Vectorized<float>* tail_sum,
+    int idx) {
+  // See NOTE[Intrinsics in bfdot variant] above.
+  const auto temp_vec1 =
+      vld1q_bf16(reinterpret_cast<const bfloat16_t*>(&vec1[idx]));
+  const auto temp_vec2 =
+      vld1q_bf16(reinterpret_cast<const bfloat16_t*>(&vec2[idx]));
+  *tail_sum = vbfdotq_f32(*tail_sum, temp_vec1, temp_vec2);
+}
 
-  sum[2 * registerPairIndex] = f32_fma_bf16(
-      sum[2 * registerPairIndex],
-      vget_low_u16(temp_vec1),
-      vget_low_u16(temp_vec2));
-  sum[2 * registerPairIndex + 1] = f32_fma_bf16(
-      sum[2 * registerPairIndex + 1],
-      vget_high_u16(temp_vec1),
-      vget_high_u16(temp_vec2));
+#else
+#define TARGET_ARM_BF16_ATTRIBUTE
+#endif // COMPILER_SUPPORTS_BF16_TARGET
+
+namespace {
+
+[[maybe_unused]] std::pair<vec::Vectorized<float>, vec::Vectorized<float>>
+fmadd(
+    const vec::Vectorized<c10::BFloat16>& a,
+    const vec::Vectorized<c10::BFloat16>& b,
+    const vec::Vectorized<float>& acc_low,
+    const vec::Vectorized<float>& acc_high) {
+  const auto [a_float_low, a_float_high] = convert_bfloat16_float(a);
+  const auto [b_float_low, b_float_high] = convert_bfloat16_float(b);
+  return std::make_pair(
+      fmadd(a_float_low, b_float_low, acc_low),
+      fmadd(a_float_high, b_float_high, acc_high));
 }
 
-template <bool useBfdot>
-ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE void
-dot_with_fp32_arith_main_inner_loop(
-    const BFloat16* vec1,
-    const BFloat16* vec2,
-    float32x4_t sum[kF32RegistersPerIteration],
+[[maybe_unused]] vec::Vectorized<float> fmadd(
+    const vec::Vectorized<float>& acc,
+    const vec::Vectorized<c10::BFloat16>& a,
+    const vec::Vectorized<c10::BFloat16>& b) {
+  const auto [a_float_low, a_float_high] = convert_bfloat16_float(a);
+  const auto [b_float_low, b_float_high] = convert_bfloat16_float(b);
+  return fmadd(
+      a_float_high, b_float_high, fmadd(a_float_low, b_float_low, acc));
+}
+} // namespace
+
+template <typename T>
+C10_ALWAYS_INLINE void dot_with_fp32_arith_main_inner_loop_no_bfdot(
+    const T* vec1,
+    const T* vec2,
+    vec::VectorizedN<float, kF32RegistersPerIteration>& sum,
     int registerPairIndex) {
-  if constexpr (useBfdot) {
-    dot_with_fp32_arith_main_inner_loop_bfdot(
-        vec1, vec2, sum, registerPairIndex);
-  } else {
-    dot_with_fp32_arith_main_inner_loop_no_bfdot(
-        vec1, vec2, sum, registerPairIndex);
-  }
+  static_assert(std::is_same_v<T, BFloat16>);
+  const auto temp_vec1 = vec::Vectorized<T>::loadu(
+      &vec1[registerPairIndex * vec::Vectorized<T>::size()]);
+  const auto temp_vec2 = vec::Vectorized<T>::loadu(
+      &vec2[registerPairIndex * vec::Vectorized<T>::size()]);
+
+  const auto [result_low, result_high] = fmadd(
+      temp_vec1,
+      temp_vec2,
+      sum[2 * registerPairIndex],
+      sum[2 * registerPairIndex + 1]);
+  sum[2 * registerPairIndex] = result_low;
+  sum[2 * registerPairIndex + 1] = result_high;
 }
 
-static ET_INLINE void dot_with_fp32_arith_vectorized_tail_inner_loop(
-    const BFloat16* vec1,
-    const BFloat16* vec2,
-    float32x4_t* tailSum,
+template <typename T>
+C10_ALWAYS_INLINE void dot_with_fp32_arith_vectorized_tail_inner_loop_no_bfdot(
+    const T* vec1,
+    const T* vec2,
+    vec::Vectorized<float>* tail_sum,
     int idx) {
-  const auto temp_vec1 =
-      vld1_u16(reinterpret_cast<const uint16_t*>(&vec1[idx]));
-  const auto temp_vec2 =
-      vld1_u16(reinterpret_cast<const uint16_t*>(&vec2[idx]));
-  *tailSum = f32_fma_bf16(*tailSum, temp_vec1, temp_vec2);
+  const auto temp_vec1 = vec::Vectorized<T>::loadu(&vec1[idx]);
+  const auto temp_vec2 = vec::Vectorized<T>::loadu(&vec2[idx]);
+  *tail_sum = fmadd(*tail_sum, temp_vec1, temp_vec2);
 }
 
-namespace {
+template <typename T>
+C10_ALWAYS_INLINE auto dot_with_fp32_arith_main_loop_no_bfdot(
+    const T* vec1,
+    const T* vec2,
+    int64_t len) {
+  vec::VectorizedN<float, kF32RegistersPerIteration> sum(0);
+  const auto len_aligned = len & ~(kF32ElementsPerIteration - 1);
+  for (int j = 0; j < len_aligned; j += kF32ElementsPerIteration) {
+    const auto* vec1_ = vec1 + j;
+    const auto* vec2_ = vec2 + j;
+    c10::ForcedUnroll<kF32RegisterPairsPerIteration>{}(
+        [vec1_, vec2_, &sum](auto k) C10_ALWAYS_INLINE_ATTRIBUTE {
+          dot_with_fp32_arith_main_inner_loop_no_bfdot(vec1_, vec2_, sum, k);
+        });
+  }
+  return reduce(sum);
+}
+
+#if COMPILER_SUPPORTS_BF16_TARGET
 template <int n>
 struct ForcedUnrollTargetBFloat16 {
   template <typename Func>
-  ET_TARGET_ARM_BF16_ATTRIBUTE ET_INLINE void operator()(const Func& f) const {
+  TARGET_ARM_BF16_ATTRIBUTE C10_ALWAYS_INLINE void operator()(
+      const Func& f) const {
     ForcedUnrollTargetBFloat16<n - 1>{}(f);
     f(n - 1);
   }
@@ -156,59 +248,99 @@ struct ForcedUnrollTargetBFloat16 {
 template <>
 struct ForcedUnrollTargetBFloat16<1> {
   template <typename Func>
-  ET_TARGET_ARM_BF16_ATTRIBUTE ET_INLINE void operator()(const Func& f) const {
+  TARGET_ARM_BF16_ATTRIBUTE C10_ALWAYS_INLINE void operator()(
+      const Func& f) const {
     f(0);
   }
 };
 
-} // namespace
-
-template <typename T, bool useBFloat16Dot>
-ET_TARGET_ARM_BF16_ATTRIBUTE float
-dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
-  float32x4_t sum[kF32RegistersPerIteration] = {vdupq_n_f32(0)};
+C10_ALWAYS_INLINE TARGET_ARM_BF16_ATTRIBUTE auto
+dot_with_fp32_arith_main_loop_bfdot(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    int64_t len) {
+  vec::VectorizedN<float, kF32RegistersPerIteration> sum(0);
   const auto len_aligned = len & ~(kF32ElementsPerIteration - 1);
   for (int j = 0; j < len_aligned; j += kF32ElementsPerIteration) {
     const auto* vec1_ = vec1 + j;
     const auto* vec2_ = vec2 + j;
     ForcedUnrollTargetBFloat16<kF32RegisterPairsPerIteration>{}(
         [vec1_, vec2_, &sum](auto k)
-            ET_INLINE_ATTRIBUTE ET_TARGET_ARM_BF16_ATTRIBUTE {
-              dot_with_fp32_arith_main_inner_loop<useBFloat16Dot>(
-                  vec1_, vec2_, sum, k);
+            C10_ALWAYS_INLINE_ATTRIBUTE TARGET_ARM_BF16_ATTRIBUTE {
+              dot_with_fp32_arith_main_inner_loop_bfdot(vec1_, vec2_, sum, k);
             });
   }
-  auto reducedSum = reduce(sum);
-
-  // First-tier tail fixup: make sure we handle workloads that can
-  // benefit from vectorization, but don't fit into our fully unrolled
-  // loop above.
-  float32x4_t tailSum = vdupq_n_f32(0);
-  const auto len_aligned_4 = len & ~3;
-  for (int j = len_aligned; j < len_aligned_4; j += 4) {
-    dot_with_fp32_arith_vectorized_tail_inner_loop(vec1, vec2, &tailSum, j);
-  }
-  auto reducedTail = vpaddq_f32(tailSum, tailSum);
-  reducedSum += vgetq_lane_f32(vpaddq_f32(reducedTail, reducedTail), 0);
-
-  // Second-tier tail fixup: handle all workloads.
-  for (int j = len_aligned_4; j < len; ++j) {
-    reducedSum += vec1[j] * vec2[j];
-  }
-  return reducedSum;
+  return reduce(sum);
 }
+#endif // COMPILER_SUPPORTS_BF16_TARGET
 
-float bf16_dot_with_fp32_arith(
+static_assert(
+    (vec::Vectorized<BFloat16>::size() &
+     (vec::Vectorized<BFloat16>::size() - 1)) == 0,
+    "Below code expects power-of-2 vector register size!");
+
+// NOTE [GCC code duplication]: The first attempt at landing BFDOT support with
+// TARGET_ARM_BF16_ATTRIBUTE failed because unlike clang, GCC will not
+// allow inlining a non-bf16-specific function into a bf16-specific
+// function. We can work around this by duplicating the code into the
+// bfdot and non-bfdot callsites. The code is in this macro to avoid
+// actual copy/paste.
+#define DOT_WITH_FP32_ARITH_TAIL_AFTER_MAIN_LOOP_BODY(bfdot_suffix)            \
+  /* First-tier tail fixup: make sure we handle workloads that can */          \
+  /* benefit from vectorization, but don't fit into our fully unrolled */      \
+  /* loop above. */                                                            \
+  vec::Vectorized<float> tail_sum(0);                                          \
+  const auto len_aligned = len & ~(kF32ElementsPerIteration - 1);              \
+  const auto len_aligned_vec = len & ~(vec::Vectorized<BFloat16>::size() - 1); \
+  for (int j = len_aligned; j < len_aligned_vec;                               \
+       j += vec::Vectorized<BFloat16>::size()) {                               \
+    dot_with_fp32_arith_vectorized_tail_inner_loop##bfdot_suffix(              \
+        vec1, vec2, &tail_sum, j);                                             \
+  }                                                                            \
+  reduced_sum += reduce(tail_sum);                                             \
+                                                                               \
+  /* Second-tier tail fixup: handle all workloads. */                          \
+  for (const auto j : c10::irange(len_aligned_vec, len)) {                     \
+    /* Attempting to use Half here caused multiple test failures; */           \
+    /* using float to unbreak. (Suspect we need a scalar FMA.) */              \
+    float x1 = vec1[j];                                                        \
+    float x2 = vec2[j];                                                        \
+    reduced_sum += x1 * x2;                                                    \
+  }                                                                            \
+  return reduced_sum
+
+#if COMPILER_SUPPORTS_BF16_TARGET
+TARGET_ARM_BF16_ATTRIBUTE float dot_with_fp32_arith_bfdot(
     const BFloat16* vec1,
     const BFloat16* vec2,
     int64_t len) {
+  auto reduced_sum = dot_with_fp32_arith_main_loop_bfdot(vec1, vec2, len);
+  DOT_WITH_FP32_ARITH_TAIL_AFTER_MAIN_LOOP_BODY(_bfdot);
+}
+#endif // COMPILER_SUPPORTS_BF16_TARGET
+
+template <typename T>
+C10_ALWAYS_INLINE float
+dot_with_fp32_arith_no_bfdot(const T* vec1, const T* vec2, int64_t len) {
+  auto reduced_sum = dot_with_fp32_arith_main_loop_no_bfdot(vec1, vec2, len);
+  DOT_WITH_FP32_ARITH_TAIL_AFTER_MAIN_LOOP_BODY(_no_bfdot);
+}
+#undef DOT_WITH_FP32_ARITH_TAIL_AFTER_MAIN_LOOP_BODY
+
+} // namespace
+
+float bf16_dot_with_fp32_arith(
+    const at::BFloat16* vec1,
+    const at::BFloat16* vec2,
+    int64_t len) {
+#if COMPILER_SUPPORTS_BF16_TARGET
   if (cpuinfo_has_arm_bf16()) {
-    return dot_with_fp32_arith<BFloat16, true>(vec1, vec2, len);
-  } else {
-    return dot_with_fp32_arith<BFloat16, false>(vec1, vec2, len);
+    return dot_with_fp32_arith_bfdot(vec1, vec2, len);
+  } else
+#endif // COMPILER_SUPPORTS_BF16_TARGET
+  {
+    return dot_with_fp32_arith_no_bfdot(vec1, vec2, len);
   }
 }
-#endif // __aarch64__
-} // namespace internal
-} // namespace cpublas
-} // namespace executorch
+
+} // namespace executorch::cpublas::internal
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
index fc47b4482d6..1332a881ed5 100644
--- a/kernels/optimized/blas/BlasKernel.h
+++ b/kernels/optimized/blas/BlasKernel.h
@@ -158,7 +158,6 @@ void gemm_transa_(
   }
 }
 
-#ifdef __aarch64__
 namespace internal {
 float bf16_dot_with_fp32_arith(const torch::executor::BFloat16* vec1, const torch::executor::BFloat16* vec2, int64_t len);
 } // namespace internal
@@ -204,7 +203,6 @@ inline void gemm_transa_<torch::executor::BFloat16, torch::executor::BFloat16>(
     }
   });
 }
-#endif
 
 // clang-format on
 
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 85886365a01..d960db852bb 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -2,10 +2,6 @@ load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFOR
 load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
 load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load(
-    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
-    "get_compiler_optimization_flags",
-)
 
 # Because vec exists as a collection of header files, compile and preprocessor
 # flags applied to the vec target do not have any effect, since no compilation
@@ -200,7 +196,12 @@ def define_libs(is_fbcode=False):
             exported_headers = native.glob([
                 "blas/**/*.h",
             ]),
-            compiler_flags = get_compiler_optimization_flags(),
+            compiler_flags = ["-Wno-pass-failed"] + select({
+                "ovr_config//runtime:fbcode": [],
+                # TODO: replace with get_compiler_optimization_flags from op_registration_util.bzl when that
+                # is re-enabled.
+                "DEFAULT": ["-Os"],
+            }),
             header_namespace = "executorch/kernels/optimized",
             visibility = [
                 "//executorch/...",
@@ -235,6 +236,7 @@ def define_libs(is_fbcode=False):
                 "//executorch/extension/threadpool:threadpool",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
+                "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
             ],
             **get_apple_framework_deps_kwargs(is_fbcode),
         )

From 3cdc4b883e43dcf7563f9d299e889eafaeacf0c4 Mon Sep 17 00:00:00 2001
From: Jack <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 16 May 2025 23:44:45 +0200
Subject: [PATCH 140/178] Update Android demo app README.md (#10922)

---
 examples/demo-apps/android/LlamaDemo/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index d4db18ec016..4b8cafd2d4e 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -1,6 +1,6 @@
 # ExecuTorch Llama Android Demo App
 
-**[UPDATE - 10/24]** We have added support for running quantized Llama 3.2 1B/3B models in demo apps on the [XNNPACK backend](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md). We currently support inference with SpinQuant and QAT+LoRA quantization methods.
+**[UPDATE - 2025-05-15]** We have added support for running Qwen3 0.6B and 4B model. Please see [this tutorial](https://github.com/pytorch/executorch/tree/main/examples/models/qwen3#summary) for export. Loading and running Qwen3 with this app is the same as Llama, as in this doc.
 
 We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer.
 
@@ -26,6 +26,7 @@ As a whole, the models that this app supports are (varies by delegate):
 * Llama 3 8B
 * Llama 2 7B
 * LLaVA-1.5 vision model (only XNNPACK)
+* Qwen 3 0.6B, 1.7B, and 4B
 
 
 ## Building the APK
@@ -170,4 +171,4 @@ Go to `examples/demo-apps/android/LlamaDemo`,
 ```
 
 ## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new), or join our discord [here](https://lnkd.in/gWCM4ViK).

From d9fcea1424d63d872cce71d3822b18d6aeee74bb Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Fri, 16 May 2025 15:02:31 -0700
Subject: [PATCH 141/178] Build pybind preset in CI (#10936)

### Summary
TSIA

### Test plan
CI
---
 .github/workflows/build-presets.yml | 40 ++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 7f3c958ae55..168abc4a241 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -6,6 +6,8 @@ on:
     branches:
       - main
       - release/*
+    paths:
+      - .github/workflows/build-presets.yml
   workflow_dispatch:
 
 concurrency:
@@ -16,15 +18,51 @@ jobs:
   apple:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
+      fail-fast: false
       matrix:
-        preset: [macos-arm64]
+        preset: [macos-arm64, pybind]
     with:
       job-name: build
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       runner: macos-latest-xlarge
       python-version: 3.12
       submodules: recursive
+      timeout: 90
       script: |
         set -eux
         ${CONDA_RUN} ./install_requirements.sh > /dev/null
         ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
         ${CONDA_RUN} cmake --build cmake-out --parallel
+
+  linux:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: false
+      matrix:
+        preset: [pybind]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+    with:
+      job-name: build
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
+      submodules: recursive
+      timeout: 90
+      script: |
+        set -eux
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        ./install_requirements.sh > /dev/null
+        cmake --preset ${{ matrix.preset }}
+        cmake --build cmake-out --parallel

From f8218d11233f286500fa4618e54bfdfe7efe5063 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 16 May 2025 16:50:13 -0700
Subject: [PATCH 142/178] Copy executorch codegen from pytorch torchgen to
 executorch repo

Differential Revision: D74865579

Pull Request resolved: https://github.com/pytorch/executorch/pull/10939
---
 codegen/api/__init__.py                      |    0
 codegen/api/custom_ops.py                    |  151 +++
 codegen/api/et_cpp.py                        |  368 +++++++
 codegen/api/types/__init__.py                |    5 +
 codegen/api/types/signatures.py              |   76 ++
 codegen/api/types/types.py                   |   77 ++
 codegen/api/unboxing.py                      |  232 ++++
 codegen/gen.py                               | 1026 ++++++++++++++++++
 codegen/model.py                             |  220 ++++
 codegen/parse.py                             |  153 +++
 codegen/targets.bzl                          |   43 +
 shim_et/xplat/executorch/codegen/codegen.bzl |    6 +-
 12 files changed, 2353 insertions(+), 4 deletions(-)
 create mode 100644 codegen/api/__init__.py
 create mode 100644 codegen/api/custom_ops.py
 create mode 100644 codegen/api/et_cpp.py
 create mode 100644 codegen/api/types/__init__.py
 create mode 100644 codegen/api/types/signatures.py
 create mode 100644 codegen/api/types/types.py
 create mode 100644 codegen/api/unboxing.py
 create mode 100644 codegen/gen.py
 create mode 100644 codegen/model.py
 create mode 100644 codegen/parse.py

diff --git a/codegen/api/__init__.py b/codegen/api/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/codegen/api/custom_ops.py b/codegen/api/custom_ops.py
new file mode 100644
index 00000000000..46fdbda6bfd
--- /dev/null
+++ b/codegen/api/custom_ops.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from torchgen import dest
+
+
+# disable import sorting to avoid circular dependency.
+from torchgen.api.types import DispatcherSignature  # usort: skip
+from torchgen.context import method_with_native_function
+from torchgen.model import BaseTy, BaseType, DispatchKey, NativeFunction, Variant
+from torchgen.utils import concatMap, Target
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from executorch.codegen.model import ETKernelIndex
+    from torchgen.selective_build.selector import SelectiveBuilder
+
+
+# Generates RegisterKernelStub.cpp, which provides placeholder kernels for custom operators. This will be used at
+# model authoring side.
+@dataclass(frozen=True)
+class ComputeNativeFunctionStub:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str | None:
+        if Variant.function not in f.variants:
+            return None
+
+        sig = DispatcherSignature.from_schema(
+            f.func, prefix=f"wrapper_CPU_{f.func.name.overload_name}_", symint=False
+        )
+        assert sig is not None
+        if len(f.func.returns) == 0:
+            ret_name = ""
+        elif len(f.func.returns) == 1:
+            if f.func.arguments.out:
+                ret_name = f.func.arguments.out[0].name
+            else:
+                ret_name = next(
+                    (
+                        a.name
+                        for a in f.func.arguments.flat_non_out
+                        if a.type == f.func.returns[0].type
+                    ),
+                    "",
+                )
+            if not ret_name:
+                # if return type is tensor
+                if f.func.returns[0].type == BaseType(BaseTy.Tensor):
+                    # Returns an empty tensor
+                    ret_name = "at::Tensor()"
+                else:
+                    raise Exception(  # noqa: TRY002
+                        f"Can't handle this return type {f.func}"
+                    )  # noqa: TRY002
+        elif len(f.func.arguments.out) == len(f.func.returns):
+            # Returns a tuple of out arguments
+            tensor_type = "at::Tensor &"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join([r.name for r in f.func.arguments.out])}
+            )"""
+        else:
+            assert all(
+                a.type == BaseType(BaseTy.Tensor) for a in f.func.returns
+            ), f"Only support tensor returns but got {f.func.returns}"
+            # Returns a tuple of empty tensors
+            tensor_type = "at::Tensor"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join(["at::Tensor()" for _ in f.func.returns])}
+            )"""
+        ret_str = f"return {ret_name};" if len(f.func.returns) > 0 else ""
+        return f"""
+{sig.defn()} {{
+    {ret_str}
+}}
+    """
+
+
+def gen_custom_ops_registration(
+    *,
+    native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    rocm: bool,
+) -> tuple[str, str]:
+    """
+    Generate custom ops registration code for dest.RegisterDispatchKey.
+
+    :param native_functions: a sequence of `NativeFunction`
+    :param selector: for selective build.
+    :param kernel_index: kernels for all the ops.
+    :param rocm: bool for dest.RegisterDispatchKey.
+    :return: generated C++ code to register custom operators into PyTorch
+    """
+
+    # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet.
+    # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex.
+
+    dispatch_key = DispatchKey.CPU
+    backend_index = kernel_index._to_backend_index()
+    static_init_dispatch_registrations = ""
+    ns_grouped_native_functions: dict[str, list[NativeFunction]] = defaultdict(list)
+    for native_function in native_functions:
+        ns_grouped_native_functions[native_function.namespace].append(native_function)
+
+    for namespace, functions in ns_grouped_native_functions.items():
+        if len(functions) == 0:
+            continue
+        dispatch_registrations_body = "\n".join(
+            list(
+                concatMap(
+                    dest.RegisterDispatchKey(
+                        backend_index,
+                        Target.REGISTRATION,
+                        selector,
+                        rocm=rocm,
+                        symint=False,
+                        class_method_name=None,
+                        skip_dispatcher_op_registration=False,
+                    ),
+                    functions,
+                )
+            )
+        )
+        static_init_dispatch_registrations += f"""
+TORCH_LIBRARY_IMPL({namespace}, {dispatch_key}, m) {{
+{dispatch_registrations_body}
+}}"""
+    anonymous_definition = "\n".join(
+        list(
+            concatMap(
+                dest.RegisterDispatchKey(
+                    backend_index,
+                    Target.ANONYMOUS_DEFINITION,
+                    selector,
+                    rocm=rocm,
+                    symint=False,
+                    class_method_name=None,
+                    skip_dispatcher_op_registration=False,
+                ),
+                native_functions,
+            )
+        )
+    )
+    return anonymous_definition, static_init_dispatch_registrations
diff --git a/codegen/api/et_cpp.py b/codegen/api/et_cpp.py
new file mode 100644
index 00000000000..5703af89c5a
--- /dev/null
+++ b/codegen/api/et_cpp.py
@@ -0,0 +1,368 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from executorch.codegen.api.types import (
+    ArrayRefCType,
+    BaseTypeToCppMapping,
+    OptionalCType,
+    scalarT,
+    tensorListT,
+    tensorT,
+)
+
+from torchgen import local
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    CType,
+    MutRefCType,
+    NamedCType,
+    SpecialArgName,
+    TupleCType,
+    VectorCType,
+    voidT,
+)
+from torchgen.model import (
+    Argument,
+    Arguments,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from typing_extensions import assert_never
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+"""
+This file describes the translation of JIT schema to the public C++ API, which is what people use when they call
+functions like at::add. It also serves as a native function API, which is the signature of kernels,
+since in Executorch CppSignature is the same as NativeSignature.
+
+Difference between this file and torchgen.api.cpp.py:
+
+  - Executorch doesn't support TensorOptions, however in this file we still keep the logic here to be compatible with
+    torchgen.api.cpp, so that we can do stuff like ATen mode (running ATen kernels in Executorch).
+
+  - Executorch doesn't support Dimname.
+
+  - Executorch runtime doesn't support SymInt, will treat it as int.
+"""
+
+
+# Translation of "value types" in JIT schema to C++ API type.  Value
+# types look the same no matter if they are argument types or return
+# types.  Returns None if the type in question is not a value type.
+def valuetype_type(
+    t: Type,
+    *,
+    binds: ArgName,
+) -> NamedCType | None:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar:
+            return None
+        # For SymInt we simply treat it as int.
+        elif str(t) == "SymInt":
+            return NamedCType(binds, BaseCType(BaseTypeToCppMapping[BaseTy.int]))
+        # All other BaseType currently map directly to BaseCppTypes.
+        return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name]))
+    elif isinstance(t, OptionalType):
+        elem = valuetype_type(t.elem, binds=binds)
+        if elem is None:
+            return None
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return NamedCType(
+                binds, ArrayRefCType(BaseCType(BaseTypeToCppMapping[BaseTy.bool]))
+            )
+        else:
+            return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translation of types occurring in JIT arguments to a C++ argument type.
+# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
+# For example, we'll return std::vector<int> instead of IntArrayRef.
+# See Note [translation from C++ reference to value types]
+# flake8: noqa: C901
+def argumenttype_type(
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+) -> NamedCType:
+    # If it's a value type, do the value type translation
+    r = valuetype_type(
+        t,
+        binds=binds,
+    )
+    if r is not None:
+        return r
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))
+            else:
+                return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(
+                    binds, MutRefCType(BaseCType(tensorT))
+                )  # TODO: fix this discrepancy
+            else:
+                return NamedCType(
+                    binds, ConstRefCType(OptionalCType(BaseCType(tensorT)))
+                )
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        # TODO: keeping these special cases for Tensor[] and Tensor?[] so that we can hookup with ATen kernels.
+        if str(t.elem) == "Tensor":
+            return NamedCType(binds, BaseCType(tensorListT))
+        elif str(t.elem) == "Dimname":
+            raise NotImplementedError("Executorch doesn't support Dimname")
+        elif str(t.elem) == "Tensor?":
+            return NamedCType(binds, ArrayRefCType(OptionalCType(BaseCType(tensorT))))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translate a JIT argument into its C++ type
+def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
+
+
+# Translation of a (non-multi) return type from JIT to C++
+# N.B: returntype_type returns a CType, not a NamedCType.
+# This is mostly because of the mismatch between return types and return names.
+# e.g. a function with a return type of 'void' has 0 return names,
+# and a function with a return type of 'std::tuple' has >1 return name.
+def returntype_type(t: Type, *, mutable: bool) -> CType:
+    # placeholder is ignored
+    r = valuetype_type(t, binds="__placeholder__")
+    if r is not None:
+        return r.type
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                if local.use_const_ref_for_mutable_tensors():
+                    return ConstRefCType(BaseCType(tensorT))
+                else:
+                    return MutRefCType(BaseCType(tensorT))
+            else:
+                # Note [Tensor Copy Returns]
+                # Currently, we use "Argument.is_write" to determine
+                # whether or not Tensor return types should be copies or references.
+                # If that ever changes, take a look at other locations of this note!
+                return BaseCType(tensorT)
+        elif t.name == BaseTy.Scalar:
+            return BaseCType(scalarT)
+    elif isinstance(t, ListType):
+        assert (
+            not mutable
+        ), "Native functions should never return a mutable tensor list. They should return void."
+        elem = returntype_type(t.elem, mutable=False)
+        assert t.size is None, f"fixed size list returns not supported: {t}"
+        return VectorCType(elem)
+
+    raise AssertionError(f"unrecognized return type {t}")
+
+
+# Translation of a single return to its C++ type
+def return_type(r: Return) -> CType:
+    return returntype_type(r.type, mutable=r.is_write)
+
+
+# Translation of a full (possibly multi) return from JIT to its C++ type
+def returns_type(rs: Sequence[Return]) -> CType:
+    if len(rs) == 0:
+        return BaseCType(voidT)
+    elif len(rs) == 1:
+        return return_type(rs[0])
+    else:
+        return TupleCType([return_type(r) for r in rs])
+
+
+def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]:
+    returns: list[str] = []
+    for i, r in enumerate(f.func.returns):
+        # If we have an inplace function, the return argument is
+        # implicitly named self.
+        # TODO: Consider incorporating this into the data model
+        if f.func.name.name.inplace:
+            assert i == 0, "illegal inplace function with multiple returns"
+            name = "self"
+        # If we are out function, the name is the name of the
+        # corresponding output function (r.name will get recorded
+        # in field_name later.)
+        elif f.func.is_out_fn():
+            name = f.func.arguments.out[i].name
+        # If the return argument is explicitly named...
+        elif r.name:
+            name_conflict = any(
+                r.name == a.name for a in f.func.schema_order_arguments()
+            )
+            if name_conflict and not f.func.is_out_fn():
+                name = f"{r.name}_return"
+            else:
+                name = r.name
+        # If there is no explicit name and no fallback name was passed in, we just name the output result,
+        # unless it's a multi-return, in which case it's result0,
+        # result1, etc (zero-indexed)
+        else:
+            name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}"
+        returns.append(name)
+    return returns
+
+
+JIT_TO_CPP_DEFAULT = {
+    "False": "false",
+    "True": "true",
+    "None": "torch::execustd::nullopt",  # UGH this one is type directed
+    "[]": "{}",
+    "contiguous_format": "torch::executorch::MemoryFormat::Contiguous",
+    "long": "torch::executorch::kLong",
+}
+
+
+# Convert a JIT default into C++ expression representing the default
+# flake8: noqa: C901
+def default_expr(d: str, t: Type) -> str:
+    if d == "None" and str(t) == "Tensor?":
+        return "{}"
+    if isinstance(t, BaseType) and t.name is BaseTy.str:
+        # Schema allows single quotes but C++ needs double
+        if len(d) >= 2 and d[0] == "'" and d[-1] == "'":
+            s = ""
+            i = 1
+            while i + 1 < len(d):
+                if d[i] != "\\":
+                    if d[i] == '"':
+                        s += '\\"'
+                    else:
+                        s += d[i]
+                    i += 1
+                else:
+                    if d[i + 1] == "'":
+                        s += "'"
+                    else:
+                        s += d[i : i + 2]
+                    i += 2
+
+            return f'"{s}"'
+
+    if isinstance(t, OptionalType):
+        if d == "None":
+            return "torch::executor::nullopt"
+
+        return default_expr(d, t.elem)
+
+    if isinstance(t, ListType):
+        if d.startswith("[") and d.endswith("]"):
+            return "{" + d[1:-1] + "}"
+        elif t.size is None:
+            # NOTE: Sized lists can have scalar defaults
+            raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
+
+    return JIT_TO_CPP_DEFAULT.get(d, d)
+
+
+# Convert an argument into its C++ API form
+def argument(
+    a: Argument | TensorOptionsArguments | SelfArgument,
+    *,
+    cpp_no_default_args: set[str],
+    method: bool,
+    faithful: bool,
+    has_tensor_options: bool,
+) -> list[Binding]:
+    def sub_argument(
+        a: Argument | TensorOptionsArguments | SelfArgument,
+    ) -> list[Binding]:
+        return argument(
+            a,
+            cpp_no_default_args=cpp_no_default_args,
+            method=method,
+            faithful=faithful,
+            has_tensor_options=has_tensor_options,
+        )
+
+    if isinstance(a, Argument):
+        binds: ArgName
+        if a.name == "memory_format" and has_tensor_options:
+            binds = SpecialArgName.possibly_redundant_memory_format
+        else:
+            binds = a.name
+        default: str | None = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=binds),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, TensorOptionsArguments):
+        raise NotImplementedError("Need to implement type resolution for TensorOptions")
+    elif isinstance(a, SelfArgument):
+        if method:
+            # Caller is responsible for installing implicit this in context!
+            return []
+        else:
+            return sub_argument(a.argument)
+    else:
+        assert_never(a)
+
+
+def arguments(
+    arguments: Arguments,
+    *,
+    faithful: bool,
+    method: bool,
+    cpp_no_default_args: set[str],
+) -> list[Binding]:
+    args: list[Argument | TensorOptionsArguments | SelfArgument] = []
+    if faithful:
+        args.extend(arguments.non_out)
+        args.extend(arguments.out)
+    else:
+        args.extend(arguments.out)
+        args.extend(arguments.non_out)
+    return [
+        r.no_default() if faithful else r
+        for a in args
+        for r in argument(
+            a,
+            faithful=faithful,
+            method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args,
+        )
+    ]
diff --git a/codegen/api/types/__init__.py b/codegen/api/types/__init__.py
new file mode 100644
index 00000000000..9de50ae744a
--- /dev/null
+++ b/codegen/api/types/__init__.py
@@ -0,0 +1,5 @@
+# flake8: noqa: F403, F401
+from executorch.codegen.api.types.types import *
+
+# flake8: noqa: F403, F401
+from executorch.codegen.api.types.signatures import *  # usort: skip
diff --git a/codegen/api/types/signatures.py b/codegen/api/types/signatures.py
new file mode 100644
index 00000000000..0b41b227c4e
--- /dev/null
+++ b/codegen/api/types/signatures.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import torchgen.api.cpp as aten_cpp
+from executorch.codegen.api.types.types import contextArg
+
+
+if TYPE_CHECKING:
+    from torchgen.api.types import Binding, CType
+    from torchgen.model import FunctionSchema, NativeFunction
+
+
+@dataclass(frozen=True)
+class ExecutorchCppSignature:
+    """
+    This signature is merely a CppSignature with Executorch types (optionally
+    contains KernelRuntimeContext as well). The inline definition of
+    CppSignature is generated in Functions.h and it's used by unboxing
+    functions.
+    """
+
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    # The set of C++ arguments which should not have defaults applied to them
+    cpp_no_default_args: set[str]
+
+    # Allows you to prepend an arbitrary prefix to the signature name.
+    # This is useful for parts of the codegen that generate wrappers around kernels,
+    # and need to avoid naming collisions.
+    prefix: str = ""
+
+    def arguments(self, *, include_context: bool = True) -> list[Binding]:
+        return ([contextArg] if include_context else []) + et_cpp.arguments(
+            self.func.arguments,
+            faithful=True,  # always faithful, out argument at the end
+            method=False,  # method not supported
+            cpp_no_default_args=self.cpp_no_default_args,
+        )
+
+    def name(self) -> str:
+        return self.prefix + aten_cpp.name(
+            self.func,
+            faithful_name_for_out_overloads=True,
+        )
+
+    def decl(self, name: str | None = None, *, include_context: bool = True) -> str:
+        args_str = ", ".join(
+            a.decl() for a in self.arguments(include_context=include_context)
+        )
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def defn(self, name: str | None = None) -> str:
+        args = [a.defn() for a in self.arguments()]
+        args_str = ", ".join(args)
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def returns_type(self) -> CType:
+        return et_cpp.returns_type(self.func.returns)
+
+    @staticmethod
+    def from_native_function(
+        f: NativeFunction, *, prefix: str = ""
+    ) -> ExecutorchCppSignature:
+        return ExecutorchCppSignature(
+            func=f.func, prefix=prefix, cpp_no_default_args=f.cpp_no_default_args
+        )
+
+
+from executorch.codegen.api import et_cpp
diff --git a/codegen/api/types/types.py b/codegen/api/types/types.py
new file mode 100644
index 00000000000..712d7e5e341
--- /dev/null
+++ b/codegen/api/types/types.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from torchgen.api.types import (
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    CType,
+    doubleT,
+    Expr,
+    longT,
+    MutRefCType,
+    NamedCType,
+)
+from torchgen.model import BaseTy
+
+
+halfT = BaseCppType("torch::executor", "Half")
+bfloat16T = BaseCppType("torch::executor", "BFloat16")
+stringT = BaseCppType("torch::executor", "string_view")
+scalarTypeT = BaseCppType("torch::executor", "ScalarType")
+tensorT = BaseCppType("torch::executor", "Tensor")
+tensorListT = BaseCppType("torch::executor", "TensorList")
+scalarT = BaseCppType("torch::executor", "Scalar")
+memoryFormatT = BaseCppType("torch::executor", "MemoryFormat")
+intArrayRefT = BaseCppType("torch::executor", "IntArrayRef")
+optionalT = BaseCppType("torch::executor", "optional")
+contextT = BaseCppType("torch::executor", "KernelRuntimeContext")
+
+contextExpr = Expr(
+    expr="context",
+    type=NamedCType(name="context", type=MutRefCType(BaseCType(contextT))),
+)
+
+contextArg = Binding(
+    name="context",
+    nctype=contextExpr.type,
+    argument=None,  # type: ignore[arg-type]
+    default=None,
+)
+
+BaseTypeToCppMapping: dict[BaseTy, BaseCppType] = {
+    BaseTy.int: longT,
+    BaseTy.float: doubleT,
+    BaseTy.bool: boolT,
+    BaseTy.str: stringT,
+    BaseTy.ScalarType: scalarTypeT,
+    BaseTy.Tensor: tensorT,
+    BaseTy.Scalar: scalarT,
+    BaseTy.MemoryFormat: memoryFormatT,
+}
+
+
+@dataclass(frozen=True)
+class OptionalCType(CType):
+    elem: CType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"torch::executor::optional<{self.elem.cpp_type()}>"
+
+    def remove_const_ref(self) -> CType:
+        return OptionalCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayRefCType(CType):
+    elem: CType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"torch::executor::ArrayRef<{self.elem.cpp_type()}>"
+
+    def remove_const_ref(self) -> CType:
+        return ArrayRefCType(self.elem.remove_const_ref())
diff --git a/codegen/api/unboxing.py b/codegen/api/unboxing.py
new file mode 100644
index 00000000000..d92ee8d557f
--- /dev/null
+++ b/codegen/api/unboxing.py
@@ -0,0 +1,232 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Callable, TYPE_CHECKING
+
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Type,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torchgen.api.types import Binding, CType, NamedCType
+
+
+connector = "\n\t"
+
+
+# Return unboxing function name for a NativeFunction
+def name(f: NativeFunction) -> str:
+    return f.func.name.unambiguous_name()
+
+
+@dataclass(frozen=True)
+class Unboxing:
+    """
+    Takes a sequence of Bindings and unbox EValues to these Bindings. Return generated code that performs correct unboxing.
+    A sample generated code:
+    // aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    void mul_out(EValue** stack) {
+        EValue& self = *stack[0];
+        EValue& other = *stack[1];
+        EValue& out = *stack[2];
+        const torch::executor::Tensor & self_base = self.to<torch::executor::Tensor>();
+        const torch::executor::Tensor & other_base = other.to<torch::executor::Tensor>();
+        torch::executor::Tensor & out_base = out.to<torch::executor::Tensor>();
+
+        EXECUTORCH_SCOPE_PROF("native_call_mul.out");
+        torch::executor::mul_outf(self_base, other_base, out_base);
+
+
+    }
+    """
+
+    # this is a callable that converts a JIT argument, into its C++ type.
+    # Translates (type, mutability, binds) to NamedCType. E.g., torchgen.api.cpp.argumenttype_type.
+    argument_type_gen: Callable[
+        ...,
+        NamedCType,
+    ]
+
+    # Convert all the arguments in a NativeFunction to C++ code
+    def convert_arguments(
+        self, args: Sequence[Binding]
+    ) -> tuple[list[Binding], list[str]]:
+        code_list = [f"EValue& {args[i].name} = *stack[{i}];" for i in range(len(args))]
+        binding_list = []
+        for arg in args:
+            # expecting only Argument
+            if not isinstance(arg.argument, Argument):
+                raise Exception(  # noqa: TRY002
+                    f"Unexpected argument type, expecting `Argument` but got {arg}"
+                )
+            argument: Argument = arg.argument
+            unboxed_name, _, code, decl = self.argumenttype_evalue_convert(
+                argument.type, argument.name, mutable=argument.is_write
+            )
+            code_list.extend(decl)
+            code_list.extend(code)
+            binding_list.append(arg.with_name(unboxed_name))
+        return binding_list, code_list
+
+    def argumenttype_evalue_convert(
+        self, t: Type, arg_name: str, *, mutable: bool = False
+    ) -> tuple[str, CType, list[str], list[str]]:
+        """
+        Takes in the type, name and mutability corresponding to an argument, and generates a tuple of:
+        (1) the C++ code necessary to unbox the argument
+        (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType
+        :param t: a `Type` of an argument
+        :param arg_name: argument name
+        :param mutable: boolean for whether this argument type is mutable
+        :return: unboxed result
+        """
+        ctype = self.argument_type_gen(t, mutable=mutable, binds=arg_name).type
+
+        if isinstance(t, BaseType):
+            out_name = f"{arg_name}_base"
+            code, decl = self._gen_code_base_type(
+                arg_name=arg_name, out_name=out_name, ctype=ctype
+            )
+        elif isinstance(t, OptionalType):
+            out_name = f"{arg_name}_opt_out"
+            code, decl = self._gen_code_optional_type(
+                arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+            )
+        elif isinstance(t, ListType):
+            out_name = f"{arg_name}_list_out"
+            code, decl = self._gen_code_list_type(
+                arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+            )
+        else:
+            raise Exception(  # noqa: TRY002
+                f"Cannot handle type {t}. arg_name: {arg_name}"
+            )  # noqa: TRY002
+        return out_name, ctype, code, decl
+
+    def _gen_code_base_type(
+        self, arg_name: str, out_name: str, ctype: CType
+    ) -> tuple[list[str], list[str]]:
+        return [
+            f"{ctype.cpp_type()} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();"
+        ], []
+
+    def _gen_code_optional_type(
+        self, arg_name: str, out_name: str, t: OptionalType, ctype: CType
+    ) -> tuple[list[str], list[str]]:
+        in_name = f"{arg_name}_opt_in"
+        res_name, base_type, res_code, decl = self.argumenttype_evalue_convert(
+            t.elem, in_name
+        )
+        return (
+            f"""
+    auto {out_name} = {arg_name}.toOptional<{base_type.cpp_type(strip_ref=True)}>();
+            """.split(
+                "\n"
+            ),
+            decl,
+        )
+
+    def _gen_code_list_type(
+        self, arg_name: str, out_name: str, t: ListType, ctype: CType
+    ) -> tuple[list[str], list[str]]:
+        in_name = f"{arg_name}_list_in"
+        elem_name = f"{arg_name}_elem"
+        code = []
+        res_name, res_ctype, res_code, decl = self.argumenttype_evalue_convert(
+            t.elem, elem_name
+        )
+
+        if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.Tensor:
+            code.extend(
+                f"""
+    auto {out_name} = {arg_name}.toTensorList();
+                """.split(
+                    "\n"
+                )
+            )
+        elif isinstance(t.elem, BaseType) and (
+            t.elem.name == BaseTy.int or t.elem.name == BaseTy.SymInt
+        ):
+            code.extend(
+                f"""
+    auto {out_name} = {arg_name}.toIntList();
+                """.split(
+                    "\n"
+                )
+            )
+        elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.float:
+            code.extend(
+                f"""
+    auto {out_name} = {arg_name}.toDoubleList();
+                """.split(
+                    "\n"
+                )
+            )
+        elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool:
+            # handle list type with size, e.g., bool[4]
+            code.extend(
+                f"""
+#ifdef USE_ATEN_LIB
+std::array<bool, {t.size}> {out_name};
+auto {in_name} = {arg_name}.toBoolList();
+size_t _i = 0;
+for (auto {elem_name}: {in_name}) {{
+    {out_name}[_i++] = {elem_name};
+}}
+#else
+auto {out_name} = {arg_name}.toBoolList();
+#endif
+                """.split(
+                    "\n"
+                )
+            )
+        # pytorch codegen:
+        # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<::std::optional<at::Tensor>>
+        elif (
+            isinstance(t.elem, OptionalType)
+            and isinstance(t.elem.elem, BaseType)
+            and t.elem.elem.name == BaseTy.Tensor
+        ):
+            code.extend(
+                f"""
+#ifdef USE_ATEN_LIB
+auto {in_name} = {arg_name}.toListOptionalTensor();
+c10::List<::std::optional<at::Tensor>> {out_name};
+for (auto {elem_name}: {in_name}) {{
+    {out_name}.push_back({elem_name});
+}}
+#else
+auto {out_name} = {arg_name}.toListOptionalTensor();
+#endif
+                """.split(
+                    "\n"
+                )
+            )
+        else:
+            # use ArrayRef as default.
+            vec_name = arg_name + "_vec"
+            # need to bring vector instantiation out of scope so that ArrayRef has valid data
+            decl.append(
+                f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};"
+            )
+            code.extend(
+                f"""
+    for (EValue {elem_name}: {in_name}) {{
+        {connector.join(res_code)}
+        {vec_name}.push_back({res_name});
+    }}
+    {ctype.cpp_type(strip_ref=True)} {out_name}({vec_name});
+                """.split(
+                    "\n"
+                )
+            )
+        return code, decl
diff --git a/codegen/gen.py b/codegen/gen.py
new file mode 100644
index 00000000000..7541a6e9705
--- /dev/null
+++ b/codegen/gen.py
@@ -0,0 +1,1026 @@
+from __future__ import annotations
+
+import argparse
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, TextIO, TYPE_CHECKING
+
+import yaml
+from executorch.codegen.api import et_cpp
+from executorch.codegen.api.custom_ops import (
+    ComputeNativeFunctionStub,
+    gen_custom_ops_registration,
+)
+from executorch.codegen.api.types import contextArg, ExecutorchCppSignature
+from executorch.codegen.api.unboxing import Unboxing
+from executorch.codegen.model import ETKernelIndex, ETKernelKey, ETParsedYaml
+from executorch.codegen.parse import ET_FIELDS, parse_et_yaml, parse_et_yaml_struct
+
+# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices.
+from torchgen import dest
+from torchgen.api import cpp as aten_cpp
+from torchgen.api.types import CppSignature, CppSignatureGroup, CType, NamedCType
+from torchgen.context import (
+    method_with_native_function,
+    method_with_nested_native_function,
+    with_native_function_and_index,
+)
+from torchgen.gen import (
+    get_custom_build_selector,
+    get_native_function_declarations,
+    get_native_function_declarations_from_ns_grouped_kernels,
+    get_native_function_schema_registrations,
+    LineLoader,
+    parse_native_yaml,
+)
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DEFAULT_KERNEL_NAMESPACE,
+    DispatchKey,
+    FunctionSchema,
+    Location,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+    Variant,
+)
+from torchgen.utils import (
+    context,
+    FileManager,
+    make_file_manager,
+    mapMaybe,
+    NamespaceHelper,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torchgen.selective_build.selector import SelectiveBuilder
+
+
+def _sig_decl_wrapper(sig: CppSignature | ExecutorchCppSignature) -> str:
+    """
+    A wrapper function to basically get `sig.decl(include_context=True)`.
+    For ATen kernel, the codegen has no idea about ET contextArg, so we
+    use this wrapper to add it.
+    """
+    if isinstance(sig, ExecutorchCppSignature):
+        return sig.decl()
+
+    returns_type = aten_cpp.returns_type(sig.func.returns).cpp_type()
+    cpp_args = [a.decl() for a in sig.arguments()]
+    cpp_args_str = ", ".join([contextArg.decl()] + cpp_args)
+    sig_decl = f"{returns_type} {sig.name()}({cpp_args_str})"
+    return sig_decl
+
+
+def static_dispatch(
+    sig: CppSignature | ExecutorchCppSignature,
+    f: NativeFunction,
+    backend_indices: list[BackendIndex],
+) -> str:
+    """
+    For a given `NativeFunction`, find out the corresponding native function and dispatch to it. If zero or more than one
+    native function exists, error out. A simplified version of register_dispatch_key.py
+    Arguments:
+        sig: A CppSignature for this native function we want to use.
+        f: NativeFunction to generate static dispatch.
+        backend_indices: All available backends.
+    Return:
+        C++ code to call backend-specific functions, e.g., "return at::native::add(self, other, scale);"
+    """
+    if len(backend_indices) == 0 or f.manual_kernel_registration:
+        return ""
+
+    backends = [b for b in backend_indices if b.has_kernel(f)]
+    static_block = None
+    if len(backends) == 1:
+        backend_metadata = backends[0].get_kernel(f)
+        if backend_metadata:
+            args = ", ".join(a.name for a in sig.arguments())
+            # Here we are assuming there's no difference between CppSignature and NativeSignature for Executorch.
+            static_block = f"return ::{backend_metadata.cpp_namespace}::{backend_metadata.kernel}({args});"
+    else:
+        static_block = f"""
+ET_ASSERT_UNREACHABLE_MSG("The number of native function(s) binding to {f.func.name} is {len(backends)}.");
+    """
+    return f"""
+// {f.namespace}::{f.func}
+TORCH_API inline {_sig_decl_wrapper(sig)} {{
+    {static_block}
+}}
+"""
+
+
+# Generates Functions.h, which provides the functional public C++ API,
+# and the scaffolding to call into the dispatcher from these functions.
+@dataclass(frozen=True)
+class ComputeFunction:
+    static_dispatch_backend_indices: list[BackendIndex]
+
+    selector: SelectiveBuilder
+
+    use_aten_lib: bool
+
+    is_custom_op: Callable[[NativeFunction], bool]
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str | None:
+        is_method_variant = False
+        if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"):
+            return None
+
+        if Variant.function not in f.variants and Variant.method in f.variants:
+            is_method_variant = True
+
+        # only valid remaining case is only function is in f.variants
+        elif not (Variant.function in f.variants and Variant.method not in f.variants):
+            raise Exception(  # noqa: TRY002
+                f"Can't handle native function {f.func} with the following variant specification {f.variants}."
+            )
+
+        sig: CppSignature | ExecutorchCppSignature = (
+            CppSignatureGroup.from_native_function(
+                f, method=False, fallback_binding=f.manual_cpp_binding
+            ).most_faithful_signature()
+            if self.use_aten_lib
+            else ExecutorchCppSignature.from_native_function(f)
+        )
+        if self.use_aten_lib and not self.is_custom_op(f):
+            comma = ", "
+
+            if is_method_variant:
+                return f"""
+// {f.namespace}::{f.func}
+TORCH_API inline {_sig_decl_wrapper(sig)} {{
+    return {sig.arguments()[0].name}.{sig.name()}({comma.join(e.name for e in sig.arguments()[1:])});
+}}
+"""
+            else:
+                return f"""
+// {f.namespace}::{f.func}
+TORCH_API inline {_sig_decl_wrapper(sig)} {{
+    return at::{sig.name()}({comma.join(e.name for e in sig.arguments())});
+}}
+"""
+
+        else:
+            return static_dispatch(
+                sig,
+                f,
+                backend_indices=self.static_dispatch_backend_indices,
+            )
+
+
+# Generates RegisterCodegenUnboxedKernels.cpp.
+@dataclass(frozen=True)
+class ComputeCodegenUnboxedKernels:
+    selector: SelectiveBuilder
+
+    use_aten_lib: bool
+
+    add_exception_boundary: bool
+
+    @method_with_nested_native_function
+    def __call__(
+        self,
+        unbox_kernel_entry: tuple[NativeFunction, tuple[ETKernelKey, BackendMetadata]],
+    ) -> str:
+        f: NativeFunction = unbox_kernel_entry[0]
+        kernel_key: ETKernelKey | list[ETKernelKey] = unbox_kernel_entry[1][0]
+        kernel_meta: BackendMetadata = unbox_kernel_entry[1][1]
+
+        op_name = f"{f.namespace}::{f.func.name}"
+        if not self.selector.is_root_operator(op_name):
+            return ""
+
+        if not isinstance(kernel_key, list):
+            kernel_key = [kernel_key]
+        used_kernel_keys = self.selector.et_get_selected_kernels(
+            op_name, [k.to_native_string() for k in kernel_key]
+        )
+        if not used_kernel_keys:
+            return ""
+        sig: CppSignature | ExecutorchCppSignature
+        argument_type_gen: Callable[..., NamedCType]
+        return_type_gen: Callable[..., CType]
+        if self.use_aten_lib:
+            sig = CppSignatureGroup.from_native_function(
+                f, method=False, fallback_binding=f.manual_cpp_binding
+            ).most_faithful_signature()
+            argument_type_gen = aten_cpp.argumenttype_type
+            return_type_gen = aten_cpp.returns_type
+            arguments = sig.arguments()
+            kernel_call = f"torch::executor::{f.namespace}::{sig.name()}"
+        else:
+            sig = ExecutorchCppSignature.from_native_function(f)
+            argument_type_gen = et_cpp.argumenttype_type
+            return_type_gen = et_cpp.returns_type
+            arguments = sig.arguments(include_context=False)
+            kernel_call = f"{kernel_meta.cpp_namespace}::{kernel_meta.kernel}"
+        # parse arguments into C++ code
+        binding_list, code_list = Unboxing(
+            argument_type_gen=argument_type_gen
+        ).convert_arguments(arguments)
+
+        # for each C++ argument, generate the conversion code
+        code_connector = "\n\t"
+        arg_connector = ", "
+
+        args_str = f"{arg_connector.join(e.name for e in binding_list)}"
+        event_tracer_output_logging = ""
+        output_ids = []
+
+        if len(f.func.returns) == 0:
+            if len(f.func.arguments.out) == 0:
+                raise Exception(  # noqa: TRY002
+                    f"Can't handle native function {f.func} with no returns and no out yet."
+                )
+            out = f.func.arguments.out[0]
+            return_assignment = f"""stack[{len(binding_list)}] = &{out.name};"""
+            ret_prefix = ""
+            output_ids = [len(binding_list)]
+        else:
+            if len(f.func.arguments.out) == 0:
+                return_assignment = (
+                    f"""*stack[{len(binding_list)}] = EValue(result_);"""
+                )
+                ret_prefix = return_type_gen(f.func.returns).cpp_type() + " result_ = "
+                output_ids = [len(binding_list)]
+            else:
+                return_assignment = ""
+                ret_prefix = ""
+                output_ids = [
+                    len(binding_list) - (i + 1)
+                    for i in reversed(range(len(f.func.arguments.out)))
+                ]
+
+        for output_id in output_ids:
+            event_tracer_output_logging += (
+                f"internal::event_tracer_log_evalue("
+                f"context.internal_event_tracer(), "
+                f"*stack[{output_id}]);\n"
+            )
+
+        exception_boundary_begin = ""
+        exception_boundary_end = ""
+        if self.add_exception_boundary:
+            indent = " " * 8
+            exception_boundary_begin = indent + "try {"
+            exception_boundary_end = f"""{indent}}} catch (const std::exception& ex) {{
+{indent}  ET_LOG(Error, "Kernel threw an exception: %s", ex.what());
+{indent}  context.fail(torch::executor::Error::Internal);
+{indent}}}"""
+        newline = "\n    "
+        return "\n".join(
+            [
+                f"""
+Kernel(
+    "{f.namespace}::{f.func.name}",{newline + '"' + (k + '",') if k != "default" else ""}
+    []({contextArg.defn()}, EValue** stack) {{
+        {code_connector.join(code_list)}
+
+{exception_boundary_begin}
+        internal::EventTracerProfileOpScope event_tracer_op_scope(context.internal_event_tracer(), "native_call_{f.func.name}");
+        EXECUTORCH_SCOPE_PROF("native_call_{f.func.name}");
+        {ret_prefix}{kernel_call}(context, {args_str});
+        {event_tracer_output_logging}
+        {return_assignment}
+{exception_boundary_end}
+    }}
+),
+"""
+                for k in used_kernel_keys
+            ]
+        )
+
+
+def gen_unboxing(
+    *,
+    native_functions: Sequence[NativeFunction],
+    cpu_fm: FileManager,
+    selector: SelectiveBuilder,
+    use_aten_lib: bool,
+    kernel_index: ETKernelIndex,
+    manual_registration: bool,
+    add_exception_boundary: bool = False,
+) -> None:
+    # Iterable type for write_sharded is a Tuple of (native_function, (kernel_key, metadata))
+    def key_func(
+        item: tuple[NativeFunction, tuple[ETKernelKey, BackendMetadata]],
+    ) -> str:
+        return item[0].root_name + ":" + item[1][0].to_native_string()
+
+    items: list[tuple[NativeFunction, tuple[ETKernelKey, BackendMetadata]]] = [
+        (native_function, (kernel_key, metadata))
+        for native_function in native_functions
+        for kernel_key, metadata in kernel_index.get_kernels(native_function).items()
+    ]
+
+    header = ["Functions.h" if use_aten_lib else "NativeFunctions.h"]
+    filename = (
+        "RegisterKernels.cpp"
+        if manual_registration
+        else "RegisterCodegenUnboxedKernels.cpp"
+    )
+    cpu_fm.write_sharded(
+        filename,
+        items,
+        key_fn=key_func,
+        env_callable=lambda unbox_kernel_entry: {
+            "unboxed_kernels": [
+                ComputeCodegenUnboxedKernels(
+                    selector, use_aten_lib, add_exception_boundary
+                )(unbox_kernel_entry)
+            ],
+            "fn_header": (
+                header if unbox_kernel_entry == items[0] else []
+            ),  # Only write header once
+        },
+        num_shards=1,
+        sharded_keys={"unboxed_kernels", "fn_header"},
+    )
+
+
+@with_native_function_and_index  # type: ignore[arg-type]
+def compute_native_function_declaration(
+    g: NativeFunctionsGroup | NativeFunction, kernel_index: ETKernelIndex
+) -> list[str]:
+    assert isinstance(g, NativeFunction)
+    sig = ExecutorchCppSignature.from_native_function(f=g)
+    metadata_list = kernel_index.get_kernels(g).values()
+    if metadata_list is None:
+        return []
+
+    # for kernels in lean mode, we declare two versions, one with context and one without.
+    # In the end we will cleanup the unused one.
+    def gen_decl(metadata: BackendMetadata, include_context: bool) -> str:
+        return f"{sig.decl(name=metadata.kernel, include_context=include_context)};"
+
+    return [
+        gen_decl(metadata, include_context)
+        for include_context in [False, True]
+        for metadata in metadata_list
+    ]
+
+
+def gen_functions_declarations(
+    *,
+    native_functions: Sequence[NativeFunction],
+    kernel_index: ETKernelIndex,
+    selector: SelectiveBuilder,
+    use_aten_lib: bool,
+    custom_ops_native_functions: Sequence[NativeFunction] | None = None,
+) -> str:
+    """
+    Generates namespace separated C++ function API inline declaration/definitions.
+    Native functions are grouped by namespaces and the generated code is wrapped inside
+    namespace blocks.
+
+    E.g., for `custom_1::foo.out` in yaml file we will generate a C++ API as a symbol
+    in `torch::executor::custom_1::foo_out`. This way we avoid symbol conflict when
+    the other `custom_2::foo.out` is available.
+    """
+
+    # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet.
+    # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex.
+
+    backend_index = kernel_index._to_backend_index()
+
+    ns_grouped_functions = defaultdict(list)
+    for native_function in native_functions:
+        ns_grouped_functions[native_function.namespace].append(native_function)
+    functions_declarations = ""
+    newline = "\n"
+    for namespace in ns_grouped_functions:
+        ns_helper = NamespaceHelper(
+            namespace_str=namespace,
+            entity_name="",
+            max_level=3,
+        )
+        declarations = list(
+            mapMaybe(
+                ComputeFunction(
+                    static_dispatch_backend_indices=[backend_index],
+                    selector=selector,
+                    use_aten_lib=use_aten_lib,
+                    is_custom_op=lambda f: custom_ops_native_functions is not None
+                    and f in custom_ops_native_functions,
+                ),
+                ns_grouped_functions[namespace],
+            )
+        )
+        functions_declarations += f"""
+{ns_helper.prologue}
+{newline.join(declarations)}
+{ns_helper.epilogue}
+        """
+    return functions_declarations
+
+
+def get_ns_grouped_kernels(
+    *,
+    native_functions: Sequence[NativeFunction],
+    kernel_index: ETKernelIndex,
+    native_function_decl_gen: Callable[
+        [
+            NativeFunctionsGroup | NativeFunction,
+            ETKernelIndex,
+        ],
+        list[str],
+    ],
+) -> dict[str, list[str]]:
+    ns_grouped_kernels: dict[str, list[str]] = defaultdict(list)
+    for f in native_functions:
+        native_function_namespaces = set()
+        op_kernels = kernel_index.get_kernels(f)
+        for backend_metadata in op_kernels.values():
+            if backend_metadata:
+                namespace = backend_metadata.cpp_namespace
+                native_function_namespaces.add(namespace)
+            else:
+                namespace = DEFAULT_KERNEL_NAMESPACE
+            assert (
+                len(native_function_namespaces) <= 1
+            ), f"Codegen only supports one namespace per operator, got {native_function_namespaces}"
+            ns_grouped_kernels[namespace].extend(
+                native_function_decl_gen(f, kernel_index)
+            )
+    return ns_grouped_kernels
+
+
+def gen_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    gen_custom_ops_header: bool,
+    custom_ops_native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    cpu_fm: FileManager,
+    use_aten_lib: bool,
+) -> None:
+    """Generate headers.
+
+    Args:
+        native_functions (Sequence[NativeFunction]): a collection of NativeFunction for ATen ops.
+        gen_custom_ops_header (bool): whether we should generate CustomOpsNativeFunctions.h
+        custom_ops_native_functions (Sequence[NativeFunction]): a collection of NativeFunction for custom ops.
+        kernel_index (ETKernelIndex): kernel collection
+        cpu_fm (FileManager): file manager manages output stream
+        use_aten_lib (bool): whether we are generating for PyTorch types or Executorch types.
+    """
+    aten_headers = ["#include <ATen/Functions.h>"]
+    backend_indices = {DispatchKey.CPU: kernel_index._to_backend_index()}
+    if gen_custom_ops_header:
+        cpu_fm.write_with_template(
+            "CustomOpsNativeFunctions.h",
+            "NativeFunctions.h",
+            lambda: {
+                "nativeFunctions_declarations": get_native_function_declarations(
+                    grouped_native_functions=custom_ops_native_functions,
+                    backend_indices=backend_indices,
+                    # pyre-ignore: Incompatible parameter type [6]
+                    native_function_decl_gen=dest.compute_native_function_declaration,
+                ),
+                "headers": [
+                    "#include <ATen/ATen.h>",
+                    "#include <torch/torch.h>",
+                ],
+            },
+        )
+        aten_headers.append('#include "CustomOpsNativeFunctions.h"')
+    cpu_fm.write(
+        "Functions.h",
+        lambda: {
+            "static_dispatch_extra_headers": (
+                aten_headers if use_aten_lib else ['#include "NativeFunctions.h"']
+            ),
+            "Functions_declarations": gen_functions_declarations(
+                native_functions=native_functions,
+                kernel_index=kernel_index,
+                selector=selector,
+                use_aten_lib=use_aten_lib,
+                custom_ops_native_functions=custom_ops_native_functions,
+            ),
+        },
+    )
+    cpu_fm.write(
+        "RegisterKernels.h",
+        lambda: {
+            "generated_comment": "@" + "generated by torchgen/gen_executorch.py",
+        },
+    )
+    headers = {
+        "headers": [
+            "#include <executorch/runtime/core/exec_aten/exec_aten.h> // at::Tensor etc.",
+            "#include <executorch/runtime/kernel/kernel_runtime_context.h>",
+        ],
+    }
+    if use_aten_lib:
+        headers["headers"].append("#include <executorch/codegen/macros.h> // TORCH_API")
+        cpu_fm.write(
+            "NativeFunctions.h",
+            lambda: dict(
+                {
+                    "nativeFunctions_declarations": get_native_function_declarations(
+                        grouped_native_functions=native_functions,
+                        backend_indices=backend_indices,
+                        # pyre-ignore: Incompatible parameter type [6]
+                        native_function_decl_gen=dest.compute_native_function_declaration,
+                    ),
+                },
+                **headers,
+            ),
+        )
+    else:
+        ns_grouped_kernels = get_ns_grouped_kernels(
+            native_functions=native_functions,
+            kernel_index=kernel_index,
+            native_function_decl_gen=compute_native_function_declaration,  # type: ignore[arg-type]
+        )
+        cpu_fm.write(
+            "NativeFunctions.h",
+            lambda: dict(
+                {
+                    "nativeFunctions_declarations": get_native_function_declarations_from_ns_grouped_kernels(
+                        ns_grouped_kernels=ns_grouped_kernels,
+                    ),
+                },
+                **headers,
+            ),
+        )
+
+
+def gen_custom_ops(
+    *,
+    native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    cpu_fm: FileManager,
+    rocm: bool,
+) -> None:
+    dispatch_key = DispatchKey.CPU
+    (
+        anonymous_definition,
+        static_init_dispatch_registrations,
+    ) = gen_custom_ops_registration(
+        native_functions=native_functions,
+        selector=selector,
+        kernel_index=kernel_index,
+        rocm=rocm,
+    )
+    cpu_fm.write_with_template(
+        f"Register{dispatch_key}CustomOps.cpp",
+        "RegisterDispatchKeyCustomOps.cpp",
+        lambda: {
+            "ops_headers": '#include "CustomOpsNativeFunctions.h"',
+            "DispatchKey": dispatch_key,
+            "dispatch_namespace": dispatch_key.lower(),
+            "dispatch_namespaced_definitions": "",
+            "dispatch_anonymous_definitions": anonymous_definition,
+            "static_init_dispatch_registrations": static_init_dispatch_registrations,
+        },
+    )
+    cpu_fm.write_with_template(
+        f"Register{dispatch_key}Stub.cpp",
+        "RegisterDispatchKeyCustomOps.cpp",
+        lambda: {
+            "ops_headers": "",
+            "DispatchKey": dispatch_key,
+            "dispatch_namespace": dispatch_key.lower(),
+            "dispatch_namespaced_definitions": "",
+            "dispatch_anonymous_definitions": list(
+                mapMaybe(ComputeNativeFunctionStub(), native_functions)
+            ),
+            "static_init_dispatch_registrations": static_init_dispatch_registrations,
+        },
+    )
+
+    (
+        aten_schema_registrations,
+        schema_registrations,
+    ) = get_native_function_schema_registrations(
+        native_functions=native_functions,
+        schema_selector=selector,
+    )
+    cpu_fm.write(
+        "RegisterSchema.cpp",
+        lambda: {
+            "schema_registrations": schema_registrations,
+            "aten_schema_registrations": aten_schema_registrations,
+        },
+    )
+
+
+def translate_native_yaml(
+    tags_yaml_path: str,
+    aten_yaml_path: str,
+    native_yaml_path: str | None,
+    use_aten_lib: bool,
+    out_file: TextIO,
+) -> None:
+    """Translates Executorch DSL dialect to use the same syntax as
+    native_functions.yaml. The major difference is that Executorch DSL dialect
+    supports "op" key, where it refers to the operator name in native_functions.yaml.
+
+    For example, a functions.yaml may have the following entry:
+
+    - op: add.out
+      ...
+
+    It needs to be translated to the following:
+
+    - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+      ...
+
+    We go in aten_yaml_path and find the operator schema for "add.out" and add it
+    to the original functions.yaml. We also add required field "variants", where for
+    Executorch it will always be "function".
+
+    For ATen mode we don't have to do the translation because native_yaml_path is
+    the same as native_functions.yaml.
+
+    Args:
+        tags_yaml_path: Path to a tags.yaml file to satisfy codegen parsing.
+            It is not optional.
+        aten_yaml_path: Path to ATen operator yaml file native_functions.yaml.
+        native_yaml_path: Path to a functions.yaml file to parse.
+            If the path does not exist in the filesystem, it is treated as an
+            empty file. If `custom_ops_yaml_path` exists, the contents of that
+            file are appended to the yaml input to be parsed.
+        use_aten_lib: We use this flag to determine if we want to generate native
+            functions. In ATen mode we should generate out= variants.
+        out_file: The IO object that we are writing into.
+    Returns:
+        None
+    """
+    if use_aten_lib:
+        with open(aten_yaml_path) as aten_yaml:
+            out_file.writelines(aten_yaml.readlines())
+        return
+
+    native_functions, persisted_fields = parse_et_yaml(
+        aten_yaml_path,
+        tags_yaml_path,
+        None,
+        skip_native_fns_gen=False,
+    )
+
+    func_to_scoped_name: dict[FunctionSchema, str] = {
+        f.func: f"{f.namespace}::{f.func.name}" for f in native_functions
+    }
+    op_to_scoped_name: dict[OperatorName, str] = {
+        func.name: name for func, name in func_to_scoped_name.items()
+    }
+
+    schema_dict = {name: str(func) for func, name in func_to_scoped_name.items()}
+    kernel_persist_dict: dict[str, dict[str, Any]] = {
+        op_to_scoped_name[op]: v for op, v in persisted_fields.items()
+    }
+
+    if (
+        not native_yaml_path
+        or not os.path.exists(native_yaml_path)
+        or os.stat(native_yaml_path).st_size == 0
+    ):
+        return
+    with open(native_yaml_path) as native_yaml:
+        native_es = yaml.load(native_yaml, Loader=LineLoader)
+        if not native_es:
+            return
+        for e in native_es:
+            assert isinstance(e.get("__line__"), int), e
+            loc = Location(native_yaml_path, e.pop("__line__"))
+            with context(lambda loc=loc: f"in {loc}:\n  "):  # type: ignore[misc]
+                if "variants" not in e:
+                    e["variants"] = "function"
+                if "func" in e:
+                    continue
+                assert isinstance(e.get("op"), str), e
+                opname = e.pop("op")
+                if "::" not in opname:
+                    opname = "aten::" + opname
+                assert opname in schema_dict
+                e["func"] = schema_dict.get(opname)
+
+                # Write out persisted kernel information
+                if opname in kernel_persist_dict:
+                    for k, v in kernel_persist_dict[opname].items():
+                        e[k] = v
+
+        yaml.dump(native_es, out_file, width=1000)
+
+
+def parse_yaml(
+    path: str | None,
+    tags_yaml_path: str,
+    function_filter: Callable[[NativeFunction], bool],
+    skip_native_fns_gen: bool = False,
+) -> tuple[
+    list[NativeFunction],
+    dict[DispatchKey, dict[OperatorName, BackendMetadata]] | ETKernelIndex,
+]:
+    if path and os.path.exists(path) and os.stat(path).st_size > 0:
+        with open(path) as f:
+            es = yaml.load(f, Loader=LineLoader)
+
+        # Check for kernel index structure
+        kernel_index = (
+            parse_et_yaml_struct(es) if any("kernels" in e for e in es) else None
+        )
+
+        # Remove ET specific fields from entries for BC compatibility
+        for entry in es:
+            for field in ET_FIELDS:
+                entry.pop(field, None)
+
+        parsed_yaml = parse_native_yaml(
+            path,
+            tags_yaml_path,
+            None,
+            skip_native_fns_gen=skip_native_fns_gen,
+            loaded_yaml=es,
+        )
+        native_functions = list(filter(function_filter, parsed_yaml.native_functions))
+        op_names = [f.func.name for f in native_functions]
+
+        # (1) Return ETKernelIndex if kernel index is present
+        if kernel_index is not None:
+            filtered_index = {
+                op_name: kernel_mapping
+                for op_name, kernel_mapping in kernel_index.index.items()
+                if op_name in op_names
+            }
+            return native_functions, ETKernelIndex(index=filtered_index)
+
+        # (2) Return BackendIndices if kernel index is absent
+        def map_index(
+            m: dict[OperatorName, BackendMetadata],
+        ) -> dict[OperatorName, BackendMetadata]:
+            return {op: m[op] for op in m if op in op_names}
+
+        backend_indices = {
+            k: map_index(b.index) for (k, b) in parsed_yaml.backend_indices.items()
+        }
+
+        return native_functions, backend_indices
+    else:
+        return [], {}
+
+
+def parse_yaml_files(
+    tags_yaml_path: str,
+    aten_yaml_path: str,
+    native_yaml_path: str | None,
+    custom_ops_yaml_path: str | None,
+    selector: SelectiveBuilder,
+    use_aten_lib: bool,
+) -> tuple[ETParsedYaml, ETParsedYaml | None]:
+    """Parses functions.yaml and custom_ops.yaml files.
+
+    Args:
+        tags_yaml_path: Path to a tags.yaml file to satisfy codegen parsing.
+            It is not optional.
+        aten_yaml_path: Path to ATen operator yaml file native_functions.yaml.
+        native_yaml_path: Path to a functions.yaml file to parse.
+            If the path does not exist in the filesystem, it is treated as an
+            empty file. If `custom_ops_yaml_path` exists, the contents of that
+            file are appended to the yaml input to be parsed.
+        custom_ops_yaml_path: Path to a custom_ops.yaml file to parse. If
+            the path does not exist in the filesystem, it is ignored.
+        selector: For selective build.
+        use_aten_lib: We use this flag to determine if we want to generate native
+            functions. In ATen mode we should generate out= variants.
+    Returns:
+        A tuple with two elements:
+        [0]: The parsed results of concatenating the contents of
+             `native_yaml_path` and `custom_ops_yaml_path`.
+        [1]: The parsed results of the contents of `custom_ops_yaml_path`, if
+             present. If not present, None.
+    """
+    import tempfile
+
+    # only include selected ops, this is because we want to avoid
+    def function_filter(f: NativeFunction) -> bool:
+        return selector.is_native_function_selected(f)
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        translated_yaml_path = os.path.join(tmpdirname, "translated.yaml")
+        with open(translated_yaml_path, "w") as translated:
+            translate_native_yaml(
+                tags_yaml_path,
+                aten_yaml_path,
+                native_yaml_path,
+                use_aten_lib,
+                translated,
+            )
+
+        translated_functions, translated_indices = parse_yaml(
+            translated_yaml_path, tags_yaml_path, function_filter, not use_aten_lib
+        )
+        custom_ops_functions, custom_ops_indices = parse_yaml(
+            custom_ops_yaml_path, tags_yaml_path, function_filter, True
+        )
+
+        # Convert BackendIndices to ETKernelIndex
+        if not isinstance(translated_indices, ETKernelIndex):
+            translated_indices = ETKernelIndex.from_backend_indices(translated_indices)
+        if not isinstance(custom_ops_indices, ETKernelIndex):
+            custom_ops_indices = ETKernelIndex.from_backend_indices(custom_ops_indices)
+
+        combined_functions = translated_functions + custom_ops_functions
+        combined_kernel_index = ETKernelIndex.merge_indices(
+            translated_indices, custom_ops_indices
+        )
+        combined_yaml = ETParsedYaml(combined_functions, combined_kernel_index)
+        custom_ops_parsed_yaml = ETParsedYaml(custom_ops_functions, custom_ops_indices)
+
+    return combined_yaml, custom_ops_parsed_yaml
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate operator source files")
+    # Although we don't refer to --source-path directly, make_file_manager()
+    # expects it to point to a directory that contains a templates/ subdirectory
+    # containing the file templates.
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for kernel templates",
+    )
+    parser.add_argument(
+        "--functions-yaml-path",
+        "--functions_yaml_path",
+        help="path to the functions.yaml file to use. Optional, but at least "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
+        "specified.",
+    )
+    parser.add_argument(
+        "--custom-ops-yaml-path",
+        "--custom_ops_yaml_path",
+        help="path to the custom_ops.yaml file to use. Optional, but at least "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
+        "specified.",
+    )
+    parser.add_argument(
+        "--aten-yaml-path",
+        "--aten_yaml_path",
+        help="path to native_functions.yaml file.",
+    )
+    # Note that make_file_manager() also looks at --install-dir.
+    parser.add_argument(
+        "-d",
+        "--install-dir",
+        "--install_dir",
+        help="output directory",
+        default="build/generated",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dependencies",
+        help="output a list of dependencies into the given file and exit",
+    )
+    # Although we don't refer to --dry-run directly, make_file_manager() looks
+    # for it.
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="run without writing any files (still updates outputs)",
+    )
+    parser.add_argument(
+        "--static-dispatch-backend",
+        "--static_dispatch_backend",
+        nargs="*",
+        help="generate static dispatch code for the specific backend (if set)",
+    )
+    parser.add_argument(
+        "--op-registration-whitelist",
+        "--op_registration_whitelist",
+        nargs="*",
+        help="filter op registrations by the whitelist (if set); "
+        "each item is `namespace`::`operator name` without overload name; "
+        "e.g.: aten::empty aten::conv2d ...",
+    )
+    parser.add_argument(
+        "--op-selection-yaml-path",
+        "--op_selection_yaml_path",
+        help="Provide a path to the operator selection (for custom build) YAML "
+        "that contains the information about the set of selected operators "
+        "and their categories (training, ...). Each operator is either a "
+        "full operator name with overload or just a bare operator name. "
+        "The operator names also contain the namespace prefix (e.g. aten::)",
+    )
+    parser.add_argument(
+        "--tags-path",
+        help="Path to tags.yaml. Required by yaml parsing in codegen system.",
+    )
+    parser.add_argument(
+        "--rocm",
+        action="store_true",
+        help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly",
+    )
+    parser.add_argument(
+        "--use-aten-lib",
+        "--use_aten_lib",
+        action="store_true",
+        help="a boolean flag to indicate whether we use ATen kernels or not, in the future this flag will be per "
+        "operator",
+    )
+    parser.add_argument(
+        "--manual_registration",
+        "--manual-registration",
+        action="store_true",
+        help="a boolean flag to indicate whether we want to manually call"
+        "register_kernels() or rely on static init. ",
+    )
+    parser.add_argument(
+        "--generate",
+        type=str,
+        nargs="*",
+        choices=["headers", "sources"],
+        default=["headers", "sources"],
+        help="Generate only a subset of files",
+    )
+    parser.add_argument(
+        "--add-exception-boundary",
+        "--add_exception_boundary",
+        action="store_true",
+        help="whether to add a try/catch in the generated kernel wrapper to "
+        "convert exceptions to clean failures.",
+    )
+    options = parser.parse_args()
+    assert options.tags_path, "tags.yaml is required by codegen yaml parsing."
+
+    selector = get_custom_build_selector(
+        options.op_registration_whitelist,
+        options.op_selection_yaml_path,
+    )
+
+    parsed_yaml, custom_ops_parsed_yaml = parse_yaml_files(
+        aten_yaml_path=options.aten_yaml_path,
+        tags_yaml_path=options.tags_path,
+        native_yaml_path=options.functions_yaml_path,
+        custom_ops_yaml_path=options.custom_ops_yaml_path,
+        selector=selector,
+        use_aten_lib=options.use_aten_lib,
+    )
+    native_functions, kernel_index = (
+        parsed_yaml.native_functions,
+        parsed_yaml.kernel_index,
+    )
+    custom_ops_native_functions = (
+        custom_ops_parsed_yaml.native_functions if custom_ops_parsed_yaml else []
+    )
+
+    cpu_fm = make_file_manager(options=options)
+
+    if "headers" in options.generate:
+        # generate CustomOpsNativeFunctions.h when custom_ops.yaml is present, to match the build system.
+        gen_headers(
+            native_functions=native_functions,
+            gen_custom_ops_header=options.custom_ops_yaml_path,
+            custom_ops_native_functions=custom_ops_native_functions,
+            selector=selector,
+            kernel_index=kernel_index,
+            cpu_fm=cpu_fm,
+            use_aten_lib=options.use_aten_lib,
+        )
+
+    if "sources" in options.generate:
+        gen_unboxing(
+            native_functions=native_functions,
+            cpu_fm=cpu_fm,
+            selector=selector,
+            use_aten_lib=options.use_aten_lib,
+            kernel_index=kernel_index,
+            manual_registration=options.manual_registration,
+            add_exception_boundary=options.add_exception_boundary,
+        )
+        if custom_ops_native_functions:
+            gen_custom_ops(
+                native_functions=custom_ops_native_functions,
+                selector=selector,
+                kernel_index=kernel_index,
+                cpu_fm=cpu_fm,
+                rocm=options.rocm,
+            )
+
+    if options.output_dependencies:
+        depfile_path = Path(options.output_dependencies).resolve()
+        depfile_name = depfile_path.name
+        depfile_stem = depfile_path.stem
+
+        for fm, prefix in [
+            (cpu_fm, ""),
+        ]:
+            varname = prefix + depfile_stem
+            path = depfile_path.parent / (prefix + depfile_name)
+            fm.write_outputs(varname, str(path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/codegen/model.py b/codegen/model.py
new file mode 100644
index 00000000000..ace83dba2be
--- /dev/null
+++ b/codegen/model.py
@@ -0,0 +1,220 @@
+# Represents all kernels used by an Executorch model.
+# It maintains a dict[OperatorName, dict[ETKernelKey, BackendMetadata]] structure.
+
+from __future__ import annotations
+
+import itertools
+from collections import defaultdict, namedtuple
+from dataclasses import dataclass
+from enum import IntEnum
+
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DispatchKey,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+)
+from typing_extensions import assert_never
+
+
+KERNEL_KEY_VERSION = 1
+
+
+# TODO: Duplicated Subset from codegen.tool.gen_oplist, remove declaration in codegen
+class ScalarType(IntEnum):
+    Byte = 0
+    Char = 1
+    Short = 2
+    Int = 3
+    Long = 4
+    Float = 6
+    Double = 7
+    Bool = 11
+
+
+ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "kernel_index"])
+
+
+@dataclass(frozen=True)
+class ETKernelKeyOpArgMeta:
+    arg_name: str
+    dtype: str
+    # The order of the dimensions if entry is a Tensor
+    dim_order: tuple[int, ...]
+
+    def to_native_string(self) -> str:
+        dtype_str = ScalarType[self.dtype].value
+        dim_str = str(self.dim_order)[1:-1].replace(" ", "")
+        return f"{dtype_str};{dim_str}"
+
+
+@dataclass(frozen=True)
+class ETKernelKey:
+    # Field undefined is default = True
+    arg_meta: tuple[ETKernelKeyOpArgMeta, ...] = ()
+
+    # Indicator for this kernel being used as a catch all
+    default: bool = False
+
+    version: int = KERNEL_KEY_VERSION
+
+    @staticmethod
+    def gen_from_yaml(
+        args: dict[str, tuple[str, str]],
+        type_alias_map: dict[str, list[str]],  # TODO: Support unwrapped str val
+        dim_order_alias_map: dict[str, list[int]],
+    ) -> list[ETKernelKey]:
+        """Generate ETKernelKeys from arg kernel specs
+        Multiple ETKernelKeys are returned due to dtype permutations from utilizing
+        type_alias_map (actualizing each potential type permutation as a KernelKey)
+
+        Args:
+            args: Mapping from argument name to kernel specs
+                Kernel specs are a tuple of (dtype, dim_order).
+                Currently tuple entries must be aliased via the alias map arguments
+            type_alias_map: Mapping from type alias to potential type enums
+                i.e { T0 : [Double, Int] } means T0 can be either Double or Int
+                Used for lookup by args
+            dim_order_alias_map: Mapping from alias to a list of dimension orders
+                Used for lookup by args
+        """
+        # Cast to dim order to int
+        dim_order_alias_map = {
+            k: [int(alias) for alias in v] for k, v in dim_order_alias_map.items()
+        }
+        kernel_keys = []
+
+        # Get all used Dtype Alias
+        dtype_alias_used = set()
+        for type_alias, dim_order in args.values():
+            # Enforce usage of alias initially
+            # TODO: Support inlined arguments
+            assert type_alias in type_alias_map, "Undefined type alias: " + str(
+                type_alias
+            )
+            assert (
+                dim_order in dim_order_alias_map
+            ), f"Undefined dim_order alias: {dim_order}"
+            dtype_alias_used.add(type_alias)
+
+        # Generate all permutations of dtype alias values
+        alias_dtypes = [
+            [(alias, dtype) for dtype in type_alias_map[alias]]
+            for alias in dtype_alias_used
+        ]
+        alias_permutations = [
+            dict(permutation) for permutation in list(itertools.product(*alias_dtypes))
+        ]
+
+        # Using each alias value permutation, generate kernel keys
+        op_arg_cache = {}
+        for permutation in alias_permutations:
+            arg_list = []
+            for arg_name, arg_spec in args.items():
+                dtype = permutation[arg_spec[0]]
+                dim_order = dim_order_alias_map[arg_spec[1]]  # type: ignore[assignment]
+                if (
+                    cache_key := (arg_name, dtype, tuple(dim_order))
+                ) not in op_arg_cache:
+                    op_arg_cache[cache_key] = ETKernelKeyOpArgMeta(*cache_key)  # type: ignore[arg-type]
+
+                arg_list.append(op_arg_cache[cache_key])
+            kernel_keys.append(ETKernelKey(tuple(arg_list)))
+
+        return kernel_keys
+
+    def to_native_string(self) -> str:
+        if self.default:
+            return "default"
+        return (
+            "v"
+            + str(KERNEL_KEY_VERSION)
+            + "/"
+            + "|".join([arg.to_native_string() for arg in self.arg_meta])
+        )
+
+
+@dataclass(frozen=True)
+class ETKernelIndex:
+    index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]]
+
+    def has_kernels(self, g: NativeFunction | NativeFunctionsGroup) -> bool:
+        m = self.get_kernels(g)
+        return m is not None
+
+    def get_kernels(
+        self, g: NativeFunction | NativeFunctionsGroup
+    ) -> dict[ETKernelKey, BackendMetadata]:
+        if isinstance(g, NativeFunction):
+            f = g
+        elif isinstance(g, NativeFunctionsGroup):
+            f = g.functional
+        else:
+            assert_never(g)
+        if f.func.name not in self.index:
+            return {}
+        return self.index[f.func.name]
+
+    @staticmethod
+    def grow_from_backend_indices(
+        kernel_index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]],
+        backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]],
+    ) -> None:
+        for dk in backend_indices:
+            index = backend_indices[dk]
+            for op, backend_metadata in index.items():
+                if op in kernel_index:
+                    kernel_index[op][ETKernelKey(default=True)] = backend_metadata
+                else:
+                    kernel_index[op] = {ETKernelKey(default=True): backend_metadata}
+
+    @staticmethod
+    def from_backend_indices(
+        backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]],
+    ) -> ETKernelIndex:
+        kernel_index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]] = (
+            defaultdict(dict)
+        )
+        ETKernelIndex.grow_from_backend_indices(kernel_index, backend_indices)
+        return ETKernelIndex(kernel_index)
+
+    def grow(
+        self, backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]]
+    ) -> ETKernelIndex:
+        ETKernelIndex.grow_from_backend_indices(self.index, backend_indices)
+        return self
+
+    def _to_backend_index(self) -> BackendIndex:
+        """
+        WARNING: this will be deprecated once all the codegen places know how to handle ETKernelIndex.
+        """
+        index: dict[OperatorName, BackendMetadata] = {}
+        for op in self.index:
+            kernel_dict = self.index[op]
+            assert (
+                len(kernel_dict.values()) == 1
+            ), f"Can't convert ETKernelIndex to BackendIndex because {op} has more than one kernels. Got {kernel_dict}"
+            index[op] = kernel_dict.get(
+                ETKernelKey(default=True),
+                BackendMetadata(kernel="", structured=False, cpp_namespace=""),
+            )
+        return BackendIndex(
+            dispatch_key=DispatchKey.CPU,
+            use_out_as_primary=False,
+            device_guard=False,
+            external=False,
+            index=index,
+        )
+
+    # Note duplicate ETKernelKey from index_b will clobber the metadata from index_a
+    @staticmethod
+    def merge_indices(index_a: ETKernelIndex, index_b: ETKernelIndex) -> ETKernelIndex:
+        combined = defaultdict(dict, index_a.index.copy())
+
+        for op, entry in index_b.index.items():
+            for key, metadata in entry.items():
+                combined[op][key] = metadata
+
+        return ETKernelIndex(combined)
diff --git a/codegen/parse.py b/codegen/parse.py
new file mode 100644
index 00000000000..b36dd6f77e7
--- /dev/null
+++ b/codegen/parse.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from collections import defaultdict, namedtuple
+from typing import Any
+
+import yaml
+
+from executorch.codegen.model import ETKernelIndex, ETKernelKey
+from torchgen.gen import LineLoader, parse_native_yaml
+from torchgen.model import (
+    BackendMetadata,
+    DispatchKey,
+    FunctionSchema,
+    NativeFunction,
+    OperatorName,
+)
+from torchgen.utils import NamespaceHelper
+
+
+# Parse native_functions.yaml into a sequence of NativeFunctions and ET Backend Indices.
+ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "et_kernel_indices"])
+
+# Fields in native_functions.yaml used to determine which kernels should be used
+ET_FIELDS = ["kernels", "type_alias", "dim_order_alias"]
+
+
+def parse_from_yaml(ei: dict[str, object]) -> dict[ETKernelKey, BackendMetadata]:
+    """Given a loaded yaml representing kernel assignment information, extract the
+    mapping from `kernel keys` to `BackendMetadata` (the latter representing the kernel instance)
+
+    Args:
+        ei: Dict keys {kernels, type_alias, dim_order_alias}
+            See ETKernelKey for description of arguments
+    """
+    e = ei.copy()
+    if (kernels := e.pop("kernels", None)) is None:
+        return {}
+
+    type_alias: dict[str, list[str]] = e.pop("type_alias", {})  # type: ignore[assignment]
+    dim_order_alias: dict[str, list[str]] = e.pop("dim_order_alias", {})  # type: ignore[assignment]
+    dim_order_alias.pop("__line__", None)
+
+    kernel_mapping: dict[ETKernelKey, BackendMetadata] = {}
+
+    for entry in kernels:  # type: ignore[attr-defined]
+        arg_meta = entry.get("arg_meta")
+        if arg_meta is not None:
+            arg_meta.pop("__line__")
+
+        kernel_name = entry.get("kernel_name")
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            kernel_name, max_level=3
+        )
+        kernel_namespace = namespace_helper.get_cpp_namespace(default="at")
+        backend_metadata = BackendMetadata(
+            kernel=namespace_helper.entity_name,
+            structured=False,
+            cpp_namespace=(kernel_namespace + "::native"),
+        )
+
+        kernel_keys = (
+            [ETKernelKey((), default=True)]
+            if arg_meta is None
+            else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias)  # type: ignore[arg-type]
+        )
+
+        for kernel_key in kernel_keys:
+            assert kernel_key not in kernel_mapping, (
+                "Duplicate kernel key: " + str(kernel_key) + " " + str(e)
+            )
+            kernel_mapping[kernel_key] = backend_metadata
+
+    return kernel_mapping
+
+
+def parse_et_yaml_struct(es: object) -> ETKernelIndex:
+    """Given a loaded yaml representing a list of operators, for each op extract the mapping
+    of `kernel keys` to `BackendMetadata` (the latter representing the kernel instance
+    that should be used by the kernel key).
+    """
+    indices: dict[OperatorName, dict[ETKernelKey, BackendMetadata]] = {}
+    for ei in es:  # type: ignore[attr-defined]
+        e = ei.copy()
+
+        funcs = e.pop("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            namespaced_entity=funcs, max_level=1
+        )
+        opname = FunctionSchema.parse(namespace_helper.entity_name).name
+
+        assert opname not in indices, f"Duplicate func found in yaml: {opname} already"
+
+        if len(index := parse_from_yaml(e)) != 0:
+            indices[opname] = index
+
+    return ETKernelIndex(indices)
+
+
+def extract_kernel_fields(es: object) -> dict[OperatorName, dict[str, Any]]:
+    """Given a loaded yaml representing a list of operators, extract the
+    kernel key related fields indexed by the operator name.
+    """
+    fields: dict[OperatorName, dict[str, Any]] = defaultdict(dict)
+    for ei in es:  # type: ignore[attr-defined]
+        funcs = ei.get("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            namespaced_entity=funcs, max_level=1
+        )
+        opname = FunctionSchema.parse(namespace_helper.entity_name).name
+
+        for field in ET_FIELDS:
+            if (value := ei.get(field)) is not None:
+                fields[opname][field] = value
+
+    return fields
+
+
+def parse_et_yaml(
+    path: str,
+    tags_yaml_path: str,
+    ignore_keys: set[DispatchKey] | None = None,
+    skip_native_fns_gen: bool = False,
+) -> tuple[list[NativeFunction], dict[OperatorName, dict[str, Any]]]:
+    """Parse native_functions.yaml into NativeFunctions and an Operator Indexed Dict
+    of fields to persist from native_functions.yaml to functions.yaml
+    """
+    with open(path) as f:
+        es = yaml.load(f, Loader=LineLoader)
+
+    et_kernel = extract_kernel_fields(es)
+
+    # Remove ET specific fields from entries for BC compatibility
+    strip_et_fields(es)
+
+    native_yaml = parse_native_yaml(
+        path,
+        tags_yaml_path,
+        ignore_keys,
+        skip_native_fns_gen=skip_native_fns_gen,
+        loaded_yaml=es,
+    )
+    return native_yaml.native_functions, et_kernel
+
+
+def strip_et_fields(es: object) -> None:
+    """Given a loaded yaml representing a list of operators,
+    remove ET specific fields from every entries for BC compatibility
+    """
+    for entry in es:  # type: ignore[attr-defined]
+        for field in ET_FIELDS:
+            entry.pop(field, None)
diff --git a/codegen/targets.bzl b/codegen/targets.bzl
index 814abf20c51..ff10b0a9e28 100644
--- a/codegen/targets.bzl
+++ b/codegen/targets.bzl
@@ -32,3 +32,46 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
+
+    runtime.python_library(
+        name = "api",
+        srcs = [
+            "api/__init__.py",
+            "api/custom_ops.py",
+            "api/et_cpp.py",
+            "api/types/__init__.py",
+            "api/types/signatures.py",
+            "api/types/types.py",
+            "api/unboxing.py",
+        ],
+        base_module = "executorch.codegen",
+        external_deps = [
+            "torchgen",
+        ],
+    )
+
+    runtime.python_library(
+        name = "gen_lib",
+        srcs = [
+            "gen.py",
+            "model.py",
+            "parse.py",
+        ],
+        base_module = "executorch.codegen",
+        deps = [
+            ":api",
+        ],
+    )
+
+    runtime.python_binary(
+        name = "gen",
+        main_module = "executorch.codegen.gen",
+        package_style = "inplace",
+        deps = [
+            ":gen_lib",
+        ],
+        _is_external_target = True,
+        visibility = [
+            "PUBLIC",
+        ],
+    )
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index e1cebaa1140..e7bb7ecf9e0 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -178,10 +178,9 @@ def _prepare_genrule_and_lib(
         },
     }
     """
-    target = runtime.external_dep_location("gen-executorch")
     aten_src_path = runtime.external_dep_location("aten-src-path")
     genrule_cmd = [
-        "$(exe {})".format(target),
+        "$(exe //executorch/codegen:gen)",
         "--source-path=$(location //executorch/codegen:templates)",
         "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
         "--aten_yaml_path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),
@@ -262,7 +261,6 @@ def _prepare_custom_ops_genrule_and_lib(
     genrules = {}
     libs = {}
     aten_src_path = runtime.external_dep_location("aten-src-path")
-    target = runtime.external_dep_location("gen-executorch")
     genrule_name = name + "_gen"
 
     if custom_ops_yaml_path:
@@ -281,7 +279,7 @@ def _prepare_custom_ops_genrule_and_lib(
 
         # genrule for generating operator kernel bindings
         genrule_cmd = [
-            "$(exe {})".format(target),
+            "$(exe //executorch/codegen:gen)",
             "--source-path=$(location //executorch/codegen:templates)",
             "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
             "--aten_yaml_path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),

From 6f59e89aa0fa1eaabf3ee94af0838681e1b2129b Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Fri, 16 May 2025 18:59:07 -0700
Subject: [PATCH 143/178] Android backend used by method

Differential Revision: D74913386

Pull Request resolved: https://github.com/pytorch/executorch/pull/10934
---
 .../org/pytorch/executorch/ModuleE2ETest.java | 13 +++++++++++
 .../java/org/pytorch/executorch/Module.java   | 10 +++++++++
 .../org/pytorch/executorch/NativePeer.java    |  4 ++++
 extension/android/jni/jni_layer.cpp           | 22 +++++++++++++++++++
 4 files changed, 49 insertions(+)

diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java
index 3a033851be9..444a5166d95 100644
--- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java
@@ -8,6 +8,7 @@
 
 package org.pytorch.executorch;
 
+import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertFalse;
@@ -89,6 +90,18 @@ public void testClassification(String filePath) throws IOException, URISyntaxExc
         assertEquals(bananaClass, argmax(scores));
     }
 
+    @Test
+    public void testXnnpackBackendRequired() throws IOException, URISyntaxException {
+        File pteFile = new File(getTestFilePath("/mv3_xnnpack_fp32.pte"));
+        InputStream inputStream = getClass().getResourceAsStream("/mv3_xnnpack_fp32.pte");
+        FileUtils.copyInputStreamToFile(inputStream, pteFile);
+        inputStream.close();
+
+        Module module = Module.load(getTestFilePath("/mv3_xnnpack_fp32.pte"));
+        String[] expectedBackends = new String[] {"XnnpackBackend"};
+        assertArrayEquals(expectedBackends, module.getUsedBackends("forward"));
+    }
+
     @Test
     public void testMv2Fp32() throws IOException, URISyntaxException {
         testClassification("/mv2_xnnpack_fp32.pte");
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
index d4f1e99a3c7..2fd488dd1f1 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
@@ -137,6 +137,16 @@ public int loadMethod(String methodName) {
     }
   }
 
+  /**
+   * Returns the names of the methods in a certain method.
+   *
+   * @param methodName method name to query
+   * @return an array of backend name
+   */
+  public String[] getUsedBackends(String methodName) {
+    return mNativePeer.getUsedBackends(methodName);
+  }
+
   /** Retrieve the in-memory log buffer, containing the most recent ExecuTorch log entries. */
   public String[] readLogBuffer() {
     return mNativePeer.readLogBuffer();
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
index a5487a4702e..5700176261b 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
@@ -55,6 +55,10 @@ public void resetNative() {
   @DoNotStrip
   public native int loadMethod(String methodName);
 
+  /** Return the list of backends used by a method */
+  @DoNotStrip
+  public native String[] getUsedBackends(String methodName);
+
   /** Retrieve the in-memory log buffer, containing the most recent ExecuTorch log entries. */
   @DoNotStrip
   public native String[] readLogBuffer();
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index f3c62e1d70f..a78f3801c64 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -13,6 +13,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "jni_layer_constants.h"
@@ -395,6 +396,26 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
 #endif
   }
 
+  facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>> getUsedBackends(
+      facebook::jni::alias_ref<jstring> methodName) {
+    auto methodMeta = module_->method_meta(methodName->toStdString()).get();
+    std::unordered_set<std::string> backends;
+    for (auto i = 0; i < methodMeta.num_backends(); i++) {
+      backends.insert(methodMeta.get_backend_name(i).get());
+    }
+
+    facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>> ret =
+        facebook::jni::JArrayClass<jstring>::newArray(backends.size());
+    int i = 0;
+    for (auto s : backends) {
+      facebook::jni::local_ref<facebook::jni::JString> backend_name =
+          facebook::jni::make_jstring(s.c_str());
+      (*ret)[i] = backend_name;
+      i++;
+    }
+    return ret;
+  }
+
   static void registerNatives() {
     registerHybrid({
         makeNativeMethod("initHybrid", ExecuTorchJni::initHybrid),
@@ -402,6 +423,7 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
         makeNativeMethod("execute", ExecuTorchJni::execute),
         makeNativeMethod("loadMethod", ExecuTorchJni::load_method),
         makeNativeMethod("readLogBuffer", ExecuTorchJni::readLogBuffer),
+        makeNativeMethod("getUsedBackends", ExecuTorchJni::getUsedBackends),
     });
   }
 };

From 9aaea316894430a6310aa48499c8df4d3ddb50bd Mon Sep 17 00:00:00 2001
From: Berker Soyluoglu <bsoyluoglu@meta.com>
Date: Sat, 17 May 2025 00:13:02 -0700
Subject: [PATCH 144/178] Add copy API to ExecuTorchValue (#10954)

---
 extension/apple/ExecuTorch/Exported/ExecuTorchValue.h  | 8 +++++++-
 extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm | 9 +++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
index f95fa48210a..a205c154a20 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
@@ -50,7 +50,7 @@ typedef float ExecuTorchFloatValue
  */
 NS_SWIFT_NAME(Value)
 __attribute__((deprecated("This API is experimental.")))
-@interface ExecuTorchValue : NSObject
+@interface ExecuTorchValue : NSObject <NSCopying>
 
 /**
  * The tag that indicates the dynamic type of the value.
@@ -208,6 +208,12 @@ __attribute__((deprecated("This API is experimental.")))
 + (instancetype)valueWithDouble:(ExecuTorchDoubleValue)value
     NS_SWIFT_NAME(init(_:));
 
+/**
+ * Returns a copy of the value.
+ *
+ * @return A new ExecuTorchValue instance that is a duplicate of the current value.
+ */
+ - (instancetype)copy;
 
 /**
  * Determines whether the current Value is equal to another Value.
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm
index 3475d679e5e..a335f67a99a 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.mm
@@ -57,6 +57,15 @@ - (instancetype)initWithTag:(ExecuTorchValueTag)tag
   return self;
 }
 
+- (instancetype)copy {
+  return [self copyWithZone:nil];
+}
+
+- (instancetype)copyWithZone:(nullable NSZone *)zone {
+  return [[ExecuTorchValue allocWithZone:zone] initWithTag:_tag
+                                                     value:[_value copyWithZone:zone]];
+}
+
 - (ExecuTorchValueTag)tag {
   return _tag;
 }

From 9663bfbb7e6aff74345222ff69dc8f1db3432990 Mon Sep 17 00:00:00 2001
From: Max Ren <40742183+mcr229@users.noreply.github.com>
Date: Sat, 17 May 2025 20:47:37 -0700
Subject: [PATCH 145/178] Hook up PreprocessAll flow to EdgeManager

Differential Revision: D74629455

Pull Request resolved: https://github.com/pytorch/executorch/pull/10842
---
 exir/backend/backend_api.py | 44 +++++++++++++++++++++++++++----------
 exir/program/_program.py    | 39 ++++++++++++++++++++------------
 2 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 838156498c4..d8210e7433a 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -238,7 +238,6 @@ def generate_debug_handle(ep: ExportedProgram) -> int:
         call_delegate_node.meta["val"] = submodule_output_node.meta["val"]
         call_submodule_node.replace_all_uses_with(call_delegate_node)
         owning_graph_module.graph.erase_node(call_submodule_node)
-
     if is_submodule:
         assert len(toplevel_input_specs_to_delete) == 0
         assert len(toplevel_output_specs_to_delete) == 0
@@ -574,26 +573,29 @@ def lower_all_submodules_to_backend(
     # The created exported program for the submodules are in the call_module node's meta data
     # We just map the method_to_submodule_nodes directly to the method_to_partitioned_exported_programs
     method_to_partitioned_program = {
-        method_name: [node.meta["submodule_program"] for node in call_submodule_nodes]
+        method_name: [
+            copy.deepcopy(node.meta["submodule_program"])
+            for node in call_submodule_nodes
+        ]
         for method_name, call_submodule_nodes in method_to_submodules_nodes.items()
     }
     method_to_compile_specs = {
         method_name: [node.meta["compile_spec"] for node in call_submodule_nodes]
         for method_name, call_submodule_nodes in method_to_submodules_nodes.items()
     }
-    backend_found = False
-    for cls in BackendDetails.__subclasses__():
-        if backend_id == cls.__name__:
-            method_to_preprocess_result: dict[str, List[PreprocessResult]] = (
-                cls.preprocess_multimethod(
-                    method_to_partitioned_program, method_to_compile_specs
-                )
-            )
-            backend_found = True
 
-    if not backend_found:
+    backend_name_to_subclass = {
+        subclass.__name__: subclass for subclass in BackendDetails.__subclasses__()
+    }
+    if backend_id not in backend_name_to_subclass:
         raise NotImplementedError(f"Backend {backend_id} was not found.")
 
+    method_to_preprocess_result: dict[str, List[PreprocessResult]] = (
+        backend_name_to_subclass[backend_id].preprocess_multimethod(
+            method_to_partitioned_program, method_to_compile_specs
+        )
+    )
+
     for method_name in method_to_preprocess_result.keys():
         owning_program = method_to_tagged_edge_program[method_name]
         list_of_preprocess_results = method_to_preprocess_result[method_name]
@@ -612,6 +614,9 @@ def lower_all_submodules_to_backend(
                 compile_specs=compile_spec,
                 named_data_store_output=preprocess_result.data_store_output,
             )
+            lowered_module.meta = {
+                "debug_handle_map": preprocess_result.debug_handle_map,
+            }
             is_submodule = call_submodule_node.meta["is_submodule"]
             toplevel_input_specs_to_delete = call_submodule_node.meta[
                 "toplevel_input_specs_to_delete"
@@ -633,6 +638,20 @@ def lower_all_submodules_to_backend(
             )
 
 
+def remove_used_metadata(graph: torch.fx.Graph) -> None:
+    """
+    Remove the used metadata from the graph.
+    """
+    for node in graph.nodes:
+        node.meta.pop("delegation_tag", None)
+        node.meta.pop("backend_id", None)
+        node.meta.pop("submodule_program", None)
+        node.meta.pop("toplevel_input_specs_to_delete", None)
+        node.meta.pop("toplevel_output_specs_to_delete", None)
+        node.meta.pop("is_submodule", None)
+        node.meta.pop("submodule_output_node", None)
+
+
 @dataclass
 class MethodProgramsPartitionerSpec:
     """
@@ -748,6 +767,7 @@ def to_backend(
         if method_name in method_to_tagged_exported_program:
             tagged_exported_program = method_to_tagged_exported_program[method_name]
             tagged_exported_program._validate()
+            remove_used_metadata(tagged_exported_program.graph_module.graph)
             partitioned_and_lowered_exported_programs[method_name] = ExportedProgram(
                 root=tagged_exported_program.graph_module,
                 graph=tagged_exported_program.graph_module.graph,
diff --git a/exir/program/_program.py b/exir/program/_program.py
index f10433c42ae..f24807e253d 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -23,7 +23,10 @@
 from executorch.exir._serialize._serialize import serialize_for_executorch
 from executorch.exir._serialize.data_serializer import DataSerializer
 from executorch.exir._warnings import experimental
-from executorch.exir.backend.backend_api import to_backend
+from executorch.exir.backend.backend_api import (
+    MethodProgramsPartitionerSpec,
+    to_backend,
+)
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
 from executorch.exir.delegate import executorch_call_delegate, is_lowered_module
@@ -1239,10 +1242,16 @@ def to_edge_transform_and_lower(
     if transform_passes is not None:
         edge_manager = edge_manager.transform(transform_passes)
 
-    if partitioner is not None:
+    max_num_partitioners = 0
+    for partitioner_list in partitioner.values():
+        max_num_partitioners = max(max_num_partitioners, len(partitioner_list))
+
+    for i in range(max_num_partitioners):
+        method_to_partitioner = {}
         for name, partitioner_list in partitioner.items():
-            for curr_partitioner in partitioner_list:
-                edge_manager = edge_manager.to_backend({name: curr_partitioner})
+            if i < len(partitioner_list):
+                method_to_partitioner[name] = partitioner_list[i]
+        edge_manager = edge_manager.to_backend(method_to_partitioner)
 
     for name, program in edge_manager._edge_programs.items():
         ops_set_to_not_decompose: Set[torch._ops.OpOverload] = set()
@@ -1475,7 +1484,8 @@ def transform(
 
     @et_logger("to_backend")
     def to_backend(
-        self, partitioner: Union[Partitioner, Dict[str, Partitioner]]
+        self,
+        partitioner: Union[Partitioner, Dict[str, Partitioner]],
     ) -> "EdgeProgramManager":
         """
         Returns a semantically-equivalent program to the one given as input,
@@ -1501,17 +1511,18 @@ def to_backend(
             specified subgraphs lowered.
         """
         new_edge_programs: Dict[str, ExportedProgram] = {}
-        if isinstance(partitioner, dict):
-            for name, program in self._edge_programs.items():
-                if name in partitioner.keys():
-                    new_edge_programs[name] = to_backend(program, partitioner[name])
-                else:
-                    new_edge_programs[name] = program
+        method_to_partitioner: Dict[str, Partitioner] = {}
+        if not isinstance(partitioner, dict):
+            method_to_partitioner = {name: partitioner for name in self._edge_programs}
+        else:
+            method_to_partitioner = partitioner
 
-        else:  # apply partitioner to every method
-            for name, program in self._edge_programs.items():
-                new_edge_programs[name] = to_backend(program, partitioner)
+        method_to_programs_and_partitioners = MethodProgramsPartitionerSpec(
+            self._edge_programs,
+            method_to_partitioner,
+        )
 
+        new_edge_programs = to_backend(method_to_programs_and_partitioners)
         config = EdgeCompileConfig(_check_ir_validity=False)
         return EdgeProgramManager(
             new_edge_programs,

From 7d9dd46a538cc1162bdb90cc36e3d3df6879d9f4 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Sun, 18 May 2025 17:33:50 -0700
Subject: [PATCH 146/178] Arm backend: Remove fast scratch part for now

Differential Revision: D74939323

Pull Request resolved: https://github.com/pytorch/executorch/pull/10958
---
 backends/arm/runtime/EthosUBackend.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index 74d35b3e3d7..51ebb3742e1 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -189,8 +189,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // the end of the execution of the Ethos-U custom delegate
     char* ethosu_scratch =
         static_cast<char*>(temp_allocator->allocate(handles.scratch_data_size));
-    extern size_t ethosu_fast_scratch_size;
-    extern unsigned char* ethosu_fast_scratch;
     ET_LOG(
         Debug,
         "EthosUBackend::execute: Running program data:\n  cmd %p %zu\n  weight %p %zu\n  scratch %p %zu\n  fast scratch %p %zu\n",
@@ -200,8 +198,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         handles.weight_data_size,
         ethosu_scratch,
         handles.scratch_data_size,
-        ethosu_fast_scratch,
-        ethosu_fast_scratch_size);
+        nullptr,
+        0);
 
     // Write argument values (from EValue tensor) into Ethos-U scratch
     // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM
@@ -311,12 +309,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         static_cast<uint64_t>(
             reinterpret_cast<uintptr_t>((handles.weight_data))),
         static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch)),
-        static_cast<uint64_t>(
-            reinterpret_cast<uintptr_t>(ethosu_fast_scratch))};
+        0};
     size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = {
-        handles.weight_data_size,
-        handles.scratch_data_size,
-        ethosu_fast_scratch_size};
+        handles.weight_data_size, handles.scratch_data_size, 0};
     int result = 0;
     EXECUTORCH_PROF_START(
         event_tracer, event_tracer_local_scope, "+EthosUBackend::execute()NPU");

From 6ad47dfa4209be0429044b9a52392f409bd46fd0 Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Mon, 19 May 2025 11:51:12 +0200
Subject: [PATCH 147/178] Arm backend: Do not run model unit tests in parallel
 (#10953)

Executing to many of the larger models in parallel can cause some resource
problems on the machine.

Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
---
 backends/arm/test/test_arm_baremetal.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 6764dd27d96..413e274da8f 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -87,7 +87,7 @@ test_pytest_models() { # Test ops and other things
     backends/arm/scripts/build_executorch.sh
 
     # Run arm baremetal pytest tests without FVP
-    pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/models
+    pytest  --verbose --color=yes backends/arm/test/models
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -122,7 +122,7 @@ test_pytest_models_ethosu_fvp() { # Same as test_pytest but also sometime verify
     backends/arm/test/setup_testing.sh
 
     # Run arm baremetal pytest tests with FVP
-    pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/models --arm_run_corstoneFVP
+    pytest  --verbose --color=yes backends/arm/test/models --arm_run_corstoneFVP
     echo "${TEST_SUITE_NAME}: PASS"
 }
 

From bb507924ffa63af680534549c1737805116b05f5 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 16:10:04 +0400
Subject: [PATCH 148/178] init

---
 examples/models/llama/export_llama_lib.py |  8 ++++++
 extension/llm/export/builder.py           | 35 ++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 05d03ea5621..ae4ab97258e 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -521,6 +521,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="If true, stops right after torch.export() and saves the exported model.",
     )
+
+    parser.add_argument(
+        "--nncf_compression",
+        default=False,
+        action="store_true",
+        help="If true, stops right after torch.export() and saves the exported model.",
+    )
     return parser
 
 
@@ -1138,6 +1145,7 @@ def _load_llama_model(
         use_legacy_export=args.qnn,
         save_exported_program=args.export_only,
         verbose=verbose,
+        nncf_compression=args.nncf_compression,
         metadata=_load_llama_model_metadata(
             weight_type,
             use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 323311caeea..be0fc6824b3 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -15,7 +15,7 @@
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from unittest.mock import patch
-
+import nncf
 import torch
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
@@ -41,6 +41,7 @@
 from torch.export import export_for_training, ExportedProgram
 from torch.nn.attention import SDPBackend
 from torchao.utils import unwrap_tensor_subclass
+from functools import partial
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -99,6 +100,7 @@ def __init__(
         dynamic_shapes: Optional[Any] = None,
         use_legacy_export: bool = False,
         save_exported_program: bool = False,
+        nncf_compression: bool = False
     ):
         # Store necessary constructor arguments.
         self.model = model
@@ -120,6 +122,7 @@ def __init__(
         self.dynamic_shapes = dynamic_shapes
         self.use_legacy_export = use_legacy_export
         self.save_exported_program = save_exported_program
+        self.nncf_compression = nncf_compression
 
         # Note: treat this as the source of truth for the result of
         # torch.export'ing a model. If the overall ExportedProgram is needed,
@@ -409,6 +412,36 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
+        if(self.nncf_compression):
+            tokenizer = get_tokenizer(self.tokenizer_path)
+            def transform_fn(
+                module: torch.fx.GraphModule, tokenizer, prompts: str
+            ):
+                # TODO: change criteria & support batch inputs if necessary
+                pos = torch.tensor(0, dtype=torch.int64)
+                token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+                with torch.no_grad():
+                    while token_list[-1] != tokenizer.eos_id:
+                        logits = module(
+                            torch.full((1, 1), token_list[pos]),
+                            {"input_pos": torch.tensor((pos,))},
+                        )
+                        pos += 1
+                        if pos >= len(token_list):
+                            if self.generate_full_logits:
+                                token_list.append(
+                                    torch.argmax(logits[:, -1], dim=-1).item()
+                                )
+                            else:
+                                token_list.append(torch.argmax(logits[:], dim=-1).item())
+            self.pre_autograd_graph_module = nncf.compress_weights(
+                                                                self.pre_autograd_graph_module,
+                                                                # dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
+                                                                mode=nncf.CompressWeightsMode.INT4_SYM,
+                                                                # ratio=0.8,
+                                                                # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                            )
         else:
             logging.info("No quantizer provided, passing...")
             return self

From 6925c5e8d1860953006f319e9f30be93bc24f767 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 16:28:22 +0400
Subject: [PATCH 149/178] small fix

---
 extension/llm/export/builder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index be0fc6824b3..076db8ef6e5 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -442,6 +442,7 @@ def transform_fn(
                                                                 # ratio=0.8,
                                                                 # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
                                                             )
+            return self
         else:
             logging.info("No quantizer provided, passing...")
             return self

From 5e23cb9f37f156bc7067360edc72b6d81342bab4 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 16:45:54 +0400
Subject: [PATCH 150/178] minor fix

---
 extension/llm/export/builder.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 076db8ef6e5..dc31efba4f7 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -412,7 +412,7 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
-        if(self.nncf_compression):
+        elif(self.nncf_compression):
             tokenizer = get_tokenizer(self.tokenizer_path)
             def transform_fn(
                 module: torch.fx.GraphModule, tokenizer, prompts: str
@@ -437,10 +437,10 @@ def transform_fn(
                                 token_list.append(torch.argmax(logits[:], dim=-1).item())
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
-                                                                # dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
+                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
                                                                 mode=nncf.CompressWeightsMode.INT4_SYM,
-                                                                # ratio=0.8,
-                                                                # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                                ratio=0.8,
+                                                                sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
                                                             )
             return self
         else:

From e04a901c77c7bfcbd06653c27cd3d5d9c4f39b27 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 19:56:42 +0400
Subject: [PATCH 151/178] add data aware wc

---
 extension/llm/export/builder.py | 36 ++++++++++++++-------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index dc31efba4f7..0e26145dff8 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -412,32 +412,26 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
-        elif(self.nncf_compression):
+        elif (self.nncf_compression):
             tokenizer = get_tokenizer(self.tokenizer_path)
+
             def transform_fn(
-                module: torch.fx.GraphModule, tokenizer, prompts: str
+                prompts: str, tokenizer
             ):
-                # TODO: change criteria & support batch inputs if necessary
-                pos = torch.tensor(0, dtype=torch.int64)
-                token_list = tokenizer.encode(prompts, bos=True, eos=False)
-
-                with torch.no_grad():
-                    while token_list[-1] != tokenizer.eos_id:
-                        logits = module(
-                            torch.full((1, 1), token_list[pos]),
-                            {"input_pos": torch.tensor((pos,))},
-                        )
-                        pos += 1
-                        if pos >= len(token_list):
-                            if self.generate_full_logits:
-                                token_list.append(
-                                    torch.argmax(logits[:, -1], dim=-1).item()
-                                )
-                            else:
-                                token_list.append(torch.argmax(logits[:], dim=-1).item())
+                tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+                logging.error(tokenized_text)
+
+                inputs = ()
+                inputs = (
+                    torch.tensor(tokenized_text).unsqueeze(0),
+                    {"input_pos": torch.tensor([0])},
+                )
+
+                return inputs
+                                
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
-                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
+                                                                dataset=nncf.Dataset([self.calibration_data], transform_func=partial(transform_fn, tokenizer=tokenizer)),
                                                                 mode=nncf.CompressWeightsMode.INT4_SYM,
                                                                 ratio=0.8,
                                                                 sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,

From fb5750ef03ad76dbb6a703c1f1ce094fb0b2f0d6 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 19:59:15 +0400
Subject: [PATCH 152/178] minor fix

---
 extension/llm/export/builder.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 0e26145dff8..11cc0d0e749 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -428,10 +428,11 @@ def transform_fn(
                 )
 
                 return inputs
-                                
+
+            self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
-                                                                dataset=nncf.Dataset([self.calibration_data], transform_func=partial(transform_fn, tokenizer=tokenizer)),
+                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
                                                                 mode=nncf.CompressWeightsMode.INT4_SYM,
                                                                 ratio=0.8,
                                                                 sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,

From de72d65690ea519c99b60d04e50cc0a667f19f3a Mon Sep 17 00:00:00 2001
From: jathu <jathu.satkunarajah@gmail.com>
Date: Mon, 19 May 2025 09:38:06 -0700
Subject: [PATCH 153/178] Delete redundant pybind workflows (#10957)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary

These are redundant now given we have dedicated workflows for them:
-
[build-wheels-linux](https://github.com/pytorch/executorch/blob/9aaea316894430a6310aa48499c8df4d3ddb50bd/.github/workflows/build-wheels-linux.yml)
— builds + tests with model
-
[build-wheels-macos](https://github.com/pytorch/executorch/blob/9aaea316894430a6310aa48499c8df4d3ddb50bd/.github/workflows/build-wheels-macos.yml)
— builds + tests with model
- pybind preset in
[build-presets](https://github.com/pytorch/executorch/blob/9aaea316894430a6310aa48499c8df4d3ddb50bd/.github/workflows/build-presets.yml#L1)
— build

### Test plan

CI
---
 .github/workflows/pull.yml  | 28 ----------------------------
 .github/workflows/trunk.yml | 24 ------------------------
 2 files changed, 52 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 2dc1fcde36e..80ad800b4fc 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -343,34 +343,6 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         PYTHON_EXECUTABLE=python bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" mv2
 
-  test-pybind-build-linux:
-    name: test-pybind-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # build module for executorch.extension.pybindings.portable_lib
-        BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON" \
-        bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
-
-        # see if we can import the module successfully
-        python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-
   test-binary-size-linux-gcc:
     name: test-binary-size-linux-gcc
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 8d87c65f7a3..b4ce196e8ad 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -289,30 +289,6 @@ jobs:
         # Build and test coreml delegate
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/build_all.sh
 
-  test-pybind-build-macos:
-    name: test-pybind-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: cmake
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 180
-      script: |
-        bash .ci/scripts/setup-conda.sh
-
-        # build module for executorch.extension.pybindings.portable_lib
-        BUILD_TOOL=${{ matrix.build-tool }}
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON" PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
-
-        # see if we can import the module successfully
-        ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-
   test-static-llama-ane:
     name: test-static-llama-ane
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main

From 78227f0fa1c11fc6ca0a854ad9cfc5c449dc4bb0 Mon Sep 17 00:00:00 2001
From: Michael Adragna <33380470+leafs1@users.noreply.github.com>
Date: Mon, 19 May 2025 10:48:47 -0700
Subject: [PATCH 154/178] Suppport unary log in xnnpack delegate (#10952)

### Summary
Support log in XNNPACK backend

### Test plan
Wrote test cases to see if appropriate xnnpack log was called
---
 backends/xnnpack/operators/__init__.py        |  1 +
 backends/xnnpack/operators/op_log.py          | 52 +++++++++++++++++++
 backends/xnnpack/partition/config/__init__.py |  2 +
 .../partition/config/generic_node_configs.py  |  7 +++
 backends/xnnpack/partition/configs.py         |  1 +
 backends/xnnpack/runtime/XNNCompiler.cpp      | 31 +++++++++++
 .../xnnpack/serialization/runtime_schema.fbs  |  1 +
 backends/xnnpack/serialization/schema.fbs     |  1 +
 .../serialization/xnnpack_graph_schema.py     |  6 +++
 backends/xnnpack/test/ops/test_log.py         | 45 ++++++++++++++++
 10 files changed, 147 insertions(+)
 create mode 100644 backends/xnnpack/operators/op_log.py
 create mode 100644 backends/xnnpack/test/ops/test_log.py

diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
index f056ad8b086..ae3effc2ce7 100644
--- a/backends/xnnpack/operators/__init__.py
+++ b/backends/xnnpack/operators/__init__.py
@@ -24,6 +24,7 @@
     op_hardtanh,
     op_leaky_relu,
     op_linear,
+    op_log,
     op_matrix_multiplication,
     op_max_dim,
     op_max_pool2d,
diff --git a/backends/xnnpack/operators/op_log.py b/backends/xnnpack/operators/op_log.py
new file mode 100644
index 00000000000..edafadf4a27
--- /dev/null
+++ b/backends/xnnpack/operators/op_log.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNGraph,
+    XNNLog,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class LogVisitor(NodeVisitor):
+    target = "aten.log.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input
+        input_id = vals_to_ids[get_input_node(node, 0)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNLog(
+                input_id=input_id,
+                output_id=output_id,
+                flags=0,
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
index 0ffb80f4e3c..a8bc9e6e4a0 100644
--- a/backends/xnnpack/partition/config/__init__.py
+++ b/backends/xnnpack/partition/config/__init__.py
@@ -30,6 +30,7 @@
     # EluConfig,
     HardtanhConfig,
     LeakyReLUConfig,
+    LogConfig,
     MaximumConfig,
     MaxPool2dConfig,
     MeanDimConfig,
@@ -82,6 +83,7 @@
     HardswishConfig,
     LeakyReLUConfig,
     LinearConfig,
+    LogConfig,
     MaxDimConfig,
     MaximumConfig,
     MaxPool2dConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
index c0e707474d3..e16698a3ae6 100644
--- a/backends/xnnpack/partition/config/generic_node_configs.py
+++ b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -357,6 +357,13 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
 
 
+class LogConfig(GenericNodePartitionerConfig):
+    target_name = "log.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
+
+
 class MeanDimConfig(GenericNodePartitionerConfig):
     target_name = "mean.dim"
 
diff --git a/backends/xnnpack/partition/configs.py b/backends/xnnpack/partition/configs.py
index 65fb5ee48e4..60b270134bd 100644
--- a/backends/xnnpack/partition/configs.py
+++ b/backends/xnnpack/partition/configs.py
@@ -64,6 +64,7 @@
     exir_ops.edge.aten.leaky_relu.default,
     exir_ops.edge.aten.addmm.default,  # TODO(T163877189) add constraint for addmm
     exir_ops.edge.aten.rsqrt.default,
+    exir_ops.edge.aten.log.default,
 ]
 
 SUPPORTED_MODULES = [
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 3b78c1a0b84..445744e9918 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1418,6 +1418,36 @@ Error defineReciprocalSquareRootNode(
   return Error::Ok;
 }
 
+/*
+Define serialized log node into the subgraph, using the remapped ids
+to map the serialized ids, to the new ids generated when defining the
+tensor value
+*/
+Error defineLogNode(
+    xnn_subgraph_t subgraph_ptr,
+    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
+    const NodePtr node,
+    const fb_xnnpack::XNNGraph* graph) noexcept {
+  MAYBE_UNUSED(graph);
+
+  auto graph_node = node->xnode_union_as_XNNLog();
+
+  xnn_status status = xnn_define_log(
+      subgraph_ptr,
+      remapped_ids.at(graph_node->input_id()),
+      remapped_ids.at(graph_node->output_id()),
+      graph_node->flags());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create log node %i with code: %s",
+      node->debug_handle(),
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
 /*
 Define serialized ceiling node into the subgraph, using the remapped ids
 to map the serialized ids, to the new ids generated when defining the
@@ -1981,6 +2011,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Ceiling)
     _DEFINE(Hardswish)
     _DEFINE(LeakyReLU)
+    _DEFINE(Log)
     _DEFINE(Maximum)
     _DEFINE(Negate)
     _DEFINE(Square)
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index 75074107c55..f10ba3d1b81 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -139,6 +139,7 @@ union XNodeUnion {
   XNNConcatenate5: _XNNCat,
   XNNConvTranspose2d: _XNNNodeConv,
   XNNReciprocalSquareRoot: _XNNNode1x1,
+  XNNLog: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index 193656c30b1..565eb4c3bba 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -135,6 +135,7 @@ union XNodeUnion {
   XNNConcatenate5: _XNNCat,
   XNNConvTranspose2d: _XNNNodeConv,
   XNNReciprocalSquareRoot: _XNNNode1x1,
+  XNNLog: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
index 3cb572c66ef..2a3ccaf2a0a 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_schema.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -309,6 +309,11 @@ class XNNLeakyReLU:
     flags: int
 
 
+@dataclass
+class XNNLog(XNNNode1x1):
+    pass
+
+
 @dataclass
 class XNNMaximum(XNNNode2x1):
     pass
@@ -379,6 +384,7 @@ class XNNScaledDotProductAttention:
     XNNScaledDotProductAttention,
     XNNBatchMatrixMultiply,
     XNNReciprocalSquareRoot,
+    XNNLog,
 ]
 
 
diff --git a/backends/xnnpack/test/ops/test_log.py b/backends/xnnpack/test/ops/test_log.py
new file mode 100644
index 00000000000..a0670158b7f
--- /dev/null
+++ b/backends/xnnpack/test/ops/test_log.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestLog(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
+    class Log(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            x = torch.abs(x)
+            z = torch.log(x)
+            return z
+
+    def run_log_test(self, inputs):
+        (
+            Tester(self.Log(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.log.default": 1})
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp16_log(self):
+        inputs = (torch.randn(20).to(torch.float16),)
+        self.run_log_test(inputs)
+
+    def test_fp32_log(self):
+        inputs = (torch.randn(20),)
+        self.run_log_test(inputs)

From 303239895c0710573144ee2f3ae3b97959eae447 Mon Sep 17 00:00:00 2001
From: eigen-k <eigen@meta.com>
Date: Mon, 19 May 2025 11:10:38 -0700
Subject: [PATCH 155/178] Make
 test_no_replace_quant_permute_dequant_with_requantize use GraphBuilder.

Differential Revision: D74842294

Pull Request resolved: https://github.com/pytorch/executorch/pull/10927
---
 backends/cadence/aot/tests/test_fusion_ops_passes.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
index b3ca3e3de74..d99b01840f4 100644
--- a/backends/cadence/aot/tests/test_fusion_ops_passes.py
+++ b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -295,11 +295,12 @@ def test_no_replace_quant_permute_dequant_with_requantize(self):
             args=(permute, 4.5, 6, 0, 127, torch.int8),
         )
         builder.output(dequant)
-        graph_module = FuseQuantDequantToRequantizePass(
+        original_graph = builder.get_graph_module()
+        converted_graph = FuseQuantDequantToRequantizePass(
             force_quant_dequant_fusion=False
-        )(builder.get_graph_module()).graph_module
+        )(original_graph).graph_module
         self.check_op_counts(
-            graph_module,
+            converted_graph,
             expected_op_counts={
                 # Verify that no dequant/quant pair was replaced with requantize.
                 # quantize -> permute -> dequantize should not be replaced with requantize.

From 95e27edea45c6dcabe503c47022def0299b81952 Mon Sep 17 00:00:00 2001
From: eigen-k <eigen@meta.com>
Date: Mon, 19 May 2025 11:36:20 -0700
Subject: [PATCH 156/178] Make test_replace_quant_view_dequant_with_requantize
 use GraphBuilder.

Differential Revision: D74843628

Pull Request resolved: https://github.com/pytorch/executorch/pull/10928
---
 .../aot/tests/test_fusion_ops_passes.py       | 42 +++++++++----------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
index d99b01840f4..4e7c37a1635 100644
--- a/backends/cadence/aot/tests/test_fusion_ops_passes.py
+++ b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -311,30 +311,28 @@ def test_no_replace_quant_permute_dequant_with_requantize(self):
         )
 
     def test_replace_quant_view_dequant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = x.view(-1)
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
-
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        view = builder.call_operator(
+            op=exir_ops.edge.aten.view_copy.default, args=(quant, [-1])
+        )
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(view, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(dequant)
+        original_graph = builder.get_graph_module()
+        converted_graph = FuseQuantDequantToRequantizePass()(
+            original_graph
+        ).graph_module
         self.check_op_counts(
-            graph_module,
+            converted_graph,
             expected_op_counts={
-                # Verify that no dequant/quant pair was replaced with requantize.
-                # quantize -> permute -> dequantize should not be replaced with requantize.
+                # Verify that dequant/quant pair was replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
                 exir_ops.edge.cadence.requantize.default: 1,

From ea9eeb888f4bde33a46eb8181237e8ddf7ba948b Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 22:43:15 +0400
Subject: [PATCH 157/178] add quantization support for disable_dynamic_shapes

---
 extension/llm/export/builder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 11cc0d0e749..be2b50a2339 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -430,6 +430,7 @@ def transform_fn(
                 return inputs
 
             self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
+            self.calibration_data = ([word for prompt in self.calibration_data for word in prompt.split()] if self.dynamic_shapes else self.calibration_data)
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
                                                                 dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),

From a9057282a5e1641ef258b9da22ace6007eb0062a Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Mon, 19 May 2025 11:47:51 -0700
Subject: [PATCH 158/178] ToOutVarPass skips inplace ops

Differential Revision: D74833331

Pull Request resolved: https://github.com/pytorch/executorch/pull/10921
---
 exir/passes/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
index d46e5826081..777b2a1c866 100644
--- a/exir/passes/__init__.py
+++ b/exir/passes/__init__.py
@@ -24,6 +24,7 @@
 from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode
 from executorch.exir.error import InternalError
 from executorch.exir.operator.convert import (
+    _get_overload_schema,
     get_out_args_from_opoverload,
     is_out_variant,
     to_out_variant,
@@ -63,6 +64,7 @@
 from torch._subclasses import FakeTensor
 from torch.fx.passes.infra.pass_base import PassBase, PassResult
 from torch.fx.passes.shape_prop import TensorMetadata
+from torchgen.model import SchemaKind
 
 __all__ = [
     "ExportPass",
@@ -257,7 +259,6 @@ def callWithLoggerEnabled(self, graph_module: torch.fx.GraphModule) -> None:
     memory.alloc,
     memory.view,
     executorch_call_delegate,
-    torch.ops.aten.copy_.default,
 }
 to_out_var_skiplist.update(_EXECUTORCH_SYM_OPS)
 
@@ -347,6 +348,8 @@ def get_submodule(node: torch.fx.Node) -> torch.fx.GraphModule:
                 continue
             elif target in to_out_var_skiplist:
                 continue
+            elif _get_overload_schema(target).kind() == SchemaKind.inplace:
+                continue
             if not isinstance(
                 target, (torch._ops.OpOverload, EdgeOpOverload, BackendOpOverload)
             ):

From 770569dcfa31f733f15cd26189bf56a135d5147c Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 22:51:55 +0400
Subject: [PATCH 159/178] minor fix

---
 extension/llm/export/builder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index be2b50a2339..8da1eab844b 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -430,7 +430,8 @@ def transform_fn(
                 return inputs
 
             self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
-            self.calibration_data = ([word for prompt in self.calibration_data for word in prompt.split()] if self.dynamic_shapes else self.calibration_data)
+            self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data
+            logging.error(self.calibration_data)
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
                                                                 dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),

From e365e15476d1c7872a34ee169ecbbd9150584fb7 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Mon, 19 May 2025 12:13:06 -0700
Subject: [PATCH 160/178] Add input size validation to Module.execute (#10701)

### Summary
Adds input size validation to `Module.execute` to prevent possible
silent memory corruption when too many EValue inputs are passed.

Fixes #10510

### Test plan
- Added unit test `TestExecuteWithTooManyInputs`
- Verified by successfully running all `module_test.cpp` tests, except
`TestPTD` (did not have access to `ModuleLinear.ptd`)
- To run locally:
- Bypass `is_fbcode` guard in `targets.bzl` and redirect test file paths
to use a locally exported `ModuleAdd.pte` file
  - Build and run tests via:

  ```
  buck2 build //extension/module/test:test
  buck2 run //extension/module/test:test

---------

Co-authored-by: Anthony Shoumikhin <anthony@shoumikh.in>
---
 extension/module/module.cpp           |  6 ++++++
 extension/module/test/module_test.cpp | 10 ++++++++++
 2 files changed, 16 insertions(+)

diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 6c534b8d560..11d71d1ae08 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -240,6 +240,12 @@ runtime::Result<std::vector<runtime::EValue>> Module::execute(
   auto& method = methods_.at(method_name).method;
   auto& inputs = methods_.at(method_name).inputs;
 
+  ET_CHECK_OR_RETURN_ERROR(
+      input_values.size() <= inputs.size(),
+      InvalidArgument,
+      "input size: %zu does not match method input size: %zu",
+      input_values.size(),
+      inputs.size());
   for (size_t i = 0; i < input_values.size(); ++i) {
     if (!input_values[i].isNone()) {
       inputs[i] = input_values[i];
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 6d0b941706d..e0444c2aefb 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -216,6 +216,16 @@ TEST_F(ModuleTest, TestExecuteOnCurrupted) {
   EXPECT_NE(result.error(), Error::Ok);
 }
 
+TEST_F(ModuleTest, TestExecuteWithTooManyInputs) {
+  Module module(model_path_);
+
+  auto tensor = make_tensor_ptr({2, 2}, {1.f, 2.f, 3.f, 4.f});
+
+  const auto result = module.execute("forward", {tensor, tensor, 1.0, 1.0});
+
+  EXPECT_NE(result.error(), Error::Ok);
+}
+
 TEST_F(ModuleTest, TestGet) {
   Module module(model_path_);
 

From 4d7b64ee1a60f753a0ce1d201d1c8ecf396bea4c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Mon, 19 May 2025 12:20:28 -0700
Subject: [PATCH 161/178] support function + method variants

Differential Revision: D75006941

Pull Request resolved: https://github.com/pytorch/executorch/pull/10974
---
 codegen/gen.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/codegen/gen.py b/codegen/gen.py
index 7541a6e9705..43dc296a317 100644
--- a/codegen/gen.py
+++ b/codegen/gen.py
@@ -134,15 +134,14 @@ def __call__(self, f: NativeFunction) -> str | None:
         if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"):
             return None
 
-        if Variant.function not in f.variants and Variant.method in f.variants:
-            is_method_variant = True
-
-        # only valid remaining case is only function is in f.variants
-        elif not (Variant.function in f.variants and Variant.method not in f.variants):
+        if Variant.function not in f.variants and Variant.method not in f.variants:
             raise Exception(  # noqa: TRY002
-                f"Can't handle native function {f.func} with the following variant specification {f.variants}."
+                f"Expected one of function or method to be in variants for {f.func.name}"
             )
 
+        if Variant.function not in f.variants and Variant.method in f.variants:
+            is_method_variant = True
+
         sig: CppSignature | ExecutorchCppSignature = (
             CppSignatureGroup.from_native_function(
                 f, method=False, fallback_binding=f.manual_cpp_binding

From b2f9ef912e67371406fa601df6ede84d874e4fa8 Mon Sep 17 00:00:00 2001
From: mcremon-meta <134334895+mcremon-meta@users.noreply.github.com>
Date: Mon, 19 May 2025 12:28:25 -0700
Subject: [PATCH 162/178] Remove ReplaceTCopyWithTransform

Differential Revision: D74967760

Pull Request resolved: https://github.com/pytorch/executorch/pull/10962
---
 backends/cadence/aot/replace_ops.py           | 26 ---------------
 .../aot/tests/test_replace_ops_passes.py      | 32 -------------------
 2 files changed, 58 deletions(-)

diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 34a1abdf0b1..358ec1d6a4b 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -283,31 +283,6 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(op, args, kwargs, meta)
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))
-class ReplaceTCopyWithTransposePass(ExportPass):
-    """
-    Replace t_copy with transpose_copy.int. If the input is 1D, the t_copy is
-    a nop. t_copy is not supported, so this is an opt_level=0 pass.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if get_edge_overload_packet(op) != exir_ops.edge.aten.t_copy:
-            return super().call_operator(op, args, kwargs, meta)
-
-        # Get the input tensor shape
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
-
-        # If the input is a 1D tensor, this t_copy is a nop, so return the input
-        if in_tensor.dim() <= 1:
-            return args[0]
-
-        assert in_tensor.dim() == 2, "t_copy expects a tensor with <= 2 dimensions"
-        transpose_args = (args[0], 0, 1)
-        return super().call_operator(
-            exir_ops.edge.aten.transpose_copy.int, transpose_args, kwargs, meta
-        )
-
-
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceMMWithAddMMPass(ExportPass):
     """
@@ -2407,7 +2382,6 @@ class CadenceReplaceOpsInGraph:
     passes = [
         ReplaceEmptyTensorsWithFullPass,
         ReplaceFunctionallyEquivalentOpTargets,
-        ReplaceTCopyWithTransposePass,
         ReplacePermuteWithTransposePass,
         ReplaceScalarWithTensorArgPass,
         ReplaceConvolutionOptionalArgsWithConcreteArgsPass,
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
index 85077db93ca..e7bf8e9cefa 100644
--- a/backends/cadence/aot/tests/test_replace_ops_passes.py
+++ b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -48,7 +48,6 @@
     ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
     ReplaceSplitWithSlicePass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
-    ReplaceTCopyWithTransposePass,
     ReplaceTransposedConvWithLinearPass,
     ReplaceTrivialConvWithLinear,
     ReplaceWhereWithFullArgsWithWhereScalar,
@@ -368,37 +367,6 @@ def forward(self, x: torch.Tensor):
             0,
         )
 
-    @parameterized.expand(
-        [
-            [(16, 32)],
-            [(1, 240)],
-            [(4, 16)],
-        ]
-    )
-    @torch.no_grad()
-    def test_replace_t_copy_with_transpose(self, shape: Tuple[int]):
-        class TCopy(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
-                return exir_ops.edge.aten.t_copy(x)
-
-        w = torch.randn(shape)
-        inputs = (w,)
-        p1 = ReplaceTCopyWithTransposePass()
-        p2 = ReplacePermuteWithTransposePass()
-        model = TCopy()
-        graph_module = export_to_edge(model, inputs).exported_program().graph_module
-        graph_after_passes = cast(
-            PassResult, p2(cast(PassResult, p1(graph_module)).graph_module)
-        ).graph_module
-        self.assertEqual(
-            count_node(graph_after_passes, exir_ops.edge.aten.transpose_copy.int),
-            1,
-        )
-        self.assertEqual(
-            count_node(graph_after_passes, exir_ops.edge.aten.t_copy),
-            0,
-        )
-
     @parameterized.expand(
         [
             [(1, 8, 33), 8, 16, 3],

From 9aedbeb7a98e5fa1ccc625d82cfabd14bebd874a Mon Sep 17 00:00:00 2001
From: George Gekov <george.gekov@arm.com>
Date: Mon, 19 May 2025 21:31:45 +0100
Subject: [PATCH 163/178] Arm backend: Make the CI green by not testing
 Dedicated_Sram for the Ethos-U85 (#10973)

Temporary solution to the problem in
https://github.com/pytorch/executorch/pull/10958 The
arm_executor_runner.cpp need to declare the ethosu_fast_scratch array
and pass it onto to the EthosUBackend.cpp. It is important that for
Shared_Sram, the ethosu_fast_scratch is nullptr and for Dedicated_Sram
it points to the fast memory array.
---
 backends/arm/scripts/build_executor_runner.sh | 2 +-
 backends/arm/test/test_arm_baremetal.sh       | 5 ++++-
 backends/arm/test/test_model.py               | 2 +-
 examples/arm/run.sh                           | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/backends/arm/scripts/build_executor_runner.sh b/backends/arm/scripts/build_executor_runner.sh
index 9e2f3954c53..807821d427f 100755
--- a/backends/arm/scripts/build_executor_runner.sh
+++ b/backends/arm/scripts/build_executor_runner.sh
@@ -103,7 +103,7 @@ then
     memory_mode="Shared_Sram"
     if [[ ${target} =~ "ethos-u85" ]]
     then
-        memory_mode="Dedicated_Sram_384KB"
+        memory_mode="Sram_Only"
     fi
 fi
 
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 413e274da8f..12a37315109 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -210,7 +210,10 @@ test_models_ethos-u85() { # End to End model tests using model_test.py
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-512 --model=mv3 --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=w2l --extra_flags="-DET_ATOL=0.01 -DET_RTOL=0.01"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4 --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
+    # Temporarily not test inception_v4 on Ethos-U85. To support inception_v4 properly on Ethos-U85, we need to run the model in Dedicated_Sram memory mode with
+    # 384KB(or another amount lower than 2MB) of SRAM passed as fast scratch area. The PR adding support for Dedicated_Sram(https://github.com/pytorch/executorch/pull/10714) 
+    # was reverted due to a change required in an internal variant of the examples/arm/executor_runner/arm_executor_runner.cpp
+    # python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4 --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
 
     echo "${TEST_SUITE_NAME}: PASS"
     }
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
index 072583ef862..b0fd2f2a381 100755
--- a/backends/arm/test/test_model.py
+++ b/backends/arm/test/test_model.py
@@ -81,7 +81,7 @@ def get_args():
         if "u55" in args.target:
             args.memory_mode = "Shared_Sram"
         elif "u85" in args.target:
-            args.memory_mode = "Dedicated_Sram_384KB"
+            args.memory_mode = "Sram_Only"
         else:
             raise RuntimeError(f"Invalid target name {args.target}")
 
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 750c251596c..89ac5cd30a8 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -110,7 +110,7 @@ then
     memory_mode="Shared_Sram"
     if [[ ${target} =~ "ethos-u85" ]]
     then
-        memory_mode="Dedicated_Sram_384KB"
+        memory_mode="Sram_Only"
     fi
 fi
 

From cb3eba0d7f630bc8cec0a9cc1df8ae2f17af3f7a Mon Sep 17 00:00:00 2001
From: Sicheng Stephen Jia <ssjia@meta.com>
Date: Mon, 19 May 2025 17:57:29 -0400
Subject: [PATCH 164/178] Fix Windows build (#10946)

Summary:
## Context

Fix third party `CMakeLists.txt` to allow `flatcc` to build for Windows.
Some CMake configuration settings need to be adjusted for windows
platforms.

Test Plan:
## Test Plan

```
python install_executorch.py
```
---
 setup.py                   |  2 +-
 third-party/CMakeLists.txt | 27 ++++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 860e7f39403..4ad834d8ef9 100644
--- a/setup.py
+++ b/setup.py
@@ -835,7 +835,7 @@ def get_ext_modules() -> List[Extension]:
 
     ext_modules = [
         BuiltFile(
-            src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers_external_project/bin/%BUILD_TYPE%/",
+            src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers_external_project/bin/",
             src_name="flatc",
             dst="executorch/data/bin/",
             is_executable=True,
diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt
index c323d411e44..de2c1b4aea7 100644
--- a/third-party/CMakeLists.txt
+++ b/third-party/CMakeLists.txt
@@ -7,14 +7,19 @@
 
 # MARK: - flatbuffers
 
+if(WIN32)
+  set(_executorch_external_project_additional_args)
+else()
+  # Always use Make to avoid needing to codesign flatc if the project is using Xcode.
+  set(_executorch_external_project_additional_args CMAKE_GENERATOR "Unix Makefiles")
+endif()
+
 # We use ExternalProject to build flatc from source to force it target the host.
 # Otherwise, flatc will target the project's toolchain (i.e. iOS, or Android).
 ExternalProject_Add(
   flatbuffers_external_project
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_external_project
   SOURCE_DIR ${PROJECT_SOURCE_DIR}/third-party/flatbuffers
-  # Always use Make to avoid needing to codesign flatc if the project is using Xcode.
-  CMAKE_GENERATOR "Unix Makefiles"
   CMAKE_ARGS -DFLATBUFFERS_BUILD_FLATC=ON
              -DFLATBUFFERS_INSTALL=ON
              -DFLATBUFFERS_BUILD_FLATHASH=OFF
@@ -28,6 +33,7 @@ ExternalProject_Add(
              $<$<AND:$<BOOL:${APPLE}>,$<BOOL:$<FILTER:${PLATFORM},EXCLUDE,^MAC>>>:-DCMAKE_OSX_SYSROOT=>
              -DCMAKE_OSX_DEPLOYMENT_TARGET:STRING=${CMAKE_OSX_DEPLOYMENT_TARGET}
   BUILD_BYPRODUCTS <INSTALL_DIR>/bin/flatc
+  ${_executorch_external_project_additional_args}
 )
 ExternalProject_Get_Property(flatbuffers_external_project INSTALL_DIR)
 add_executable(flatc IMPORTED GLOBAL)
@@ -35,7 +41,7 @@ add_dependencies(flatc flatbuffers_external_project)
 if(WIN32)
   # flatbuffers does not use CMAKE_BUILD_TYPE. Internally, the build forces Release
   # config, but from CMake's perspective the build type is always Debug.
-  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/$<CONFIG>/bin/flatc.exe)
+  set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/flatc.exe)
 else()
   set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/flatc)
 endif()
@@ -49,13 +55,22 @@ endif()
 
 # MARK: - flatcc
 
+if(WIN32)
+  # For some reason, when configuring the external project during build
+  # CMAKE_C_SIMULATE_ID is set to MSVC, but CMAKE_CXX_SIMULATE_ID is not set.
+  # To make sure the external project is configured correctly, set it explicitly
+  # here.
+  set(_flatcc_extra_cmake_args -DCMAKE_CXX_SIMULATE_ID=MSVC)
+else()
+  set(_flatcc_extra_cmake_args)
+endif()
+
 # Similar to flatbuffers, we want to build flatcc for the host. See inline comments
 # in the flatbuffers ExternalProject_Add for more details.
 ExternalProject_Add(
   flatcc_external_project
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}/flatcc_external_project
   SOURCE_DIR ${PROJECT_SOURCE_DIR}/third-party/flatcc
-  CMAKE_GENERATOR "Unix Makefiles"
   CMAKE_ARGS -DFLATCC_RTONLY=OFF
              -DFLATCC_TEST=OFF
              -DFLATCC_REFLECTION=OFF
@@ -66,14 +81,16 @@ ExternalProject_Add(
              -DCMAKE_TOOLCHAIN_FILE=
              $<$<AND:$<BOOL:${APPLE}>,$<BOOL:$<FILTER:${PLATFORM},EXCLUDE,^MAC>>>:-DCMAKE_OSX_SYSROOT=>
              -DCMAKE_OSX_DEPLOYMENT_TARGET:STRING=${CMAKE_OSX_DEPLOYMENT_TARGET}
+             ${_flatcc_extra_cmake_args}
   BUILD_BYPRODUCTS <INSTALL_DIR>/bin/flatcc
+  {_executorch_external_project_additional_args}
 )
 file(REMOVE_RECURSE ${PROJECT_SOURCE_DIR}/third-party/flatcc/lib)
 ExternalProject_Get_Property(flatcc_external_project INSTALL_DIR)
 add_executable(flatcc_cli IMPORTED GLOBAL)
 add_dependencies(flatcc_cli flatcc_external_project)
 if(WIN32)
-  set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/$<CONFIG>/bin/flatcc.exe)
+  set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/flatcc.exe)
 else()
   set_target_properties(flatcc_cli PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/bin/flatcc)
 endif()

From d1c2683af42362f703c4681f648c9fd718dd546c Mon Sep 17 00:00:00 2001
From: haowhsu-quic <111341466+haowhsu-quic@users.noreply.github.com>
Date: Tue, 20 May 2025 13:59:57 +0800
Subject: [PATCH 165/178] Qualcomm AI Engine Direct - fix for pytorch uplevel
 (#10769)

### Summary
- use 'fold_quantize=False' in convert_pt2e to prevent overwriting
state_dict during lowering
- change in _get_updated_graph_siganture to have signature detected
correctly
---
 .../qualcomm/_passes/annotate_quant_attrs.py  |  45 +------
 .../_passes/convert_conv1d_to_conv2d.py       | 125 +++++++++++++-----
 .../_passes/expand_broadcast_tensor_shape.py  |   8 +-
 backends/qualcomm/_passes/fold_qdq.py         |  30 ++++-
 backends/qualcomm/_passes/qnn_pass_manager.py |   4 +
 .../_passes/replace_index_put_input.py        |   7 +-
 backends/qualcomm/_passes/utils.py            |   1 -
 backends/qualcomm/builders/README.md          |   4 +-
 backends/qualcomm/builders/node_visitor.py    |  64 ++++-----
 backends/qualcomm/builders/op_abs.py          |   2 +-
 .../builders/op_adaptive_avg_pool2d.py        |   2 +-
 backends/qualcomm/builders/op_add.py          |   2 +-
 backends/qualcomm/builders/op_amax.py         |   2 +-
 backends/qualcomm/builders/op_and.py          |   2 +-
 backends/qualcomm/builders/op_argmin.py       |   2 +-
 backends/qualcomm/builders/op_avg_pool2d.py   |   2 +-
 backends/qualcomm/builders/op_batch_norm.py   |  26 +++-
 backends/qualcomm/builders/op_bmm.py          |   2 +-
 backends/qualcomm/builders/op_cat.py          |   2 +-
 backends/qualcomm/builders/op_ceil.py         |   2 +-
 backends/qualcomm/builders/op_clamp.py        |   2 +-
 backends/qualcomm/builders/op_conv2d.py       |   6 +-
 backends/qualcomm/builders/op_cos.py          |   2 +-
 backends/qualcomm/builders/op_cum_sum.py      |   2 +-
 .../qualcomm/builders/op_depth_to_space.py    |   2 +-
 backends/qualcomm/builders/op_div.py          |   2 +-
 backends/qualcomm/builders/op_elu.py          |   2 +-
 backends/qualcomm/builders/op_embedding.py    |   2 +-
 backends/qualcomm/builders/op_eq.py           |   2 +-
 backends/qualcomm/builders/op_exp.py          |   2 +-
 backends/qualcomm/builders/op_expand.py       |   2 +-
 backends/qualcomm/builders/op_ge.py           |   2 +-
 backends/qualcomm/builders/op_gelu.py         |   2 +-
 backends/qualcomm/builders/op_group_norm.py   |   6 +-
 backends/qualcomm/builders/op_gt.py           |   2 +-
 backends/qualcomm/builders/op_hardsigmoid.py  |   2 +-
 backends/qualcomm/builders/op_hardswish.py    |   2 +-
 backends/qualcomm/builders/op_hardtanh.py     |   2 +-
 backends/qualcomm/builders/op_index.py        |   2 +-
 backends/qualcomm/builders/op_index_put.py    |   4 +-
 .../qualcomm/builders/op_instance_norm.py     |   4 +-
 backends/qualcomm/builders/op_layer_norm.py   |   6 +-
 backends/qualcomm/builders/op_le.py           |   2 +-
 backends/qualcomm/builders/op_linear.py       |   6 +-
 backends/qualcomm/builders/op_log.py          |   2 +-
 backends/qualcomm/builders/op_log_softmax.py  |   2 +-
 backends/qualcomm/builders/op_logical_not.py  |   2 +-
 backends/qualcomm/builders/op_lt.py           |   2 +-
 backends/qualcomm/builders/op_matmul.py       |   2 +-
 backends/qualcomm/builders/op_max.py          |   2 +-
 backends/qualcomm/builders/op_max_pool2d.py   |   2 +-
 backends/qualcomm/builders/op_mean_dim.py     |   2 +-
 backends/qualcomm/builders/op_min.py          |   2 +-
 backends/qualcomm/builders/op_mul.py          |   2 +-
 backends/qualcomm/builders/op_ne.py           |  43 +-----
 backends/qualcomm/builders/op_neg.py          |   2 +-
 backends/qualcomm/builders/op_or.py           |   2 +-
 backends/qualcomm/builders/op_pad.py          |   2 +-
 backends/qualcomm/builders/op_pow.py          |   4 +-
 backends/qualcomm/builders/op_prelu.py        |  19 +--
 backends/qualcomm/builders/op_relu.py         |   2 +-
 backends/qualcomm/builders/op_repeat.py       |   2 +-
 backends/qualcomm/builders/op_reshape.py      |   2 +-
 backends/qualcomm/builders/op_rms_norm.py     |   6 +-
 backends/qualcomm/builders/op_rsqrt.py        |   2 +-
 .../qualcomm/builders/op_scalar_tensor.py     |   2 +-
 backends/qualcomm/builders/op_select_copy.py  |   2 +-
 backends/qualcomm/builders/op_sigmoid.py      |   2 +-
 backends/qualcomm/builders/op_sin.py          |   2 +-
 backends/qualcomm/builders/op_slice_copy.py   |  15 ++-
 backends/qualcomm/builders/op_softmax.py      |   2 +-
 .../qualcomm/builders/op_space_to_depth.py    |   2 +-
 .../qualcomm/builders/op_split_with_sizes.py  |   2 +-
 backends/qualcomm/builders/op_sqrt.py         |   2 +-
 backends/qualcomm/builders/op_squeeze.py      |   2 +-
 backends/qualcomm/builders/op_stack.py        |   2 +-
 backends/qualcomm/builders/op_sub.py          |   2 +-
 backends/qualcomm/builders/op_sum_int_list.py |   2 +-
 backends/qualcomm/builders/op_tanh.py         |   2 +-
 backends/qualcomm/builders/op_to.py           |   2 +-
 backends/qualcomm/builders/op_topk.py         |   2 +-
 backends/qualcomm/builders/op_transpose.py    |   2 +-
 backends/qualcomm/builders/op_unbind.py       |   2 +-
 backends/qualcomm/builders/op_unsqueeze.py    |   2 +-
 .../builders/op_upsample_bilinear2d.py        |   2 +-
 .../builders/op_upsample_nearest2d.py         |   2 +-
 backends/qualcomm/builders/op_where.py        |   6 +-
 .../qualcomm/partition/qnn_partitioner.py     |   8 +-
 backends/qualcomm/tests/test_qnn_delegate.py  |  90 +++++++------
 backends/qualcomm/tests/utils.py              |  15 ++-
 backends/qualcomm/utils/constants.py          |   1 +
 backends/qualcomm/utils/utils.py              |   3 +-
 .../qualcomm/oss_scripts/llama/CMakeLists.txt |   1 +
 .../qaihub_scripts/llama/CMakeLists.txt       |   2 +
 .../stable_diffusion/CMakeLists.txt           |   1 +
 examples/qualcomm/utils.py                    |   4 +-
 exir/program/_program.py                      |   2 +-
 97 files changed, 374 insertions(+), 322 deletions(-)

diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py
index b4f14fc28cd..d9ef9cb691d 100644
--- a/backends/qualcomm/_passes/annotate_quant_attrs.py
+++ b/backends/qualcomm/_passes/annotate_quant_attrs.py
@@ -7,10 +7,8 @@
 from typing import Any, Dict
 
 import torch
-from executorch.backends.qualcomm.builders.utils import get_parameter, set_parameter
+from executorch.backends.qualcomm.builders.utils import get_parameter
 from executorch.backends.qualcomm.utils.constants import (
-    QCOM_AXIS,
-    QCOM_BLOCK_SIZE,
     QCOM_DTYPE,
     QCOM_ENCODING,
     QCOM_QUANT_ATTRS,
@@ -18,11 +16,8 @@
     QCOM_QUANT_MIN,
     QCOM_REQUANTIZE,
     QCOM_SCALE,
-    QCOM_SCALES,
     QCOM_ZERO_POINT,
-    QCOM_ZERO_POINTS,
 )
-from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
 from .utils import dq_ops, get_quant_attrs, q_ops
@@ -101,43 +96,9 @@ def _annotate_requant(self, n):
                     n.args[0].meta.setdefault(QCOM_REQUANTIZE, {})
                     n.args[0].meta[QCOM_REQUANTIZE][user_node.name] = dq_attrs
 
-    # Dequant all the fold_quant parameters back to fp32.
-    # If an operation is not supported by QNN and got fallback, it will expect a fp32 param.
-    def _dequant_fold_params(self, n, quant_attrs, param):
-        if quant_attrs[QCOM_ENCODING] in [
-            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
-        ]:
-            dim, axis = param.dim(), quant_attrs[QCOM_AXIS]
-            scales = self._expand(quant_attrs[QCOM_SCALES], dim, axis)
-            offsets = self._expand(quant_attrs[QCOM_ZERO_POINTS], dim, axis)
-            param = param.sub(offsets).mul(scales).to(torch.float32).contiguous()
-        elif quant_attrs[QCOM_ENCODING] in [
-            exir_ops.edge.pt2e_quant.dequantize_affine.default
-        ]:
-            param = torch.ops.pt2e_quant.dequantize_affine(
-                param,
-                block_size=quant_attrs[QCOM_BLOCK_SIZE],
-                scale=quant_attrs[QCOM_SCALE],
-                zero_point=quant_attrs[QCOM_ZERO_POINT],
-                input_dtype=quant_attrs[QCOM_DTYPE],
-                quant_min=quant_attrs[QCOM_QUANT_MIN],
-                quant_max=quant_attrs[QCOM_QUANT_MAX],
-                output_dtype=torch.float32,
-            )
-        else:
-            scale = quant_attrs[QCOM_SCALE]
-            offset = quant_attrs[QCOM_ZERO_POINT]
-            param = param.sub(offset).mul(scale).to(torch.float32).contiguous()
-
-        set_parameter(param, n.args[0], self.edge_program)
-        n.args[0].meta["val"] = param
-
     def _annotate_quant_attrs(
         self, graph_module: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
-        # Keep track of const params that has been dequant, so it does not get
-        # dequant multiple times if the const param has more than 1 user
-        visited_const_param = set()
         for n in graph_module.graph.nodes:
             self._annotate_requant(n)
             # With fold_quant enabled, check if the input of dq op is quantized param.
@@ -149,10 +110,6 @@ def _annotate_quant_attrs(
             quant_attrs = get_quant_attrs(self.edge_program, n)
             self._annotate_source_nodes(n, quant_attrs)
 
-            if param is not None and n.args[0] not in visited_const_param:
-                visited_const_param.add(n.args[0])
-                self._dequant_fold_params(n, quant_attrs, param)
-
         return graph_module
 
     def call(self, graph_module: torch.fx.GraphModule):
diff --git a/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
index 72dc29c2880..dabe0243a47 100644
--- a/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
+++ b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
@@ -5,10 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-import torch.nn as nn
 from executorch.backends.qualcomm.builders.utils import get_parameter, set_parameter
 from executorch.backends.qualcomm.utils.constants import QCOM_REQUANTIZE
-from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
 from .utils import copy_meta
@@ -23,16 +21,43 @@ class ConvertConv1dToConv2d(ExportPass):
     def __init__(self, edge_program: torch.export.ExportedProgram):
         super(ConvertConv1dToConv2d, self).__init__()
         self.edge_program = edge_program
+        self.conv_op_map = {
+            torch.ops.aten.conv1d.default: torch.ops.aten.conv2d.default,
+            torch.ops.aten.conv_transpose1d.default: torch.ops.aten.conv_transpose2d.input,
+        }
+
+    def append_qdq(
+        self,
+        graph_module: torch.fx.GraphModule,
+        node: torch.fx.Node,
+        qdq_node: torch.fx.Node,
+    ):
+        q_op = torch.ops.quantized_decomposed.quantize_per_tensor.default
+        dq_op = torch.ops.quantized_decomposed.dequantize_per_tensor.default
+        if qdq_node.target not in {q_op, dq_op}:
+            return node
+
+        with graph_module.graph.inserting_after(node):
+            q_args = (node, *qdq_node.args[1:])
+            q_node = graph_module.graph.create_node("call_function", q_op, q_args)
+            q_node.meta = copy_meta(node.meta)
+            q_node.meta["val"] = q_node.meta["val"].to(q_args[-1])
+            with graph_module.graph.inserting_after(q_node):
+                dq_args = (q_node, *qdq_node.args[1:])
+                dq_node = graph_module.graph.create_node(
+                    "call_function", dq_op, dq_args
+                )
+                dq_node.meta = copy_meta(node.meta)
+
+        return dq_node
 
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
-        conv_op = exir_ops.edge.aten.convolution.default
         for node in graph.nodes:
-            if node.target == conv_op and node.meta["val"].dim() == 3:
-
+            if node.target in self.conv_op_map:
                 input_node = node.args[0]
                 with graph_module.graph.inserting_after(input_node):
-                    unsqueeze_op = exir_ops.edge.aten.unsqueeze_copy.default
+                    unsqueeze_op = torch.ops.aten.unsqueeze_copy.default
                     unsqueeze_node = graph.create_node(
                         "call_function",
                         unsqueeze_op,
@@ -44,52 +69,88 @@ def call(self, graph_module: torch.fx.GraphModule):
                     unsqueeze_node.meta = copy_meta(
                         input_node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)}
                     )
+                    qdq_node_after_unsqueeze = self.append_qdq(
+                        graph_module=graph_module,
+                        node=unsqueeze_node,
+                        qdq_node=input_node,
+                    )
 
-                    with graph_module.graph.inserting_after(unsqueeze_node):
-
-                        filter_node = node.args[1]
+                    with graph_module.graph.inserting_after(qdq_node_after_unsqueeze):
+                        filter_arg = node.args[1]
+                        filter_node = (
+                            filter_arg
+                            if filter_arg.op == "placeholder"
+                            else node.args[1].args[0]
+                        )
                         filter_node.meta["val"] = (
                             filter_node.meta["val"].unsqueeze(2).contiguous()
                         )
-                        filter_tensor = get_parameter(filter_node, self.edge_program)
-                        # Ensure tensor is nn.Parameter type, so program does not fail during edge_program._validate()
-                        filter_tensor = nn.Parameter(filter_tensor.unsqueeze(2))
-                        set_parameter(filter_tensor, filter_node, self.edge_program)
+                        filter_tensor = get_parameter(
+                            filter_node, self.edge_program
+                        ).unsqueeze(2)
+                        set_parameter(
+                            (
+                                torch.nn.Parameter(filter_tensor)
+                                if filter_tensor.dtype == torch.float
+                                else filter_tensor
+                            ),
+                            filter_node,
+                            self.edge_program,
+                        )
 
+                        num_args = len(node.args)
                         bias_node = node.args[2]
-                        stride = [1] + node.args[3]
-                        padding = [0] + node.args[4]
-                        dilation = [1] + node.args[5]
-                        transpose = node.args[6]
-                        output_padding = [0] + node.args[7]
-                        groups = node.args[8]
-
-                        conv2d_node = graph.create_node(
-                            "call_function",
-                            conv_op,
-                            (
-                                unsqueeze_node,
-                                filter_node,
+                        stride = [1] + node.args[3] if num_args > 3 else [1, 1]
+                        padding = [0] + node.args[4] if num_args > 4 else [0, 0]
+                        if node.target == torch.ops.aten.conv1d.default:
+                            dilation = [1] + node.args[5] if num_args > 5 else [1, 1]
+                            groups = node.args[6] if num_args > 5 else 1
+                            conv_args = (
+                                qdq_node_after_unsqueeze,
+                                node.args[1],
                                 bias_node,
                                 stride,
                                 padding,
                                 dilation,
-                                transpose,
+                                groups,
+                            )
+                        else:
+                            output_padding = (
+                                [0] + node.args[5] if num_args > 5 else [0, 0]
+                            )
+                            groups = node.args[6] if num_args > 6 else 1
+                            dilation = [1] + node.args[7] if num_args > 7 else [1, 1]
+                            conv_args = (
+                                qdq_node_after_unsqueeze,
+                                node.args[1],
+                                bias_node,
+                                stride,
+                                padding,
                                 output_padding,
                                 groups,
-                            ),
+                                dilation,
+                            )
+                        conv2d_node = graph.create_node(
+                            "call_function",
+                            self.conv_op_map[node.target],
+                            conv_args,
                         )
                         conv2d_node.meta = copy_meta(
                             node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)}
                         )
+                        qdq_node_after_conv2d = self.append_qdq(
+                            graph_module=graph_module,
+                            node=conv2d_node,
+                            qdq_node=list(node.users)[0],
+                        )
 
-                        with graph_module.graph.inserting_after(conv2d_node):
-                            squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+                        with graph_module.graph.inserting_after(qdq_node_after_conv2d):
+                            squeeze_op = torch.ops.aten.squeeze_copy.dims
                             squeeze_node = graph.create_node(
                                 "call_function",
                                 squeeze_op,
                                 (
-                                    conv2d_node,
+                                    qdq_node_after_conv2d,
                                     [2],
                                 ),
                             )
@@ -102,8 +163,10 @@ def call(self, graph_module: torch.fx.GraphModule):
                                     QCOM_REQUANTIZE
                                 ]
                                 conv2d_node.meta.pop(QCOM_REQUANTIZE, None)
+
                 for user in node.users.copy():
                     user.replace_input_with(node, squeeze_node)
+
         graph.eliminate_dead_code()
         graph_module.recompile()
         return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
index 829b3757e06..4fe87604fc1 100644
--- a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
+++ b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
@@ -9,6 +9,8 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.passes import dead_code_elimination_pass
 
+from .utils import dq_ops
+
 
 class ExpandBroadcastTensorShape(ExportPass):
     """
@@ -45,9 +47,13 @@ def traverse_broadcast_node(self, graph_module: torch.fx.GraphModule):
                                 exir_ops.edge.aten.view_copy.default,
                                 (arg, tuple(new_rank)),
                             )
+                            # try skip dq_ops to get correct param node if applicable
+                            arg_meta = (
+                                arg.args[0].meta if arg.target in dq_ops else arg.meta
+                            )
                             # meta needs to be copied elementwisely for fake-tensor
                             # to be updated correctly and not affect meta of arg
-                            for k, v in arg.meta.items():
+                            for k, v in arg_meta.items():
                                 reshape_node.meta[k] = v
                             reshape_node.meta["val"] = reshape_node.meta["val"].reshape(
                                 new_rank
diff --git a/backends/qualcomm/_passes/fold_qdq.py b/backends/qualcomm/_passes/fold_qdq.py
index bc17b2fae1f..accf66d4c35 100644
--- a/backends/qualcomm/_passes/fold_qdq.py
+++ b/backends/qualcomm/_passes/fold_qdq.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
+from executorch.backends.qualcomm.builders.utils import is_parameter
+from executorch.backends.qualcomm.utils.constants import QCOM_BYPASS_NODE
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.passes import dead_code_elimination_pass
@@ -16,23 +18,38 @@ class FoldQDQ(ExportPass):
     Erase QDQ pattern.
     """
 
-    def __init__(self):
+    def __init__(self, edge_program: torch.export.ExportedProgram, force_fold=False):
         super(FoldQDQ, self).__init__()
+        self.edge_program = edge_program
+        self.force_fold = force_fold
 
-    def _fold(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    def _annotate_bypass(self, node):
+        node.meta[QCOM_BYPASS_NODE] = True
+        for arg in node.args:
+            if isinstance(arg, torch.fx.Node) and arg.op == "call_function":
+                self._annotate_bypass(arg)
+
+    def _fold_dq(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
         # remove dq
         for n in graph_module.graph.nodes:
             user_list = list(n.users.keys())
             if n.target not in dq_ops:
                 continue
-            for user_n in user_list:
-                user_n.replace_input_with(n, n.args[0])
-            graph_module.graph.erase_node(n)
 
+            # skip parameters & buffers
+            if not self.force_fold and is_parameter(n.args[0], self.edge_program):
+                self._annotate_bypass(n)
+            else:
+                for user_n in user_list:
+                    user_n.replace_input_with(n, n.args[0])
+                graph_module.graph.erase_node(n)
+
+    def _fold_q(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
         # remove q
         for n in graph_module.graph.nodes:
             if n.target not in q_ops:
                 continue
+
             to_be_removed = [n]
             source_n = n.args[0]
 
@@ -57,7 +74,8 @@ def _fold(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 graph_module.graph.erase_node(n)
 
     def call(self, graph_module: torch.fx.GraphModule):
-        self._fold(graph_module)
+        self._fold_dq(graph_module)
+        self._fold_q(graph_module)
         graph_module.recompile()
         dead_code_elimination_pass(graph_module)
         return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index c98f27db120..63c303eb689 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -200,6 +200,9 @@ def transform_for_export_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        # this pass will rewrite state_dict, it needs to be accomplished before
+        # to_edge_transform_and_lower
+        self.add_pass(ConvertConv1dToConv2d(exported_program))
         self.add_pass(ConvertSquareToPow())
         self.add_pass(LiftConstantScalarOperands())
         self._transform(exported_program.graph_module)
@@ -207,6 +210,7 @@ def transform_for_export_pipeline(self, exported_program: ExportedProgram):
         return ep
 
     def transform_for_preprocess_pipeline(self, exported_program: ExportedProgram):
+        self.add_pass(FoldQDQ(exported_program, force_fold=True))
         self.add_pass(InsertRequantize())
         self.add_pass(InsertIOQDQ(exported_program))
         self.add_pass(LayoutTransform(exported_program, insert_permute=True))
diff --git a/backends/qualcomm/_passes/replace_index_put_input.py b/backends/qualcomm/_passes/replace_index_put_input.py
index dcdf2bb3a7f..93ee21bfc7c 100644
--- a/backends/qualcomm/_passes/replace_index_put_input.py
+++ b/backends/qualcomm/_passes/replace_index_put_input.py
@@ -33,7 +33,8 @@ def call(self, graph_module: torch.fx.GraphModule):
                     copy_node := list(node.users)[0]
                 ) and copy_node.target == exir_ops.edge.aten.copy.default:
                     m_buffer_node = copy_node.args[0]
-                    bad_frozen_node = node.args[0]
+                    dq_node = node.args[0]
+                    bad_frozen_node = dq_node.args[0]
                     if QCOM_QUANT_ATTRS in bad_frozen_node.meta:
                         m_buffer_node.meta[QCOM_QUANT_ATTRS] = bad_frozen_node.meta[
                             QCOM_QUANT_ATTRS
@@ -43,8 +44,8 @@ def call(self, graph_module: torch.fx.GraphModule):
                                 m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING]
                             ]
                         )
-                    with graph.inserting_after(bad_frozen_node):
-                        node.replace_input_with(bad_frozen_node, m_buffer_node)
+                    with graph.inserting_after(dq_node):
+                        node.replace_input_with(dq_node, m_buffer_node)
                 else:
                     continue
 
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index 46d9e0cde76..10dcbb07aac 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -103,7 +103,6 @@ def get_passes_dependency_for_capture_program():
         AnnotateStack: [RemoveRedundancy],
         AnnotateUnbind: [RemoveRedundancy],
         ConvertBmmToMatmul: [RecomposePixelUnshuffle],
-        ConvertConv1dToConv2d: [FoldQDQ],
         ConvertUpsampleBicubicWithBilinear: [RemoveRedundancy],
         DecomposeAny: [RemoveRedundancy],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
index 783a53dd645..22f0852941c 100644
--- a/backends/qualcomm/builders/README.md
+++ b/backends/qualcomm/builders/README.md
@@ -227,7 +227,7 @@ Now, we can start to fill in function body step by step:
 
 2. Define input gamma / beta tensors:
     ```python
-        weight_node = node.args[2]
+        weight_node = self.get_node(node.args[2])
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
@@ -237,7 +237,7 @@ Now, we can start to fill in function body step by step:
             nodes_to_wrappers,
         )
 
-        bias_node = node.args[3]
+        bias_node = self.get_node(node.args[3])
         bias_tensor = get_parameter(bias_node, self.edge_program)
         bias_tensor_wrapper = self.define_tensor(
             bias_node,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index 7965a30caea..5e9520d4c05 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -11,6 +11,7 @@
 
 import numpy as np
 import torch
+from executorch.backends.qualcomm._passes.utils import dq_ops
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_AXIS,
     QCOM_AXIS_ORDER,
@@ -18,7 +19,6 @@
     QCOM_BLOCK_SCALE_BITWIDTH,
     QCOM_BLOCK_SCALE_OFFSET,
     QCOM_BLOCK_SCALES,
-    QCOM_BLOCK_SIZE,
     QCOM_BLOCK_STORAGE_TYPE,
     QCOM_DTYPE,
     QCOM_ENCODING,
@@ -95,6 +95,19 @@ def __init__(
         self.edge_program = edge_program
         self.enable_tensor_dump = enable_tensor_dump
 
+    def get_node(self, node):
+        """
+        Utility to skip dequantize node for frozen param
+        """
+        return node.args[0] if node is not None and node.target in dq_ops else node
+
+    def get_first_user(self, node):
+        """
+        Utility to skip dequantize user for frozen param
+        """
+        user_0 = list(node.users)[0]
+        return user_0 if user_0.target not in dq_ops else self.get_first_user(user_0)
+
     def get_tensor(self, input_node, op_node, idx=None):
         """
         Get tensor value/shape with axis_order
@@ -142,7 +155,9 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
             # symmetric quantization is required
             scale_offset.append(PyQnnWrapper.Qnn_ScaleOffset_t(max_scale, 0))
 
-        if "convolution" in list(node.users)[0].target.__name__:
+        # skip dequantize op, e.g. frozen_param -> dq -> conv2d
+        user_0 = self.get_first_user(node)
+        if "convolution" in user_0.target.__name__:
             # OIHW (pytorch) -> HWIO (QNN)
             quant_config[QCOM_AXIS] = 3
             quant_config[QCOM_AXIS_ORDER] = (2, 3, 1, 0)
@@ -178,14 +193,11 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
                 PyQnnWrapper.Qnn_ScaleOffset_t(scales[i], -zero_points[i])
             )
 
-        user_0 = list(node.users)[0]
+        # skip dequantize op, e.g. frozen_param -> dq -> conv2d
+        user_0 = self.get_first_user(node)
         # Memory layout of QNN conv weight always ends in Output. Like conv2d is HWIO
-        if (
-            "convolution" in user_0.target.__name__
-            and list(node.users)[0].args[1] == node
-        ):
+        if "convolution" in user_0.target.__name__:
             quant_config[QCOM_AXIS] = 3
-
         else:
             quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS]
 
@@ -256,33 +268,21 @@ def get_quant_encoding_conf(
     def get_quant_tensor_value(
         self, tensor: torch.Tensor, quant_attrs: Dict, quant_configs: Dict
     ) -> torch.Tensor:
-        dtype = quant_configs[QCOM_DTYPE]
-        if quant_attrs[QCOM_ENCODING] in PER_TENSOR_ENCODING:
+        # params should have been quantized by framework
+        # here we're handling constant operators like arange, full, etc.
+        if tensor.dtype == torch.float32:
+            assert quant_attrs[QCOM_ENCODING] in PER_TENSOR_ENCODING, (
+                f"unrecongnized quantization attribute detected {quant_attrs[QCOM_ENCODING]}",
+            )
             scale = quant_attrs[QCOM_SCALE]
             zero_point = quant_attrs[QCOM_ZERO_POINT]
-            tensor = tensor.div(scale).add(zero_point).round().to(dtype)
-        elif quant_attrs[QCOM_ENCODING] in PER_CHANNEL_ENCODING:
-            scale = quant_attrs[QCOM_SCALES]
-            zero_point = quant_attrs[QCOM_ZERO_POINTS]
-            tensor = tensor.div(scale).add(zero_point).round().to(dtype)
-        else:  # per_block
-            if axis_order := quant_configs.get(QCOM_AXIS_ORDER, None):
-                origin_order = tuple(
-                    axis_order.index(x) for x in range(len(axis_order))
-                )
-                tensor = tensor.permute(origin_order)
-            tensor = torch.ops.pt2e_quant.quantize_affine(
-                tensor,
-                block_size=quant_attrs[QCOM_BLOCK_SIZE],
-                scale=quant_attrs[QCOM_SCALE],
-                zero_point=quant_attrs[QCOM_ZERO_POINT],
-                output_dtype=dtype,
-                quant_min=quant_attrs[QCOM_QUANT_MIN],
-                quant_max=quant_attrs[QCOM_QUANT_MAX],
+            tensor = (
+                tensor.div(scale).add(zero_point).round().to(quant_configs[QCOM_DTYPE])
             )
-            if axis_order:
-                tensor = tensor.permute(axis_order)
-
+        # Since we're using torch.int32 to store 16bit data
+        # need to make it compact here for QNN to correctly retrieve data
+        if quant_configs.get(QCOM_DTYPE) == torch.uint16:
+            tensor = tensor.to(torch.uint16)
         # Make the backends access data correctly
         if quant_configs.get(QCOM_BITWIDTH) == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
diff --git a/backends/qualcomm/builders/op_abs.py b/backends/qualcomm/builders/op_abs.py
index 002ffe85208..2209ffc792c 100644
--- a/backends/qualcomm/builders/op_abs.py
+++ b/backends/qualcomm/builders/op_abs.py
@@ -35,7 +35,7 @@ def define_node(
         )
         abs_output_tensors = [output_tensor_wrapper]
 
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor_wrapper = self.define_tensor(
             input_node,
             node,
diff --git a/backends/qualcomm/builders/op_adaptive_avg_pool2d.py b/backends/qualcomm/builders/op_adaptive_avg_pool2d.py
index c944e1646e7..777e1f61ada 100644
--- a/backends/qualcomm/builders/op_adaptive_avg_pool2d.py
+++ b/backends/qualcomm/builders/op_adaptive_avg_pool2d.py
@@ -28,7 +28,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
 
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_add.py b/backends/qualcomm/builders/op_add.py
index b5edfd7bb52..f8fb31fb725 100644
--- a/backends/qualcomm/builders/op_add.py
+++ b/backends/qualcomm/builders/op_add.py
@@ -37,7 +37,7 @@ def define_node(
 
         add_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_amax.py b/backends/qualcomm/builders/op_amax.py
index 099004a4bcf..62c17b8dfcd 100644
--- a/backends/qualcomm/builders/op_amax.py
+++ b/backends/qualcomm/builders/op_amax.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_and.py b/backends/qualcomm/builders/op_and.py
index 44e6f2893f5..22b63e0d6ff 100644
--- a/backends/qualcomm/builders/op_and.py
+++ b/backends/qualcomm/builders/op_and.py
@@ -37,7 +37,7 @@ def define_node(
 
         and_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_argmin.py b/backends/qualcomm/builders/op_argmin.py
index 5630b02a5cc..fa3fad4a61b 100644
--- a/backends/qualcomm/builders/op_argmin.py
+++ b/backends/qualcomm/builders/op_argmin.py
@@ -27,7 +27,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         op_wrapper_list = []
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         output_tensor = self.get_tensor(node, node)
         argmin_inp_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_avg_pool2d.py b/backends/qualcomm/builders/op_avg_pool2d.py
index 394d4008587..f4762e8bb5a 100644
--- a/backends/qualcomm/builders/op_avg_pool2d.py
+++ b/backends/qualcomm/builders/op_avg_pool2d.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py
index 9aed1401875..ec0a7c39348 100644
--- a/backends/qualcomm/builders/op_batch_norm.py
+++ b/backends/qualcomm/builders/op_batch_norm.py
@@ -40,14 +40,22 @@ def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor, eps):
         if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
             # scale value equals to zero will cause failure in HTP
             diff = max(abs(tensor.max()), abs(tensor.min())) + eps
-            quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX]
+            quant_attrs[QCOM_SCALE] = (diff / quant_attrs[QCOM_QUANT_MAX]).item()
+
+    def try_dequantize(self, node: torch.fx.Node, tensor: torch.Tensor):
+        if tensor.dtype == torch.float:
+            return tensor
+
+        scale = node.meta[QCOM_QUANT_ATTRS][QCOM_SCALE]
+        offset = node.meta[QCOM_QUANT_ATTRS][QCOM_ZERO_POINT]
+        return tensor.sub(offset).mul(scale).to(torch.float32).contiguous()
 
     def define_node(
         self,
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         eps = 1e-9
@@ -78,9 +86,12 @@ def define_node(
         batch_norm_output_tensors = [output_tensor_wrapper]
 
         n_feature = output_tensor.shape[-1 if QCOM_AXIS_ORDER in node.meta else 1]
-        filter_node = node.args[1]
+        filter_node = self.get_node(node.args[1])
         if filter_node is not None:
-            filter_tensor = get_parameter(filter_node, self.edge_program)
+            # dequantize here for post-process
+            filter_tensor = self.try_dequantize(
+                filter_node, get_parameter(filter_node, self.edge_program)
+            )
         else:
             # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
             filter_node = torch.fx.Node(
@@ -110,9 +121,12 @@ def define_node(
         )
         batch_norm_input_tensors.append(filter_tensor_wrapper)
 
-        bias_node = node.args[2]
+        bias_node = self.get_node(node.args[2])
         if bias_node is not None:
-            bias_tensor = get_parameter(bias_node, self.edge_program)
+            # dequantize here for post-process
+            bias_tensor = self.try_dequantize(
+                bias_node, get_parameter(bias_node, self.edge_program)
+            )
             amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps)
             bias_tensor = bias_tensor - amount
             self.update_encoding(bias_node, bias_tensor, eps)
diff --git a/backends/qualcomm/builders/op_bmm.py b/backends/qualcomm/builders/op_bmm.py
index 46fbff1cc7e..d473d085490 100644
--- a/backends/qualcomm/builders/op_bmm.py
+++ b/backends/qualcomm/builders/op_bmm.py
@@ -27,7 +27,7 @@ def define_node(
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         bmm_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
 
             input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_cat.py b/backends/qualcomm/builders/op_cat.py
index 7f160856390..09f99396589 100644
--- a/backends/qualcomm/builders/op_cat.py
+++ b/backends/qualcomm/builders/op_cat.py
@@ -32,7 +32,7 @@ def define_node(
         list_of_tensor_wrappers = []
 
         for tensor_input in list_of_tensors:
-            input_tensor = self.get_tensor(tensor_input, node)
+            input_tensor = self.get_tensor(self.get_node(tensor_input), node)
             list_of_tensor_wrappers.append(
                 self.define_tensor(
                     tensor_input,
diff --git a/backends/qualcomm/builders/op_ceil.py b/backends/qualcomm/builders/op_ceil.py
index 19fe14d6392..f0a43846d11 100644
--- a/backends/qualcomm/builders/op_ceil.py
+++ b/backends/qualcomm/builders/op_ceil.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_clamp.py b/backends/qualcomm/builders/op_clamp.py
index 0f9a9ffa196..e80c99db352 100644
--- a/backends/qualcomm/builders/op_clamp.py
+++ b/backends/qualcomm/builders/op_clamp.py
@@ -27,7 +27,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py
index c019a835223..5a168ca103a 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv2d.py
@@ -104,7 +104,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         assert (
             input_tensor.dim() == 4
@@ -117,7 +117,7 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        filter_node = node.args[1]
+        filter_node = self.get_node(node.args[1])
         filter_tensor = get_parameter(filter_node, self.edge_program)
         # weight of pytorch OIHW(conv2d) | IOHW(conv_transpose2d), yet QNN is HWIO
         is_transpose_conv = cast(bool, node.args[6])
@@ -133,7 +133,7 @@ def define_node(
         conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper]
 
         if node.args[2] is not None:
-            bias_node = node.args[2]
+            bias_node = self.get_node(node.args[2])
             bias_tensor = get_parameter(bias_node, self.edge_program)
             bias_tensor_wrapper = self.define_tensor(
                 bias_node,
diff --git a/backends/qualcomm/builders/op_cos.py b/backends/qualcomm/builders/op_cos.py
index 589bf3ef88e..69c0d40a026 100644
--- a/backends/qualcomm/builders/op_cos.py
+++ b/backends/qualcomm/builders/op_cos.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_cum_sum.py b/backends/qualcomm/builders/op_cum_sum.py
index f62485bc519..dceaea83345 100644
--- a/backends/qualcomm/builders/op_cum_sum.py
+++ b/backends/qualcomm/builders/op_cum_sum.py
@@ -37,7 +37,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_depth_to_space.py b/backends/qualcomm/builders/op_depth_to_space.py
index 56c57b4bd5e..357b7a81039 100644
--- a/backends/qualcomm/builders/op_depth_to_space.py
+++ b/backends/qualcomm/builders/op_depth_to_space.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_div.py b/backends/qualcomm/builders/op_div.py
index ce3f96abc7f..399e914e290 100644
--- a/backends/qualcomm/builders/op_div.py
+++ b/backends/qualcomm/builders/op_div.py
@@ -37,7 +37,7 @@ def define_node(
 
         div_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_elu.py b/backends/qualcomm/builders/op_elu.py
index f9cc089c7bb..f0ac422f4b8 100644
--- a/backends/qualcomm/builders/op_elu.py
+++ b/backends/qualcomm/builders/op_elu.py
@@ -28,7 +28,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         # tensor input
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_embedding.py b/backends/qualcomm/builders/op_embedding.py
index 5b0d1600393..ba5b1a02077 100644
--- a/backends/qualcomm/builders/op_embedding.py
+++ b/backends/qualcomm/builders/op_embedding.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        weight_node = node.args[0]
+        weight_node = self.get_node(node.args[0])
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
diff --git a/backends/qualcomm/builders/op_eq.py b/backends/qualcomm/builders/op_eq.py
index 855c5e13be6..6f33ea78bd1 100644
--- a/backends/qualcomm/builders/op_eq.py
+++ b/backends/qualcomm/builders/op_eq.py
@@ -37,7 +37,7 @@ def define_node(
 
         input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_exp.py b/backends/qualcomm/builders/op_exp.py
index 8c4794c9725..f736dec85c2 100644
--- a/backends/qualcomm/builders/op_exp.py
+++ b/backends/qualcomm/builders/op_exp.py
@@ -26,7 +26,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         # tensor input
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_expand.py b/backends/qualcomm/builders/op_expand.py
index c098ed00c94..31d248638ab 100644
--- a/backends/qualcomm/builders/op_expand.py
+++ b/backends/qualcomm/builders/op_expand.py
@@ -27,7 +27,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_ge.py b/backends/qualcomm/builders/op_ge.py
index 6784167aa5b..28a29829731 100644
--- a/backends/qualcomm/builders/op_ge.py
+++ b/backends/qualcomm/builders/op_ge.py
@@ -37,7 +37,7 @@ def define_node(
 
         input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_gelu.py b/backends/qualcomm/builders/op_gelu.py
index c178740448e..02356a2eef5 100644
--- a/backends/qualcomm/builders/op_gelu.py
+++ b/backends/qualcomm/builders/op_gelu.py
@@ -26,7 +26,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_group_norm.py b/backends/qualcomm/builders/op_group_norm.py
index 26700216b53..a52569cfa7a 100644
--- a/backends/qualcomm/builders/op_group_norm.py
+++ b/backends/qualcomm/builders/op_group_norm.py
@@ -29,7 +29,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -39,7 +39,7 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        weight_node = node.args[1]
+        weight_node = self.get_node(node.args[1])
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
@@ -49,7 +49,7 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        bias_node = node.args[2]
+        bias_node = self.get_node(node.args[2])
         bias_tensor = get_parameter(bias_node, self.edge_program)
         bias_tensor_wrapper = self.define_tensor(
             bias_node,
diff --git a/backends/qualcomm/builders/op_gt.py b/backends/qualcomm/builders/op_gt.py
index 6c311f42b7f..8c1ef3a600c 100644
--- a/backends/qualcomm/builders/op_gt.py
+++ b/backends/qualcomm/builders/op_gt.py
@@ -37,7 +37,7 @@ def define_node(
 
         input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_hardsigmoid.py b/backends/qualcomm/builders/op_hardsigmoid.py
index 1acc08a387d..c30cae92f55 100644
--- a/backends/qualcomm/builders/op_hardsigmoid.py
+++ b/backends/qualcomm/builders/op_hardsigmoid.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_hardswish.py b/backends/qualcomm/builders/op_hardswish.py
index ed28ff95f78..fb4d0a40515 100644
--- a/backends/qualcomm/builders/op_hardswish.py
+++ b/backends/qualcomm/builders/op_hardswish.py
@@ -26,7 +26,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_hardtanh.py b/backends/qualcomm/builders/op_hardtanh.py
index 68bafaaab8b..4025a060ff3 100644
--- a/backends/qualcomm/builders/op_hardtanh.py
+++ b/backends/qualcomm/builders/op_hardtanh.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_index.py b/backends/qualcomm/builders/op_index.py
index ff039f9d7a8..fe6bf4262d8 100644
--- a/backends/qualcomm/builders/op_index.py
+++ b/backends/qualcomm/builders/op_index.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_index_put.py b/backends/qualcomm/builders/op_index_put.py
index c317cc0a8b7..9c11a6ca891 100644
--- a/backends/qualcomm/builders/op_index_put.py
+++ b/backends/qualcomm/builders/op_index_put.py
@@ -20,7 +20,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -50,7 +50,7 @@ def define_node(
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
-        value_node = node.args[2]
+        value_node = self.get_node(node.args[2])
 
         value_tensor = self.get_tensor(value_node, node)
 
diff --git a/backends/qualcomm/builders/op_instance_norm.py b/backends/qualcomm/builders/op_instance_norm.py
index e7e7f14a944..828e89a97f2 100644
--- a/backends/qualcomm/builders/op_instance_norm.py
+++ b/backends/qualcomm/builders/op_instance_norm.py
@@ -35,7 +35,9 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node, weight_node, bias_node = node.args[0:3]
+        input_node = self.get_node(node.args[0])
+        weight_node = self.get_node(node.args[1])
+        bias_node = self.get_node(node.args[2])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_layer_norm.py b/backends/qualcomm/builders/op_layer_norm.py
index 06f822014ed..5316cb1dabe 100644
--- a/backends/qualcomm/builders/op_layer_norm.py
+++ b/backends/qualcomm/builders/op_layer_norm.py
@@ -30,7 +30,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -53,7 +53,7 @@ def define_node(
         axis = [len(input_tensor.shape) - 1]
         axis_shape = [len(axis)]
 
-        weight_node = node.args[2]
+        weight_node = self.get_node(node.args[2])
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
@@ -65,7 +65,7 @@ def define_node(
 
         layer_norm_input_tensors = [input_tensor_wrapper, weight_tensor_wrapper]
 
-        bias_node = node.args[3]
+        bias_node = self.get_node(node.args[3])
         if bias_node is not None:
             bias_tensor = get_parameter(bias_node, self.edge_program)
             bias_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_le.py b/backends/qualcomm/builders/op_le.py
index 1dd2a06b777..e5784049c5c 100644
--- a/backends/qualcomm/builders/op_le.py
+++ b/backends/qualcomm/builders/op_le.py
@@ -37,7 +37,7 @@ def define_node(
 
         input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_linear.py b/backends/qualcomm/builders/op_linear.py
index 71b6072b9e5..71716e81bca 100644
--- a/backends/qualcomm/builders/op_linear.py
+++ b/backends/qualcomm/builders/op_linear.py
@@ -34,7 +34,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         linear_input_tensors = []
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -45,7 +45,7 @@ def define_node(
         )
         linear_input_tensors.append(input_tensor_wrapper)
 
-        weight_node = node.args[1]
+        weight_node = self.get_node(node.args[1])
         if (
             quant_attrs := weight_node.meta.get(QCOM_QUANT_ATTRS)
         ) and QCOM_SCALES in quant_attrs:
@@ -67,7 +67,7 @@ def define_node(
         linear_input_tensors.append(weight_tensor_wrapper)
 
         if len(node.args) >= 3:
-            bias_node = node.args[2]
+            bias_node = self.get_node(node.args[2])
 
             # TODO remove this when qnn sdk support
             if QCOM_SCALES in bias_node.meta.get(QCOM_QUANT_ATTRS, {}):
diff --git a/backends/qualcomm/builders/op_log.py b/backends/qualcomm/builders/op_log.py
index bcc40aa6268..65125e42316 100644
--- a/backends/qualcomm/builders/op_log.py
+++ b/backends/qualcomm/builders/op_log.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         log_inp_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_log_softmax.py b/backends/qualcomm/builders/op_log_softmax.py
index d395d5eb66e..2d6c857591e 100644
--- a/backends/qualcomm/builders/op_log_softmax.py
+++ b/backends/qualcomm/builders/op_log_softmax.py
@@ -27,7 +27,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         log_softmax_inp_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_logical_not.py b/backends/qualcomm/builders/op_logical_not.py
index 457a1007ada..1eed7d894de 100644
--- a/backends/qualcomm/builders/op_logical_not.py
+++ b/backends/qualcomm/builders/op_logical_not.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_lt.py b/backends/qualcomm/builders/op_lt.py
index b4a080efc38..9494aac9d29 100644
--- a/backends/qualcomm/builders/op_lt.py
+++ b/backends/qualcomm/builders/op_lt.py
@@ -37,7 +37,7 @@ def define_node(
 
         input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_matmul.py b/backends/qualcomm/builders/op_matmul.py
index 577bcb12a42..8d45424bd62 100644
--- a/backends/qualcomm/builders/op_matmul.py
+++ b/backends/qualcomm/builders/op_matmul.py
@@ -27,7 +27,7 @@ def define_node(
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         matmul_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
 
             input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_max.py b/backends/qualcomm/builders/op_max.py
index 7d41358a266..57e119922ed 100644
--- a/backends/qualcomm/builders/op_max.py
+++ b/backends/qualcomm/builders/op_max.py
@@ -37,7 +37,7 @@ def define_node(
 
         min_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_max_pool2d.py b/backends/qualcomm/builders/op_max_pool2d.py
index 8d0087eb2c6..a0ef685acd0 100644
--- a/backends/qualcomm/builders/op_max_pool2d.py
+++ b/backends/qualcomm/builders/op_max_pool2d.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py
index 313b24420db..8fb0e9e3c95 100644
--- a/backends/qualcomm/builders/op_mean_dim.py
+++ b/backends/qualcomm/builders/op_mean_dim.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_min.py b/backends/qualcomm/builders/op_min.py
index 0df2796974d..72224500b0e 100644
--- a/backends/qualcomm/builders/op_min.py
+++ b/backends/qualcomm/builders/op_min.py
@@ -37,7 +37,7 @@ def define_node(
 
         min_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_mul.py b/backends/qualcomm/builders/op_mul.py
index 3138d3b8c9b..36e0c91cf7a 100644
--- a/backends/qualcomm/builders/op_mul.py
+++ b/backends/qualcomm/builders/op_mul.py
@@ -37,7 +37,7 @@ def define_node(
 
         mul_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_ne.py b/backends/qualcomm/builders/op_ne.py
index 0227b02efbf..e9b723a88c5 100644
--- a/backends/qualcomm/builders/op_ne.py
+++ b/backends/qualcomm/builders/op_ne.py
@@ -8,14 +8,6 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpElementWiseNotEqual, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -45,38 +37,9 @@ def define_node(
 
         input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
-            if isinstance(input_node, torch.fx.Node):
-                input_tensor = self.get_tensor(input_node, node)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
-            else:
-                scalar = input_node
-                input_tensor = torch.tensor(scalar, dtype=torch.float32)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC
-
-                # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-                input_node = torch.fx.Node(
-                    node.graph,
-                    node.name + "_runtime_scalar",
-                    "call_function",
-                    exir_ops.edge.aten.scalar_tensor.default,
-                    (),  # args
-                    {},  # kwargs
-                )
-                # Because the output data type of the ne node is boolean.
-                # We need to take the quant attr from the non-scalar node.
-                if quant_attrs := node.args[index ^ 1].meta.get(QCOM_QUANT_ATTRS):
-                    quant_attrs = quant_attrs.copy()
-                    quant_range = (
-                        quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
-                    )
-                    quant_attrs[QCOM_ZERO_POINT] = (
-                        0 if scalar >= 0 else quant_attrs[QCOM_QUANT_MAX]
-                    )
-                    quant_attrs[QCOM_SCALE] = (
-                        scalar / quant_range if scalar >= 0 else -scalar / quant_range
-                    )
-                    input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            input_node = self.get_node(node.args[index])
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
             input_tensor_wrapper = self.define_tensor(
                 input_node,
diff --git a/backends/qualcomm/builders/op_neg.py b/backends/qualcomm/builders/op_neg.py
index a950a1887ab..fd48cbe2791 100644
--- a/backends/qualcomm/builders/op_neg.py
+++ b/backends/qualcomm/builders/op_neg.py
@@ -24,7 +24,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         neg_inp_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_or.py b/backends/qualcomm/builders/op_or.py
index c2751744788..483831db0f7 100644
--- a/backends/qualcomm/builders/op_or.py
+++ b/backends/qualcomm/builders/op_or.py
@@ -37,7 +37,7 @@ def define_node(
 
         or_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_pad.py b/backends/qualcomm/builders/op_pad.py
index 5ec34065f8b..7b210ed6838 100644
--- a/backends/qualcomm/builders/op_pad.py
+++ b/backends/qualcomm/builders/op_pad.py
@@ -27,7 +27,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         pad_inp_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_pow.py b/backends/qualcomm/builders/op_pow.py
index 3e89bdcfc4d..996d3b353e2 100644
--- a/backends/qualcomm/builders/op_pow.py
+++ b/backends/qualcomm/builders/op_pow.py
@@ -37,7 +37,7 @@ def define_node(
         pow_output_tensors = [output_tensor_wrapper]
 
         # tensor input
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
@@ -51,7 +51,7 @@ def define_node(
         )
 
         # exp input
-        exp_node = node.args[1]
+        exp_node = self.get_node(node.args[1])
         exp_tensor = self.get_tensor(exp_node, node)
         exp_tensor_wrapper = self.define_tensor(
             exp_node,
diff --git a/backends/qualcomm/builders/op_prelu.py b/backends/qualcomm/builders/op_prelu.py
index e35839f535e..69ea5e005a7 100644
--- a/backends/qualcomm/builders/op_prelu.py
+++ b/backends/qualcomm/builders/op_prelu.py
@@ -26,7 +26,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         prelu_inp_tensor_wrapper = self.define_tensor(
             input_node,
@@ -36,23 +36,18 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        coeff_node = node.args[1]
-        coeff_tensor = torch.zeros(input_node.meta["val"].shape)
+        coeff_node = self.get_node(node.args[1])
         coeff = get_parameter(coeff_node, self.edge_program)
-        # param nodes will be FakeTensor when doing partition
-        # fill in random numeric for validation
-        if isinstance(coeff, torch._subclasses.fake_tensor.FakeTensor):
-            coeff = torch.ones(coeff.shape)
+        coeff_tensor = torch.zeros(input_node.meta["val"].shape, dtype=coeff.dtype)
         # per-channel activation
         if coeff_node.meta["val"].shape[0] > 1:
             for i in range(input_node.meta["val"].shape[1]):
                 coeff_tensor = coeff_tensor.index_fill(1, torch.tensor([i]), coeff[i])
-            if QCOM_AXIS_ORDER in input_node.meta:
-                axis_order = input_node.meta[QCOM_AXIS_ORDER]
-                coeff_tensor = coeff_tensor.permute(dims=axis_order).contiguous()
         else:
-            coeff = coeff.item()
-            coeff_tensor = torch.full(input_tensor.shape, coeff).to(torch.float32)
+            coeff_tensor.fill_(coeff[0])
+
+        if axis_order := input_node.meta.get(QCOM_AXIS_ORDER, None):
+            coeff_tensor = coeff_tensor.permute(dims=axis_order).contiguous()
 
         coeff_tensor_wrapper = self.define_tensor(
             coeff_node,
diff --git a/backends/qualcomm/builders/op_relu.py b/backends/qualcomm/builders/op_relu.py
index 29335797e28..d237b84efe1 100644
--- a/backends/qualcomm/builders/op_relu.py
+++ b/backends/qualcomm/builders/op_relu.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         relu_inp_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_repeat.py b/backends/qualcomm/builders/op_repeat.py
index 9748f1e9619..e5867e64447 100644
--- a/backends/qualcomm/builders/op_repeat.py
+++ b/backends/qualcomm/builders/op_repeat.py
@@ -27,7 +27,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_reshape.py b/backends/qualcomm/builders/op_reshape.py
index ff4a603fa5b..6e25c65e16d 100644
--- a/backends/qualcomm/builders/op_reshape.py
+++ b/backends/qualcomm/builders/op_reshape.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py
index aa7f9becd98..fdf49b09fef 100644
--- a/backends/qualcomm/builders/op_rms_norm.py
+++ b/backends/qualcomm/builders/op_rms_norm.py
@@ -36,7 +36,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         # args of node : ['input', 'normalized_shape', 'weight', 'eps']
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -60,7 +60,7 @@ def define_node(
         axes = [node.args[0].meta["val"].dim() - 1]
         axes_shape = [len(axes)]
 
-        weight_node = node.args[2]
+        weight_node = self.get_node(node.args[2])
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
@@ -71,7 +71,7 @@ def define_node(
         )
 
         # Fake node, nn module seems to be inconsistent with document
-        bias_tensor = torch.zeros(weight_tensor.shape)
+        bias_tensor = torch.zeros(weight_tensor.shape, dtype=weight_tensor.dtype)
         bias_node = torch.fx.Node(
             node.graph,
             node.name + "_runtime_bias",
diff --git a/backends/qualcomm/builders/op_rsqrt.py b/backends/qualcomm/builders/op_rsqrt.py
index 162b485e9e5..b1995e28dde 100644
--- a/backends/qualcomm/builders/op_rsqrt.py
+++ b/backends/qualcomm/builders/op_rsqrt.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         rsqrt_inp_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_scalar_tensor.py b/backends/qualcomm/builders/op_scalar_tensor.py
index d236f6674df..2e9154115bc 100644
--- a/backends/qualcomm/builders/op_scalar_tensor.py
+++ b/backends/qualcomm/builders/op_scalar_tensor.py
@@ -13,7 +13,7 @@
 
 
 @register_node_visitor
-class Arange(NodeVisitor):
+class ScalarTensor(NodeVisitor):
     target = ["scalar_tensor.default"]
 
     def __init__(self, *args) -> None:
diff --git a/backends/qualcomm/builders/op_select_copy.py b/backends/qualcomm/builders/op_select_copy.py
index 148888f1497..c5a7c0f7c99 100644
--- a/backends/qualcomm/builders/op_select_copy.py
+++ b/backends/qualcomm/builders/op_select_copy.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_sigmoid.py b/backends/qualcomm/builders/op_sigmoid.py
index ae6e6709c0a..ce820c8f4ee 100644
--- a/backends/qualcomm/builders/op_sigmoid.py
+++ b/backends/qualcomm/builders/op_sigmoid.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         sigmoid_inp_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_sin.py b/backends/qualcomm/builders/op_sin.py
index 8828685ac9e..f9a0b1c2e63 100644
--- a/backends/qualcomm/builders/op_sin.py
+++ b/backends/qualcomm/builders/op_sin.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_slice_copy.py b/backends/qualcomm/builders/op_slice_copy.py
index 8d12e03c0bb..7d3a154e9f1 100644
--- a/backends/qualcomm/builders/op_slice_copy.py
+++ b/backends/qualcomm/builders/op_slice_copy.py
@@ -26,7 +26,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
@@ -50,12 +50,17 @@ def define_node(
         dim = cast(int, node.args[1])
         if dim < 0:
             dim = dim % len(input_tensor.shape)
-        start = cast(int, node.args[2])
+
+        start = 0 if node.args[2] is None else cast(int, node.args[2])
         if start < 0:
             start = start % input_tensor.shape[dim]
-        end = min(cast(int, node.args[3]), input_tensor.shape[dim])
-        if end < 0:
-            end = end % input_tensor.shape[dim]
+
+        if len(node.args) > 3:
+            end = min(cast(int, node.args[3]), input_tensor.shape[dim])
+            if end < 0:
+                end = end % input_tensor.shape[dim]
+        else:
+            end = input_tensor.shape[dim]
 
         input_tensor_rank = len(input_tensor.shape)
         ranges = []
diff --git a/backends/qualcomm/builders/op_softmax.py b/backends/qualcomm/builders/op_softmax.py
index f6f826e2a40..43cc6438b9b 100644
--- a/backends/qualcomm/builders/op_softmax.py
+++ b/backends/qualcomm/builders/op_softmax.py
@@ -27,7 +27,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         softmax_inp_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_space_to_depth.py b/backends/qualcomm/builders/op_space_to_depth.py
index 0282cf3f15a..84c79d841d8 100644
--- a/backends/qualcomm/builders/op_space_to_depth.py
+++ b/backends/qualcomm/builders/op_space_to_depth.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_split_with_sizes.py b/backends/qualcomm/builders/op_split_with_sizes.py
index 138f6ed60ec..b70d74aa339 100644
--- a/backends/qualcomm/builders/op_split_with_sizes.py
+++ b/backends/qualcomm/builders/op_split_with_sizes.py
@@ -28,7 +28,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
 
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_sqrt.py b/backends/qualcomm/builders/op_sqrt.py
index 5505e92ee67..ff5a0c086e0 100644
--- a/backends/qualcomm/builders/op_sqrt.py
+++ b/backends/qualcomm/builders/op_sqrt.py
@@ -26,7 +26,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         # tensor input
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_squeeze.py b/backends/qualcomm/builders/op_squeeze.py
index b828bb7b0b9..94d6e5a3cf9 100644
--- a/backends/qualcomm/builders/op_squeeze.py
+++ b/backends/qualcomm/builders/op_squeeze.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_stack.py b/backends/qualcomm/builders/op_stack.py
index fdef148ad4d..25b7d353dc4 100644
--- a/backends/qualcomm/builders/op_stack.py
+++ b/backends/qualcomm/builders/op_stack.py
@@ -30,7 +30,7 @@ def define_node(
         input_node_list = node.args[0]
         stack_input_tensors = []
         for input_node in input_node_list:
-            input_tensor = self.get_tensor(input_node, node)
+            input_tensor = self.get_tensor(self.get_node(input_node), node)
             stack_inp_tensor_wrapper = self.define_tensor(
                 input_node,
                 node,
diff --git a/backends/qualcomm/builders/op_sub.py b/backends/qualcomm/builders/op_sub.py
index 954ca9d3917..e7e5b22bb96 100644
--- a/backends/qualcomm/builders/op_sub.py
+++ b/backends/qualcomm/builders/op_sub.py
@@ -37,7 +37,7 @@ def define_node(
 
         sub_input_tensors = []
         for index in range(2):
-            input_node = node.args[index]
+            input_node = self.get_node(node.args[index])
             input_tensor = self.get_tensor(input_node, node)
             tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
diff --git a/backends/qualcomm/builders/op_sum_int_list.py b/backends/qualcomm/builders/op_sum_int_list.py
index 74181f46cb3..fc5546f9d33 100644
--- a/backends/qualcomm/builders/op_sum_int_list.py
+++ b/backends/qualcomm/builders/op_sum_int_list.py
@@ -28,7 +28,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
 
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_tanh.py b/backends/qualcomm/builders/op_tanh.py
index ddc9fd2a2a6..c06f44b312f 100644
--- a/backends/qualcomm/builders/op_tanh.py
+++ b/backends/qualcomm/builders/op_tanh.py
@@ -26,7 +26,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_to.py b/backends/qualcomm/builders/op_to.py
index 5fb016aef95..dc1062846ed 100644
--- a/backends/qualcomm/builders/op_to.py
+++ b/backends/qualcomm/builders/op_to.py
@@ -80,7 +80,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_topk.py b/backends/qualcomm/builders/op_topk.py
index 745cf7b9935..2b5d23268b9 100644
--- a/backends/qualcomm/builders/op_topk.py
+++ b/backends/qualcomm/builders/op_topk.py
@@ -33,7 +33,7 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
 
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_transpose.py b/backends/qualcomm/builders/op_transpose.py
index d29fc73084c..7fb02a2fb7c 100644
--- a/backends/qualcomm/builders/op_transpose.py
+++ b/backends/qualcomm/builders/op_transpose.py
@@ -28,7 +28,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         permute_node = input_node if QCOM_INSERTED_PERMUTE in node.meta else node
         input_tensor = self.get_tensor(input_node, permute_node)
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_unbind.py b/backends/qualcomm/builders/op_unbind.py
index 8ca62e2a07b..1c505e6f4fd 100644
--- a/backends/qualcomm/builders/op_unbind.py
+++ b/backends/qualcomm/builders/op_unbind.py
@@ -27,7 +27,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_unsqueeze.py b/backends/qualcomm/builders/op_unsqueeze.py
index 55790129462..f5cd7af3b2e 100644
--- a/backends/qualcomm/builders/op_unsqueeze.py
+++ b/backends/qualcomm/builders/op_unsqueeze.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
 
         input_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_upsample_bilinear2d.py b/backends/qualcomm/builders/op_upsample_bilinear2d.py
index 654fb934571..10dfe375fe0 100644
--- a/backends/qualcomm/builders/op_upsample_bilinear2d.py
+++ b/backends/qualcomm/builders/op_upsample_bilinear2d.py
@@ -26,7 +26,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_upsample_nearest2d.py b/backends/qualcomm/builders/op_upsample_nearest2d.py
index c4b353fd3e9..4e9c4741ca2 100644
--- a/backends/qualcomm/builders/op_upsample_nearest2d.py
+++ b/backends/qualcomm/builders/op_upsample_nearest2d.py
@@ -26,7 +26,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        input_node = node.args[0]
+        input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_where.py b/backends/qualcomm/builders/op_where.py
index ecac45a7a6f..94ee1b0e940 100644
--- a/backends/qualcomm/builders/op_where.py
+++ b/backends/qualcomm/builders/op_where.py
@@ -25,7 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        conditional_input_node = node.args[0]
+        conditional_input_node = self.get_node(node.args[0])
         conditional_input_tensor = self.get_tensor(conditional_input_node, node)
         conditional_input_tensor_wrapper = self.define_tensor(
             conditional_input_node,
@@ -35,7 +35,7 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        true_input_node = node.args[1]
+        true_input_node = self.get_node(node.args[1])
         true_input_tensor = self.get_tensor(true_input_node, node)
         true_input_tensor_wrapper = self.define_tensor(
             true_input_node,
@@ -45,7 +45,7 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        false_input_node = node.args[2]
+        false_input_node = self.get_node(node.args[2])
         false_input_tensor = self.get_tensor(false_input_node, node)
         false_input_tensor_wrapper = self.define_tensor(
             false_input_node,
diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py
index d9eb188614c..7e5a779e748 100644
--- a/backends/qualcomm/partition/qnn_partitioner.py
+++ b/backends/qualcomm/partition/qnn_partitioner.py
@@ -12,7 +12,10 @@
 from executorch.backends.qualcomm.builders import node_visitor
 from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
 from executorch.backends.qualcomm.qnn_preprocess import QnnBackend
-from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_AXIS_ORDER,
+    QCOM_BYPASS_NODE,
+)
 
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
@@ -46,7 +49,6 @@ def __init__(
         skip_node_op_set: set = None,
     ):
         self.node_visitors = node_visitor.get_node_visitors(edge_program)
-
         self.skip_node_op_set = skip_node_op_set
         self.skip_node_id_set = skip_node_id_set
         self.nodes_to_wrappers = defaultdict(dict)
@@ -70,6 +72,8 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             node.target in allow_list_operator
             # bypass if custom op appears
             or OpContextLoader.namespace == node.target.namespace
+            # bypass dequantize op for parameters & buffers
+            or node.meta.get(QCOM_BYPASS_NODE, False)
         ):
             return True
 
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 338209fcd4a..74c85b773c2 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1535,8 +1535,11 @@ def test_qnn_backend_elu(self):
     def test_qnn_backend_embedding(self):
         module = Embedding()  # noqa: F405
         sample_input = (torch.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]]).to(torch.int32),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        quant_dtype = [QuantDtype.use_8a8w, QuantDtype.use_16a4w]
+        for i, qdtype in enumerate(quant_dtype):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input, quant_dtype=qdtype)
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_equal(self):
         test_comb = [
@@ -2505,7 +2508,7 @@ def test_qnn_backend_profile_op(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_profile_events=34,
+            expected_profile_events=30,
         )
 
     def test_qnn_backend_shared_buffer(self):
@@ -2527,6 +2530,9 @@ def test_qnn_backend_shared_buffer(self):
         )
 
     def test_qnn_backend_online_prepare(self):
+        if self.enable_x86_64:
+            self.skipTest("online prepare is not supported on host machine")
+
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.model],
@@ -3150,7 +3156,7 @@ def test_qnn_backend_profile_op(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_profile_events=35,
+            expected_profile_events=30,
         )
 
     def test_qnn_backend_shared_buffer(self):
@@ -3173,6 +3179,9 @@ def test_qnn_backend_shared_buffer(self):
         )
 
     def test_qnn_backend_online_prepare(self):
+        if self.enable_x86_64:
+            self.skipTest("online prepare is not supported on host machine")
+
         backend_options = generate_htp_compiler_spec(use_fp16=False)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.model],
@@ -3300,17 +3309,17 @@ def test_qnn_backend_dump_context_from_pte(self):
     def test_qnn_backend_draw_graph(self):
         golden_data = """digraph test {
             rankdir=TB
-            aten_convolution_default_0 [label=<
+            aten_convolution_default_1_0 [label=<
                         <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
-                        <TR><TD BGCOLOR="white">name: aten_convolution_default_0</TD></TR>
+                        <TR><TD BGCOLOR="white">name: aten_convolution_default_1_0</TD></TR>
                         <TR><TD BGCOLOR="white">data_type: Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_8</TD></TR>
                         <TR><TD BGCOLOR="white">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE</TD></TR>
                         <TR><TD BGCOLOR="white">dims: [1, 28, 28, 32]</TD></TR>
                         <TR><TD BGCOLOR="white">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_SCALE_OFFSET</TD></TR>
                     </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
-            aten_relu_default_0 [label=<
+            aten_relu_default_1_0 [label=<
                         <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
-                        <TR><TD BGCOLOR="white">name: aten_relu_default_0</TD></TR>
+                        <TR><TD BGCOLOR="white">name: aten_relu_default_1_0</TD></TR>
                         <TR><TD BGCOLOR="white">data_type: Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_8</TD></TR>
                         <TR><TD BGCOLOR="white">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE</TD></TR>
                         <TR><TD BGCOLOR="white">dims: [1, 28, 28, 32]</TD></TR>
@@ -3340,17 +3349,33 @@ def test_qnn_backend_draw_graph(self):
                         <TR><TD BGCOLOR="lightpink">dims: [32]</TD></TR>
                         <TR><TD BGCOLOR="lightpink">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET</TD></TR>
                     </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
-            aten_convolution_default_1_0 [label=<
+            b__frozen_param0_0 [label=<
                         <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
-                        <TR><TD BGCOLOR="white">name: aten_convolution_default_1_0</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">name: b__frozen_param0_0</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">data_type: Qnn_DataType_t.QNN_DATATYPE_SFIXED_POINT_8</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">dims: [3, 3, 32, 32]</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET</TD></TR>
+                    </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
+            b__frozen_param1_0 [label=<
+                        <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
+                        <TR><TD BGCOLOR="lightpink">name: b__frozen_param1_0</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">data_type: Qnn_DataType_t.QNN_DATATYPE_SFIXED_POINT_32</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">dims: [32]</TD></TR>
+                        <TR><TD BGCOLOR="lightpink">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET</TD></TR>
+                    </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
+            aten_convolution_default_0 [label=<
+                        <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
+                        <TR><TD BGCOLOR="white">name: aten_convolution_default_0</TD></TR>
                         <TR><TD BGCOLOR="white">data_type: Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_8</TD></TR>
                         <TR><TD BGCOLOR="white">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE</TD></TR>
                         <TR><TD BGCOLOR="white">dims: [1, 28, 28, 32]</TD></TR>
                         <TR><TD BGCOLOR="white">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_SCALE_OFFSET</TD></TR>
                     </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
-            aten_relu_default_1_0 [label=<
+            aten_relu_default_0 [label=<
                         <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
-                        <TR><TD BGCOLOR="white">name: aten_relu_default_1_0</TD></TR>
+                        <TR><TD BGCOLOR="white">name: aten_relu_default_0</TD></TR>
                         <TR><TD BGCOLOR="white">data_type: Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_8</TD></TR>
                         <TR><TD BGCOLOR="white">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE</TD></TR>
                         <TR><TD BGCOLOR="white">dims: [1, 28, 28, 32]</TD></TR>
@@ -3364,14 +3389,6 @@ def test_qnn_backend_draw_graph(self):
                         <TR><TD BGCOLOR="white">dims: [1, 28, 28, 32]</TD></TR>
                         <TR><TD BGCOLOR="white">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_SCALE_OFFSET</TD></TR>
                     </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
-            output_quantized_decomposed_dequantize_per_tensor_tensor_0 [label=<
-                        <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
-                        <TR><TD BGCOLOR="lightgreen">name: output_quantized_decomposed_dequantize_per_tensor_tensor_0</TD></TR>
-                        <TR><TD BGCOLOR="lightgreen">data_type: Qnn_DataType_t.QNN_DATATYPE_FLOAT_32</TD></TR>
-                        <TR><TD BGCOLOR="lightgreen">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_APP_READ</TD></TR>
-                        <TR><TD BGCOLOR="lightgreen">dims: [1, 32, 28, 28]</TD></TR>
-                        <TR><TD BGCOLOR="lightgreen">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED</TD></TR>
-                    </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
             input_0_x_0 [label=<
                         <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
                         <TR><TD BGCOLOR="lightgreen">name: input_0_x_0</TD></TR>
@@ -3380,36 +3397,27 @@ def test_qnn_backend_draw_graph(self):
                         <TR><TD BGCOLOR="lightgreen">dims: [1, 32, 28, 28]</TD></TR>
                         <TR><TD BGCOLOR="lightgreen">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED</TD></TR>
                     </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
-            b__frozen_param0_0 [label=<
-                        <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
-                        <TR><TD BGCOLOR="lightpink">name: b__frozen_param0_0</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">data_type: Qnn_DataType_t.QNN_DATATYPE_SFIXED_POINT_8</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">dims: [3, 3, 32, 32]</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET</TD></TR>
-                    </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
-            b__frozen_param1_0 [label=<
+            output_quantized_decomposed_dequantize_per_tensor_tensor_0 [label=<
                         <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
-                        <TR><TD BGCOLOR="lightpink">name: b__frozen_param1_0</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">data_type: Qnn_DataType_t.QNN_DATATYPE_SFIXED_POINT_32</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">dims: [32]</TD></TR>
-                        <TR><TD BGCOLOR="lightpink">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET</TD></TR>
+                        <TR><TD BGCOLOR="lightgreen">name: output_quantized_decomposed_dequantize_per_tensor_tensor_0</TD></TR>
+                        <TR><TD BGCOLOR="lightgreen">data_type: Qnn_DataType_t.QNN_DATATYPE_FLOAT_32</TD></TR>
+                        <TR><TD BGCOLOR="lightgreen">tensor_type: Qnn_TensorType_t.QNN_TENSOR_TYPE_APP_READ</TD></TR>
+                        <TR><TD BGCOLOR="lightgreen">dims: [1, 32, 28, 28]</TD></TR>
+                        <TR><TD BGCOLOR="lightgreen">quantization_encoding: Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED</TD></TR>
                     </TABLE>> color=black fillcolor=transparent shape=box style=rounded]
-            quantized_decomposed_quantize_per_tensor_default_0 -> aten_convolution_default_0
-            input_0_x_0 -> quantized_decomposed_quantize_per_tensor_default_0
-            b__frozen_param0_0 -> aten_convolution_default_0
-            b__frozen_param1_0 -> aten_convolution_default_0
-            aten_convolution_default_0 -> aten_relu_default_0
             quantized_decomposed_quantize_per_tensor_default_0 -> aten_convolution_default_1_0
+            input_0_x_0 -> quantized_decomposed_quantize_per_tensor_default_0
             b__frozen_param2_0 -> aten_convolution_default_1_0
             b__frozen_param3_0 -> aten_convolution_default_1_0
             aten_convolution_default_1_0 -> aten_relu_default_1_0
+            quantized_decomposed_quantize_per_tensor_default_0 -> aten_convolution_default_0
+            b__frozen_param0_0 -> aten_convolution_default_0
+            b__frozen_param1_0 -> aten_convolution_default_0
+            aten_convolution_default_0 -> aten_relu_default_0
             aten_relu_default_0 -> aten_add_tensor_0
             aten_relu_default_1_0 -> aten_add_tensor_0
             aten_add_tensor_0 -> output_quantized_decomposed_dequantize_per_tensor_tensor_0
-        }
-        """
+        }"""
         module = DrawGraphModel()  # noqa: F405
         sample_input = (torch.randn(1, 32, 28, 28),)
         module = self.get_qdq_module(module, sample_input)
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 695c846de05..0b34290f4c2 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -15,6 +15,7 @@
 import numpy as np
 import torch
 from executorch import exir
+from executorch.backends.qualcomm._passes.utils import dq_ops
 from executorch.backends.qualcomm.qnn_preprocess import QnnBackend
 from executorch.backends.qualcomm.quantizer.quantizer import ModuleQConfig, QuantDtype
 from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
@@ -298,7 +299,7 @@ def validate_profile():
                     target_time_scale=TimeScale.CYCLES,
                 )
                 self.assertTrue(
-                    len(inspector.to_dataframe().index) == expected_profile_events
+                    len(inspector.to_dataframe().index) >= expected_profile_events
                 )
 
             def validate_intermediate_tensor():
@@ -583,7 +584,7 @@ def get_converted_sgd_trained_module(
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
-        return torch.ao.quantization.quantize_pt2e.convert_pt2e(prepared)
+        return convert_pt2e(prepared)
 
     def split_graph(self, division: int):
         class SplitGraph(ExportPass):
@@ -595,6 +596,10 @@ def __init__(self, division):
                 super().__init__()
                 self.division = division
 
+            def _is_legit_node(self, node):
+                # skip dq_ops for frozen_params
+                return node.op == "call_function" and node.target not in dq_ops
+
             def _insert_clone(
                 self, graph_module: torch.fx.GraphModule
             ) -> torch.fx.GraphModule:
@@ -609,9 +614,11 @@ def _insert_clone(
                 # Insert clone op to split model based on the shares
                 num_graph_nodes = 0
                 for node in graph_module.graph.nodes:
-                    num_graph_nodes += 1 if node.op == "call_function" else 0
+                    if not self._is_legit_node(node):
+                        continue
 
-                    if num_graph_nodes % shares != 0 or node.op != "call_function":
+                    num_graph_nodes += 1
+                    if num_graph_nodes % shares != 0:
                         continue
 
                     with graph_module.graph.inserting_after(node):
diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py
index ce917bf4115..a4a087287a4 100644
--- a/backends/qualcomm/utils/constants.py
+++ b/backends/qualcomm/utils/constants.py
@@ -15,6 +15,7 @@
 QCOM_BLOCK_SCALE_BITWIDTH = "block_scale_bitwidth"
 QCOM_BLOCK_SCALE_OFFSET = "block_scale_offset"
 QCOM_BLOCK_STORAGE_TYPE = "block_storage_type"
+QCOM_BYPASS_NODE = "bypass_node"
 QCOM_DATA = "data"
 QCOM_DTYPE = "dtype"
 QCOM_ENCODING = "encoding"
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 3653cd3176f..e80b03a32c0 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -640,7 +640,8 @@ def prepare_subgm(subgm, subgm_name):
     for node in graph_module.graph.nodes:
         if node.op == "call_module":
             graph_module.set_submodule(
-                node.name, convert_pt2e(graph_module.get_submodule(node.name))
+                node.name,
+                convert_pt2e(graph_module.get_submodule(node.name)),
             )
     # canonicalize graph for lowering again
     graph_module, edge_prog_mgrs = _canonicalize_graph_with_lowered_module(
diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
index 4d4f1c2e39d..b9b1ddc0f72 100644
--- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
@@ -52,6 +52,7 @@ target_link_libraries(
   qnn_executorch_backend
   executorch_core
   extension_data_loader
+  extension_flat_tensor
   extension_module
   extension_tensor
   gflags
diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
index 16d91013349..4e44a1599b1 100644
--- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
@@ -39,6 +39,7 @@ target_link_libraries(
   qnn_executorch_backend
   executorch_core
   extension_data_loader
+  extension_flat_tensor
   extension_module
   extension_tensor
   gflags
@@ -87,6 +88,7 @@ target_link_libraries(
   qnn_executorch_backend
   executorch_core
   extension_data_loader
+  extension_flat_tensor
   extension_module
   extension_tensor
   gflags
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
index ff22f08cd09..5b63a6678fc 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
@@ -23,6 +23,7 @@ target_link_libraries(
   qnn_executorch_backend
   executorch_core
   extension_data_loader
+  extension_flat_tensor
   extension_module
   extension_tensor
   gflags
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index d8dab88e998..b5ac77c3d1f 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -251,8 +251,8 @@ def qat_train(ori_model, captured_model, quantizer, dataset):
         loss.backward()
         optimizer.step()
 
-    return torch.ao.quantization.quantize_pt2e.convert_pt2e(
-        torch.ao.quantization.move_exported_model_to_eval(annotated_model)
+    return convert_pt2e(
+        torch.ao.quantization.move_exported_model_to_eval(annotated_model),
     )
 
 
diff --git a/exir/program/_program.py b/exir/program/_program.py
index f24807e253d..155591dd1af 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1283,7 +1283,7 @@ def to_edge_transform_and_lower(
 
 @experimental(
     """
-    This is an experimental API which overloads to_edge by preserving specified ops to not be decomposed. 
+    This is an experimental API which overloads to_edge by preserving specified ops to not be decomposed.
     This function will be combined with to_edge in the future.
     """
 )

From b73f9d5ca39dc3a02311922e95d853b745a81b25 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 19 May 2025 23:14:38 -0700
Subject: [PATCH 166/178] Lint links for modified lines only on PR (#10994)

---
 .github/workflows/_link_check.yml | 20 ++++++++++----------
 scripts/lint_urls.sh              | 17 +++++++++++------
 scripts/lint_xrefs.sh             | 15 ++++++++++-----
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml
index 39779adc73c..2e96b0fd118 100644
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@@ -18,11 +18,11 @@ jobs:
       timeout: 120
       script: |
         ./scripts/lint_urls.sh $(
-          { [ "${{ github.event_name }}" = "pull_request" ] \
-              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
-          || \
-          { [ "${{ github.event_name }}" = "push" ] \
-              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
         ) || {
           echo
           echo "URL lint failed."
@@ -43,11 +43,11 @@ jobs:
       timeout: 60
       script: |
         ./scripts/lint_xrefs.sh $(
-          { [ "${{ github.event_name }}" = "pull_request" ] \
-              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
-          || \
-          { [ "${{ github.event_name }}" = "push" ] \
-              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
         ) || {
           echo
           echo "Xref lint failed."
diff --git a/scripts/lint_urls.sh b/scripts/lint_urls.sh
index 64e79c9bec6..d93dc056790 100755
--- a/scripts/lint_urls.sh
+++ b/scripts/lint_urls.sh
@@ -66,7 +66,7 @@ while IFS=: read -r filepath url; do
   while [ "$(running_jobs)" -ge "$max_jobs" ]; do
     sleep 1
   done
- done < <(
+done < <(
   pattern='(?!.*@lint-ignore)(?<!git\+)(?<!\$\{)https?://(?![^/]*@)(?![^\s<>\")]*[<>\{\}\$])[^[:space:]<>")\[\]\\|]+'
   excludes=(
     ':(exclude,glob)**/.*'
@@ -78,12 +78,17 @@ while IFS=: read -r filepath url; do
     ':(exclude,glob)**/third-party/**'
     ':(exclude,glob)**/third_party/**'
   )
-  if [ $# -gt 0 ]; then
-    paths=("$@")
+  if [ $# -eq 2 ]; then
+    for filename in $(git diff --name-only --unified=0 "$1...$2"); do
+      git diff --unified=0 "$1...$2" -- "$filename" "${excludes[@]}" \
+        | grep -E '^\+' \
+        | grep -Ev '^\+\+\+' \
+        | perl -nle 'print for m#'"$pattern"'#g' \
+        | sed 's|^|'"$filename"':|'
+    done
   else
-    paths=('*')
-  fi
-  git --no-pager grep --no-color -I -P -o "$pattern" -- "${paths[@]}" "${excludes[@]}" \
+    git --no-pager grep --no-color -I -P -o "$pattern" -- . "${excludes[@]}"
+  fi \
   | sed -E 's/[^/[:alnum:]]+$//' \
   | grep -Ev '://(0\.0\.0\.0|127\.0\.0\.1|localhost)([:/])' \
   | grep -Ev '://[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' \
diff --git a/scripts/lint_xrefs.sh b/scripts/lint_xrefs.sh
index 6da530d3d2a..3b836df038f 100755
--- a/scripts/lint_xrefs.sh
+++ b/scripts/lint_xrefs.sh
@@ -34,12 +34,17 @@ done < <(
     ':(exclude,glob)**/third-party/**'
     ':(exclude,glob)**/third_party/**'
   )
-  if [ $# -gt 0 ]; then
-    paths=("$@")
+  if [ $# -eq 2 ]; then
+    for filename in $(git diff --name-only --unified=0 "$1...$2"); do
+      git diff --unified=0 "$1...$2" -- "$filename" "${excludes[@]}" \
+        | grep -E '^\+' \
+        | grep -Ev '^\+\+\+' \
+        | perl -nle 'print for m#'"$pattern"'#g' \
+        | sed 's|^|'"$filename"':|'
+    done
   else
-    paths=('*')
-  fi
-  git --no-pager grep --no-color -I -P -o "$pattern" -- "${paths[@]}" "${excludes[@]}" \
+    git --no-pager grep --no-color -I -P -o "$pattern" -- . "${excludes[@]}"
+  fi \
   | grep -Ev 'https?://' \
   | sed -E \
       -e 's#([^:]+):\[[^]]+\]\(([^)]+)\)#\1:\2#' \

From 7d9b15f050b9a46226937858bce473e9b02349f6 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Mon, 19 May 2025 23:49:24 -0700
Subject: [PATCH 167/178] Add etdump to android

Differential Revision: D75024936

Pull Request resolved: https://github.com/pytorch/executorch/pull/10889
---
 .github/workflows/android-perf.yml            |  2 +-
 extension/android/CMakeLists.txt              | 10 ++++
 .../java/org/pytorch/executorch/Module.java   | 12 +++++
 .../org/pytorch/executorch/NativePeer.java    |  3 ++
 extension/android/jni/jni_layer.cpp           | 49 +++++++++++++++++--
 .../android-llm-device-farm-test-spec.yml.j2  |  3 ++
 .../org/pytorch/minibench/ModelRunner.java    |  2 +
 scripts/build_android_library.sh              |  3 ++
 8 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 266454c39ab..e2f85e05d3a 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -367,7 +367,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         mkdir -p aar-out
-        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash scripts/build_android_library.sh
+        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 EXECUTORCH_ANDROID_PROFILING=ON bash scripts/build_android_library.sh
         mkdir -p extension/benchmark/android/benchmark/app/libs
         cp aar-out/executorch.aar extension/benchmark/android/benchmark/app/libs
         pushd extension/benchmark/android/benchmark
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index b2f7b8d9f47..06cae3475e8 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -80,6 +80,16 @@ list(
   fbjni
 )
 
+if(EXECUTORCH_ANDROID_PROFILING)
+  list(
+    APPEND
+    link_libraries
+    etdump
+    flatccrt
+  )
+  target_compile_definitions(executorch_jni PUBLIC EXECUTORCH_ANDROID_PROFILING=1)
+endif()
+
 if(TARGET optimized_native_cpu_ops_lib)
   list(
     APPEND
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
index 2fd488dd1f1..b31641d5a37 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
@@ -152,6 +152,18 @@ public String[] readLogBuffer() {
     return mNativePeer.readLogBuffer();
   }
 
+  /**
+   * Dump the ExecuTorch ETRecord file to /data/local/tmp/result.etdump.
+   *
+   * <p>Currently for internal (minibench) use only.
+   *
+   * @return true if the etdump was successfully written, false otherwise.
+   */
+  @Experimental
+  public boolean etdump() {
+    return mNativePeer.etdump();
+  }
+
   /**
    * Explicitly destroys the native Module object. Calling this method is not required, as the
    * native object will be destroyed when this object is garbage-collected. However, the timing of
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
index 5700176261b..58d58de1b3e 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
@@ -62,4 +62,7 @@ public void resetNative() {
   /** Retrieve the in-memory log buffer, containing the most recent ExecuTorch log entries. */
   @DoNotStrip
   public native String[] readLogBuffer();
+
+  @DoNotStrip
+  public native boolean etdump();
 }
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index a78f3801c64..048d5bffa78 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -15,7 +15,6 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-
 #include "jni_layer_constants.h"
 
 #include <executorch/extension/android/jni/log.h>
@@ -32,6 +31,12 @@
 #include <executorch/extension/threadpool/threadpool.h>
 #endif
 
+#ifdef EXECUTORCH_ANDROID_PROFILING
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
@@ -238,8 +243,13 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
     } else if (loadMode == 3) {
       load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors;
     }
-
-    module_ = std::make_unique<Module>(modelPath->toStdString(), load_mode);
+#ifdef EXECUTORCH_ANDROID_PROFILING
+    auto etdump_gen = std::make_unique<executorch::etdump::ETDumpGen>();
+#else
+    auto etdump_gen = nullptr;
+#endif
+    module_ = std::make_unique<Module>(
+        modelPath->toStdString(), load_mode, std::move(etdump_gen));
 
 #ifdef ET_USE_THREADPOOL
     // Default to using cores/2 threadpool threads. The long-term plan is to
@@ -362,7 +372,6 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
       auto jevalue = JEValue::newJEValueFromEValue(result.get()[i]);
       jresult->setElement(i, *jevalue);
     }
-
     return jresult;
   }
 
@@ -396,6 +405,37 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
 #endif
   }
 
+  jboolean etdump() {
+#ifdef EXECUTORCH_ANDROID_PROFILING
+    executorch::etdump::ETDumpGen* etdumpgen =
+        (executorch::etdump::ETDumpGen*)module_->event_tracer();
+    auto etdump_data = etdumpgen->get_etdump_data();
+
+    if (etdump_data.buf != nullptr && etdump_data.size > 0) {
+      int etdump_file =
+          open("/data/local/tmp/result.etdump", O_WRONLY | O_CREAT, 0644);
+      if (etdump_file == -1) {
+        ET_LOG(Error, "Cannot create result.etdump error: %d", errno);
+        return false;
+      }
+      ssize_t bytes_written =
+          write(etdump_file, (uint8_t*)etdump_data.buf, etdump_data.size);
+      if (bytes_written == -1) {
+        ET_LOG(Error, "Cannot write result.etdump error: %d", errno);
+        return false;
+      } else {
+        ET_LOG(Info, "ETDump written %d bytes to file.", bytes_written);
+      }
+      close(etdump_file);
+      free(etdump_data.buf);
+      return true;
+    } else {
+      ET_LOG(Error, "No ETDump data available!");
+    }
+#endif
+    return false;
+  }
+
   facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>> getUsedBackends(
       facebook::jni::alias_ref<jstring> methodName) {
     auto methodMeta = module_->method_meta(methodName->toStdString()).get();
@@ -423,6 +463,7 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
         makeNativeMethod("execute", ExecuTorchJni::execute),
         makeNativeMethod("loadMethod", ExecuTorchJni::load_method),
         makeNativeMethod("readLogBuffer", ExecuTorchJni::readLogBuffer),
+        makeNativeMethod("etdump", ExecuTorchJni::etdump),
         makeNativeMethod("getUsedBackends", ExecuTorchJni::getUsedBackends),
     });
   }
diff --git a/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 b/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
index 3021d42038b..aa113561cc8 100644
--- a/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
+++ b/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
@@ -107,6 +107,7 @@ phases:
 
       - echo "Run benchmark"
       - |
+        adb -s $DEVICEFARM_DEVICE_UDID shell touch /data/local/tmp/result.etdump
         adb -s $DEVICEFARM_DEVICE_UDID shell am force-stop org.pytorch.minibench
 
         adb -s $DEVICEFARM_DEVICE_UDID shell dumpsys deviceidle force-idle
@@ -147,6 +148,8 @@ phases:
         # Trying to pull the file using adb ends up with permission error, but this works too, so why not
         echo "${BENCHMARK_RESULTS}" > $DEVICEFARM_LOG_DIR/benchmark_results.json
 
+        adb -s $DEVICEFARM_DEVICE_UDID pull /data/local/tmp/result.etdump $DEVICEFARM_LOG_DIR/result.etdump
+
 artifacts:
   # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory.
   - $DEVICEFARM_LOG_DIR
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
index 3913a8d76f5..28f4e3728f0 100644
--- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
+++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
@@ -41,6 +41,8 @@ public void runBenchmark(
       latency.add(forwardMs);
     }
 
+    module.etdump();
+
     final BenchmarkMetric.BenchmarkModel benchmarkModel =
         BenchmarkMetric.extractBackendAndQuantization(model.getName().replace(".pte", ""));
     // The list of metrics we have atm includes:
diff --git a/scripts/build_android_library.sh b/scripts/build_android_library.sh
index 5f0790adb82..38916873103 100755
--- a/scripts/build_android_library.sh
+++ b/scripts/build_android_library.sh
@@ -40,6 +40,8 @@ build_android_native_library() {
     -DANDROID_PLATFORM=android-26 \
     -DBUILD_TESTING=OFF \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
+    -DEXECUTORCH_ENABLE_EVENT_TRACER="${EXECUTORCH_ANDROID_PROFILING:-OFF}" \
     -DEXECUTORCH_LOG_LEVEL=Info \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
@@ -75,6 +77,7 @@ build_android_native_library() {
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
     -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+    -DEXECUTORCH_ANDROID_PROFILING="${EXECUTORCH_ANDROID_PROFILING:-OFF}" \
     -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM="${EXECUTORCH_BUILD_EXTENSION_LLM:-ON}" \
     -DEXECUTORCH_BUILD_LLAMA_JNI="${EXECUTORCH_BUILD_EXTENSION_LLM:-ON}" \

From 40736e299e035606943f8a919783b7f1789b34e2 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 20 May 2025 02:59:38 -0400
Subject: [PATCH 168/178] Move optimized target definitions to
 op_registration.bzl (#10986)

Pull Request resolved: #10877

So we can use them in codegen.bzl later (can't pull in definitions from targets.bzl files).
ghstack-source-id: 284862879

Differential Revision: [D74741846](https://our.internmc.facebook.com/intern/diff/D74741846/)
---
 kernels/optimized/cpu/targets.bzl             | 128 +-----------------
 .../optimized/op_registration_util.bzl        | 128 ++++++++++++++++++
 2 files changed, 131 insertions(+), 125 deletions(-)

diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index 4b9cdef5936..0d60d2c6bee 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -1,127 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "op_target")
-
-_OPTIMIZED_ATEN_OPS = (
-    op_target(
-        name = "op_add",
-        deps = [
-            ":binary_ops",
-            ":add_sub_impl",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/portable/cpu/util:broadcast_util",
-        ],
-    ),
-    op_target(
-        name = "op_bmm",
-        deps = [
-            "//executorch/kernels/optimized:libblas",
-            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
-        ],
-    ),
-    op_target(
-        name = "op_div",
-        deps = [
-            ":binary_ops",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/portable/cpu/util:broadcast_util",
-        ],
-    ),
-    op_target(
-        name = "op_elu",
-        deps = [
-            "//executorch/extension/threadpool:threadpool",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
-        ],
-    ),
-    op_target(name = "op_exp"),
-    op_target(
-        name = "op_fft_c2r",
-        compiler_flags = [] if runtime.is_oss else [
-            "-Wno-global-constructors",
-            "-Wno-shadow",
-        ],
-        deps = [":fft_utils"],
-    ),
-    op_target(
-        name = "op_fft_r2c",
-        compiler_flags = [] if runtime.is_oss else [
-            "-Wno-global-constructors",
-            "-Wno-shadow",
-        ],
-        deps = [":fft_utils"],
-    ),
-    op_target(name = "op_sigmoid"),
-    op_target(
-        name = "op_gelu",
-        deps = [
-            "//executorch/kernels/portable/cpu/util:activation_ops_util",
-            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
-        ],
-    ),
-    op_target(
-        name = "op_le",
-        deps = [
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/portable/cpu/util:broadcast_util",
-        ],
-    ),
-    op_target(
-        name = "op_linear",
-        deps = [
-            "//executorch/kernels/optimized:libblas",
-            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
-        ],
-    ),
-    op_target(
-        name = "op_log_softmax",
-        deps = [
-            "//executorch/kernels/portable/cpu/util:activation_ops_util",
-            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
-        ],
-    ),
-    op_target(
-        name = "op_mm",
-        deps = [
-            "//executorch/kernels/optimized:libblas",
-            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
-        ],
-    ),
-    op_target(
-        name = "op_mul",
-        deps = [
-            ":binary_ops",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/portable/cpu/util:broadcast_util",
-            "//executorch/runtime/core/exec_aten/util:tensor_util",
-        ],
-    ),
-    op_target(
-        name = "op_native_layer_norm",
-        deps = [
-            ":moments_utils",
-            "//executorch/kernels/portable/cpu/util:normalization_ops_util",
-        ],
-    ),
-    op_target(name = "op_neg"),
-    op_target(
-        name = "op_sub",
-        deps = [
-            ":binary_ops",
-            ":add_sub_impl",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/portable/cpu/util:broadcast_util",
-        ],
-    ),
-    op_target(
-        name = "op_where",
-        deps = [
-            "//executorch/extension/threadpool:threadpool",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-        ],
-    ),
-)
-
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "OPTIMIZED_ATEN_OPS", "define_op_target", "op_target")
 
 def get_sleef_preprocessor_flags():
     if runtime.is_oss:
@@ -137,10 +15,10 @@ def define_common_targets():
     """
 
     # Define build targets for all operators registered in the tables above.
-    for op in _OPTIMIZED_ATEN_OPS:
+    for op in OPTIMIZED_ATEN_OPS:
         define_op_target(**op)
 
-    aten_op_targets = [":{}".format(op["name"]) for op in _OPTIMIZED_ATEN_OPS]
+    aten_op_targets = [":{}".format(op["name"]) for op in OPTIMIZED_ATEN_OPS]
     all_op_targets = aten_op_targets
 
     runtime.cxx_library(
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index 3ac89132380..5121dbad599 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -1,3 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under both the MIT license found in the
+# LICENSE-MIT file in the root directory of this source tree and the Apache
+# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
+# of this source tree.
+
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
@@ -137,3 +144,124 @@ def define_op_target(name, compiler_flags, deps):
         compiler_flags = compiler_flags,
         deps = deps,
     )
+
+OPTIMIZED_ATEN_OPS = (
+    op_target(
+        name = "op_add",
+        deps = [
+            ":binary_ops",
+            ":add_sub_impl",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+        ],
+    ),
+    op_target(
+        name = "op_bmm",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
+    op_target(
+        name = "op_div",
+        deps = [
+            ":binary_ops",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+        ],
+    ),
+    op_target(
+        name = "op_elu",
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
+    ),
+    op_target(name = "op_exp"),
+    op_target(
+        name = "op_fft_c2r",
+        compiler_flags = [] if runtime.is_oss else [
+            "-Wno-global-constructors",
+            "-Wno-shadow",
+        ],
+        deps = [":fft_utils"],
+    ),
+    op_target(
+        name = "op_fft_r2c",
+        compiler_flags = [] if runtime.is_oss else [
+            "-Wno-global-constructors",
+            "-Wno-shadow",
+        ],
+        deps = [":fft_utils"],
+    ),
+    op_target(name = "op_sigmoid"),
+    op_target(
+        name = "op_gelu",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:activation_ops_util",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
+    ),
+    op_target(
+        name = "op_le",
+        deps = [
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+        ],
+    ),
+    op_target(
+        name = "op_linear",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
+    op_target(
+        name = "op_log_softmax",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:activation_ops_util",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
+    ),
+    op_target(
+        name = "op_mm",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
+    op_target(
+        name = "op_mul",
+        deps = [
+            ":binary_ops",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/runtime/core/exec_aten/util:tensor_util",
+        ],
+    ),
+    op_target(
+        name = "op_native_layer_norm",
+        deps = [
+            ":moments_utils",
+            "//executorch/kernels/portable/cpu/util:normalization_ops_util",
+        ],
+    ),
+    op_target(name = "op_neg"),
+    op_target(
+        name = "op_sub",
+        deps = [
+            ":binary_ops",
+            ":add_sub_impl",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+        ],
+    ),
+    op_target(
+        name = "op_where",
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+        ],
+    ),
+)

From 7d194cf930d195962988800ad48ee55f870a7e60 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Tue, 20 May 2025 00:01:18 -0700
Subject: [PATCH 169/178] Add a android log implementation

Differential Revision: D74865527

Pull Request resolved: https://github.com/pytorch/executorch/pull/10938
---
 runtime/platform/default/android.cpp          | 190 ++++++++++++++++++
 runtime/platform/targets.bzl                  |  11 +-
 scripts/build_android_library.sh              |   1 +
 .../xplat/executorch/build/env_interface.bzl  |   1 +
 tools/cmake/preset/default.cmake              |   4 +-
 5 files changed, 201 insertions(+), 6 deletions(-)
 create mode 100644 runtime/platform/default/android.cpp

diff --git a/runtime/platform/default/android.cpp b/runtime/platform/default/android.cpp
new file mode 100644
index 00000000000..5945bf54842
--- /dev/null
+++ b/runtime/platform/default/android.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Default PAL implementations for Android system.
+ */
+
+// This cpp file will provide weak implementations of the symbols declared in
+// Platform.h. Client users can strongly define any or all of the functions to
+// override them.
+#define ET_INTERNAL_PLATFORM_WEAKNESS ET_WEAK
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/platform.h>
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+
+#include <android/log.h>
+
+/**
+ * On debug builds, ensure that `et_pal_init` has been called before
+ * other PAL functions which depend on initialization.
+ */
+#ifdef NDEBUG
+
+/**
+ * Assert that the PAL has been initialized.
+ */
+#define _ASSERT_PAL_INITIALIZED() ((void)0)
+
+#else // NDEBUG
+
+/**
+ * Assert that the PAL has been initialized.
+ */
+#define _ASSERT_PAL_INITIALIZED()                                   \
+  do {                                                              \
+    if (!initialized) {                                             \
+      __android_log_print(                                          \
+          ANDROID_LOG_FATAL,                                        \
+          "ExecuTorch",                                             \
+          "%s",                                                     \
+          "ExecuTorch PAL must be initialized before call to %s()", \
+          ET_FUNCTION);                                             \
+    }                                                               \
+  } while (0)
+
+#endif // NDEBUG
+
+/// Start time of the system (used to zero the system timestamp).
+static std::chrono::time_point<std::chrono::steady_clock> systemStartTime;
+
+/// Flag set to true if the PAL has been successfully initialized.
+static bool initialized = false;
+
+/**
+ * Initialize the platform abstraction layer.
+ *
+ * This function should be called before any other function provided by the PAL
+ * to initialize any global state. Typically overridden by PAL implementer.
+ */
+#ifdef _MSC_VER
+#pragma weak et_pal_init
+#endif // _MSC_VER
+void et_pal_init(void) {
+  if (initialized) {
+    return;
+  }
+
+  systemStartTime = std::chrono::steady_clock::now();
+  initialized = true;
+}
+
+/**
+ * Immediately abort execution, setting the device into an error state, if
+ * available.
+ */
+#ifdef _MSC_VER
+#pragma weak et_pal_abort
+#endif // _MSC_VER
+ET_NORETURN void et_pal_abort(void) {
+  std::abort();
+}
+
+/**
+ * Return a monotonically non-decreasing timestamp in system ticks.
+ *
+ * @retval Timestamp value in system ticks.
+ */
+#ifdef _MSC_VER
+#pragma weak et_pal_current_ticks
+#endif // _MSC_VER
+et_timestamp_t et_pal_current_ticks(void) {
+  _ASSERT_PAL_INITIALIZED();
+  auto systemCurrentTime = std::chrono::steady_clock::now();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             systemCurrentTime - systemStartTime)
+      .count();
+}
+
+/**
+ * Return the conversion rate from system ticks to nanoseconds, as a fraction.
+ * To convert an interval from system ticks to nanoseconds, multiply the tick
+ * count by the numerator and then divide by the denominator:
+ *   nanoseconds = ticks * numerator / denominator
+ *
+ * @retval The ratio of nanoseconds to system ticks.
+ */
+#ifdef _MSC_VER
+#pragma weak et_pal_ticks_to_ns_multiplier
+#endif // _MSC_VER
+et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
+  // The system tick interval is 1 nanosecond, so the conversion factor is 1.
+  return {1, 1};
+}
+
+/**
+ * Emit a log message to adb logcat.
+ *
+ * @param[in] timestamp Timestamp of the log event in system ticks since boot.
+ * @param[in] level Severity level of the message. Must be a printable 7-bit
+ *     ASCII uppercase letter.
+ * @param[in] filename Name of the file that created the log event.
+ * @param[in] function Name of the function that created the log event.
+ * @param[in] line Line in the source file where the log event was created.
+ * @param[in] message Message string to log.
+ * @param[in] length Message string length.
+ */
+#ifdef _MSC_VER
+#pragma weak et_pal_emit_log_message
+#endif // _MSC_VER
+void et_pal_emit_log_message(
+    ET_UNUSED et_timestamp_t timestamp,
+    et_pal_log_level_t level,
+    ET_UNUSED const char* filename,
+    ET_UNUSED const char* function,
+    ET_UNUSED size_t line,
+    const char* message,
+    ET_UNUSED size_t length) {
+  _ASSERT_PAL_INITIALIZED();
+
+  int android_log_level = ANDROID_LOG_UNKNOWN;
+  if (level == 'D') {
+    android_log_level = ANDROID_LOG_DEBUG;
+  } else if (level == 'I') {
+    android_log_level = ANDROID_LOG_INFO;
+  } else if (level == 'E') {
+    android_log_level = ANDROID_LOG_ERROR;
+  } else if (level == 'F') {
+    android_log_level = ANDROID_LOG_FATAL;
+  }
+
+  __android_log_print(android_log_level, "ExecuTorch", "%s", message);
+}
+
+/**
+ * NOTE: Core runtime code must not call this directly. It may only be called by
+ * a MemoryAllocator wrapper.
+ *
+ * Allocates size bytes of memory via malloc.
+ *
+ * @param[in] size Number of bytes to allocate.
+ * @returns the allocated memory, or nullptr on failure. Must be freed using
+ *     et_pal_free().
+ */
+#ifdef _MSC_VER
+#pragma weak et_pal_allocate
+#endif // _MSC_VER
+void* et_pal_allocate(size_t size) {
+  return malloc(size);
+}
+
+/**
+ * Frees memory allocated by et_pal_allocate().
+ *
+ * @param[in] ptr Pointer to memory to free. May be nullptr.
+ */
+#ifdef _MSC_VER
+#pragma weak et_pal_free
+#endif // _MSC_VER
+void et_pal_free(void* ptr) {
+  free(ptr);
+}
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index 68322ffe97f..5235101648e 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -41,13 +41,16 @@ def define_common_targets():
     # client defined implementations will overide them.
     runtime.cxx_library(
         name = "platform_private",
-        srcs = _select_pal({
-            "minimal": ["default/minimal.cpp"],
-            "posix": ["default/posix.cpp"],
-        }),
+        srcs = select({
+            "ovr_config//os:android": ["default/android.cpp"],
+            "DEFAULT": _select_pal({
+                "minimal": ["default/minimal.cpp"],
+                "posix": ["default/posix.cpp"],
+        })}),
         deps = [
             ":pal_interface",
         ],
+        external_deps = ["log"],
         visibility = [
             "//executorch/core/...",
         ],
diff --git a/scripts/build_android_library.sh b/scripts/build_android_library.sh
index 38916873103..57516e30eef 100755
--- a/scripts/build_android_library.sh
+++ b/scripts/build_android_library.sh
@@ -39,6 +39,7 @@ build_android_native_library() {
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DANDROID_PLATFORM=android-26 \
     -DBUILD_TESTING=OFF \
+    -DEXECUTORCH_PAL_DEFAULT=android \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER="${EXECUTORCH_ANDROID_PROFILING:-OFF}" \
diff --git a/shim_et/xplat/executorch/build/env_interface.bzl b/shim_et/xplat/executorch/build/env_interface.bzl
index 8eecd8f01c8..98a031bb604 100644
--- a/shim_et/xplat/executorch/build/env_interface.bzl
+++ b/shim_et/xplat/executorch/build/env_interface.bzl
@@ -43,6 +43,7 @@ _EXTERNAL_DEPS = {
     "gtest_aten": "//third-party:gtest_aten",
     "libtorch": "//third-party:libtorch",
     "libtorch_python": "//third-party:libtorch_python",
+    "log": [], # Intentionally not supporting OSS buck build log
     # Huggingface Tokenizer
     "nlohmann_json": [], # Intentionally not supporting OSS buck build HF tokenizer.
     "prettytable": "//third-party:prettytable",
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 5dbaa1871ea..51c228c479b 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -31,7 +31,7 @@ define_overridable_option(
 )
 define_overridable_option(
   EXECUTORCH_PAL_DEFAULT
-  "Which PAL default implementation to use. Choices: posix, minimal"
+  "Which PAL default implementation to use. Choices: posix, minimal, android"
   STRING "posix"
 )
 define_overridable_option(
@@ -276,7 +276,7 @@ define_overridable_option(
 # At this point all the options should be configured with their final value.
 
 if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
-  message(FATAL_ERROR "PAL default implementation (EXECUTORCH_PAL_DEFAULT=${EXECUTORCH_PAL_DEFAULT}) file not found: ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}. Choices: posix, minimal")
+  message(FATAL_ERROR "PAL default implementation (EXECUTORCH_PAL_DEFAULT=${EXECUTORCH_PAL_DEFAULT}) file not found: ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}. Choices: posix, minimal, android")
 endif()
 
 
From 56018e16c9363b57223f31db99f13634e29149ce Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 20 May 2025 03:13:34 -0400
Subject: [PATCH 170/178] Dtype selective build for optimized ops (#10992)

Pull Request resolved: #10878

Add dtype selective build for optimized ops. Follows the same process as portable, where we copy the source files and rebuild the library.

1. Generalize copy genrule for portable/optimized/source/header.
2. Copy optimized source files + headers.
3. Build optimized ops using source files, dependencies, portable header.
4. Add test, confirm that we can run addmul with float dtypes (when we remove, the test fails).
ghstack-source-id: 284862896
@exported-using-ghexport

Differential Revision: [D74688554](https://our.internmc.facebook.com/intern/diff/D74688554/)
---
 examples/selective_build/targets.bzl          |  15 ++
 kernels/optimized/cpu/targets.bzl             |  21 +-
 .../core/portable_type/c10/c10/targets.bzl    |   2 +-
 shim_et/xplat/executorch/codegen/codegen.bzl  | 196 ++++++++++++++----
 .../optimized/op_registration_util.bzl        |   8 +
 5 files changed, 194 insertions(+), 48 deletions(-)

diff --git a/examples/selective_build/targets.bzl b/examples/selective_build/targets.bzl
index 276ee3afe41..685cf5068e4 100644
--- a/examples/selective_build/targets.bzl
+++ b/examples/selective_build/targets.bzl
@@ -69,6 +69,19 @@ def define_common_targets():
         visibility = ["//executorch/..."],
     )
 
+    executorch_generated_lib(
+        name = "select_ops_in_dict_lib_optimized",
+        functions_yaml_target = "//executorch/kernels/optimized:optimized.yaml",
+        kernel_deps = [
+            "//executorch/kernels/optimized:optimized_operators",
+        ],
+        deps = [
+            ":select_ops_in_dict",
+        ],
+        dtype_selective_build = True,
+        visibility = ["//executorch/..."],
+    )
+
     # Select all ops from a yaml file
     et_operator_library(
         name = "select_ops_from_yaml",
@@ -121,6 +134,8 @@ def define_common_targets():
         lib.append(":select_ops_in_list_lib")
     elif select_ops == "dict":
         lib.append(":select_ops_in_dict_lib")
+    elif select_ops == "dict_optimized":
+        lib.append(":select_ops_in_dict_lib_optimized")
     elif select_ops == "yaml":
         lib.append(":select_ops_from_yaml_lib")
     elif select_ops == "model":
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index 0d60d2c6bee..62c47c6256f 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -25,7 +25,7 @@ def define_common_targets():
         name = "add_sub_impl",
         srcs = [],
         exported_headers = ["op_add_sub_impl.h"],
-        visibility = ["//executorch/kernels/optimized/cpu/..."],
+        visibility = ["//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS",],
         exported_deps = [
             "//executorch/runtime/core:core",
             "//executorch/kernels/portable/cpu/util:broadcast_indexes_range",
@@ -36,14 +36,14 @@ def define_common_targets():
         name = "fft_utils",
         srcs = [],
         exported_headers = ["fft_utils.h"],
-        visibility = ["//executorch/kernels/optimized/cpu/..."],
+        visibility = ["//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS",],
         exported_deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"],
     )
 
     runtime.cxx_library(
         name = "binary_ops",
         exported_headers = ["binary_ops.h"],
-        visibility = ["//executorch/kernels/optimized/cpu/..."],
+        visibility = ["//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS",],
         exported_deps = ["//executorch/runtime/core:core"],
     )
 
@@ -58,9 +58,22 @@ def define_common_targets():
         name = "moments_utils",
         srcs = [],
         exported_headers = ["moments_utils.h"],
-        visibility = ["//executorch/kernels/optimized/..."],
+        visibility = ["//executorch/kernels/optimized/...", "@EXECUTORCH_CLIENTS",],
         exported_deps = [
             "//executorch/kernels/optimized:libvec",
             "//executorch/kernels/optimized:libutils",
         ],
     )
+
+    # Used for dtype selective build. Collect source and header files.
+    runtime.filegroup(
+        name = "optimized_source_files",
+        srcs = native.glob(["*.cpp"]),
+        visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.filegroup(
+        name = "optimized_header_files",
+        srcs = native.glob(["*.h"]),
+        visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
+    )
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index 4555d42a567..176e4b8980b 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -53,7 +53,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "aten_headers_for_executorch",
         srcs = [],
-        visibility = ["//executorch/kernels/optimized/..."],
+        visibility = ["//executorch/kernels/optimized/...", "@EXECUTORCH_CLIENTS"],
         exported_deps = select({
             "DEFAULT": [],
             "ovr_config//cpu:arm64": [
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index e7bb7ecf9e0..f4996b6aaf7 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -1,6 +1,12 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_default_executorch_platforms", "is_xplat", "runtime", "struct_to_json")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_header_list", "portable_source_list")
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_header_list", "optimized_source_list")
+load(
+    "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
+    "get_vec_deps",
+    "get_vec_preprocessor_flags",
+)
 
 # Headers that declare the function signatures of the C++ functions that
 # map to entries in functions.yaml and custom_ops.yaml.
@@ -384,52 +390,60 @@ def exir_custom_ops_aot_lib(
             force_static = False,
         )
 
-# Used for dtype selective build. Genrules to copy source and header files.
-def portable_outs(target_name, file_list):
-    outs = {}
-    for file in file_list:
-        outs[file] = ["{}/{}".format(target_name, file)]
-    return outs
-
-def copy_portable_source_files(name):
-    target_name = "portable_source_files"
+def copy_files(genrule_name, target, file_list):
+    """
+    Copy files from `target` to current directory.
+        genrule_name: name of this copy genrule.
+        target: a runtime.filegroup that globs together files.
+            eg. //executorch/kernels/portable/cpu:portable_source_files.
+        file_list: list of filenames, used to generate the outfiles.
+            eg. //executorch/kernels/portable/cpu:portable_source_list.
+    """
+    target_name = target.split(":")[1]
     runtime.genrule(
-        name = name,
-        cmd = "cp -f -r $(location //executorch/kernels/portable/cpu:{}) $OUT/".format(target_name),
-        outs = portable_outs(target_name, portable_source_list()),
+        name = genrule_name,
+        cmd = "cp -f -r $(location {}) $OUT/".format(target),
+        outs = {file: ["{}/{}".format(target_name, file)] for file in file_list},
         default_outs = ["."],
     )
 
-def copy_portable_header_files(name):
-    target_name = "portable_header_files"
-    runtime.genrule(
+def build_portable_header_lib(name, oplist_header_name, feature = None):
+    """Build the portable headers into a header-only library.
+    Ensures that includes work across portable and optimized libs.
+    #include "executorch/kernels/portable/cpu/<header.h>"
+    """
+    # Copy portable header files.
+    portable_header_files = {}
+    genrule_name = name + "_copy_portable_header"
+    copy_files(genrule_name, "//executorch/kernels/portable/cpu:portable_header_files", portable_header_list())
+    for header in portable_header_list():
+        portable_header_files[header] = ":{}[{}]".format(genrule_name, header)
+
+    # Include dtype header.
+    portable_header_files["selected_op_variants.h"] = ":{}[selected_op_variants]".format(oplist_header_name)
+
+    # Build portable headers lib.
+    runtime.cxx_library(
         name = name,
-        cmd = "cp -f -r $(location //executorch/kernels/portable/cpu:{}) $OUT/".format(target_name),
-        outs = portable_outs(target_name, portable_header_list()),
-        default_outs = ["."],
+        srcs = [],
+        exported_headers = portable_header_files,
+        exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
+        # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk
+        header_namespace = "executorch/kernels/portable/cpu",
+        feature = feature,
     )
 
-def build_portable_lib(name, oplist_header_name, feature = None, expose_operator_symbols = False):
+def build_portable_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False):
     """Build portable lib from source. We build from source so that the generated header file,
     selected_op_variants.h, can be used to selectively build the lib for different dtypes.
     """
 
     # Copy portable cpp files.
     portable_source_files = []
-    copy_portable_source_files_genrule = name + "_copy_portable_source"
-    copy_portable_source_files(copy_portable_source_files_genrule)
+    genrule_name = name + "_copy_portable_source"
+    copy_files(genrule_name, "//executorch/kernels/portable/cpu:portable_source_files", portable_source_list())
     for op in portable_source_list():
-        portable_source_files.append(":{}[{}]".format(copy_portable_source_files_genrule, op))
-
-    # Copy portable header files.
-    portable_header_files = {}
-    copy_portable_header_files_genrule = name + "_copy_portable_header"
-    copy_portable_header_files(copy_portable_header_files_genrule)
-    for header in portable_header_list():
-        portable_header_files[header] = ":{}[{}]".format(copy_portable_header_files_genrule, header)
-
-    # Include dtype header.
-    portable_header_files["selected_op_variants.h"] = ":{}[selected_op_variants]".format(oplist_header_name)
+        portable_source_files.append(":{}[{}]".format(genrule_name, op))
 
     # For shared library build, we don't want to expose symbols of
     # kernel implementation (ex torch::executor::native::tanh_out)
@@ -449,9 +463,8 @@ def build_portable_lib(name, oplist_header_name, feature = None, expose_operator
     runtime.cxx_library(
         name = name,
         srcs = portable_source_files,
-        exported_headers = portable_header_files,
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
-        deps = ["//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps"],
+        deps = ["//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps"] +  [":" + portable_header_lib],
         # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk
         header_namespace = "executorch/kernels/portable/cpu",
         compiler_flags = compiler_flags,
@@ -467,6 +480,88 @@ def build_portable_lib(name, oplist_header_name, feature = None, expose_operator
         feature = feature,
     )
 
+def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False):
+    """Build optimized lib from source. We build from source so that the generated header file,
+    selected_op_variants.h, can be used to selectively build the lib for different dtypes.
+    """
+
+    # Copy optimized cpp files.
+    optimized_source_files = []
+    source_genrule = name + "_copy_optimized_source"
+    copy_files(source_genrule, "//executorch/kernels/optimized/cpu:optimized_source_files", optimized_source_list())
+    for op in optimized_source_list():
+        optimized_source_files.append(":{}[{}]".format(source_genrule, op))
+
+    # Copy optimized header files.
+    optimized_header_files = {}
+    header_genrule = name + "_copy_optimized_header"
+    copy_files(header_genrule, "//executorch/kernels/optimized/cpu:optimized_header_files", optimized_header_list())
+    for header in optimized_header_list():
+        optimized_header_files[header] = ":{}[{}]".format(header_genrule, header)
+
+    # For shared library build, we don't want to expose symbols of
+    # kernel implementation (ex torch::executor::native::tanh_out)
+    # to library users. They should use kernels through registry only.
+    # With visibility=hidden, linker won't expose kernel impl symbols
+    # so it can prune unregistered kernels.
+    # Currently fbcode links all dependent libraries through shared
+    # library, and it blocks users like unit tests to use kernel
+    # implementation directly. So we enable this for xplat only.
+    compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed","-Wno-global-constructors","-Wno-shadow",]
+    if not expose_operator_symbols:
+        # Removing '-fvisibility=hidden' exposes operator symbols.
+        # This allows operators to be called outside of the kernel registry.
+        compiler_flags += ["-fvisibility=hidden"]
+
+    # Set up dependencies.
+    optimized_lib_deps = [
+        "//executorch/kernels/optimized/cpu:add_sub_impl",
+        "//executorch/kernels/optimized/cpu:binary_ops",
+        "//executorch/kernels/optimized/cpu:fft_utils",
+        "//executorch/kernels/optimized/cpu:moments_utils",
+        "//executorch/kernels/optimized:libblas",
+        "//executorch/kernels/optimized:libutils",
+        "//executorch/kernels/optimized:libvec",
+        "//executorch/kernels/portable/cpu/pattern:all_deps", 
+        "//executorch/kernels/portable/cpu/util:all_deps",
+        "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        "//executorch/runtime/kernel:kernel_includes",
+        ":" + portable_header_lib,
+    ] + get_vec_deps()
+    
+    # Build optimized lib.
+    runtime.cxx_library(
+        name = name,
+        srcs = optimized_source_files,
+        exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
+        deps = optimized_lib_deps,
+        # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk
+        header_namespace = "executorch/kernels/optimized/cpu",
+        compiler_flags = compiler_flags,
+        preprocessor_flags = get_vec_preprocessor_flags(),
+        # sleef needs to be added as a direct dependency of the operator target when building for Android,
+        # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
+        # dependencies are not transitive
+        fbandroid_platform_deps = [
+            (
+                "^android-arm64.*$",
+                [
+                    "fbsource//third-party/sleef:sleef_arm",
+                ],
+            ),
+        ],
+        # WARNING: using a deprecated API to avoid being built into a shared
+        # library. In the case of dynamically loading so library we don't want
+        # it to depend on other so libraries because that way we have to
+        # specify library directory path.
+        force_static = True,
+        # link_whole is necessary because the operators register themselves
+        # via static initializers that run at program startup.
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+        feature = feature,
+    )
+
 def executorch_generated_lib(
         name,
         functions_yaml_target = None,
@@ -629,14 +724,29 @@ def executorch_generated_lib(
         )
 
     portable_lib = []
-    if dtype_selective_build and is_xplat() and "//executorch/kernels/portable:operators" in kernel_deps:
-        # Remove portable from kernel_deps as we're building it from source.
-        kernel_deps.remove("//executorch/kernels/portable:operators")
-
-        # Build portable lib.
-        portable_lib_name = name + "_portable_lib"
-        build_portable_lib(portable_lib_name, oplist_header_name, feature, expose_operator_symbols)
-        portable_lib = [":{}".format(portable_lib_name)]
+    optimized_lib = []
+    if dtype_selective_build and is_xplat():
+        # Build portable headers lib. Used for portable and optimized kernel libraries.
+        portable_header_lib = name + "_portable_header_lib"
+        build_portable_header_lib(portable_header_lib, oplist_header_name, feature)
+        
+        if "//executorch/kernels/portable:operators" in kernel_deps:
+            # Remove portable from kernel_deps as we're building it from source.
+            kernel_deps.remove("//executorch/kernels/portable:operators")
+
+            # Build portable lib.
+            portable_lib_name = name + "_portable_lib"
+            build_portable_lib(portable_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
+            portable_lib = [":{}".format(portable_lib_name)]
+            
+        if "//executorch/kernels/optimized:optimized_operators" in kernel_deps:
+            # Remove optimized from kernel_deps as we're building it from source.
+            kernel_deps.remove("//executorch/kernels/optimized:optimized_operators")
+            
+            # Build optimized lib.
+            optimized_lib_name = name + "_optimized_lib"
+            build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
+            optimized_lib = [":{}".format(optimized_lib_name)]
 
     # Exports headers that declare the function signatures of the C++ functions
     # that map to entries in `functions.yaml` and `custom_ops.yaml`.
@@ -690,7 +800,7 @@ def executorch_generated_lib(
                 "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/codegen:macros",
-            ] + deps + kernel_deps + portable_lib,
+            ] + deps + kernel_deps + portable_lib + optimized_lib,
             exported_deps = [
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index 5121dbad599..dc46e4dd77e 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -265,3 +265,11 @@ OPTIMIZED_ATEN_OPS = (
         ],
     ),
 )
+
+def optimized_source_list():
+    """All the source file names from //executorch/kernels/optimized/cpu"""
+    return [op["name"] + ".cpp" for op in OPTIMIZED_ATEN_OPS]
+
+def optimized_header_list():
+    """All the header file names from //executorch/kernels/optimized/cpu"""
+    return ["binary_ops.h", "fft_utils.h", "moments_utils.h", "op_add_sub_impl.h",]

From 9916cee4d826ca82737b66fe4904f20f949848ff Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 20 May 2025 11:49:26 +0400
Subject: [PATCH 171/178] partitioner update

---
 backends/openvino/partitioner.py | 50 +++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 0cb7e47bd3c..b7032634780 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -26,6 +26,12 @@
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 
+class PatternNode:
+    op_types = {}
+
+    def __init__(self):
+        self.op_types = {}
+
 class OpenvinoOperatorsSupport(OperatorSupportBase):
 
     def __init__(
@@ -121,6 +127,47 @@ def ops_to_not_decompose(
             torch.ops.aten.upsample_nearest2d.vec,
         ]
         return (ops_not_decompose, None)
+    
+    def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+        if node.op == "call_function":
+            if ("call_function" + ":" + str(node.target)) in pattern.op_types:
+                pt_input_nodes = node.all_input_nodes
+                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target)]
+                if pattern_input_ops is None:
+                    enabled_ops.append(node)
+                    return True
+                if len(pt_input_nodes) != len(pattern_input_ops):
+                    return False
+                for i in range(len(pt_input_nodes)):
+                    if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+                        return False
+                enabled_ops.append(node)
+                return True
+        elif node.op == "get_attr":
+            if "get_attr" in pattern.op_types:
+                return True
+            else:
+                return False
+        return False
+
+    def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
+        const_node = PatternNode
+        const_node.op_types["get_attr"] = None
+        bitwise_right_shift_node = PatternNode
+        bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
+        bitwise_and_node = PatternNode
+        bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node]
+        stack_node = PatternNode
+        stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
+
+        for node in graph_module.graph.nodes:
+            if str(node.op) == "call_function" and str(node.target) == "aten.stack.default":
+                enabled_ops = []
+                pattern_match = self.check_pattern(node, bitwise_and_node, enabled_ops)
+                if pattern_match:
+                    for pattern_op in enabled_ops:
+                        print(pattern_op.name)
+                        self._op_names_to_skip.add(pattern_op.name)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
@@ -154,7 +201,8 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
                 #for input_node in node.all_input_nodes:
                 #    print("\tDEBUG - OpenvinoPartitioner - input_node - op: ", input_node.op, ", target: ", input_node.target, ", name: ", input_node.name)
                 self._op_names_to_skip.add(node.name)
-
+                
+        self.capture_nncf_patterns(exported_program.graph_module)
         partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
             OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),

From 379129d947183a4a28cc05092185cf2a24c54dd9 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Tue, 20 May 2025 10:16:59 +0200
Subject: [PATCH 172/178] Arm backend: Add support for BN fusing during QAT
 (#10967)

Makes it possible to annotate patterns with more than two operators.
This allows us to annotate patterns: conv -> bn and conv -> bn -> relu
to be able to fold away BN after training in QAT. Also adds support for
QAT in Tester class.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 .../arm/quantizer/quantization_annotator.py   | 101 ++++++++++++++----
 .../arm/test/misc/test_bn_relu_folding_qat.py |  66 ++++++++++++
 backends/xnnpack/test/tester/tester.py        |  18 +++-
 3 files changed, 160 insertions(+), 25 deletions(-)
 create mode 100644 backends/arm/test/misc/test_bn_relu_folding_qat.py

diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index 55c2ca21e1b..62cbfd4602a 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -10,6 +10,7 @@
 
 import torch
 import torch.fx
+import torch.nn.functional as F
 from executorch.backends.arm.quantizer import QuantizationConfig
 from executorch.backends.arm.tosa_utils import get_node_debug_info
 from torch.ao.quantization.quantizer import QuantizationSpecBase, SharedQuantizationSpec
@@ -142,29 +143,33 @@ def _match_pattern(
 
     Each 'pattern' element is composed of a list of disjunctive nodes types.
     """
-    assert len(pattern) == 2, "Only two-nodes patterns supported currently"
-
-    if node.target in pattern[0]:
-        assert len(node.users) != 0
-        parent = node
-        child = next(iter(node.users))
-    elif node.target in pattern[1]:
-        assert len(node.args) != 0
-        parent = node.args[0]  # type: ignore[assignment]
-        child = node
-    else:
-        return False
-
-    if len(parent.users) != 1:
-        return False
-
-    if parent.target not in pattern[0] or child.target not in pattern[1]:
-        return False
-
+    assert len(pattern) > 0, "No pattern provided"
     if filter_fn is not None:
-        return filter_fn(parent) and filter_fn(child)
-
-    return True
+        if not filter_fn(node):
+            return False
+    if len(pattern) == 1:
+        # Base case where it has passed the filter_fn. Simply look if node.target is in pattern.
+        return node.target in pattern[0]
+    if node.target not in [op for sub_pattern in pattern for op in sub_pattern]:
+        # node.target not in pattern. No need to look at the rest of the pattern.
+        return False
+    # Find the index of this node's target in pattern
+    idx = [node.target in sub_pattern for sub_pattern in pattern].index(True)
+    left_pattern = pattern[:idx]
+    # Exclude idx as this contains node.target which we have already matched
+    right_pattern = pattern[idx + 1 :]
+    left_condition = True
+    right_condition = True
+    # Recursively look at the rest of the pattern by calling this function for
+    # node's input and user node with updated patterns.
+    if len(left_pattern) > 0:
+        parent = node.all_input_nodes[0]
+        if len(parent.users) != 1:
+            return False
+        left_condition = _match_pattern(parent, left_pattern, filter_fn)
+    if len(right_pattern) > 0:
+        right_condition = _match_pattern(list(node.users)[0], right_pattern, filter_fn)
+    return left_condition and right_condition
 
 
 _one_to_one = [
@@ -274,6 +279,58 @@ def any_or_hardtanh_min_zero(n: Node):
         return n.target != torch.ops.aten.hardtanh.default or n.args[1] == 0
 
     if _match_pattern(
+        node,
+        [
+            [
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.conv2d.padding,
+            ],
+            [torch.ops.aten.batch_norm.default, F.batch_norm],
+            [torch.ops.aten.relu.default, torch.ops.aten.hardtanh.default],
+        ],
+        filter_fn=any_or_hardtanh_min_zero,
+    ):
+        if node.target in (
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.conv2d.padding,
+        ):
+            quant_properties.quant_inputs = [
+                _QuantProperty(0, input_act_qspec),
+                _QuantProperty(1, weight_qspec, mark_annotated=True),
+                _QuantProperty(2, bias_qspec, optional=True, mark_annotated=True),
+            ]
+        elif node.target in (
+            torch.ops.aten.relu.default,
+            torch.ops.aten.hardtanh.default,
+        ):
+            quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
+
+    elif _match_pattern(
+        node,
+        [
+            [
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.conv2d.padding,
+            ],
+            [torch.ops.aten.batch_norm.default, F.batch_norm],
+        ],
+    ):
+        if node.target in (
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.conv2d.padding,
+        ):
+            quant_properties.quant_inputs = [
+                _QuantProperty(0, input_act_qspec),
+                _QuantProperty(1, weight_qspec, mark_annotated=True),
+                _QuantProperty(2, bias_qspec, optional=True, mark_annotated=True),
+            ]
+        elif node.target in [torch.ops.aten.batch_norm.default, F.batch_norm]:
+            quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
+    elif _match_pattern(
         node,
         [
             [
diff --git a/backends/arm/test/misc/test_bn_relu_folding_qat.py b/backends/arm/test/misc/test_bn_relu_folding_qat.py
new file mode 100644
index 00000000000..782783f8205
--- /dev/null
+++ b/backends/arm/test/misc/test_bn_relu_folding_qat.py
@@ -0,0 +1,66 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_symmetric_quantization_config,
+    TOSAQuantizer,
+)
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineBI
+
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from torch import nn
+
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class ConvModule(torch.nn.Module):
+    input_shape = (1, 28, 28)
+    batch_size = 64
+    test_data: input_t1 = (torch.randn(batch_size, *input_shape),)
+
+    def __init__(self, batch_norm: bool = True) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(1, 16, 3, stride=2)
+        self.bn = nn.BatchNorm2d(num_features=16) if batch_norm else nn.Identity()
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = F.relu(x)
+
+        return x
+
+
+models = {
+    "conv_bn_relu": ConvModule(batch_norm=True),
+    "conv_relu": ConvModule(batch_norm=False),
+}
+
+
+@common.parametrize("model", models)
+def test_qat_tosa_BI(model: torch.nn.Module):
+    pipeline = TosaPipelineBI[input_t1](model, model.test_data, [], [], qtol=1)
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "0.80": common.TosaSpecification.create_from_string("TOSA-0.80+BI"),
+        "1.0": common.TosaSpecification.create_from_string("TOSA-1.0+INT"),
+    }
+    tosa_spec = tosa_profiles[tosa_version]
+    quantizer = TOSAQuantizer(tosa_spec)
+    pipeline.change_args(
+        "quantize",
+        Quantize(
+            quantizer=quantizer,
+            quantization_config=get_symmetric_quantization_config(is_qat=True),
+            is_qat=True,
+        ),
+    )
+    pipeline.run()
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index cbce817cf4b..dcdafebd6fd 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -55,7 +55,11 @@
 )
 from executorch.exir.program._program import _transform
 from torch._export.pass_base import PassType
-from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
 from torch.ao.quantization.quantizer.quantizer import Quantizer
 from torch.export import export, ExportedProgram
 from torch.testing import FileCheck
@@ -150,10 +154,11 @@ def __init__(
         quantization_config: Optional[QuantizationConfig] = None,
         calibrate: bool = True,
         calibration_samples: Optional[Sequence[Any]] = None,
+        is_qat: Optional[bool] = False,
     ):
         self.quantizer = quantizer or XNNPACKQuantizer()
         self.quantization_config = (
-            quantization_config or get_symmetric_quantization_config()
+            quantization_config or get_symmetric_quantization_config(is_qat=is_qat)
         )
         self.calibrate = calibrate
         self.calibration_samples = calibration_samples
@@ -161,15 +166,22 @@ def __init__(
         self.quantizer.set_global(self.quantization_config)
 
         self.converted_graph = None
+        self.is_qat = is_qat
 
     def run(
         self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]]
     ) -> None:
         assert inputs is not None
+        if self.is_qat:
+            artifact.train()
         captured_graph = export_for_training(artifact, inputs, strict=True).module()
 
         assert isinstance(captured_graph, torch.fx.GraphModule)
-        prepared = prepare_pt2e(captured_graph, self.quantizer)
+
+        if self.is_qat:
+            prepared = prepare_qat_pt2e(captured_graph, self.quantizer)
+        else:
+            prepared = prepare_pt2e(captured_graph, self.quantizer)
 
         if self.calibrate:
             # Calibrate prepared model to provide data to quantization observers.

From d509ee31de5240300dc08ba2f097c580b6275171 Mon Sep 17 00:00:00 2001
From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com>
Date: Tue, 20 May 2025 09:19:12 +0100
Subject: [PATCH 173/178] Arm backend: Refactor models to allow for TOSA 1.0
 (#10904)

### Summary
Update model unit tests to use the new test infrastructure pipeline.
---
 backends/arm/test/models/test_conformer.py | 200 ++++++++++-----------
 backends/arm/test/models/test_dl3_arm.py   | 147 ++++++++-------
 backends/arm/test/models/test_llama.py     | 108 +++++------
 backends/arm/test/models/test_lstm_arm.py  | 141 +++++++--------
 backends/arm/test/models/test_w2l_arm.py   | 180 +++++++------------
 5 files changed, 360 insertions(+), 416 deletions(-)

diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
index 5b9a50f08e8..2448ea47b40 100644
--- a/backends/arm/test/models/test_conformer.py
+++ b/backends/arm/test/models/test_conformer.py
@@ -3,32 +3,37 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
+
+import pytest
 
 import torch
-from executorch.backends.arm.test import common, conftest
 
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
 from torchaudio.models import Conformer
 
+input_t = Tuple[torch.Tensor, torch.IntTensor]  # Input x, y
+
 
 def get_test_inputs(dim, lengths, num_examples):
     return (torch.rand(num_examples, int(lengths.max()), dim), lengths)
 
 
-class TestConformer(unittest.TestCase):
+class TestConformer:
     """Tests Torchaudio Conformer"""
 
     # Adjust nbr below as we increase op support. Note: most of the delegates
     # calls are directly consecutive to each other in the .pte. The reason
     # for that is some assert ops are removed by passes in the
     # .to_executorch step, i.e. after Arm partitioner.
-    ops_after_partitioner = {
-        "executorch_exir_dialects_edge__ops_aten_max_default": 1,
-        "torch.ops.aten._assert_scalar.default": 7,
-        "torch.ops.aten._local_scalar_dense.default": 1,
-    }
+    aten_ops = ["torch.ops.aten._assert_scalar.default"]
 
     dim = 16
     num_examples = 10
@@ -43,96 +48,87 @@ class TestConformer(unittest.TestCase):
     )
     conformer = conformer.eval()
 
-    def test_conformer_tosa_MI(self):
-        (
-            ArmTester(
-                self.conformer,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .dump_operator_distribution()
-            .check_count(self.ops_after_partitioner)
-            .to_executorch()
-            # TODO(MLETORCH-632): Fix numerical errors
-            .run_method_and_compare_outputs(
-                rtol=1.0,
-                atol=5.0,
-                inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
-            )
-        )
-
-    def test_conformer_tosa_BI(self):
-        (
-            ArmTester(
-                self.conformer,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .run_method_and_compare_outputs(
-                qtol=1.0,
-                rtol=1.0,
-                atol=5.0,
-                inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
-            )
-        )
-
-    def test_conformer_u55_BI(self):
-        tester = (
-            ArmTester(
-                self.conformer,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .serialize()
-        )
-
-        if conftest.is_option_enabled("corstone_fvp"):
-            try:
-                tester.run_method_and_compare_outputs(
-                    qtol=1.0,
-                    rtol=1.0,
-                    atol=5.0,
-                    inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
-                )
-                self.fail(
-                    "TODO(MLETORCH-635): Expected failure under FVP option, but test passed."
-                )
-            except Exception:
-                pass
-
-    def test_conformer_u85_BI(self):
-        tester = (
-            ArmTester(
-                self.conformer,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_u85_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            try:
-                tester.run_method_and_compare_outputs(
-                    qtol=1.0,
-                    rtol=1.0,
-                    atol=5.0,
-                    inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
-                )
-                self.fail(
-                    "TODO(MLETORCH-635): Expected failure under FVP option, but test passed."
-                )
-            except Exception:
-                pass
+
+def test_conformer_tosa_MI():
+    pipeline = TosaPipelineMI[input_t](
+        TestConformer.conformer,
+        TestConformer.model_example_inputs,
+        aten_op=TestConformer.aten_ops,
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs",
+        get_test_inputs(
+            TestConformer.dim, TestConformer.lengths, TestConformer.num_examples
+        ),
+        rtol=1.0,
+        atol=5.0,
+    )
+    pipeline.run()
+
+
+def test_conformer_tosa_BI():
+    pipeline = TosaPipelineBI[input_t](
+        TestConformer.conformer,
+        TestConformer.model_example_inputs,
+        aten_op=TestConformer.aten_ops,
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.pop_stage("check_count.exir")
+    pipeline.change_args(
+        "run_method_and_compare_outputs",
+        get_test_inputs(
+            TestConformer.dim, TestConformer.lengths, TestConformer.num_examples
+        ),
+        rtol=1.0,
+        atol=5.0,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@pytest.mark.xfail(
+    reason="TODO(MLETORCH-635): Expected failure under FVP option, but test passed."
+)
+def test_conformer_u55_BI():
+    pipeline = EthosU55PipelineBI[input_t](
+        TestConformer.conformer,
+        TestConformer.model_example_inputs,
+        aten_ops=TestConformer.aten_ops,
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs",
+        get_test_inputs(
+            TestConformer.dim, TestConformer.lengths, TestConformer.num_examples
+        ),
+        rtol=1.0,
+        atol=5.0,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="All IO needs to have the same data type (MLETORCH-635)")
+def test_conformer_u85_BI():
+    pipeline = EthosU85PipelineBI[input_t](
+        TestConformer.conformer,
+        TestConformer.model_example_inputs,
+        aten_ops=TestConformer.aten_ops,
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs",
+        get_test_inputs(
+            TestConformer.dim, TestConformer.lengths, TestConformer.num_examples
+        ),
+        rtol=1.0,
+        atol=5.0,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/models/test_dl3_arm.py b/backends/arm/test/models/test_dl3_arm.py
index 93fb1483017..2e7a3117865 100644
--- a/backends/arm/test/models/test_dl3_arm.py
+++ b/backends/arm/test/models/test_dl3_arm.py
@@ -3,92 +3,87 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
 
 import pytest
 
-from executorch.backends.arm.test import common, conftest
+import torch
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.examples.models import deeplab_v3
 
+input_t = Tuple[torch.Tensor]  # Input x
+
 
-class TestDl3(unittest.TestCase):
+class TestDl3:
     """Tests DeepLabv3."""
 
     dl3 = deeplab_v3.DeepLabV3ResNet50Model()
     model_example_inputs = dl3.get_example_inputs()
     dl3 = dl3.get_eager_model()
 
-    @unittest.expectedFailure
-    def test_dl3_tosa_MI(self):
-        (
-            ArmTester(
-                self.dl3,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=self.dl3.get_example_inputs())
-        )
-
-    @unittest.expectedFailure
-    def test_dl3_tosa_BI(self):
-        (
-            ArmTester(
-                self.dl3,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .run_method_and_compare_outputs(
-                atol=1.0, qtol=1, inputs=self.dl3.get_example_inputs()
-            )
-        )
-
-    @pytest.mark.slow
-    @pytest.mark.corstone_fvp
-    @unittest.skip
-    def test_dl3_u55_BI(self):
-        tester = (
-            ArmTester(
-                self.dl3,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                atol=1.0, qtol=1, inputs=self.dl3.get_example_inputs()
-            )
-
-    @pytest.mark.slow
-    @pytest.mark.corstone_fvp
-    @unittest.skip
-    def test_dl3_u85_BI(self):
-        tester = (
-            ArmTester(
-                self.dl3,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_u85_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                atol=1.0, qtol=1, inputs=self.dl3.get_example_inputs()
-            )
+
+def test_dl3_tosa_MI():
+    pipeline = TosaPipelineMI[input_t](
+        TestDl3.dl3,
+        TestDl3.model_example_inputs,
+        aten_op=[],
+        exir_op=[],
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs", rtol=1.0, atol=1.0
+    )  # TODO: MLETORCH-1036 decrease tolerance
+    pipeline.run()
+
+
+def test_dl3_tosa_BI():
+    pipeline = TosaPipelineBI[input_t](
+        TestDl3.dl3,
+        TestDl3.model_example_inputs,
+        aten_op=[],
+        exir_op=[],
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs", rtol=1.0, atol=1.0
+    )  # TODO: MLETORCH-1036 decrease tolerance
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@pytest.mark.skip(reason="upsample_bilinear2d operator is not supported on U55")
+def test_dl3_u55_BI():
+    pipeline = EthosU55PipelineBI[input_t](
+        TestDl3.dl3,
+        TestDl3.model_example_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs", rtol=1.0, atol=1.0
+    )  # TODO: MLETORCH-1036 decrease tolerance
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@pytest.mark.skip(reason="Runs out of memory on U85")
+def test_dl3_u85_BI():
+    pipeline = EthosU85PipelineBI[input_t](
+        TestDl3.dl3,
+        TestDl3.model_example_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs", rtol=1.0, atol=1.0
+    )  # TODO: MLETORCH-1036 decrease tolerance
+    pipeline.run()
diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
index f5d879b3b8b..494bef48444 100644
--- a/backends/arm/test/models/test_llama.py
+++ b/backends/arm/test/models/test_llama.py
@@ -5,22 +5,29 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
 import logging
 
 import os
 import sys
-import unittest
+
+from typing import Tuple
 
 import pytest
 import torch
 
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test import conftest
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
 from executorch.examples.models.llama.export_llama_lib import (
     build_args_parser,
     get_llama_model,
 )
 
+input_t = Tuple[torch.Tensor]
 
 # Add project dir to sys path to workaround importlib.import_module() conditions in model_factory.py
 this_files_dir = os.path.dirname(os.path.abspath(__file__))
@@ -30,7 +37,7 @@
 logger = logging.getLogger(__name__)
 
 
-class TestLlama(unittest.TestCase):
+class TestLlama:
     """
     Test class of Llama models. Type of Llama model depends on command line parameters:
     --llama_inputs <path to .pt file> <path to json file> <name of model variant>
@@ -39,7 +46,6 @@ class TestLlama(unittest.TestCase):
     """
 
     def prepare_model(self):
-
         checkpoint = None
         params_file = None
         usage = "To run use --llama_inputs <.pt/.pth> <.json> <name>"
@@ -88,53 +94,47 @@ def prepare_model(self):
 
         return llama_model, llama_inputs, llama_meta
 
-    def test_llama_tosa_MI(self):
-        llama_model, llama_inputs, llama_meta = self.prepare_model()
-
-        if llama_model is None or llama_inputs is None:
-            pytest.skip("Missing model and/or input files")
 
-        with torch.no_grad():
-            (
-                ArmTester(
-                    llama_model,
-                    example_inputs=llama_inputs,
-                    compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-                    constant_methods=llama_meta,
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=llama_inputs,
-                    atol=4.3,
-                    rtol=1.1,  # TODO: MLETORCH-825 decrease tolerance
-                )
-            )
-
-    def test_llama_tosa_BI(self):
-        llama_model, llama_inputs, llama_meta = self.prepare_model()
-
-        if llama_model is None or llama_inputs is None:
-            pytest.skip("Missing model and/or input files")
-
-        with torch.no_grad():
-            (
-                ArmTester(
-                    llama_model,
-                    example_inputs=llama_inputs,
-                    compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-                    constant_methods=llama_meta,
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=llama_inputs,
-                    atol=9.9,
-                    rtol=1.5,  # TODO: Tolerance needs to be updated after MLETORCH-907
-                )
-            )
+def test_llama_tosa_MI():
+    llama_model, llama_inputs, llama_meta = TestLlama().prepare_model()
+
+    if llama_model is None or llama_inputs is None:
+        pytest.skip("Missing model and/or input files")
+
+    with torch.no_grad():
+        pipeline = TosaPipelineMI[input_t](
+            llama_model,
+            llama_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "run_method_and_compare_outputs",
+            atol=4.3,
+            rtol=1.1,  # TODO: MLETORCH-825 decrease tolerance
+        )
+        pipeline.run()
+
+
+@pytest.mark.xfail(reason="KeyError: scalar_tensor_1 (MLETORCH-907)")
+def test_llama_tosa_BI():
+    llama_model, llama_inputs, llama_meta = TestLlama.prepare_model()
+
+    if llama_model is None or llama_inputs is None:
+        pytest.skip("Missing model and/or input files")
+
+    with torch.no_grad():
+        pipeline = TosaPipelineBI[input_t](
+            llama_model,
+            llama_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "run_method_and_compare_outputs",
+            atol=9.9,
+            rtol=1.5,  # TODO: Tolerance needs to be updated after MLETORCH-907
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/test_lstm_arm.py b/backends/arm/test/models/test_lstm_arm.py
index 1304de7eae1..48d2e918ff6 100644
--- a/backends/arm/test/models/test_lstm_arm.py
+++ b/backends/arm/test/models/test_lstm_arm.py
@@ -1,20 +1,24 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
-import unittest
+from typing import Tuple
 
 import torch
 
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
 from torch.nn.quantizable.modules import rnn
 
+input_t = Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]  #  (h0, c0)
+
 
 def get_test_inputs():
     return (
@@ -23,7 +27,7 @@ def get_test_inputs():
     )
 
 
-class TestLSTM(unittest.TestCase):
+class TestLSTM:
     """Tests quantizable LSTM module."""
 
     """
@@ -37,69 +41,60 @@ class TestLSTM(unittest.TestCase):
     # Used e.g. for quantization calibration and shape extraction in the tester
     model_example_inputs = get_test_inputs()
 
-    def test_lstm_tosa_MI(self):
-        (
-            ArmTester(
-                self.lstm,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=get_test_inputs())
-        )
-
-    def test_lstm_tosa_BI(self):
-        (
-            ArmTester(
-                self.lstm,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(atol=3e-1, qtol=1, inputs=get_test_inputs())
-        )
-
-    def test_lstm_u55_BI(self):
-        tester = (
-            ArmTester(
-                self.lstm,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_u55_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                atol=3e-1, qtol=1, inputs=get_test_inputs()
-            )
-
-    def test_lstm_u85_BI(self):
-        tester = (
-            ArmTester(
-                self.lstm,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_u85_compile_spec(),
-            )
-            .quantize()
-            .export()
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                atol=3e-1, qtol=1, inputs=get_test_inputs()
-            )
+
+def test_lstm_tosa_MI():
+    pipeline = TosaPipelineMI[input_t](
+        TestLSTM.lstm,
+        TestLSTM.model_example_inputs,
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", get_test_inputs(), atol=3e-1)
+    pipeline.run()
+
+
+def test_lstm_tosa_BI():
+    pipeline = TosaPipelineBI[input_t](
+        TestLSTM.lstm,
+        TestLSTM.model_example_inputs,
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+def test_lstm_u55_BI():
+    pipeline = EthosU55PipelineBI[input_t](
+        TestLSTM.lstm,
+        TestLSTM.model_example_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+def test_lstm_u85_BI():
+    pipeline = EthosU85PipelineBI[input_t](
+        TestLSTM.lstm,
+        TestLSTM.model_example_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+    pipeline.change_args(
+        "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
+    )
+    pipeline.run()
diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py
index 8cd2ff22b75..1a755937482 100644
--- a/backends/arm/test/models/test_w2l_arm.py
+++ b/backends/arm/test/models/test_w2l_arm.py
@@ -11,12 +11,18 @@
 import pytest
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
-from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torchaudio import models
 
+input_t = Tuple[torch.Tensor]  # Input x
+
 
 def get_test_inputs(batch_size, num_features, input_frames):
     return (torch.randn(batch_size, num_features, input_frames),)
@@ -32,114 +38,66 @@ class TestW2L(unittest.TestCase):
     w2l = models.Wav2Letter(num_features=num_features).eval()
     model_example_inputs = get_test_inputs(batch_size, num_features, input_frames)
 
-    all_operators = {
+    all_operators = [
         "executorch_exir_dialects_edge__ops_aten_convolution_default",
         "executorch_exir_dialects_edge__ops_aten__log_softmax_default",
         "executorch_exir_dialects_edge__ops_aten_relu_default",
-    }
-
-    operators_after_quantization = all_operators - {
-        "executorch_exir_dialects_edge__ops_aten__log_softmax_default",
-    }
-
-    @pytest.mark.slow  # about 3min on std laptop
-    def test_w2l_tosa_MI(self):
-        (
-            ArmTester(
-                self.w2l,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .dump_operator_distribution()
-            .to_edge_transform_and_lower()
-            .dump_operator_distribution()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(
-                inputs=get_test_inputs(
-                    self.batch_size, self.num_features, self.input_frames
-                )
-            )
-        )
-
-    @pytest.mark.slow  # about 1min on std laptop
-    def test_w2l_tosa_BI(self):
-        (
-            ArmTester(
-                self.w2l,
-                example_inputs=self.model_example_inputs,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .dump_operator_distribution()
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(
-                atol=0.1,
-                qtol=1,
-                inputs=get_test_inputs(
-                    self.batch_size, self.num_features, self.input_frames
-                ),
-            )
-        )
-
-    def _test_w2l_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-        compile_spec: CompileSpec,
-    ):
-        tester = (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize()
-            .export()
-            .to_edge()
-            .check(list(self.operators_after_quantization))
-            .partition()
-            .to_executorch()
-            .serialize()
-        )
-        return tester
-
-    # TODO: expected fail as TOSA.Transpose is not supported by Ethos-U55
-    @pytest.mark.slow
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_w2l_u55_BI(self):
-        tester = self._test_w2l_ethos_BI_pipeline(
-            self.w2l,
-            self.model_example_inputs,
-            common.get_u55_compile_spec(),
-        )
-
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                atol=1.0,
-                qtol=1,
-                inputs=get_test_inputs(
-                    self.batch_size, self.num_features, self.input_frames
-                ),
-            )
-
-    @pytest.mark.slow
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-761
-    @pytest.mark.skip(reason="Intermittent timeout issue: MLETORCH-856")
-    def test_w2l_u85_BI(self):
-        tester = self._test_w2l_ethos_BI_pipeline(
-            self.w2l,
-            self.model_example_inputs,
-            common.get_u85_compile_spec(),
-        )
-
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                atol=1.0,
-                qtol=1,
-                inputs=get_test_inputs(
-                    self.batch_size, self.num_features, self.input_frames
-                ),
-            )
+    ]
+
+
+@pytest.mark.slow  # about 3min on std laptop
+def test_w2l_tosa_MI():
+    pipeline = TosaPipelineMI[input_t](
+        TestW2L.w2l,
+        TestW2L.model_example_inputs,
+        aten_op=[],
+        exir_op=TestW2L.all_operators,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@pytest.mark.slow  # about 1min on std laptop
+@pytest.mark.flaky
+def test_w2l_tosa_BI():
+    pipeline = TosaPipelineBI[input_t](
+        TestW2L.w2l,
+        TestW2L.model_example_inputs,
+        aten_op=[],
+        exir_op=TestW2L.all_operators,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@pytest.mark.slow
+@common.XfailIfNoCorstone300
+@pytest.mark.xfail(
+    reason="MLETORCH-1009: Wav2Letter fails on U55 due to unsupported conditions",
+    strict=False,
+)
+def test_w2l_u55_BI():
+    pipeline = EthosU55PipelineBI[input_t](
+        TestW2L.w2l,
+        TestW2L.model_example_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@pytest.mark.slow
+@common.XfailIfNoCorstone320
+@pytest.mark.skip(reason="Intermittent timeout issue: MLETORCH-856")
+def test_w2l_u85_BI():
+    pipeline = EthosU85PipelineBI[input_t](
+        TestW2L.w2l,
+        TestW2L.model_example_inputs,
+        aten_ops=[],
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+    pipeline.run()

From 08dfe525f18fbb39eb43ad2d0368eb1a9621b11a Mon Sep 17 00:00:00 2001
From: Tom Allsop <72802373+tom-arm@users.noreply.github.com>
Date: Tue, 20 May 2025 09:23:45 +0100
Subject: [PATCH 174/178] Arm backend: Adjust AvgPool2d padding when window is
 not divisible by stride (#10972)

* AvgPool2dVisitor will adjust the padding so the pooling window is
divisible by the stride
* Improve tests in test_max_pool.py

Signed-off-by: Tom Allsop <tom.allsop@arm.com>
---
 backends/arm/operators/op_avg_pool2d.py       | 29 +++++++++++++++
 backends/arm/operators/op_max_pool2d.py       | 27 +++-----------
 .../operators/operator_validation_utils.py    | 37 +++++++++++++++++++
 backends/arm/test/ops/test_avg_pool2d.py      | 12 ++++++
 backends/arm/test/ops/test_max_pool.py        |  2 +
 5 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py
index 9eb533b7968..dc455206f75 100644
--- a/backends/arm/operators/op_avg_pool2d.py
+++ b/backends/arm/operators/op_avg_pool2d.py
@@ -17,6 +17,7 @@
     register_node_visitor,
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
+    adjust_pooling_pad_if_needed,
     validate_num_inputs,
     validate_same_dtype,
 )
@@ -63,6 +64,20 @@ def _build_generic_avgpool2d(
         except IndexError:
             pad_size_list = [0, 0, 0, 0]
 
+        # Adjust the padding as necessary
+        pad_size_list[1] = adjust_pooling_pad_if_needed(
+            input_tensor.shape[2],
+            kernel_size_list[0],
+            stride_size_list[0],
+            pad_size_list[1],
+        )
+        pad_size_list[3] = adjust_pooling_pad_if_needed(
+            input_tensor.shape[3],
+            kernel_size_list[1],
+            stride_size_list[1],
+            pad_size_list[3],
+        )
+
         attr = ts.TosaSerializerAttribute()
         attr.PoolAttribute(
             kernel=kernel_size_list,
@@ -192,6 +207,20 @@ def _build_generic_avgpool2d(
         except IndexError:
             pad_size_list = [0, 0, 0, 0]
 
+        # Adjust the padding as necessary
+        pad_size_list[1] = adjust_pooling_pad_if_needed(
+            input_tensor.shape[2],
+            kernel_size_list[0],
+            stride_size_list[0],
+            pad_size_list[1],
+        )
+        pad_size_list[3] = adjust_pooling_pad_if_needed(
+            input_tensor.shape[3],
+            kernel_size_list[1],
+            stride_size_list[1],
+            pad_size_list[3],
+        )
+
         attr = ts.TosaSerializerAttribute()
         attr.AvgPool2dAttribute(
             kernel=kernel_size_list,
diff --git a/backends/arm/operators/op_max_pool2d.py b/backends/arm/operators/op_max_pool2d.py
index 8a37627a416..170c16261a7 100644
--- a/backends/arm/operators/op_max_pool2d.py
+++ b/backends/arm/operators/op_max_pool2d.py
@@ -17,6 +17,7 @@
     register_node_visitor,
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
+    adjust_pooling_pad_if_needed,
     validate_num_inputs,
     validate_same_dtype,
 )
@@ -24,24 +25,6 @@
 from executorch.backends.arm.tosa_specification import TosaSpecification
 
 
-# Similarly to Conv2d, the TOSA spec requires that following is exactly divisible:
-# `(input + 2 * pad - kernel_size) / stride`
-# PyTorch however, does not require this, so as needed, we must adjust the padding.
-def adjust_pad_if_needed(
-    input_size: int, kernel_size: int, stride: int, pad: int
-) -> int:
-    if pad == 0:
-        return pad
-
-    mod_remainder = (input_size + 2 * pad - kernel_size) % stride
-
-    # No need to adjust
-    if mod_remainder == 0:
-        return pad
-
-    return pad - mod_remainder
-
-
 @register_node_visitor
 class MaxPool2dVisitor_0_80(NodeVisitor):
     target = "aten.max_pool2d.default"
@@ -82,13 +65,13 @@ def define_node(
             pad_size_list = [0, 0, 0, 0]
 
         # Adjust the padding as necessary
-        pad_size_list[1] = adjust_pad_if_needed(
+        pad_size_list[1] = adjust_pooling_pad_if_needed(
             input_tensor.shape[2],
             kernel_size[0],
             stride[0],
             pad_size_list[1],
         )
-        pad_size_list[3] = adjust_pad_if_needed(
+        pad_size_list[3] = adjust_pooling_pad_if_needed(
             input_tensor.shape[3],
             kernel_size[1],
             stride[1],
@@ -167,13 +150,13 @@ def define_node(
             pad_size_list = [0, 0, 0, 0]
 
         # Adjust the padding as necessary
-        pad_size_list[1] = adjust_pad_if_needed(
+        pad_size_list[1] = adjust_pooling_pad_if_needed(
             input_tensor.shape[2],
             kernel_size[0],
             stride[0],
             pad_size_list[1],
         )
-        pad_size_list[3] = adjust_pad_if_needed(
+        pad_size_list[3] = adjust_pooling_pad_if_needed(
             input_tensor.shape[3],
             kernel_size[1],
             stride[1],
diff --git a/backends/arm/operators/operator_validation_utils.py b/backends/arm/operators/operator_validation_utils.py
index d15bb65ba77..f0c9af2a137 100644
--- a/backends/arm/operators/operator_validation_utils.py
+++ b/backends/arm/operators/operator_validation_utils.py
@@ -99,3 +99,40 @@ def validate_same_dtype(op_name: str, tensors: List[Any]):
                 f"{op_name}: Expected all tensors to have dtype {reference_dtype}, but "
                 f"found inconsistent dtype {tensor.dtype}."
             )
+
+
+def adjust_pooling_pad_if_needed(
+    input_size: int, kernel_size: int, stride: int, pad: int
+) -> int:
+    """
+    Calculates the padding that needs to be removed to a pooling window to make it
+    divisible by the kernels stride. All inputs should correspond to the same dimension.
+
+    Parameters:
+    -----------
+    input_size : int
+        The size of the input to the operator.
+
+    kernel_size : int
+        The size of the kernel.
+
+    stride : int
+        The size of the stride.
+
+    pad : int
+        The amount of padding.
+
+    Output:
+    -------
+    An int, representing the padding to remove to make the window divisible.
+    """
+    if pad == 0:
+        return pad
+
+    mod_remainder = (input_size + 2 * pad - kernel_size) % stride
+
+    # No need to adjust
+    if mod_remainder == 0:
+        return pad
+
+    return pad - mod_remainder
diff --git a/backends/arm/test/ops/test_avg_pool2d.py b/backends/arm/test/ops/test_avg_pool2d.py
index 65c1830b9b2..9927a6d2895 100644
--- a/backends/arm/test/ops/test_avg_pool2d.py
+++ b/backends/arm/test/ops/test_avg_pool2d.py
@@ -59,6 +59,18 @@ def forward(self, x):
         AvgPool2d((4, 6), (1, 2), (2, 3)),
         (torch.rand(1, 16, 50, 32),),
     ),
+    "non_divisible_window": lambda: (
+        AvgPool2d(3, 2, 1),
+        (torch.rand(1, 16, 112, 112),),
+    ),
+    "non_divisible_window_height": lambda: (
+        AvgPool2d(3, (2, 1), 1),
+        (torch.rand(1, 16, 56, 56),),
+    ),
+    "non_divisible_window_width": lambda: (
+        AvgPool2d(3, (1, 2), 1),
+        (torch.rand(1, 16, 56, 56),),
+    ),
 }
 
 
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index a1fd3ea30ec..7e9c90e983f 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -26,6 +26,8 @@
     "ones": lambda: (torch.ones(1, 16, 50, 32), [4, 2, 0]),
     "rand": lambda: (torch.rand(1, 16, 52, 16), [4, 3, 0]),
     "non_divisible": lambda: (torch.rand(1, 16, 112, 112), [3, 2, 1]),
+    "non_divisible_window_height": lambda: (torch.rand(1, 16, 56, 56), [3, (2, 1), 1]),
+    "non_divisible_window_width": lambda: (torch.rand(1, 16, 56, 56), [3, (1, 2), 1]),
 }
 
 test_data_suite_mult_batches = {

From da62d5f2badbfbb793ed08366663eba59ece3da3 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Tue, 20 May 2025 10:24:39 +0200
Subject: [PATCH 175/178] Arm backend: Clean up matmul tests (#10971)

- Removes duplicated matmul tests.
- Replaces pytest.mark_flaky with qtol for quantized tests cases of
mm/bmm.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 backends/arm/test/ops/test_bmm.py | 28 +++-------------------------
 backends/arm/test/ops/test_mm.py  |  2 +-
 2 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index bd2c9338275..6b66abbda01 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -44,16 +44,6 @@ def forward(self, x, y):
         return torch.bmm(x, y)
 
 
-class MatMul(torch.nn.Module):
-    test_data_generators = {
-        "rand_3d": lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
-        "rand_4d": lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
-    }
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
 class BMMSingleInput(torch.nn.Module):
     test_data_generators = {
         "rand_3d_1": lambda: (torch.rand(20, 3, 3),),
@@ -81,26 +71,14 @@ def test_bmm_tosa_MI_single_input(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", MatMul.test_data_generators)
-def test_mm_tosa_MI(test_data: input_t1):
-    pipeline = TosaPipelineMI[input_t1](MatMul(), test_data(), aten_op_mm, exir_op_mm)
-    pipeline.run()
-
-
-@common.parametrize("test_data", MatMul.test_data_generators)
-def test_mm_tosa_BI(test_data: input_t1):
-    pipeline = TosaPipelineBI[input_t1](MatMul(), test_data(), aten_op_mm, exir_op_mm)
-    pipeline.run()
-
-
-@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLETORCH-534)
 @common.parametrize("test_data", BMM.test_data_generators)
 def test_bmm_tosa_BI(test_data: input_t1):
-    pipeline = TosaPipelineBI[input_t1](BMM(), test_data(), aten_op_bmm, exir_op_bmm)
+    pipeline = TosaPipelineBI[input_t1](
+        BMM(), test_data(), aten_op_bmm, exir_op_bmm, qtol=1
+    )
     pipeline.run()
 
 
-@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLETORCH-534)
 @common.parametrize("test_data", BMMSingleInput.test_data_generators)
 def test_bmm_tosa_BI_single_input(test_data: input_t1):
     pipeline = TosaPipelineBI[input_t1](
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index a5a3b4b98b9..9c3ce443bfd 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -41,7 +41,7 @@ def test_mm_tosa_MI(test_data: Tuple):
 
 @common.parametrize("test_data", MM.test_data_generators)
 def test_mm_tosa_BI(test_data: Tuple):
-    TosaPipelineBI[test_t](MM(), test_data(), MM.aten_op, MM.exir_op).run()
+    TosaPipelineBI[test_t](MM(), test_data(), MM.aten_op, MM.exir_op, qtol=1).run()
 
 
 @common.parametrize("test_data", MM.test_data_generators)

From 0c20955e437bf3bcf3f3ae19aac1417262b4f506 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 20 May 2025 13:33:24 +0400
Subject: [PATCH 176/178] update for latest

---
 examples/models/llama/export_llama_lib.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index f9935fd324f..9cd906ad2f3 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -860,6 +860,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
     mps: bool = False,
     coreml: bool = False,
     qnn: bool = False,
+    openvino: bool = False,
     dtype_override: str = "fp32",
     enable_dynamic_shape: bool = True,
     use_kv_cache: bool = False,
@@ -1076,6 +1077,7 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             mps=args.mps,
             coreml=args.coreml,
             qnn=args.qnn,
+            openvino=args.openvino,
             dtype_override=args.dtype_override,
             enable_dynamic_shape=args.enable_dynamic_shape,
             use_kv_cache=args.use_kv_cache,

From d3730eaf17c57458ca358076fa95ead44b5e8d79 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 4 Jun 2025 19:32:48 -0700
Subject: [PATCH 177/178] quant and fp16 temp fix

---
 backends/openvino/partitioner.py              |  32 +++-
 backends/openvino/preprocess.py               |   4 +-
 backends/openvino/requirements.txt            |   2 +-
 backends/openvino/runtime/OpenvinoBackend.cpp | 145 ++++++++++++-----
 backends/openvino/utils.py                    | 150 ++++++++++++++++++
 extension/llm/export/builder.py               |   6 +
 6 files changed, 290 insertions(+), 49 deletions(-)
 create mode 100644 backends/openvino/utils.py

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index b7032634780..b64ebb0a7b2 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -73,7 +73,7 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             print(
                 f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped."
             )
-            return False
+            return True
 
         supported_ops = OperatorSupport(options)._support_dict
         if op_type == "getitem":
@@ -129,30 +129,48 @@ def ops_to_not_decompose(
         return (ops_not_decompose, None)
     
     def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+        print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.0 - op: ", node.op)
         if node.op == "call_function":
-            if ("call_function" + ":" + str(node.target)) in pattern.op_types:
+            print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.1")
+            if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - B - target: ", node.target.__name__)
                 pt_input_nodes = node.all_input_nodes
-                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target)]
+                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)]
                 if pattern_input_ops is None:
                     enabled_ops.append(node)
+                    print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.1")
                     return True
                 if len(pt_input_nodes) != len(pattern_input_ops):
+                    print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.2")
                     return False
                 for i in range(len(pt_input_nodes)):
                     if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+                        print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.3")
                         return False
                 enabled_ops.append(node)
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.4")
                 return True
         elif node.op == "get_attr":
             if "get_attr" in pattern.op_types:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2")
+                return True
+            else:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3")
+                return False
+        elif node.op == "placeholder":
+            if "placeholder" in pattern.op_types:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2")
                 return True
             else:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3")
                 return False
+        print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.4")
         return False
 
     def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
         const_node = PatternNode
         const_node.op_types["get_attr"] = None
+        const_node.op_types["placeholder"] = None
         bitwise_right_shift_node = PatternNode
         bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
         bitwise_and_node = PatternNode
@@ -160,11 +178,15 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
         stack_node = PatternNode
         stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
 
+        print("DEBUG - capture_nncf_patterns - A")
         for node in graph_module.graph.nodes:
-            if str(node.op) == "call_function" and str(node.target) == "aten.stack.default":
+            print("\tDEBUG - capture_nncf_patterns - B - op: ", node.op, ", target: ", node.target)
+            if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default":
+                print("\tDEBUG - capture_nncf_patterns - C - stack found")
                 enabled_ops = []
-                pattern_match = self.check_pattern(node, bitwise_and_node, enabled_ops)
+                pattern_match = self.check_pattern(node, stack_node, enabled_ops)
                 if pattern_match:
+                    print("\tDEBUG - capture_nncf_patterns - D - match")
                     for pattern_op in enabled_ops:
                         print(pattern_op.name)
                         self._op_names_to_skip.add(pattern_op.name)
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 2775e3eed89..665921f50e7 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -51,5 +51,5 @@ def preprocess(
         )
         model_bytes = compiled.export_model()
 
-        #return PreprocessResult(processed_bytes=model_bytes.getvalue())
-        return PreprocessResult(processed_bytes=model_bytes)
+        return PreprocessResult(processed_bytes=model_bytes.getvalue())
+        #return PreprocessResult(processed_bytes=model_bytes)
diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 316633e9004..ccb2aa91430 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@develop#egg=nncf
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index cd50d69f5af..20e308e6ef7 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -23,6 +23,36 @@ namespace executorch {
 namespace backends {
 namespace openvino {
 
+
+std::string scalarTypeToString(exa::ScalarType type) {
+    switch (type) {
+        case exa::ScalarType::Byte:   return "Byte";
+        case exa::ScalarType::Char:   return "Char";
+        case exa::ScalarType::Short:  return "Short";
+        case exa::ScalarType::Int:    return "Int";
+        case exa::ScalarType::Long:   return "Long";
+        case exa::ScalarType::Half:   return "Half";
+        case exa::ScalarType::Float:  return "Float";
+        case exa::ScalarType::Double: return "Double";
+        case exa::ScalarType::Bool:   return "Bool";
+        case exa::ScalarType::BFloat16: return "BFloat16";
+        case exa::ScalarType::ComplexHalf: return "ComplexHalf";
+        case exa::ScalarType::ComplexFloat: return "ComplexFloat";
+        case exa::ScalarType::ComplexDouble: return "ComplexDouble";
+        case exa::ScalarType::QUInt8: return "QUInt8";
+        case exa::ScalarType::QInt8:  return "QInt8";
+        case exa::ScalarType::QInt32: return "QInt32";
+        case exa::ScalarType::QUInt4x2: return "QUInt4x2";
+        case exa::ScalarType::QUInt2x4: return "QUInt2x4";
+        case exa::ScalarType::Undefined: return "Undefined";
+        case exa::ScalarType::NumOptions: return "NumOptions";
+        default:
+            throw std::invalid_argument("Unknown ScalarType");
+    }
+}
+
+
+
 OpenvinoBackend::OpenvinoBackend() {}
 
 bool OpenvinoBackend::is_available() const {
@@ -71,7 +101,9 @@ exr::Result<exr::DelegateHandle*> OpenvinoBackend::init(
   }
 
   // Import the model
+  //std::cout << "DEBUG - before import" << std::endl;
   auto compiled_model = core.import_model(compiled_stream, device);
+  //std::cout << "DEBUG - after import" << std::endl;
 
   // The processed data can be freed since the model is compiled
   processed->Free();
@@ -102,41 +134,41 @@ exr::Error OpenvinoBackend::execute(
   size_t num_outputs = infer_request->get_compiled_model().outputs().size();
 
   // Set inputs
-  std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl;
+  //std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl;
   for (size_t i = 0; i < num_inputs; i++) {
-    std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl;
-
-    if (args[i]->isNone()) {
-        std::cout << "DEBUG - Module - forward - A - type: none" << std::endl;
-    } else if (args[i]->isInt()) {
-        std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl;
-    } else if (args[i]->isDouble()) {
-        std::cout << "DEBUG - Module - forward - A - type: double" << std::endl;
-    } else if (args[i]->isBool()) {
-        std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl;
-    } else if (args[i]->isScalar()) {
-        std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl;
-    } else if (args[i]->isTensor()) {
-        std::cout << "DEBUG - Module - forward - A - type: tensor, shape: [";
-        for (int j=0; j<args[i]->toTensor().dim(); j++) {
-            std::cout << args[i]->toTensor().size(j) << ", ";
-        }
-        std::cout << "]" << std::endl;
-    } else if (args[i]->isString()) {
-        std::cout << "DEBUG - Module - forward - A - type: string" << std::endl;
-    } else if (args[i]->isIntList()) {
-        std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl;
-    } else if (args[i]->isBoolList()) {
-        std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl;
-    } else if (args[i]->isDoubleList()) {
-        std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl;
-    } else if (args[i]->isTensorList()) {
-        std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl;
-    } else if (args[i]->isListOptionalTensor()) {
-        std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl;
-    } else {
-        std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl;
-    }
+    //std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl;
+
+    //if (args[i]->isNone()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: none" << std::endl;
+    //} else if (args[i]->isInt()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl;
+    //} else if (args[i]->isDouble()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: double" << std::endl;
+    //} else if (args[i]->isBool()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl;
+    //} else if (args[i]->isScalar()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl;
+    //} else if (args[i]->isTensor()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: tensor, shape: [";
+    //    for (int j=0; j<args[i]->toTensor().dim(); j++) {
+    //        std::cout << args[i]->toTensor().size(j) << ", ";
+    //    }
+    //    std::cout << "]" << std::endl;
+    //} else if (args[i]->isString()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: string" << std::endl;
+    //} else if (args[i]->isIntList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl;
+    //} else if (args[i]->isBoolList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl;
+    //} else if (args[i]->isDoubleList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl;
+    //} else if (args[i]->isTensorList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl;
+    //} else if (args[i]->isListOptionalTensor()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl;
+    //} else {
+    //    std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl;
+    //}
 
     if (args[i]->isInt()) {
         //std::cout << "DEBUG - OpenvinoBackend - input - B.1" << std::endl;
@@ -156,7 +188,7 @@ exr::Error OpenvinoBackend::execute(
         //std::vector<int64_t> val = {args[i]->toInt()};
         //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, &val);
         ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
-        std::cout << "DEBUG - OpenvinoBackend - input - B.6 - val: " << ((int64_t*)(ov_input_tensor.data<int64_t>()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl;
+        //std::cout << "\tDEBUG - OpenvinoBackend - input - int - val: " << ((int64_t*)(ov_input_tensor.data<int64_t>()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl;
 
         infer_request->set_input_tensor(i, ov_input_tensor);
         //std::cout << "DEBUG - OpenvinoBackend - input - B.7" << std::endl;
@@ -178,18 +210,35 @@ exr::Error OpenvinoBackend::execute(
 
         infer_request->set_input_tensor(i, ov_input_tensor);
         //std::cout << "DEBUG - OpenvinoBackend - input - C.6" << std::endl;
+
+        //if (ov_type == ov::element::i64) {
+        //    int64_t sum = 0;
+        //    auto data_ptr = ov_input_tensor.data<int64_t>();
+        //    for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(int64_t); j++) {
+        //        sum += data_ptr[j];
+        //    }
+        //    //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl;
+        //} else {
+        //    float sum = 0;
+        //    auto data_ptr = ov_input_tensor.data<float>();
+        //    for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(float); j++) {
+        //        sum += data_ptr[j];
+        //    }
+        //    //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl;
+        //}
+        //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << std::endl;
     }
   }
 
   // Set outputs
-  std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl;
+  //std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl;
   for (size_t i = 0; i < num_outputs; i++) {
     //args[num_inputs + i]->toTensor().unsafeGetTensorImpl()->set_size(1,1);
-    std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: [";
-    for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
-        std::cout << args[num_inputs + i]->toTensor().size(j) << ", ";
-    }
-    std::cout << "]" << std::endl; 
+    //std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: [";
+    //for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
+    //    std::cout << args[num_inputs + i]->toTensor().size(j) << ", ";
+    //}
+    //std::cout << "]" << std::endl; 
     auto output_tensor = args[num_inputs + i]->toTensor();
     ov::Shape output_shape(
         output_tensor.sizes().begin(), output_tensor.sizes().end());
@@ -205,6 +254,15 @@ exr::Error OpenvinoBackend::execute(
 
   // Execute the inference
   infer_request->infer();
+  //for (size_t i = 0; i < num_outputs; i++) {
+  //    auto out_t = infer_request->get_output_tensor(i);
+  //    float sum = 0;
+  //    auto data_ptr = out_t.data<float>();
+  //    for (size_t j=0; j < out_t.get_byte_size()/sizeof(float); j++) {
+  //        sum += data_ptr[j];
+  //    }
+  //    //std::cout << "\tDEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << ", type: " << out_t.get_element_type() << ", sum_of_values: " << sum << std::endl;
+  //}
   //auto out_t = infer_request->get_output_tensor(0);
   //std::cout << "DEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << std::endl;
   //for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
@@ -241,13 +299,18 @@ void OpenvinoBackend::destroy(exr::DelegateHandle* handle) const {
 
 ov::element::Type OpenvinoBackend::convert_to_openvino_type(
     exa::ScalarType scalar_type) const {
+  //std::cout << "DEBUG - scalar_type: " << scalarTypeToString(scalar_type) << std::endl;
   switch (scalar_type) {
     case exa::ScalarType::Float:
       return ov::element::f32;
+    case exa::ScalarType::Half:
+      return ov::element::f16;
     case exa::ScalarType::Int:
       return ov::element::i32;
     case exa::ScalarType::Char:
       return ov::element::i8;
+    case exa::ScalarType::Byte:
+      return ov::element::u8;
     case exa::ScalarType::Long:
       return ov::element::i64;
     case exa::ScalarType::Bool:
diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py
new file mode 100644
index 00000000000..debefdd1a35
--- /dev/null
+++ b/backends/openvino/utils.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import executorch.exir as exir
+
+import torch
+from executorch.exir import EdgeProgramManager, ExecutorchProgramManager
+from executorch.exir.program._program import to_edge_with_preserved_ops
+from executorch.exir.tracer import Value
+from torch.export import export, export_for_training, ExportedProgram
+
+
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+    _check_ir_validity=True,
+    _skip_dim_order=True,  # TODO(T189114319): Reuse dim order op after solving the ios oss issue
+)
+
+
+def _to_core_aten(
+    model: Union[torch.fx.GraphModule, torch.nn.Module],
+    example_inputs: Tuple[Value, ...],
+    *,
+    example_kwarg_inputs: Optional[Dict] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    strict=True,
+    verbose=True,
+) -> ExportedProgram:
+    # post autograd export. eventually this will become .to_core_aten
+    if not isinstance(model, torch.fx.GraphModule) and not isinstance(
+        model, torch.nn.Module
+    ):
+        raise ValueError(
+            f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}"
+        )
+    core_aten_ep = export(
+        model,
+        example_inputs,
+        example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+    )
+    if verbose:
+        logging.info(f"Core ATen graph:\n{core_aten_ep.graph}")
+    return core_aten_ep
+
+
+def _core_aten_to_edge(
+    core_aten_exir_ep: ExportedProgram,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=None,
+    verbose=True,
+) -> EdgeProgramManager:
+    if not edge_compile_config:
+        edge_compile_config = exir.EdgeCompileConfig(
+            _check_ir_validity=False,  # quant ops currently break ir verification
+        )
+    edge_manager: EdgeProgramManager = to_edge_with_preserved_ops(
+        core_aten_exir_ep,
+        constant_methods=edge_constant_methods,
+        compile_config=edge_compile_config,
+        preserve_ops=[torch.ops.aten.stack.default,],
+    )
+    if verbose:
+        logging.info(f"Exported graph:\n{edge_manager.exported_program()}")
+    return edge_manager
+
+
+def export_to_edge(
+    model: Union[torch.fx.GraphModule, torch.nn.Module],
+    example_inputs: Tuple[Value, ...],
+    *,
+    example_kwarg_inputs: Optional[Dict] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=_EDGE_COMPILE_CONFIG,
+    strict=True,
+    verbose=True,
+) -> EdgeProgramManager:
+    print("DEBUG - executorch - openvino_utils")
+    core_aten_ep = _to_core_aten(
+        model,
+        example_inputs,
+        example_kwarg_inputs=example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+        verbose=verbose,
+    )
+    return _core_aten_to_edge(
+        core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
+    )
+
+
+def export_to_exec_prog(
+    model: Union[torch.fx.GraphModule, torch.nn.Module],
+    example_inputs: Tuple[Value, ...],
+    *,
+    example_kwarg_inputs: Optional[Dict[str, Any]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=_EDGE_COMPILE_CONFIG,
+    backend_config=None,
+    strict=True,
+) -> ExecutorchProgramManager:
+    m = model.eval()
+    # pre-autograd export. eventually this will become torch.export
+    m = export_for_training(m, example_inputs, strict=True).module()
+
+    core_aten_ep = _to_core_aten(
+        m,
+        example_inputs,
+        example_kwarg_inputs=example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+    )
+
+    edge_m = _core_aten_to_edge(
+        core_aten_ep, edge_constant_methods, edge_compile_config
+    )
+
+    exec_prog = edge_m.to_executorch(backend_config)
+    return exec_prog
+
+
+def save_pte_program(
+    prog: ExecutorchProgramManager, model_name: str, output_dir: str = ""
+) -> str:
+    if model_name.endswith(".pte"):
+        filename = model_name
+    else:
+        filename = os.path.join(output_dir, f"{model_name}.pte")
+
+    try:
+        # Write program to file.
+        with open(filename, "wb") as file:
+            prog.write_to_file(file)
+            logging.info(f"Saved exported program to {filename}")
+        # Write data to file/s.
+        prog.write_tensor_data_to_file(outdir=output_dir)
+    except Exception as e:
+        logging.error(f"Error while saving to {filename}: {e}")
+
+    return filename
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 8da1eab844b..0780ff2cd8b 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -413,6 +413,7 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 self.pre_autograd_graph_module = m
             return self
         elif (self.nncf_compression):
+            print("DEBUG - executorch - builder - quantize - A")
             tokenizer = get_tokenizer(self.tokenizer_path)
 
             def transform_fn(
@@ -439,6 +440,7 @@ def transform_fn(
                                                                 ratio=0.8,
                                                                 sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
                                                             )
+            print("DEBUG - executorch - builder - quantize - B")
             return self
         else:
             logging.info("No quantizer provided, passing...")
@@ -448,6 +450,7 @@ def export_to_edge(self) -> "LLMEdgeManager":
         """
         Export the model to Edge dialect and retrieve a LLMEdgeManager.
         """
+        print("DEBUG - executorch - builder - export_to_edge - A")
         dynamic_shape = self._get_dynamic_shape()
         edge_config = self._get_edge_config()
 
@@ -467,6 +470,8 @@ def export_to_edge(self) -> "LLMEdgeManager":
                 )
 
             with override_export_behaviour:
+                #if (self.nncf_compression):
+                #    from executorch.backends.openvino.utils import export_to_edge
                 self.edge_manager = export_to_edge(
                     self.pre_autograd_graph_module,  # pyre-fixme[6]
                     self.example_inputs,
@@ -476,6 +481,7 @@ def export_to_edge(self) -> "LLMEdgeManager":
                     edge_compile_config=edge_config,
                     verbose=self.verbose,
                 )
+        print("DEBUG - executorch - builder - export_to_edge - B")
         return self
 
     def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager":

From 3fef8fd57a0d980c94e35e417e16a119939f4268 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 5 Jun 2025 17:03:31 -0700
Subject: [PATCH 178/178] enable import override for export_to_edge with
 openvino

---
 extension/llm/export/builder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 0780ff2cd8b..6339da0b311 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -470,8 +470,8 @@ def export_to_edge(self) -> "LLMEdgeManager":
                 )
 
             with override_export_behaviour:
-                #if (self.nncf_compression):
-                #    from executorch.backends.openvino.utils import export_to_edge
+                if (self.nncf_compression):
+                    from executorch.backends.openvino.utils import export_to_edge
                 self.edge_manager = export_to_edge(
                     self.pre_autograd_graph_module,  # pyre-fixme[6]
                     self.example_inputs,