diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 58ee78787176..02134b64b619 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -78,9 +78,9 @@ tests/**  @comaniac @junrushao1994 @tqchen @jroesch @areusch @yzhliu @merrymercy
 ##############################
 
 # automation related
-src/auto_scheduler/**  @merrymercy @jcf94 @comaniac @junrushao1994 @vinx13
-include/tvm/auto_scheduler/**  @merrymercy @jcf94 @comaniac @junrushao1994 @vinx13
-python/tvm/auto_scheduler/**  @merrymercy @jcf94 @comaniac @junrushao1994 @vinx13
+src/auto_scheduler/**  @merrymercy @jcf94 @comaniac @junrushao1994 @vinx13 @Hzfengsy
+include/tvm/auto_scheduler/**  @merrymercy @jcf94 @comaniac @junrushao1994 @vinx13 @Hzfengsy
+python/tvm/auto_scheduler/**  @merrymercy @jcf94 @comaniac @junrushao1994 @vinx13 @Hzfengsy
 
 python/tvm/autotvm/**  @merrymercy @jcf94 @comaniac @junrushao1994 @vinx13
 
@@ -94,9 +94,9 @@ include/tvm/ir/**      @junrushao1994 @vinx13 @tqchen @jroesch @comaniac
 python/tvm/ir/**      @junrushao1994 @vinx13 @tqchen @jroesch @comaniac
 
 # tir
-src/tir/**      @junrushao1994 @vinx13 @tqchen @kparzysz-quic @ZihengJiang @masahi @were
-include/tvm/tir/**      @junrushao1994 @vinx13 @tqchen @kparzysz-quic @ZihengJiang @masahi @were
-python/tvm/tir/**      @junrushao1994 @vinx13 @tqchen @kparzysz-quic @ZihengJiang @masahi @were
+src/tir/**      @junrushao1994 @vinx13 @tqchen @kparzysz-quic @ZihengJiang @masahi @were @Hzfengsy
+include/tvm/tir/**      @junrushao1994 @vinx13 @tqchen @kparzysz-quic @ZihengJiang @masahi @were @Hzfengsy
+python/tvm/tir/**      @junrushao1994 @vinx13 @tqchen @kparzysz-quic @ZihengJiang @masahi @were @Hzfengsy
 
 # te
 src/te/**      @junrushao1994 @vinx13 @tqchen @kparzysz-quic @ZihengJiang @masahi @were
@@ -122,11 +122,11 @@ include/tvm/runtime/**  @vinx13 @tqchen @FronzenGene @liangfu @areusch @tmoreau8
 python/tvm/runtime/**  @vinx13 @tqchen @FronzenGene @liangfu @areusch @tmoreau89 @ajtulloch @masahi @kazum @ZihengJiang  @junrushao1994
 
 # runtime/micro
-src/runtime/micro/** @areusch @liangfu @tmoreau89
-src/runtime/crt/** @areusch @liangfu @tmoreau89
-include/tvm/runtime/crt/** @areusch @liangfu @tmoreau89
-include/tvm/runtime/micro/** @areusch @liangfu @tmoreau89
-python/tvm/micro/** @areusch @liangfu @tmoreau89
+src/runtime/micro/** @areusch @liangfu @tmoreau89 @manupa-arm
+src/runtime/crt/** @areusch @liangfu @tmoreau89 @manupa-arm
+include/tvm/runtime/crt/** @areusch @liangfu @tmoreau89 @manupa-arm
+include/tvm/runtime/micro/** @areusch @liangfu @tmoreau89 @manupa-arm
+python/tvm/micro/** @areusch @liangfu @tmoreau89 @manupa-arm
 
 # relay
 src/relay/** @jroesch @slyubomirsky @icemelon9 @MarisaKirisame @ZihengJiang @yzhliu @vinx13 @mbrookhart @jwfromm @zhiics @anijain2305 @wweic @eqy @junrushao1994
@@ -140,7 +140,7 @@ inlcude/tvm/relay/qnn/**  @jwfromm @anijain2305 @ZihengJiang
 python/tvm/relay/qnn/**  @jwfromm @anijain2305 @ZihengJiang
 
 # relay/backend/contrib: BYOC
-src/relay/backend/contrib/** @zhiics @trevor-m @comaniac @mbaret
+src/relay/backend/contrib/** @zhiics @trevor-m @comaniac @mbaret @manupa-arm
 
 # relay/frontends
 python/tvm/relay/frontend/**  @jwfromm @mbrookhart @srkreddy1238 @siju-samuel @Huyuwei @hlu1 @kazum @PariksheetPinjari909
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 000000000000..a1c1facb7e1b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,22 @@
+---
+name: "\U0001F4C4 Documentation"
+about: Use this template to suggest additions and changes to the documentation.
+title: "[Docs] "
+labels: "type: doc"
+
+---
+
+Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :smile_cat:
+
+Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed.
+
+### Documentation Title & Type
+
+Include the title of the document (e.g. "Getting Started with TVM"), and the type of documentation (e.g. user docs, developer docs, tutorials)
+
+### Additions/Changes Requested
+
+If an RFC/discuss post exists, link it here.
+
+Otherwise, specify what actions should be taken to provide additional clarity/readability/reproducibility to the document. Include code snippets from the previous documentation if applicable.
+
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index dfe9f572a43d..36a91576edf6 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit dfe9f572a43d41e0c1ecdf036cea97042a0febfe
+Subproject commit 36a91576edf633479c78649e050f18dd2ddc8103
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 127ba50b3720..bf0a1c61a341 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ tvm_option(HIDE_PRIVATE_SYMBOLS "Compile with -fvisibility=hidden." OFF)
 tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF)
 tvm_option(USE_FALLBACK_STL_MAP "Use TVM's POD compatible Map" OFF)
 tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
+tvm_option(USE_CMSISNN "Build with Arm CMSIS-NN" OFF)
 tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
 tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
 tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
@@ -304,9 +305,12 @@ if(BUILD_FOR_HEXAGON)
   # static one.
   if(NOT BUILD_STATIC_RUNTIME)
     list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon_posix.cc)
+    # Allow undefined symbols (there will be some from libc).
+    set(TVM_NO_UNDEFINED_SYMBOLS "")
   endif()
 
   add_definitions(-D_MACH_I32=int)
+  add_definitions(-DDMLC_CXX11_THREAD_LOCAL=0)
 endif()
 
 # Package runtime rules
@@ -357,7 +361,7 @@ endif(USE_GRAPH_EXECUTOR)
 
 # convert old options for profiler
 if(USE_GRAPH_EXECUTOR_DEBUG)
-  message(WARNING "USE_GRAPH_EXECUTOR renamed to USE_PROFILER. Please update your config.cmake")
+  message(WARNING "USE_GRAPH_EXECUTOR_DEBUG renamed to USE_PROFILER. Please update your config.cmake")
   unset(USE_GRAPH_EXECUTOR_DEBUG CACHE)
   set(USE_PROFILER ON)
 endif()
@@ -388,6 +392,12 @@ if(GTEST_INCLUDE_DIR AND GTEST_LIB)
   include(GoogleTest)
 endif()
 
+if(USE_PIPELINE_EXECUTOR)
+  message(STATUS "Build with Pipeline Executor support...")
+  file(GLOB RUNTIME_PIPELINE_SRCS src/runtime/pipeline/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_PIPELINE_SRCS})
+endif(USE_PIPELINE_EXECUTOR)
+
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
@@ -401,9 +411,12 @@ include(cmake/modules/ROCM.cmake)
 include(cmake/modules/LLVM.cmake)
 include(cmake/modules/Micro.cmake)
 include(cmake/modules/contrib/EthosN.cmake)
+include(cmake/modules/contrib/CMSISNN.cmake)
+include(cmake/modules/contrib/EthosU.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/CODEGENC.cmake)
 include(cmake/modules/contrib/DNNL.cmake)
+include(cmake/modules/contrib/ExampleTargetHooks.cmake)
 include(cmake/modules/contrib/Random.cmake)
 include(cmake/modules/contrib/Posit.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 96e1b9e42a7f..14f8191707c8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -56,6 +56,7 @@ We do encourage everyone to work anything they are interested in.
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic - hexagon, llvm
 - [Andrew Reusch](https://github.com/areusch): @areusch - runtime, microTVM
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
+- [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Junru Shao](https://github.com/junrushao1994) (PMC): @junrushao1994 - relay, compiler
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
@@ -107,6 +108,7 @@ We do encourage everyone to work anything they are interested in.
 - [Yizhi Liu](https://github.com/yzhliu) : @yzhliu
 - [Hao Lu](https://github.com/hlu1): @hlu1
 - [Eric Lunderberg](https://github.com/Lunderberg): @Lunderberg
+- [Andrew Z. Luo](https://github.com/AndrewZhaoLuo): @AndrewZhaoLuo
 - [Steven Lyubomirsky](https://github.com/slyubomirsky): @slyubomirsky
 - [Masahiro Masuda](https://github.com/masahi): @masahi
 - [Sergey Mironov](https://github.com/grwlf): @grwlf
diff --git a/apps/bundle_deploy/crt_config/crt_config.h b/apps/bundle_deploy/crt_config/crt_config.h
index 58f923512d2e..b89bedbc6d45 100644
--- a/apps/bundle_deploy/crt_config/crt_config.h
+++ b/apps/bundle_deploy/crt_config/crt_config.h
@@ -43,7 +43,7 @@
 #define TVM_CRT_MAX_REGISTERED_MODULES 2
 
 /*! Size of the global function registry, in bytes. */
-#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 200
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
 
 /*! Maximum packet size, in bytes, including the length header. */
 #define TVM_CRT_MAX_PACKET_SIZE_BYTES 512
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index ccd372b1adf4..8950eb4eab1d 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -23,6 +23,7 @@
 #include "TVMRuntime.h"
 // Runtime API
 #include "../../../src/runtime/c_runtime_api.cc"
+#include "../../../src/runtime/contrib/random/random.cc"
 #include "../../../src/runtime/cpu_device_api.cc"
 #include "../../../src/runtime/dso_library.cc"
 #include "../../../src/runtime/file_utils.cc"
diff --git a/apps/microtvm/arduino/example_project/src/model.c b/apps/microtvm/arduino/example_project/src/model.c
index 9e7c47f75160..553665191b14 100644
--- a/apps/microtvm/arduino/example_project/src/model.c
+++ b/apps/microtvm/arduino/example_project/src/model.c
@@ -20,11 +20,11 @@
 #include "model.h"
 
 #include "Arduino.h"
+#include "standalone_crt/include/dlpack/dlpack.h"
 #include "standalone_crt/include/tvm/runtime/crt/stack_allocator.h"
 
 // AOT memory array
 static uint8_t g_aot_memory[WORKSPACE_SIZE];
-extern tvm_model_t tvmgen_default_network;
 tvm_workspace_t app_workspace;
 
 // Blink code for debugging purposes
diff --git a/apps/microtvm/arduino/host_driven/src/model_support.c b/apps/microtvm/arduino/host_driven/src/model_support.c
index dfcb031136c5..bcc9a109cace 100644
--- a/apps/microtvm/arduino/host_driven/src/model_support.c
+++ b/apps/microtvm/arduino/host_driven/src/model_support.c
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include "standalone_crt/include/dlpack/dlpack.h"
+#include "standalone_crt/include/tvm/runtime/crt/error_codes.h"
 #include "stdarg.h"
 
 // Blink code for debugging purposes
diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 57177179bcd0..3d25d0bcad8f 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -57,6 +57,7 @@ class BoardAutodetectFailed(Exception):
         "package": "arduino",
         "architecture": "sam",
         "board": "arduino_due_x_dbg",
+        "model": "sam3x8e",
     },
     # Due to the way the Feather S2 bootloader works, compilation
     # behaves fine but uploads cannot be done automatically
@@ -64,27 +65,32 @@ class BoardAutodetectFailed(Exception):
         "package": "esp32",
         "architecture": "esp32",
         "board": "feathers2",
+        "model": "esp32",
     },
     "metrom4": {
         "package": "adafruit",
         "architecture": "samd",
         "board": "adafruit_metro_m4",
+        "model": "atsamd51",
     },
     # Spresense only works as of its v2.3.0 sdk
     "spresense": {
         "package": "SPRESENSE",
         "architecture": "spresense",
         "board": "spresense",
+        "model": "cxd5602gg",
     },
     "nano33ble": {
         "package": "arduino",
         "architecture": "mbed_nano",
         "board": "nano33ble",
+        "model": "nrf52840",
     },
     "pybadge": {
         "package": "adafruit",
         "architecture": "samd",
         "board": "adafruit_pybadge_m4",
+        "model": "atsamd51",
     },
     # The Teensy boards are listed here for completeness, but they
     # won't work until https://github.com/arduino/arduino-cli/issues/700
@@ -93,16 +99,19 @@ class BoardAutodetectFailed(Exception):
         "package": "teensy",
         "architecture": "avr",
         "board": "teensy40",
+        "model": "imxrt1060",
     },
     "teensy41": {
         "package": "teensy",
         "architecture": "avr",
         "board": "teensy41",
+        "model": "imxrt1060",
     },
     "wioterminal": {
         "package": "Seeeduino",
         "architecture": "samd",
         "board": "seeed_wio_terminal",
+        "model": "atsamd51",
     },
 }
 
@@ -114,6 +123,11 @@ class BoardAutodetectFailed(Exception):
         choices=list(BOARD_PROPERTIES),
         help="Name of the Arduino board to build for",
     ),
+    server.ProjectOption(
+        "arduino_model",
+        choices=[board["model"] for _, board in BOARD_PROPERTIES.items()],
+        help="Name of the model for each Arduino board.",
+    ),
     server.ProjectOption("arduino_cli_cmd", help="Path to the arduino-cli tool."),
     server.ProjectOption("port", help="Port to use for connecting to hardware"),
     server.ProjectOption(
@@ -370,7 +384,7 @@ def build(self, options):
             compile_cmd.append("--verbose")
 
         # Specify project to compile
-        subprocess.run(compile_cmd)
+        subprocess.run(compile_cmd, check=True)
 
     BOARD_LIST_HEADERS = ("Port", "Type", "Board Name", "FQBN", "Core")
 
@@ -407,7 +421,9 @@ def _parse_boards_tabular_str(self, tabular_str):
 
     def _auto_detect_port(self, options):
         list_cmd = [options["arduino_cli_cmd"], "board", "list"]
-        list_cmd_output = subprocess.run(list_cmd, stdout=subprocess.PIPE).stdout.decode("utf-8")
+        list_cmd_output = subprocess.run(
+            list_cmd, check=True, stdout=subprocess.PIPE
+        ).stdout.decode("utf-8")
 
         desired_fqbn = self._get_fqbn(options)
         for line in self._parse_boards_tabular_str(list_cmd_output):
@@ -444,7 +460,7 @@ def flash(self, options):
         if options.get("verbose"):
             upload_cmd.append("--verbose")
 
-        subprocess.run(upload_cmd)
+        subprocess.run(upload_cmd, check=True)
 
     def open_transport(self, options):
         # Zephyr example doesn't throw an error in this case
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 8bfae0a157cd..98c769be48f5 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -111,6 +111,7 @@ tensorflow-estimator = {version = "^2.1", optional = true}
 # TFLite frontend
 tflite = {version = "2.1.0", optional = true}
 wheel = "*"
+cloudpickle = "^1.6.0"
 
 
 [tool.poetry.extras]
diff --git a/apps/microtvm/reference-vm/README.md b/apps/microtvm/reference-vm/README.md
index 9303c0a64ece..a5bb63574ce3 100644
--- a/apps/microtvm/reference-vm/README.md
+++ b/apps/microtvm/reference-vm/README.md
@@ -78,14 +78,14 @@ $ ./base-box-tool.py --provider virtualbox build zephyr
 
    B. Run tests:
    ```bash
-   $ ./base-box-tool.py [--provider=PROVIDER] test --microtvm-platform=MICROTVM_PLATFORM [--test-device-serial=SERIAL] PLATFORM
+   $ ./base-box-tool.py [--provider=PROVIDER] test --microtvm-board=MICROTVM_BOARD [--test-device-serial=SERIAL] PLATFORM
    ```
-   where MICROTVM_PLATFORM is one of the options listed in the
+   where MICROTVM_BOARD is one of the options listed in the
    PLATFORM/base-box/test-config.json file.
 
    For example:
    ```base
-   $ ./base-box-tool.py --provider virtualbox test --microtvm-platform=stm32f746xx_disco zephyr
+   $ ./base-box-tool.py --provider virtualbox test --microtvm-board=stm32f746g_disco zephyr
    ```
 
    This command does the following for the specified provider:
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
index 996d303d48fb..11d89f2cd44e 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
@@ -30,9 +30,9 @@ cd ~
 
 sudo apt-get install -y ca-certificates
 
-# Install Arduino-CLI (latest version)
+# Install Arduino-CLI (specific version)
 export PATH="/home/vagrant/bin:$PATH"
-wget -O - https://raw.githubusercontent.com/arduino/arduino-cli/master/install.sh | sh -s
+wget -O - https://raw.githubusercontent.com/arduino/arduino-cli/master/install.sh | sh -s 0.18.3
 
 # Arduino (the CLI and GUI) require the dialout permission for uploading
 sudo usermod -a -G dialout $USER
@@ -45,10 +45,11 @@ ADAFRUIT_BOARDS_URL="https://adafruit.github.io/arduino-board-index/package_adaf
 ESP32_BOARDS_URL="https://raw.githubusercontent.com/espressif/arduino-esp32/gh-pages/package_esp32_dev_index.json"
 SPARKFUN_BOARDS_URL="https://raw.githubusercontent.com/sparkfun/Arduino_Boards/master/IDE_Board_Manager/package_sparkfun_index.json"
 SEEED_BOARDS_URL="https://files.seeedstudio.com/arduino/package_seeeduino_boards_index.json"
-SPRESENSE_BOARDS_URL="https://github.com/sonydevworld/spresense-arduino-compatible/releases/download/generic/package_spresense_index.json"
+SPRESENSE_BOARDS_URL="https://github.com/sonydevworld/spresense-arduino-compatible/releases/download/v2.2.1/package_spresense_index.json"
 arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOARDS_URL,$SPARKFUN_BOARDS_URL,$SEEED_BOARDS_URL,$SPRESENSE_BOARDS_URL
 
 # Install supported cores from those URLS
+arduino-cli version
 arduino-cli core install arduino:mbed_nano
 arduino-cli core install arduino:sam
 arduino-cli core install adafruit:samd --additional-urls $ADAFRUIT_BOARDS_URL
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh
index 3d8597f19b64..5c3d96dfc7df 100755
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Usage: base_box_test.sh <MICROTVM_PLATFORM>
+# Usage: base_box_test.sh <ARDUINO_BOARD>
 #     Execute microTVM Arduino tests.
 #
 
@@ -24,17 +24,17 @@ set -e
 set -x
 
 if [ "$#" -lt 1 ]; then
-    echo "Usage: base_box_test.sh <MICROTVM_PLATFORM>"
+    echo "Usage: base_box_test.sh <ARDUINO_BOARD>"
     exit -1
 fi
 
-microtvm_platform=$1
+board=$1
 
-pytest tests/micro/arduino/test_arduino_workflow.py --microtvm-platforms=${microtvm_platform}
+pytest tests/micro/arduino/test_arduino_workflow.py --arduino-board=${board}
 
-if [ $microtvm_platform == "nano33ble" ]; then
+if [ $board == "nano33ble" ]; then
     # https://github.com/apache/tvm/issues/8730
-    echo "NOTE: skipped test_arduino_rpc_server.py on $microtvm_platform -- known failure"
+    echo "NOTE: skipped test_arduino_rpc_server.py on $board -- known failure"
 else
-    pytest tests/micro/arduino/test_arduino_rpc_server.py --microtvm-platforms=${microtvm_platform}
+    pytest tests/micro/arduino/test_arduino_rpc_server.py --arduino-board=${board}
 fi
diff --git a/apps/microtvm/reference-vm/arduino/provision_setup.sh b/apps/microtvm/reference-vm/arduino/provision_setup.sh
index aeb46a8f7649..1a24cbad9419 100644
--- a/apps/microtvm/reference-vm/arduino/provision_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/provision_setup.sh
@@ -22,7 +22,8 @@ set -ex
 # NOTE: TVM is presumed to be mounted already by Vagrantfile.
 cd "${TVM_HOME}"
 
-apps/microtvm/reference-vm/rebuild-tvm.sh
+platform="arduino"
+apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
 
 # Build poetry
 cd apps/microtvm/reference-vm/arduino
@@ -43,6 +44,6 @@ echo "------------------------------[ TVM Message ]-----------------------------
 poetry lock -vvv
 poetry install
 
-echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm" >>~/.profile
+echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile
 echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/arduino && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
 echo "source \$VENV_PATH/bin/activate" >>~/.profile
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index f32885433c2b..3a5fd18cede7 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -28,7 +28,6 @@
 import subprocess
 import sys
 
-
 _LOG = logging.getLogger(__name__)
 
 
@@ -48,10 +47,19 @@
     "zephyr",
 )
 
-# List of identifying strings for microTVM platforms for testing.
-# Must match PLATFORMS as defined in tvm/tests/micro/[platform]/conftest.py
-# TODO add a way to declare supported platforms to ProjectAPI
-ALL_MICROTVM_PLATFORMS = {
+# Extra scripts required to execute on provisioning
+# in [platform]/base-box/base_box_provision.sh
+EXTRA_SCRIPTS = {
+    "arduino": (),
+    "zephyr": ("docker/install/ubuntu_init_zephyr_project.sh",),
+}
+
+PACKER_FILE_NAME = "packer.json"
+
+
+# List of identifying strings for microTVM boards for testing.
+# TODO add a way to declare supported boards to ProjectAPI
+ALL_MICROTVM_BOARDS = {
     "arduino": (
         "due",
         "feathers2",
@@ -64,22 +72,13 @@
         "wioterminal",
     ),
     "zephyr": (
-        "stm32f746xx_nucleo",
-        "stm32f746xx_disco",
-        "nrf5340dk",
+        "nucleo_f746zg",
+        "stm32f746g_disco",
+        "nrf5340dk_nrf5340_cpuapp",
         "mps2_an521",
     ),
 }
 
-# Extra scripts required to execute on provisioning
-# in [platform]/base-box/base_box_provision.sh
-EXTRA_SCRIPTS = {
-    "arduino": (),
-    "zephyr": ("docker/install/ubuntu_init_zephyr_project.sh",),
-}
-
-PACKER_FILE_NAME = "packer.json"
-
 
 def parse_virtualbox_devices():
     output = subprocess.check_output(["VBoxManage", "list", "usbhost"], encoding="utf-8")
@@ -362,7 +361,7 @@ def _quote_cmd(cmd):
         + _quote_cmd(
             [
                 f"apps/microtvm/reference-vm/{platform}/base-box/base_box_test.sh",
-                test_config["microtvm_platform"],
+                test_config["microtvm_board"],
             ]
         )
     )
@@ -376,22 +375,22 @@ def test_command(args):
     with open(test_config_file) as f:
         test_config = json.load(f)
 
-        # select microTVM test platform
-        microtvm_test_platform = test_config[args.microtvm_platform]
+        # select microTVM test config
+        microtvm_test_config = test_config[args.microtvm_board]
 
         for key, expected_type in REQUIRED_TEST_CONFIG_KEYS.items():
-            assert key in microtvm_test_platform and isinstance(
-                microtvm_test_platform[key], expected_type
+            assert key in microtvm_test_config and isinstance(
+                microtvm_test_config[key], expected_type
             ), f"Expected key {key} of type {expected_type} in {test_config_file}: {test_config!r}"
 
-        microtvm_test_platform["vid_hex"] = microtvm_test_platform["vid_hex"].lower()
-        microtvm_test_platform["pid_hex"] = microtvm_test_platform["pid_hex"].lower()
-        microtvm_test_platform["microtvm_platform"] = args.microtvm_platform
+        microtvm_test_config["vid_hex"] = microtvm_test_config["vid_hex"].lower()
+        microtvm_test_config["pid_hex"] = microtvm_test_config["pid_hex"].lower()
+        microtvm_test_config["microtvm_board"] = args.microtvm_board
 
     providers = args.provider
     provider_passed = {p: False for p in providers}
 
-    release_test_dir = os.path.join(THIS_DIR, "release-test")
+    release_test_dir = os.path.join(THIS_DIR, f"release-test-{args.platform}")
 
     if args.skip_build:
         assert len(providers) == 1, "--skip-build was given, but >1 provider specified"
@@ -406,7 +405,7 @@ def test_command(args):
                 release_test_dir,
                 args.platform,
                 provider_name,
-                microtvm_test_platform,
+                microtvm_test_config,
                 args.test_device_serial,
             )
             provider_passed[provider_name] = True
@@ -511,10 +510,10 @@ def parse_args():
         platform_specific_parser = parser_test_platform_subparsers.add_parser(platform)
         platform_specific_parser.set_defaults(platform=platform)
         platform_specific_parser.add_argument(
-            "--microtvm-platform",
-            choices=ALL_MICROTVM_PLATFORMS[platform],
+            "--microtvm-board",
+            choices=ALL_MICROTVM_BOARDS[platform],
             required=True,
-            help="MicroTVM platfrom used for testing.",
+            help="MicroTVM board used for testing.",
         )
 
     # Options for release subcommand
diff --git a/apps/microtvm/reference-vm/rebuild-tvm.sh b/apps/microtvm/reference-vm/rebuild-tvm.sh
index 1cebcf7166af..aca138d877b3 100755
--- a/apps/microtvm/reference-vm/rebuild-tvm.sh
+++ b/apps/microtvm/reference-vm/rebuild-tvm.sh
@@ -15,9 +15,20 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+#
+# "Usage ./apps/microtvm/reference-vm/rebuild-tvm.sh <PLATFORM>"
+#
 
 set -e
 
+if [ "$#" -lt 1 -o "$1" == "--help" ]; then
+    echo "Usage ./apps/microtvm/reference-vm/rebuild-tvm.sh <PLATFORM>"
+    exit -1
+fi
+
+platform=$1
+shift 1
+
 # Get number of cores for build
 if [ -n "${TVM_CI_NUM_CORES}" ]; then
   num_cores=${TVM_CI_NUM_CORES}
@@ -28,7 +39,7 @@ fi
 
 cd "$(dirname $0)"
 cd "$(git rev-parse --show-toplevel)"
-BUILD_DIR=build-microtvm
+BUILD_DIR="build-microtvm-${platform}"
 
 if [ ! -e "${BUILD_DIR}" ]; then
     mkdir "${BUILD_DIR}"
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
index 8eba63e9e331..a1da0bbe4e35 100755
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Usage: base_box_test.sh <MICROTVM_PLATFORM>
+# Usage: base_box_test.sh <ZEPHYR_BOARD>
 #     Execute microTVM Zephyr tests.
 #
 
@@ -24,16 +24,16 @@ set -e
 set -x
 
 if [ "$#" -lt 1 ]; then
-    echo "Usage: base_box_test.sh <MICROTVM_PLATFORM>"
+    echo "Usage: base_box_test.sh <ZEPHYR_BOARD>"
     exit -1
 fi
 
-microtvm_platform=$1
+board=$1
 
-pytest tests/micro/zephyr/test_zephyr.py --microtvm-platforms=${microtvm_platform}
+pytest tests/micro/zephyr/test_zephyr.py --zephyr-board=${board}
 
-if [ $microtvm_platform == "stm32f746xx" ]; then
-    echo "NOTE: skipped test_zephyr_aot.py on $microtvm_platform -- known failure"
+if [ $board == "stm32f746g_disco" ] || [ $board == "nucleo_f746zg" ]; then
+    echo "NOTE: skipped test_zephyr_aot.py on $board -- known failure"
 else
-    pytest tests/micro/zephyr/test_zephyr_aot.py --microtvm-platforms=${microtvm_platform}
+    pytest tests/micro/zephyr/test_zephyr_aot.py --zephyr-board=${board}
 fi
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
index f3f2633d9468..6e0cfa5e2527 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
+++ b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
@@ -1,13 +1,13 @@
 {
-    "stm32f746xx_nucleo": {
+    "nucleo_f746zg": {
         "vid_hex": "0483",
         "pid_hex": "374b"
         },
-    "stm32f746xx_disco": {
+    "stm32f746g_disco": {
         "vid_hex": "0483",
         "pid_hex": "374b"
         },
-    "nrf5340dk": {
+    "nrf5340dk_nrf5340_cpuapp": {
         "vid_hex": "1366",
         "pid_hex": "1055"
         },
diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
index 2ee2350b377a..e1f3bef75508 100644
--- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
@@ -22,7 +22,8 @@ set -ex
 # NOTE: TVM is presumed to be mounted already by Vagrantfile.
 cd "${TVM_HOME}"
 
-apps/microtvm/reference-vm/rebuild-tvm.sh
+platform="zephyr"
+apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
 
 # Build poetry
 cd apps/microtvm/reference-vm/zephyr
@@ -44,7 +45,7 @@ poetry lock -vvv
 poetry install
 poetry run pip3 install -r ${ZEPHYR_BASE}/scripts/requirements.txt
 
-echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm" >>~/.profile
+echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile
 echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/zephyr && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
 echo "source \$VENV_PATH/bin/activate" >>~/.profile
 echo "export PATH=\"\${PATH}:\${HOME}/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin\"" >>~/.profile
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index f267648a83f9..f2e091b2f5b5 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -57,6 +57,47 @@
 
 IS_TEMPLATE = not (API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH).exists()
 
+# Data structure to hold the information microtvm_api_server.py needs
+# to communicate with each of these boards.
+BOARD_PROPERTIES = {
+    "qemu_x86": {
+        "board": "qemu_x86",
+        "model": "host",
+    },
+    "qemu_riscv32": {
+        "board": "qemu_riscv32",
+        "model": "host",
+    },
+    "qemu_riscv64": {
+        "board": "qemu_riscv64",
+        "model": "host",
+    },
+    "mps2_an521": {
+        "board": "mps2_an521",
+        "model": "mps2_an521",
+    },
+    "nrf5340dk_nrf5340_cpuapp": {
+        "board": "nrf5340dk_nrf5340_cpuapp",
+        "model": "nrf5340dk",
+    },
+    "stm32f746g_disco": {
+        "board": "stm32f746g_disco",
+        "model": "stm32f746xx",
+    },
+    "nucleo_f746zg": {
+        "board": "nucleo_f746zg",
+        "model": "stm32f746xx",
+    },
+    "nucleo_l4r5zi": {
+        "board": "nucleo_l4r5zi",
+        "model": "stm32l4r5zi",
+    },
+    "qemu_cortex_r5": {
+        "board": "qemu_cortex_r5",
+        "model": "zynq_mp_r5",
+    },
+}
+
 
 def check_call(cmd_args, *args, **kwargs):
     cwd_str = "" if "cwd" not in kwargs else f" (in cwd: {kwargs['cwd']})"
@@ -215,28 +256,26 @@ def _get_nrf_device_args(options):
 
 PROJECT_OPTIONS = [
     server.ProjectOption(
-        "extra_files",
-        help="If given, during generate_project, uncompress the tarball at this path into the project dir",
+        "extra_files_tar",
+        help="If given, during generate_project, uncompress the tarball at this path into the project dir.",
     ),
     server.ProjectOption(
-        "gdbserver_port", help=("If given, port number to use when running the local gdbserver")
+        "gdbserver_port", help=("If given, port number to use when running the local gdbserver.")
     ),
     server.ProjectOption(
         "nrfjprog_snr",
-        help=(
-            "When used with nRF targets, serial # of the " "attached board to use, from nrfjprog"
-        ),
+        help=("When used with nRF targets, serial # of the attached board to use, from nrfjprog."),
     ),
     server.ProjectOption(
         "openocd_serial",
-        help=("When used with OpenOCD targets, serial # of the " "attached board to use"),
+        help=("When used with OpenOCD targets, serial # of the attached board to use."),
     ),
     server.ProjectOption(
         "project_type",
         help="Type of project to generate.",
         choices=tuple(PROJECT_TYPES),
     ),
-    server.ProjectOption("verbose", help="Run build with verbose output"),
+    server.ProjectOption("verbose", help="Run build with verbose output.", choices=(True, False)),
     server.ProjectOption(
         "west_cmd",
         help=(
@@ -245,7 +284,16 @@ def _get_nrf_device_args(options):
         ),
     ),
     server.ProjectOption("zephyr_base", help="Path to the zephyr base directory."),
-    server.ProjectOption("zephyr_board", help="Name of the Zephyr board to build for"),
+    server.ProjectOption(
+        "zephyr_board",
+        choices=list(BOARD_PROPERTIES),
+        help="Name of the Zephyr board to build for.",
+    ),
+    server.ProjectOption(
+        "zephyr_model",
+        choices=[board["model"] for _, board in BOARD_PROPERTIES.items()],
+        help="Name of the model for each Zephyr board.",
+    ),
 ]
 
 
@@ -399,6 +447,9 @@ def build(self, options):
         if options.get("zephyr_base"):
             cmake_args.append(f"-DZEPHYR_BASE:STRING={options['zephyr_base']}")
 
+        if options.get("west_cmd"):
+            cmake_args.append(f"-DWEST={options['west_cmd']}")
+
         cmake_args.append(f"-DBOARD:STRING={options['zephyr_board']}")
 
         check_call(cmake_args, cwd=BUILD_DIR)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 8d8186c1b4f0..0ab498695fbf 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -105,6 +105,9 @@ set(USE_GRAPH_EXECUTOR ON)
 # Whether enable tiny graph executor with CUDA Graph
 set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
 
+# Whether enable pipeline executor.
+set(USE_PIPELINE_EXECUTOR OFF)
+
 # Whether to enable the profiler for the graph executor and vm
 set(USE_PROFILER ON)
 
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 9541f5be821c..42785116214e 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -76,6 +76,7 @@ function(find_hexagon_sdk_root HEXAGON_SDK_PATH HEXAGON_ARCH)
   # - HEXAGON_SDK_VERSION
   # - HEXAGON_SDK_INCLUDES
   # - HEXAGON_QURT_INCLUDES
+  # - HEXAGON_QURT_LIBS
   # - HEXAGON_RPCMEM_ROOT
   # - HEXAGON_REMOTE_ROOT
   # - HEXAGON_QAIC_EXE
@@ -95,6 +96,8 @@ function(find_hexagon_sdk_root HEXAGON_SDK_PATH HEXAGON_ARCH)
     set_parent(HEXAGON_QURT_INCLUDES
       "${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/posix"
       "${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/qurt")
+    set_parent(HEXAGON_QURT_LIBS
+      "${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/lib")
     set_parent(HEXAGON_RPCMEM_ROOT "${HEXAGON_SDK_ROOT}/libs/common/rpcmem")
     set_parent(HEXAGON_REMOTE_ROOT
       "${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64")
@@ -111,6 +114,8 @@ function(find_hexagon_sdk_root HEXAGON_SDK_PATH HEXAGON_ARCH)
     set_parent(HEXAGON_QURT_INCLUDES
       "${HEXAGON_SDK_ROOT}/rtos/qurt/${HEXARCH_DIR}/include/posix"
       "${HEXAGON_SDK_ROOT}/rtos/qurt/${HEXARCH_DIR}/include/qurt")
+    set_parent(HEXAGON_QURT_LIBS
+      "${HEXAGON_SDK_ROOT}/rtos/qurt/${HEXARCH_DIR}/lib/pic")
     set_parent(HEXAGON_RPCMEM_ROOT "${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem")
     set_parent(HEXAGON_REMOTE_ROOT  # libadsprpc.so
       "${HEXAGON_SDK_ROOT}/ipc/fastrpc/remote/ship/android_aarch64")
diff --git a/cmake/modules/contrib/CMSISNN.cmake b/cmake/modules/contrib/CMSISNN.cmake
new file mode 100644
index 000000000000..4bd0fd865dc0
--- /dev/null
+++ b/cmake/modules/contrib/CMSISNN.cmake
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_CMSISNN)
+  message(STATUS "Build with CMSIS-NN support")
+  file(GLOB RELAY_CONTRIB_CMSISNN_SRCS src/relay/backend/contrib/cmsisnn/*.cc)
+  list(APPEND COMPILER_SRCS ${RELAY_CONTRIB_CMSISNN_SRCS})
+endif(USE_CMSISNN)
diff --git a/cmake/modules/contrib/EthosU.cmake b/cmake/modules/contrib/EthosU.cmake
new file mode 100644
index 000000000000..8f3e09b8179b
--- /dev/null
+++ b/cmake/modules/contrib/EthosU.cmake
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_ETHOSU)
+  file(GLOB ETHOSU_RELAY_CONTRIB_SRC src/relay/backend/contrib/ethosu/*)
+  list(APPEND COMPILER_SRCS ${ETHOSU_RELAY_CONTRIB_SRC})
+endif(USE_ETHOSU)
\ No newline at end of file
diff --git a/cmake/modules/contrib/ExampleTargetHooks.cmake b/cmake/modules/contrib/ExampleTargetHooks.cmake
new file mode 100644
index 000000000000..eb53dda133d2
--- /dev/null
+++ b/cmake/modules/contrib/ExampleTargetHooks.cmake
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+file(GLOB EXAMPLE_TARGET_HOOKS_SRC src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc)
+list(APPEND COMPILER_SRCS ${EXAMPLE_TARGET_HOOKS_SRC})
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 88d68408381c..eff86a950b90 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -36,6 +36,6 @@ pip3 install \
     pytest-xdist \
     requests \
     scipy \
-    synr==0.3.0 \
+    synr==0.4.0 \
     six \
     tornado
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index 774d85dcf68a..25c214068cd0 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -25,5 +25,5 @@ mkdir "$PYXIR_HOME"
 
 pip3 install progressbar h5py==2.10.0
 
-git clone --recursive --branch v0.1.6 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
+git clone --recursive --branch v0.3.1 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
 cd "${PYXIR_HOME}" && python3 setup.py install
diff --git a/docs/contribute/code_review.rst b/docs/contribute/code_review.rst
index 9b2ed0c5471e..778574458427 100644
--- a/docs/contribute/code_review.rst
+++ b/docs/contribute/code_review.rst
@@ -17,23 +17,155 @@
 
 .. _code_review_guide:
 
+
 Perform Code Reviews
 ====================
 
-This is a general guideline for code reviewers. First of all, while it is great to add new features to a project, we must also be aware that each line of code we introduce also brings **technical debt** that we may have to eventually pay.
 
-Open source code is maintained by a community with diverse background, and hence it is even more important to provide clear, documented and maintainable code. Code reviews are a shepherding process to spot potential problems, improve quality of the code. We should, however, not rely on the code review process to get the code into a ready state. Contributors are encouraged to polish the code to a ready state before requesting reviews. This is especially expected for code owner and committer candidates.
+Open source code is maintained by a community with diverse backgrounds, interests, and goals.
+Hence it is important to provide clear, documented and maintainable code and processes. Code reviews are a
+shepherding process used to collectively spot potential problems, improve quality of the code, and educate both contributors
+and reviewers about the code base and its assumptions. It is also one mechanism to ensure there are multiple people who can
+maintain a related piece of code together. Contributors are encouraged to polish the code to a reviewable
+state before requesting reviews. This is especially important for committer candidates, as committers are expected
+to participate in not only writing code but also reviewing it.
+
+This document is a living guideline for code review in open source. Please also take sometime to read
+:ref:`community_guide` about the general development process.
+
+Building Trust
+--------------
+
+First and foremost, we are building a community that based on trust, which takes time
+and effort to both build and maintain.  We expect our community members to work together in a
+constructive way and work together with common sense. Although we all have different sets of backgrounds,
+interests and goals we must work together to find solutions that work for the larger community.
+Trust-based collaboration is also a key tenant of the Apache way and an important factor to consider in growing the community,
+and promoting members to official roles.
+
+Community Participation
+-----------------------
+
+Everyone is welcomed to comment on PRs. We encourage committers to wait for some period of time(e.g. three days)
+before merging PR that contains a major architecture change. The goal is to give people time to speak up and
+express interest in reviewing and participate.
+
+Remembering that we are all coming from different backgrounds is important here. For example some community members
+work in different time zones, only work on open source during work hours, or may be traveling or having other events
+going on in their lives. An important part of working in a large project is ensuring there is collective understanding,
+so no one person is a bottleneck. While it is important to allow time for participation in code review we also can not
+block all changes on all reviewers. Remember that helping people land PRs is a great way to encourage broader
+participation, especially for those who volunteer their time to contribute.
+
+Part of this is trusting and communicating with fellow maintainers that if changes need to be applied in the future
+that PR authors will later follow through on their promises. It is the responsibility of committers to listen to all
+feedback whether from PMC members or new contributors and consider what actions need to be taken.
+
+Read the code carefully
+-----------------------
 
-Here are some checklists for code reviews, it is also helpful reference for contributors.
+Sometimes we may quickly read through the code and only pick up on a selective aspects of the code. These type of comments
+are usually helpful and should be welcomed in the community. However,  they are only part of performing code review and
+should be part of more comprehensive feedback. A good and careful code review is a large time investment and sometimes
+can be longer than writing the actual contribution.
 
+For example receiving only highly critical feedback on minor aspects of your PR rarely feels good, and it can be discouraging
+if your time and effort was not reciprocated during review. Practicing empathy when acting both as a contributor and committer
+is important and can help make you a more effective code reviewer and contributor.
 
+We expect that all committers carefully read and understand the code before signing off. There is a lot of trust involved when
+a committer hits the merge button. In the meantime, we acknowledge that sometimes problems slip through, in that case, the
+merger is responsible for ensuring the correct follow up actions are taken.
+
+Be Respectful
+-------------
+
+- To everyone who are making comments: making constructive comment will help new contributors to land their PRs
+  timely and help us welcome new members to the community.
+
+- To authors: reviewers should spend significant time reading the code, and a careful review could be as time intensive
+  as writing the code from scratch. Respectfully address review comments and reciprocate the review by helping review
+  others changes in the future.
+
+Most importantly focus on having a constructive conversation, and try to assume best intentions when interacting as a reviewer.
+If there is something in the process not working, consider getting some face time with the other contributors and discussing
+how to improve the process or communication.
+
+Factors to Consider about Code Quality
+--------------------------------------
+
+High quality code is critical to the long term success of the project. There are many factors of code quality to consider
+during a code review:
+
+- F0: Overall architecture. This includes the definition of public modules, key data structures and public interfaces.
+  Good architectural choices are critical to the success of the project in the long run.
+- F1: Architectural consistency. There are usually multiple ways to implement a new feature. We must ensure new
+  features are consistent with previous overall architectural choices and interact well with the existing code.
+  Every new feature increases the complexity of the project, and a consistent design ideally minimizes the increase
+  in complexity bought by a new feature, making it easier to maintain code in the long run.
+- F2: Code robustness and test coverage. Ensure code runs correctly in all possible settings(platforms), ensure
+  test coverage of the new feature. Clear error messages for user facing errors.
+- F3: User facing API documentation: documentation of public user facing APIs and key module interfaces are mandatory.
+  This includes the API, data structures that appears in the public interface (i.e., `include/tvm` and user facing python APIs).
+  We generally encourage well documented code and include some form of documentations for internal APIs that are used in
+  multiple places, see also F4.
+- F4: Code readability. Readability involves multiple aspects: instructive and consistent function names, clear implementation
+  of the overall flow, descriptive comments for complex code logic and internal functions. Readable code is easier to maintain.
+
+Architectural design and consistency are the most important factors since they are likely to introduce the most long term technical debt.
+As a result, committers should most carefully consider these factors before merging the code.
+
+Test coverage and API documentation are expected for code contributions.
+
+Code readability is relatively a subjective matter compared to the other ones.
+Different people have different thoughts on how to best write code. Reviewers should make constructive and actionable comments.
+In the meantime, code review should not be used as a way to get others to write code exactly the way you would.
+Conversely you should also consider that what you may easily understand, or find acceptable might not work for the larger
+community or other members. Use your judgment on what is appropriate based on the content and the scope of the contribution
+and where the contributor is coming from.
+
+We follow common :ref:`code_guide` when writing code. Style guides help ensure that code is readable and maintainable by others,
+long after the original author has moved on. Style guides are more than about code formatting — they also pertain
+to the correct way to document code, variable naming, and other conventions that are not enforced by automatic formatters.
+
+Consensus Building
+------------------
+
+Disagreements can happen during code reviews. We encourage building consensus among the people involved. We are working together
+and building trust with each other in OSS. The nature of OSS means sometimes we make compromises on less significant issues to
+make steady progress and welcome broader participation in the community. Compromise unfortunately means sometimes the world will
+not be exactly as we would like, this true even for leaders of the community.
+
+- Be civil and build consensus through constructive technical-based conversations.
+- A committer who owns the area can serve as a shepherd to drive the discussion by taking all the conversations into consideration,
+  and suggest a resolution with to move forward.
+- Because a lot of trust is involved on the committer(shepherd), they should read the PR carefully before sign off. Additionally,
+  the merger should also take the responsibility to followup in case there are problems caused by the merge.
+
+Consistency
+-----------
+
+A final remark is that we are all human and its hard to always be perfectly consistent. If contributors feel that you didn't apply these guidelines
+in a consistent way it is important to listen and hear folks out. We will constantly have to iterate on processes and guidelines as we evolve as a community.
+Our goal is to strive to be consistent and objective but all of us are unfortunately human and imperfect and will need to adjust and learn.
+
+Additional Recommendations
+--------------------------
+
+Scope the PRs
+~~~~~~~~~~~~~
+
+We recommend authors to send well scoped PRs that are easy to review and revert in case there is a problem.
+Authors avoid merging multiple unrelated changes into a single PR and split them into separate PRs.
+
+Label the PRs with Area Prefix
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When sending pull requests, it is helpful to prefix the PR title with the areas related PR(e.g. use [TIR] for TIR-related changes).
+This would help people recognize the related areas and find PRs they are interested in.
 
-Hold the Highest Standard
--------------------------
-The first rule for code reviewers is to always keep the highest standard, and do not approve code just to "be friendly". Good, informative critics each other learn and prevent technical debt in early stages.
 
 Deliberate on API and Data Structures
--------------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A minimum and stable API is critical to the project’s life. A good API makes a huge difference. Always think very carefully about all the aspects including naming, argument definitions and behavior.
 
 When possible, pay more attention still to the proposed API design during code reviews.
@@ -57,47 +189,31 @@ Here are some useful principles for designing APIs:
 - Minimum. Think about how many lines of code a user has to write to use the API.
   Remove layers of abstraction when possible.
 
-
-Ensure Test Coverage
---------------------
-Each new change of features should introduce test cases.
-Bug fixes should include regression tests that prevent the problem from happening again.
-
-Documentation is Mandatory
----------------------------
-Documentation is often overlooked. When adding new functions or changing an existing function, the documentation should be directly updated. A new feature is meaningless without documentation to make it accessible. See more at :ref:`doc_guide`
-
-Minimum Dependency
-------------------
-Always be cautious in introducing dependencies. While it is important to reuse code and avoid reinventing the wheel, dependencies can increase burden of users in deployment. A good design principle is that a feature or function should only have a dependecy if/when a user actually use it.
-
-Ensure Readability
-------------------
-While it is hard to implement a new feature, it is even harder to make others understand and maintain the code you wrote. It is common for a PMC or committer to not be
-able to understand certain contributions. In such case, a reviewer should say "I don’t understand" and ask the contributor to clarify. We highly encourage code comments which explain the code logic along with the code.
+Minimize Dependencies
+~~~~~~~~~~~~~~~~~~~~~
+Always be cautious in introducing dependencies. While it is important to reuse code and avoid reinventing the wheel,
+dependencies can increase burden of users in deployment. A good design principle is that a feature or function
+should only have a dependecy if/when a user actually use it.
 
 Concise Implementation
-----------------------
+~~~~~~~~~~~~~~~~~~~~~~
 Some basic principles applied here: favor vectorized array code over loops, use existing APIs that solve the problem.
 
 Document Lessons in Code Reviews
---------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 When you find there are some common or recurring lessons that can be summarized,
 add it to the :ref:`code_guide`.
 It is always good to refer to the guideline document when requesting changes,
 so the lessons can be shared to all the community.
 
-Respect each other
-------------------
-The code reviewers and contributors are paying the most precious currency in the world -- time. We are volunteers in the community to spend the time to build good code, help each other, learn and have fun hacking.
 
 Learn from other Code Reviews
------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 There can be multiple reviewers reviewing the same changes. Many times other reviewers
 may spot things you did not find. Try to learn from other code reviews, when possible, document these lessons.
 
 Approve and Request Changes Explicitly
---------------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The contributor and code owner can request code reviews from multiple reviewers.
 Remember to approve changes when your comments are addressed in a code review.
 To do so -- please click on changes tab in the pull request, then select approve,
diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst
index 68e77ecfa43e..4682e5aa5b33 100644
--- a/docs/langref/relay_pattern.rst
+++ b/docs/langref/relay_pattern.rst
@@ -476,6 +476,8 @@ is required to be run before the callback.
 The function ``def callback(self, pre, post, node_map)`` will be invoked when the rewriter matches
 ``self.pattern``. ``node_map`` is a dictionary mapping from pattern nodes to matched nodes in the graph.
 
+The callback function will be invoked recursively on the returned pattern until the pattern stops changing. As a result, if ``self.pattern`` matches any part of the graph that the callback returned, the rewriter will run in a loop. If you want to avoid multiple rewrites, you can pass a ``rewrite_once=True`` parameter to the constructor.
+
 Pattern Partitioning
 ********************
 
diff --git a/include/tvm/arith/int_set.h b/include/tvm/arith/int_set.h
index b9e81c0a5533..6b350e25e167 100644
--- a/include/tvm/arith/int_set.h
+++ b/include/tvm/arith/int_set.h
@@ -121,17 +121,24 @@ class IntSet : public ObjectRef {
    * \return The result set containing the indices in the vector.
    */
   static IntSet Vector(PrimExpr vec);
+  /*!
+   * \brief Construct a set representing a range [min, min + extent).
+   * \param min The minimum of the range range
+   * \param extent The extent of the range.
+   * \return The constructed set.
+   */
+  static IntSet FromMinExtent(PrimExpr min, PrimExpr extent);
   /*!
    * \brief Construct a set representing a range.
    * \param r The range
-   * \return constructed set.
+   * \return The constructed set.
    */
   static IntSet FromRange(tvm::Range r);
   /*!
    * \brief Construct a set representing a interval.
    * \param min The minimum value of the interval.
    * \param max The maximum value of the interval.
-   * \return constructed set.
+   * \return The constructed set.
    */
   static IntSet Interval(PrimExpr min, PrimExpr max);
 
diff --git a/include/tvm/ir/affine_type.h b/include/tvm/ir/affine_type.h
index afbe1f343bb8..5726e9eec1f0 100644
--- a/include/tvm/ir/affine_type.h
+++ b/include/tvm/ir/affine_type.h
@@ -71,17 +71,20 @@ class TensorAffineTypeNode : public AffineTypeNode {
   RelayExpr zero_point;
   /*! \brief The data type of this type */
   DataType dtype;
+  /*! \brief The axis for per-channel quantization */
+  int axis;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("scale", &scale);
     v->Visit("zero_point", &zero_point);
     v->Visit("dtype", &dtype);
+    v->Visit("axis", &axis);
   }
 
   bool SEqualReduce(const TensorAffineTypeNode* other, SEqualReducer equal) const {
     equal->MarkGraphNode();
     return equal(scale, other->scale) && equal(zero_point, other->zero_point) &&
-           equal(dtype, other->dtype);
+           equal(dtype, other->dtype) && equal(axis, other->axis);
   }
 
   void SHashReduce(SHashReducer hash_reduce) const {
@@ -89,6 +92,7 @@ class TensorAffineTypeNode : public AffineTypeNode {
     hash_reduce(scale);
     hash_reduce(zero_point);
     hash_reduce(dtype);
+    hash_reduce(axis);
   }
 
   static constexpr const char* _type_key = "TensorAffineType";
@@ -101,7 +105,7 @@ class TensorAffineTypeNode : public AffineTypeNode {
  */
 class TensorAffineType : public AffineType {
  public:
-  TVM_DLL TensorAffineType(RelayExpr scale, RelayExpr zero_point, DataType dtype);
+  TVM_DLL TensorAffineType(RelayExpr scale, RelayExpr zero_point, DataType dtype, int axis);
 
   TVM_DEFINE_OBJECT_REF_METHODS(TensorAffineType, AffineType, TensorAffineTypeNode);
 };
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index fefb08f878ef..852c7d0d8a98 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -122,6 +122,7 @@ class IRModuleNode : public Object {
     v->Visit("global_var_map_", &global_var_map_);
     v->Visit("global_type_var_map_", &global_type_var_map_);
     v->Visit("source_map", &source_map);
+    v->Visit("attrs", &attrs);
   }
 
   TVM_DLL bool SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const;
@@ -277,6 +278,12 @@ class IRModuleNode : public Object {
    */
   TVM_DLL void Update(const IRModule& other);
 
+  /*!
+   * \brief Create a shallow copy of this IRModule.
+   * \returns The shallow copy of the IRModule.
+   */
+  TVM_DLL IRModule ShallowCopy();
+
   /*!
    * \brief Import Relay code from the file at path.
    * \param path The path of the Relay code to import.
@@ -348,12 +355,14 @@ class IRModule : public ObjectRef {
    * \brief constructor
    * \param functions Functions in the module.
    * \param type_definitions Type definitions in the module.
-   * \param import_set Set of imported files in the module
+   * \param import_set Set of imported files in the module.
    * \param map The module source map.
+   * \param attrs The module attributes.
    */
   TVM_DLL explicit IRModule(Map<GlobalVar, BaseFunc> functions,
                             Map<GlobalTypeVar, TypeData> type_definitions = {},
-                            std::unordered_set<String> import_set = {}, parser::SourceMap map = {});
+                            std::unordered_set<String> import_set = {}, parser::SourceMap map = {},
+                            DictAttrs attrs = {});
 
   /*! \brief default constructor */
   IRModule() : IRModule(Map<GlobalVar, BaseFunc>({})) {}
@@ -415,6 +424,13 @@ class IRModule : public ObjectRef {
    */
   TVM_DLL static IRModule FromText(const String& text, const String& source_path);
 
+  /*!
+   * \brief Create a shallow copy of an IRModule.
+   * \param mod The module to copy.
+   * \return The copied module.
+   */
+  IRModule ShallowCopyIRModule(IRModule mod);
+
   /*! \brief Declare the container type. */
   using ContainerType = IRModuleNode;
 
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index d28044c3845d..de60deb9cccb 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -1017,8 +1017,8 @@ struct DensePackAttrs : public tvm::AttrsNode<DensePackAttrs> {
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
     TVM_ATTR_FIELD(weight_layout)
-        .set_default("NK")
-        .describe("Dimension ordering of weight. Packed layouts, such as NK8n, are possible.");
+        .set_default("NC")
+        .describe("Dimension ordering of weight. Packed layouts, such as NC8n, are possible.");
   }
 };
 
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index a8317e1e51ad..e3f9bad17ef5 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -475,7 +475,7 @@ struct ScanopAttrs : public tvm::AttrsNode<ScanopAttrs> {
         .describe("The first element is not included")
         .set_default(Bool(false));
   }
-};
+};  // struct ScanopAttrs
 
 /*! \brief Attributes used in unique operator */
 struct UniqueAttrs : public tvm::AttrsNode<UniqueAttrs> {
@@ -489,6 +489,15 @@ struct UniqueAttrs : public tvm::AttrsNode<UniqueAttrs> {
   }
 };  // struct UniqueAttrs
 
+/*! \brief Attributes used in einsum operator */
+struct EinsumAttrs : public tvm::AttrsNode<EinsumAttrs> {
+  String equation;
+
+  TVM_DECLARE_ATTRS(EinsumAttrs, "relay.attrs.EinsumAttrs") {
+    TVM_ATTR_FIELD(equation).describe("The einsum expression string");
+  }
+};  // struct EinsumAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index bdc46d71a77d..912879dc8a4b 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -426,6 +426,13 @@ TVM_DLL Pass RemoveUnusedFunctions(Array<runtime::String> entry_functions);
  */
 TVM_DLL Pass SimplifyExpr();
 
+/*!
+ * \brief Run any registered RelayToTIR passes registered on the functions in a module.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass RelayToTIRTargetHook();
+
 /*!
  * \brief A pass for manifesting explicit memory allocations and rewriting
  * specific dialects.
diff --git a/include/tvm/runtime/container/map.h b/include/tvm/runtime/container/map.h
index 671e38b83581..3fe4f697bb9e 100644
--- a/include/tvm/runtime/container/map.h
+++ b/include/tvm/runtime/container/map.h
@@ -1353,7 +1353,7 @@ class Map : public ObjectRef {
    *  Otherwise make a new copy of the array to ensure the current handle
    *  hold a unique copy.
    *
-   * \return Handle to the internal node container(which ganrantees to be unique)
+   * \return Handle to the internal node container(which guarantees to be unique)
    */
   MapNode* CopyOnWrite() {
     if (data_.get() == nullptr) {
diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index d1a8619e8233..776691c4c7fc 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -93,6 +93,7 @@ typedef enum {
   kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
   kTvmErrorFunctionCallWrongArgType = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 1),
   kTvmErrorFunctionCallNotImplemented = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 2),
+  kTvmErrorFunctionCallInvalidArg = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 3),
 
   // Time Evaluator - times functions for use with debug runtime.
   kTvmErrorTimeEvaluatorBadHandle = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryTimeEvaluator, 0),
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index eea195f64a6d..7ee140622bfc 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -251,6 +251,12 @@ class Report : public ObjectRef {
    */
   explicit Report(Array<Map<String, ObjectRef>> calls,
                   Map<String, Map<String, ObjectRef>> device_metrics);
+
+  /*! Deserialize a Report from a JSON object. Needed for sending the report over RPC.
+   * \param json Serialized json report from `ReportNode::AsJSON`.
+   * \returns A Report.
+   */
+  static Report FromJSON(String json);
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Report, ObjectRef, ReportNode);
 };
 
diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index deec662e74ad..64a1023158e1 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -31,7 +31,6 @@
 #include <tvm/target/target_kind.h>
 
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 33776cbe1985..66dd5375eaf9 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -305,6 +305,41 @@ class ScheduleNode : public runtime::Object {
   virtual BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                              const String& storage_scope) = 0;
   /******** Schedule: Compute location ********/
+  /*!
+   * \brief Move a producer block under the specific loop, and regenerate the
+   * loops induced by the block so that the buffer region produced by the producer block could
+   * cover those regions consumed by its consumer blocks under the given loop. It requires:
+   * 1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+   * 2) The scope block has stage-pipeline property
+   * 3) The subtree of the scope block, where the given block is in, satisfies the compact dataflow
+   * condition. i.e. all the blocks in the scope block's subtree must be either complete block or
+   * reduction block
+   * 4) The block is not an output block with regard to the scope block, i.e. the buffers written by
+   * the block are allocated under the scope block
+   * 5) All the consumers of the block are under the given loop
+   * \param block_rv The block to be moved
+   * \param loop_rv The loop where the block to be moved under
+   * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+   */
+  virtual void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                         bool preserve_unit_loops) = 0;
+  /*!
+   * \brief Move a consumer block under the specific loop, and regenerate the
+   * loops induced by the block so that the buffer region consumed by the consumer block could
+   * cover those regions produced by its producer blocks under the given loop. It requires:
+   * 1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+   * 2) The scope block has stage-pipeline property
+   * 3) The subtree of the scope block, where the given block is in, satisfies the compact dataflow
+   * condition. i.e. all the blocks in the scope block's subtree must be either complete block or
+   * reduction block
+   * 4) All the producers of the block are under the given loop
+   *
+   * \param block_rv The block to be moved
+   * \param loop_rv The loop where the block to be moved under
+   * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+   */
+  virtual void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                                bool preserve_unit_loops) = 0;
   /*!
    * \brief Inline a block into its consumer(s). It requires:
    * 1) The block is a complete non-root block, which only produces one buffer
diff --git a/include/tvm/tir/schedule/state.h b/include/tvm/tir/schedule/state.h
index 35299a3fa84b..7cd1b00c15ef 100644
--- a/include/tvm/tir/schedule/state.h
+++ b/include/tvm/tir/schedule/state.h
@@ -128,11 +128,6 @@ class ScheduleStateNode : public Object {
    */
   TVM_DLL void Replace(const tir::StmtSRef& src_sref, const Stmt& tgt_stmt,
                        const Map<Block, Block>& block_sref_reuse);
-  /*!
-   * \brief Recalculate the `affine_binding` flag of the scope block info.
-   * \param scope_sref The sref to the interested scope block.
-   */
-  TVM_DLL void UpdateAffineFlag(const StmtSRef& scope_sref);
   /*!
    * \brief Trigger the verification according to the `debug_mask` bitmask.
    * 1) If the bitmask `kVerifySRefTree` is on, verify the correctness of the sref tree.
diff --git a/include/tvm/topi/cuda/normalization.h b/include/tvm/topi/cuda/normalization.h
deleted file mode 100644
index 270b6af2d5e5..000000000000
--- a/include/tvm/topi/cuda/normalization.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda/normalization.h
- * \brief CUDA schedule for LRN and l2 normalization operations
- */
-#ifndef TVM_TOPI_CUDA_NORMALIZATION_H_
-#define TVM_TOPI_CUDA_NORMALIZATION_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-namespace cuda {
-/*!
- * \brief Create a CUDA schedule for LRN
- * \param outs The output tensors.
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_lrn(const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  Schedule s = create_schedule(out_ops);
-  int num_thread = 64;
-  IterVar block_x = tvm::te::thread_axis(Range(), "blockIdx.x");
-  IterVar thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
-  Tensor lrn = outs[0];
-  Tensor sqr_sum_up = lrn->op->InputTensors()[1];
-  Tensor sqr_sum = sqr_sum_up->op->InputTensors()[0];
-  Tensor set_pad = sqr_sum->op->InputTensors()[0];
-  s[set_pad].bind(set_pad->op.as<ComputeOpNode>()->axis[0], block_x);
-  IterVar rxk = sqr_sum->op.as<ComputeOpNode>()->reduce_axis[0];
-  IterVar xko, xki;
-  s[sqr_sum].split(rxk, num_thread, &xko, &xki);
-  Tensor srf = s.rfactor(sqr_sum, xki)[0];
-  s[sqr_sum].bind(s[sqr_sum]->op.as<ComputeOpNode>()->axis[0], block_x);
-  s[sqr_sum].bind(s[sqr_sum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
-  s[srf].compute_at(s[sqr_sum], s[sqr_sum]->op.as<ComputeOpNode>()->reduce_axis[0]);
-  s[sqr_sum_up].bind(sqr_sum_up->op.as<ComputeOpNode>()->axis[0], block_x);
-  IterVar xto, xti;
-  s[lrn].split_by_nparts(lrn->op.as<ComputeOpNode>()->axis[1], num_thread, &xto, &xti);
-  s[lrn].bind(lrn->op.as<ComputeOpNode>()->axis[0], block_x);
-  s[lrn].bind(xto, thread_x);
-
-  return s;
-}
-
-}  // namespace cuda
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_CUDA_NORMALIZATION_H_
diff --git a/include/tvm/topi/nn/local_response_norm.h b/include/tvm/topi/nn/local_response_norm.h
index 717adb8ff8fa..c826ec07cf09 100644
--- a/include/tvm/topi/nn/local_response_norm.h
+++ b/include/tvm/topi/nn/local_response_norm.h
@@ -64,17 +64,26 @@ inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.00
   auto rxs = tvm::te::reduce_axis(Range(0, size), "rxs");
   Tensor sqr_sum;
   if (axis == 1) {
-    sqr_sum = tvm::te::compute(input_shape, [&](Var i, Var l, Var j, Var k) {
-      return tvm::sum(pad_data(i, l + rxs, j, k) * pad_data(i, l + rxs, j, k), {rxs});
-    });
+    sqr_sum = tvm::te::compute(
+        input_shape,
+        [&](Var i, Var l, Var j, Var k) {
+          return tvm::sum(pad_data(i, l + rxs, j, k) * pad_data(i, l + rxs, j, k), {rxs});
+        },
+        "tensor", "sqr_sum");
   } else if (axis == 3) {
-    sqr_sum = tvm::te::compute(input_shape, [&](Var i, Var l, Var j, Var k) {
-      return tvm::sum(pad_data(i, l, j, k + rxs) * pad_data(i, l, j, k + rxs), {rxs});
-    });
+    sqr_sum = tvm::te::compute(
+        input_shape,
+        [&](Var i, Var l, Var j, Var k) {
+          return tvm::sum(pad_data(i, l, j, k + rxs) * pad_data(i, l, j, k + rxs), {rxs});
+        },
+        "tensor", "sqr_sum");
   }
-  auto sqrt_sum_up = tvm::te::compute(input_shape, [&](Var i, Var j, Var k, Var l) {
-    return tvm::pow(bias + (div(alpha * sqr_sum(i, j, k, l), size)), beta);
-  });
+  auto sqrt_sum_up = tvm::te::compute(
+      input_shape,
+      [&](Var i, Var j, Var k, Var l) {
+        return tvm::pow(bias + (div(alpha * sqr_sum(i, j, k, l), size)), beta);
+      },
+      "tensor", kElementWise);
   return topi::divide(data, sqrt_sum_up);
 }
 }  // namespace nn
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index 781db2cb872a..7470ccc92496 100755
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -150,6 +150,17 @@
             ],
         ),
     ),
+    # Vitis AI requirements
+    (
+        "vitis-ai",
+        (
+            "Requirements for the Vitis AI codegen",
+            [
+                "h5py",
+                "progressbar",
+            ],
+        ),
+    ),
     # XGBoost, useful for autotuning on some targets.
     (
         "xgboost",
@@ -217,6 +228,7 @@
     ),  # Work around https://github.com/readthedocs/sphinx_rtd_theme/issues/1115
     ("ethos-u-vela", "==2.1.1"),
     ("future", None),
+    ("h5py", "==2.10.0"),
     ("image", None),
     ("matplotlib", None),
     ("numpy", None),
@@ -224,6 +236,7 @@
     ("onnxruntime", None),
     ("opencv-python", None),
     ("pillow", None),
+    ("progressbar", None),
     ("psutil", None),
     ("pylint", None),
     ("scipy", None),
@@ -231,7 +244,7 @@
     ("sphinx_autodoc_annotation", None),
     ("sphinx_gallery", None),
     ("sphinx_rtd_theme", None),
-    ("synr", "==0.3.0"),
+    ("synr", "==0.4.0"),
     ("tensorflow", None),
     ("tensorflow-estimator", None),
     ("tflite", None),
diff --git a/python/setup.py b/python/setup.py
index dd13a12d8903..f6afa42610d4 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -49,13 +49,22 @@ def get_lib_path():
     if not CONDA_BUILD:
         lib_path = libinfo["find_lib_path"]()
         libs = [lib_path[0]]
-        if libs[0].find("runtime") == -1:
+        if "runtime" not in libs[0]:
             for name in lib_path[1:]:
-                if name.find("runtime") != -1:
+                if "runtime" in name:
                     libs.append(name)
                     break
+
+        # Add standalone_crt, if present
+        for name in lib_path:
+            candidate_path = os.path.join(os.path.dirname(name), "standalone_crt")
+            if os.path.isdir(candidate_path):
+                libs.append(candidate_path)
+                break
+
     else:
         libs = None
+
     return libs, version
 
 
@@ -154,9 +163,16 @@ def is_pure(self):
 if wheel_include_libs:
     with open("MANIFEST.in", "w") as fo:
         for path in LIB_LIST:
-            shutil.copy(path, os.path.join(CURRENT_DIR, "tvm"))
-            _, libname = os.path.split(path)
-            fo.write("include tvm/%s\n" % libname)
+            if os.path.isfile(path):
+                shutil.copy(path, os.path.join(CURRENT_DIR, "tvm"))
+                _, libname = os.path.split(path)
+                fo.write(f"include tvm/{libname}%s")
+
+            if os.path.isdir(path):
+                _, libname = os.path.split(path)
+                shutil.copytree(path, os.path.join(CURRENT_DIR, "tvm", libname))
+                fo.write(f"recursive-include tvm/{libname} *\n")
+
     setup_kwargs = {"include_package_data": True}
 
 if include_libs:
@@ -206,4 +222,10 @@ def get_package_data_files():
     os.remove("MANIFEST.in")
     for path in LIB_LIST:
         _, libname = os.path.split(path)
-        os.remove("tvm/%s" % libname)
+        path_to_be_removed = f"tvm/{libname}"
+
+        if os.path.isfile(path_to_be_removed):
+            os.remove(path_to_be_removed)
+
+        if os.path.isdir(path_to_be_removed):
+            shutil.rmtree(path_to_be_removed)
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 55a228882691..57374c54b297 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -68,6 +68,9 @@
 # Contrib initializers
 from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
 
+if support.libinfo().get("USE_MICRO", "OFF") == "ON":
+    from . import micro
+
 # NOTE: This file should be python2 compatible so we can
 # raise proper error message when user run the package using
 # an older version of the python
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index b746dbf96f43..c58aeea57d14 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -43,6 +43,7 @@
 from tvm.driver import build_module
 from tvm.ir import transform
 from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
+from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
 from tvm.contrib import tar, ndk
 from tvm.contrib.popen_pool import PopenWorker, PopenPoolExecutor, StatusKind
 from tvm.target import Target
@@ -651,8 +652,8 @@ def local_build_worker(args):
 
     Parameters
     ----------
-    args: Tuple[MeasureInput, str, int, int]
-        inputs, build-func, time, verbose args passed to local_builder_build
+    args: Tuple[MeasureInput, callable, int]
+        inputs, build-func, verbose args passed to local_builder_build
 
     Returns
     -------
@@ -660,10 +661,6 @@ def local_build_worker(args):
         The build result of this Builder thread.
     """
     inp, build_func, verbose = args
-    assert build_func == BuildFunc.name, (
-        "BuildFunc.name: " + BuildFunc.name + ", but args is: " + build_func
-    )
-    build_func = BuildFunc.build_func
 
     return _local_build_worker(inp, build_func, verbose)
 
@@ -692,13 +689,18 @@ def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbo
     res : List[BuildResult]
         The build results of these MeasureInputs.
     """
-    executor = PopenPoolExecutor(n_parallel, timeout)
+    assert build_func == BuildFunc.name, (
+        "BuildFunc.name: " + BuildFunc.name + ", but args is: " + build_func
+    )
+    executor = PopenPoolExecutor(
+        n_parallel, timeout, reset_global_scope, (AutotvmGlobalScope.current,)
+    )
     tuple_res = executor.map_with_error_catching(
         local_build_worker,
         [
             (
                 i.serialize(),
-                build_func,
+                BuildFunc.build_func,
                 verbose,
             )
             for i in inputs
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 8b68f4e9002a..1977d3de5506 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -171,7 +171,7 @@ def extract_tasks(
                 desc=",".join(func_names),
             )
         )
-        weights.append(weight)
+        weights.append(int(weight))
 
     if dump_workload_to_dag_log is not None:
         with open(dump_workload_to_dag_log, "w") as f:
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index 9b975063105f..baa0bb365fe6 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -480,7 +480,9 @@ def _tune_task(self, task_idx):
 
     def _compute_score(self, costs):
         """compute the objective function"""
-        return self.objective_func(costs)
+        # Make sure to return float.
+        score = self.objective_func(costs)
+        return score.value if hasattr(score, "value") else score
 
     def _adjust_similarity_group(self, task_idx):
         """adjust the similarity group for the selected task"""
@@ -598,7 +600,7 @@ def pre_tune(self, task_scheduler, task_id):
 
         # overall info
         if all(cost < 1e9 for cost in task_scheduler.best_costs):
-            total_latency_str = "%.3f" % (task_scheduler.cur_score.value * 1e3)
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
         else:
             total_latency_str = "-"
         print(
@@ -629,7 +631,7 @@ def __init__(self, log_file):
 
     def post_tune(self, task_scheduler, task_id):
         if all(cost < 1e9 for cost in task_scheduler.best_costs):
-            total_latency_str = "%.3f" % (task_scheduler.cur_score.value * 1e3)
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
         else:
             total_latency_str = "N/A"
 
diff --git a/python/tvm/autotvm/env.py b/python/tvm/autotvm/env.py
index ddd510ce55ed..ae07a852413c 100644
--- a/python/tvm/autotvm/env.py
+++ b/python/tvm/autotvm/env.py
@@ -30,3 +30,10 @@ def __init__(self):
 
 
 GLOBAL_SCOPE = AutotvmGlobalScope()
+
+
+def reset_global_scope(global_scope):
+    """Reset global autotvm state. This is needed to initialize PopenPool workers."""
+    global GLOBAL_SCOPE
+    GLOBAL_SCOPE = global_scope
+    AutotvmGlobalScope.current = global_scope
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
index c4c0dc92b116..10b0843402ea 100644
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -31,4 +31,3 @@
     request_remote,
 )
 from .executor import Executor
-from .local_executor import LocalExecutor
diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
deleted file mode 100644
index a9aeb790c82a..000000000000
--- a/python/tvm/autotvm/measure/local_executor.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Local based implementation of the executor using multiprocessing"""
-
-import signal
-
-from multiprocessing import Process, Queue
-
-try:
-    from queue import Empty
-except ImportError:
-    from Queue import Empty
-
-try:
-    import psutil
-except ImportError:
-    psutil = None
-
-from . import executor
-
-
-def kill_child_processes(parent_pid, sig=signal.SIGTERM):
-    """kill all child processes recursively"""
-    try:
-        parent = psutil.Process(parent_pid)
-        children = parent.children(recursive=True)
-    except psutil.NoSuchProcess:
-        return
-    for process in children:
-        try:
-            process.send_signal(sig)
-        except psutil.NoSuchProcess:
-            return
-
-
-def _execute_func(func, queue, args, kwargs):
-    """execute function and return the result or exception to a queue"""
-    try:
-        res = func(*args, **kwargs)
-    except Exception as exc:  # pylint: disable=broad-except
-        res = exc
-    queue.put(res)
-
-
-def call_with_timeout(queue, timeout, func, args, kwargs):
-    """A wrapper to support timeout of a function call"""
-
-    # start a new process for timeout (cannot use thread because we have c function)
-    p = Process(target=_execute_func, args=(func, queue, args, kwargs))
-    p.start()
-    p.join(timeout=timeout)
-
-    queue.put(executor.TimeoutError())
-
-    kill_child_processes(p.pid)
-    p.terminate()
-    p.join()
-
-
-class LocalFuture(executor.Future):
-    """Local wrapper for the future
-
-    Parameters
-    ----------
-    process: multiprocessing.Process
-        process for running this task
-    queue: multiprocessing.Queue
-        queue for receiving the result of this task
-    """
-
-    def __init__(self, process, queue):
-        self._done = False
-        self._process = process
-        self._queue = queue
-
-    def done(self):
-        self._done = self._done or not self._queue.empty()
-        return self._done
-
-    def get(self, timeout=None):
-        try:
-            res = self._queue.get(block=True, timeout=timeout)
-        except Empty:
-            raise executor.TimeoutError()
-        if self._process.is_alive():
-            kill_child_processes(self._process.pid)
-            self._process.terminate()
-        self._process.join()
-        self._queue.close()
-        self._queue.join_thread()
-        self._done = True
-        del self._queue
-        del self._process
-        return res
-
-
-class LocalFutureNoFork(executor.Future):
-    """Local wrapper for the future.
-    This is a none-fork version of LocalFuture.
-    Use this for the runtime that does not support fork (like cudnn)
-    """
-
-    def __init__(self, result):
-        self._result = result
-
-    def done(self):
-        return True
-
-    def get(self, timeout=None):
-        return self._result
-
-
-class LocalExecutor(executor.Executor):
-    """Local executor that runs workers on the same machine with multiprocessing.
-
-    Parameters
-    ----------
-    timeout: float, optional
-        timeout of a job. If time is out. A TimeoutError will be returned (not raised)
-    do_fork: bool, optional
-        For some runtime systems that do not support fork after initialization
-        (e.g. cuda runtime, cudnn). Set this to False if you have used these runtime
-        before submitting jobs.
-    """
-
-    def __init__(self, timeout=None, do_fork=True):
-        self.timeout = timeout or executor.Executor.DEFAULT_TIMEOUT
-        self.do_fork = do_fork
-
-        if self.do_fork:
-            if not psutil:
-                raise RuntimeError(
-                    "Python package psutil is missing. " "please try `pip install psutil`"
-                )
-
-    def submit(self, func, *args, **kwargs):
-        if not self.do_fork:
-            return LocalFutureNoFork(func(*args, **kwargs))
-
-        queue = Queue(2)  # Size of 2 to avoid a race condition with size 1.
-        process = Process(target=call_with_timeout, args=(queue, self.timeout, func, args, kwargs))
-        process.start()
-        return LocalFuture(process, queue)
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 8438b807d46e..ea7de35ad9e8 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
 """User facing API for specifying how to measure the generated code"""
+import enum
 import multiprocessing
 from collections import namedtuple
 
@@ -52,8 +53,19 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
         The absolute time stamp when we finish measurement.
     """
 
+    def __repr__(self):
+        error_no_str = (
+            str(self.error_no)
+            if self.error_no not in MeasureErrorNo
+            else str(MeasureErrorNo(self.error_no))
+        )
+        return (
+            f"{self.__class__.__name__}(costs={self.costs!r}, error_no={error_no_str}, "
+            f"all_cost={self.all_cost}, timestamp={self.timestamp!r})"
+        )
 
-class MeasureErrorNo(object):
+
+class MeasureErrorNo(enum.IntEnum):
     """Error type for MeasureResult"""
 
     NO_ERROR = 0  # no error
@@ -77,12 +89,15 @@ class Builder(object):
     n_parallel: int, optional
         The number of tasks submitted in parallel
         By default it will use all cpu cores
+    build_kwargs: dict, optional
+        Keyword args given to the build function.
     """
 
-    def __init__(self, timeout=10, n_parallel=None):
+    def __init__(self, timeout=10, n_parallel=None, build_kwargs=None):
         self.timeout = timeout
         self.n_parallel = n_parallel or multiprocessing.cpu_count()
-        self.build_kwargs = {}
+        self.user_build_kwargs = build_kwargs if build_kwargs is not None else {}
+        self.runner_build_kwargs = None
         self.task = None
 
     def set_task(self, task, build_kwargs=None):
@@ -97,7 +112,17 @@ def set_task(self, task, build_kwargs=None):
             The additional kwargs for build function
         """
         self.task = task
-        self.build_kwargs = build_kwargs
+        self.build_kwargs = dict(build_kwargs.items()) if build_kwargs is not None else {}
+        if any(k in self.build_kwargs for k in self.user_build_kwargs):
+            logging.warn(
+                "Overriding these runner-supplied kwargs with user-supplied:\n%s",
+                "\n".join(
+                    f" * {k}: from {build_kwargs[k]!r} to {self.user_build_kwargs[k]!r}"
+                    for k in sorted([k for k in build_kwargs if k in self.user_build_kwargs])
+                ),
+            )
+        for k, v in self.user_build_kwargs.items():
+            self.build_kwargs[k] = v
 
     def build(self, measure_inputs):
         """Build programs
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index eab6822b63b8..efe45daa1464 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -38,7 +38,9 @@
 import tvm.ir.transform
 from tvm import nd
 from tvm import rpc as _rpc
+from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
 from tvm.contrib import ndk, nvcc, stackvm, tar
+from tvm.contrib.popen_pool import PopenPoolExecutor
 from tvm.driver import build
 from tvm.error import TVMError
 from tvm.target import Target
@@ -46,7 +48,6 @@
 from ..env import AutotvmGlobalScope
 from ..task.space import InstantiationError
 from ..utils import get_const_tuple
-from .local_executor import LocalExecutor
 from .measure import Builder, MeasureErrorNo, MeasureResult, Runner
 
 logger = logging.getLogger("autotvm")
@@ -78,15 +79,22 @@ class LocalBuilder(Builder):
         The timeout of a compilation
     n_parallel: int
         The number of tasks run in parallel. "None" will use all cpu cores
+    build_kwargs: dict
+        If supplied, additional kwargs passed to build_func. Overrides any build_kwargs supplied
+        by the Runner.
     build_func: callable or str
         If is 'default', use default build function
         If is 'ndk', use function for android ndk
         If id 'stackvm', use function for stackvm
         If is callable, use it as custom build function, expect lib_format field.
+    do_fork: bool
+        If False, do not fork when building. Requires n_parallel=1.
     """
 
-    def __init__(self, timeout=10, n_parallel=None, build_func="default"):
-        super(LocalBuilder, self).__init__(timeout, n_parallel)
+    def __init__(
+        self, timeout=10, n_parallel=None, build_kwargs=None, build_func="default", do_fork=False
+    ):
+        super(LocalBuilder, self).__init__(timeout, n_parallel, build_kwargs)
 
         if isinstance(build_func, str):
             if build_func == "default":
@@ -98,7 +106,14 @@ def __init__(self, timeout=10, n_parallel=None, build_func="default"):
             else:
                 raise ValueError("Invalid build_func" + build_func)
         self.build_func = _WrappedBuildFunc(build_func)
-        self.executor = LocalExecutor(timeout=timeout)
+        if not do_fork:
+            assert n_parallel in (
+                None,
+                1,
+            ), f"if do_fork=False, need n_parallel=None or 1; got {n_parallel}"
+        self.executor = PopenPoolExecutor(
+            timeout=timeout, initializer=reset_global_scope, initargs=(AutotvmGlobalScope.current,)
+        )
         self.tmp_dir = tempfile.mkdtemp()
 
     def build(self, measure_inputs):
@@ -114,53 +129,52 @@ def build(self, measure_inputs):
                 futures.append(ret)
 
             for future in futures:
-                res = future.get()
-
-                if isinstance(res, Exception):
-                    # timeout or fleet error, return MeasureResult directly
-                    results.append(
-                        MeasureResult(
-                            (res,), MeasureErrorNo.BUILD_TIMEOUT, self.timeout, time.time()
-                        )
-                    )
-                elif res.error is not None:
-                    # instantiation error
-                    if isinstance(res.error, InstantiationError):
-                        results.append(
-                            MeasureResult(
+                try:
+                    res = future.result()
+                    if res.error is not None:
+                        # instantiation error
+                        if isinstance(res.error, InstantiationError):
+                            res = MeasureResult(
                                 (res.error,),
                                 MeasureErrorNo.INSTANTIATION_ERROR,
                                 res.time_cost,
                                 time.time(),
                             )
-                        )
-                    else:
-                        if "InstantiationError" in str(res.error):
-                            msg = str(res.error)
-                            try:
-                                msg = msg.split("\n")[-2].split(": ")[1]
-                            except Exception:  # pylint: disable=broad-except
-                                pass
-                            results.append(
-                                MeasureResult(
+
+                        else:
+                            if "InstantiationError" in str(res.error):
+                                msg = str(res.error)
+                                try:
+                                    msg = msg.split("\n")[-2].split(": ")[1]
+                                except Exception:  # pylint: disable=broad-except
+                                    pass
+                                res = MeasureResult(
                                     (InstantiationError(msg),),
                                     MeasureErrorNo.INSTANTIATION_ERROR,
                                     res.time_cost,
                                     time.time(),
                                 )
-                            )
-                        else:  # tvm error
-                            results.append(
-                                MeasureResult(
+
+                            else:  # tvm error
+                                res = MeasureResult(
                                     (res.error,),
                                     MeasureErrorNo.COMPILE_HOST,
                                     res.time_cost,
                                     time.time(),
                                 )
-                            )
-                else:
-                    # return BuildResult
-                    results.append(res)
+                except TimeoutError as ex:
+                    res = MeasureResult(
+                        (ex,), MeasureErrorNo.BUILD_TIMEOUT, self.timeout, time.time()
+                    )
+                except ChildProcessError as ex:
+                    res = MeasureResult(
+                        (ex,),
+                        MeasureErrorNo.RUNTIME_DEVICE,
+                        self.timeout,
+                        time.time(),
+                    )
+
+                results.append(res)
 
         return results
 
@@ -242,7 +256,11 @@ def __init__(
         self.cooldown_interval = cooldown_interval
         self.module_loader = module_loader
 
-        self.executor = LocalExecutor(timeout=timeout * (self.n_parallel + 1))
+        self.executor = PopenPoolExecutor(
+            timeout=timeout * (self.n_parallel + 1),
+            initializer=reset_global_scope,
+            initargs=(AutotvmGlobalScope.current,),
+        )
 
     @property
     def ref_input(self):
@@ -337,15 +355,15 @@ def run(self, measure_inputs, build_results):
                 futures.append(ret)
 
             for future in futures:
-                res = future.get()
-                if isinstance(res, Exception):  # executor error or timeout
+                try:
+                    res = future.result()
+                    results.append(res)
+                except Exception as ex:  # pylint: disable=broad-except
                     results.append(
                         MeasureResult(
-                            (str(res),), MeasureErrorNo.RUN_TIMEOUT, self.timeout, time.time()
+                            (str(ex),), MeasureErrorNo.RUN_TIMEOUT, self.timeout, time.time()
                         )
                     )
-                else:
-                    results.append(res)
 
         return results
 
@@ -512,7 +530,16 @@ def __call__(self, measure_input, tmp_dir, **kwargs):
             )
             # TODO(tvm-team) consider linline _build_func_common
             func, arg_info = _build_func_common(measure_input, **kwargs)
-            func.export_library(filename, self.build_func)
+            if self.build_func.output_format == ".model-library-format":
+                # Late import to preserve autoTVM with USE_MICRO OFF
+                try:
+                    from tvm import micro  # pylint: disable=import-outside-toplevel
+                except ImportError:
+                    raise ImportError("Requires USE_MICRO")
+
+                micro.export_model_library_format(func, filename)
+            else:
+                func.export_library(filename, self.build_func)
         except Exception as e:  # pylint: disable=broad-except
             return BuildResult(None, None, e, time.time() - tic)
         return BuildResult(filename, arg_info, None, time.time() - tic)
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index dc75de206d05..40ee24e077b4 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -145,8 +145,8 @@ def __del__(self):
 
     if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
         sys.stdout.write(
-            "\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) "
-            "| %.2f s" % (prefix, 0, 0, 0, total, time.time() - tic)
+            "\r%s Current/Best: %7.2f/%7.2f %sFLOPS | Progress: (%d/%d) "
+            "| %.2f s" % (prefix, 0, 0, si_prefix, 0, total, time.time() - tic)
         )
         sys.stdout.flush()
 
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index fc3b245d88ad..6ffae08621e0 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -25,6 +25,7 @@
 from tvm._ffi.base import string_types
 from tvm.contrib import graph_executor
 from . import debug_result
+from ...runtime.profiling import Report
 
 _DUMP_ROOT_PREFIX = "tvmdbg_"
 _DUMP_PATH_PREFIX = "_tvmdbg_"
@@ -102,6 +103,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._execute_node = module["execute_node"]
         self._get_node_output = module["get_node_output"]
         self._profile = module["profile"]
+        self._profile_rpc = module["profile_rpc"]
         graph_executor.GraphModule.__init__(self, module)
         self._create_debug_env(graph_json_str, device)
 
@@ -274,7 +276,7 @@ def profile(self, collectors=None, **input_dict):
         Parameters
         ----------
         collectors : Optional[Sequence[MetricCollector]]
-            Extra metrics to collect.
+            Extra metrics to collect. If profiling over RPC, collectors must be `None`.
 
         input_dict : dict of str to NDArray
             List of input values to be feed to
@@ -284,10 +286,13 @@ def profile(self, collectors=None, **input_dict):
         timing_results : str
             Per-operator and whole graph timing results in a table format.
         """
-        collectors = [] if collectors is None else collectors
         if input_dict:
             self.set_input(**input_dict)
 
+        if self.module.type_key == "rpc":
+            # We cannot serialize MetricCollectors over RPC
+            assert collectors is None, "Profiling with collectors is not supported over RPC"
+            return Report.from_json(self._profile_rpc())
         return self._profile(collectors)
 
     def exit(self):
diff --git a/python/tvm/contrib/hexagon.py b/python/tvm/contrib/hexagon.py
index 6364ef749dd9..fe256163f73c 100644
--- a/python/tvm/contrib/hexagon.py
+++ b/python/tvm/contrib/hexagon.py
@@ -194,9 +194,10 @@ def mutate(stmt):
         if isinstance(stmt, tvm.tir.Allocate):
             var = stmt.buffer_var
             scope = var.type_annotation.storage_scope
+            is_vtcm = var in vtcm_buffers
             if scope == "local.vtcm":
                 vtcm_buffers.pop()
-            if var in vtcm_buffers:
+            if is_vtcm:
                 is_null = tvm.tir.call_intrin("bool", tvm.ir.Op.get("tir.isnullptr"), var)
                 throw_error = tvm.tir.call_intrin(
                     "int32", tvm.ir.Op.get("tir.tvm_throw_last_error")
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
new file mode 100644
index 000000000000..36c03891d210
--- /dev/null
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -0,0 +1,543 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Pipeline executor that executes a series of modules in a pipeline fashion."""
+import json
+import tvm._ffi
+from tvm import relay
+from tvm.relay.transform import InferType
+from tvm.contrib import graph_executor
+
+
+def pipeline_executor_enabled():
+    """Check if the pipeline executor is enabled.
+
+    Return
+    -------
+    enable: bool
+        Return whether the pipeline executor is enabled.
+    """
+    return tvm._ffi.get_global_func("tvm.pipeline_executor.create", allow_missing=True) is not None
+
+
+def build(pipe_configs):
+    """Build modules used in the pipeline executor, then use these modules and configuration
+    to create a pipeline executor.
+
+    Parameters
+    ----------
+    pipe_configs: PipelineConfig
+        Build Configuration information.
+
+    Returns
+    -------
+    ret: PipelineExecutorFactoryModule
+        Common interface for pipeline executor factory modules.
+    """
+    mods = {}
+    mod_n_configs = pipe_configs.get_config()
+    config_len = len(mod_n_configs)
+    string_config = [{} for _ in range(config_len)]
+    for ir_mod, mod_config in mod_n_configs.items():
+        mconf = mod_config["pipeline"].copy()
+        mod_idx = mconf["mod_idx"] - 1
+        dev = mod_config["dev"]
+        target = mod_config["target"]
+        build_func = relay.build
+        # Check whether there is a customized build function.
+        if "build" in mod_config and mod_config["build"]:
+            build_func = mod_config["build"]
+
+        mod = build_func(
+            ir_mod,
+            target,
+            params=mod_config["params"],
+            target_host=mod_config["target_host"],
+            mod_name=mod_config["mod_name"],
+        )
+
+        mconf["dev"] = "{},{}".format(dev.device_type, dev.device_id)
+        # Create a pipeline configuration.
+        string_config[mod_idx] = mconf
+        mods[mod] = {"dev": dev}
+
+    return PipelineExecutorFactoryModule(mods, string_config)
+
+
+class PipelineModule(object):
+    """Wrapper of runtime module, caller can use this module to set parameters and get outputs.
+
+    Parameters
+    ----------
+    module : PipelineExecutorFactoryModule
+        Common interface for pipeline executor factory modules.
+    """
+
+    def __init__(self, module):
+        self.module = module.module
+
+
+class PipelineConfig(object):
+    """Pipeline configuration information, this class contains the DAG that expresses
+    the dependency of each module involved in a pipeline and the parameters for building
+    each module.
+    """
+
+    class Binding:
+        """This class defines the module connections information.
+        The type can only be "input" or "output".
+
+        Parameters
+        ----------
+        owner : ModuleWrapper
+            The class who owns this interface.
+
+        io_type : str
+            The I/O type of this interface. It can only be "input" or "output".
+
+        name : str/integer
+            Name, for input it is string such as "data0", for output it is an integer such as 0.
+
+        data_type: TensorType
+            The data type of this interface.
+        """
+
+        def __init__(self, owner, io_type, name, data_type=None):
+            self.io_owner = owner
+            self.io_type = io_type
+            self.name = str(name)
+            # Child interfaces that depend on this interface.
+            self.bindings = []
+            # Parents interfaces that this interface depend on.
+            self.parents = []
+
+            self.data_type = data_type
+
+        def get_name(self):
+            # Return name of this interface and the name of owner who owns this interface.
+            owner_name = ""
+            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                owner_name = self.io_owner.name
+
+            return owner_name, self.name
+
+        def get_owner_idx(self):
+            # If the owner is ModuleWrapper return the owner index, if not return 0.
+            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                return self.io_owner.idx
+
+            return 0
+
+        def is_global_interface(self):
+            """The global interface is the interface visible to the caller which use a pipeline
+            executor, the global input interface is responsible for passing parameters to the
+            internal module interface, and the global output interface is responsible for
+            outputting the results computed by the pipeline executor to a caller.
+            """
+            return not isinstance(self.io_owner, PipelineConfig.ModuleWrapper)
+
+        def __repr__(self):
+            # Get all binding information.
+            ret = "  |{}: ".format(self.name)
+            for binding in self.bindings:
+                mname, dname = binding.get_name()
+                ret += "{0}:{1} ".format(mname, dname)
+            return ret
+
+        def check_dag_acyclic(self, start, inputs):
+            """This is to check whether the DAG containing these input interfaces is acyclic.
+            Parameters
+            ----------
+            start: ModuleWrapper
+                The starting node of the cycle check algorithm.
+
+            inputs: Binding
+                These interfaces are used to connect to each other to build DAG.
+
+            Return
+            ------
+                Return true if there is no cycle in the DAG.
+            """
+            for binding in inputs.values():
+                if start == binding.io_owner:
+                    return False
+                for p in binding.parents:
+                    if not self.check_dag_acyclic(start, p.io_owner.input_bindings.bindings):
+                        return False
+
+            return True
+
+        def connect(self, binding):
+            """Connect the current interface to the destination interface.
+            Correct connections are as follows: 1. global input connected to module input,
+            2. module output connected to global output, 3. module output connected to
+            module input.
+
+            Parameters
+            ----------
+            binding: Binding
+                The destination of this connection.
+            """
+
+            # Check whether the binding setting is correct or not.
+            if self.io_owner == binding.io_owner:
+                raise RuntimeError(f"Can not bind itself.")
+
+            if not self.is_global_interface() and self.io_type == "input":
+                raise RuntimeError(f"Module can only bind from output interface!")
+
+            if (
+                not self.is_global_interface()
+                and not binding.is_global_interface()
+                and binding.io_type == "output"
+            ):
+                raise RuntimeError(f"Can not bind module output with another module output!")
+
+            if (
+                not self.is_global_interface()
+                and binding.is_global_interface()
+                and binding.io_type == "input"
+            ):
+                raise RuntimeError(f"Can not bind module output with global input!")
+
+            if self.is_global_interface() and self.io_type == "output":
+                raise RuntimeError(f"Global output can not be used as binding start point.")
+
+            if self.is_global_interface() and binding.io_type != "input":
+                raise RuntimeError(f"Global input can only bind with module input.")
+
+            self.bindings.append(binding)
+            if not self.is_global_interface():
+                # Check whether the data types of the source and destination are the same.
+                if (
+                    isinstance(binding.io_owner, PipelineConfig.ModuleWrapper)
+                    and self.data_type != binding.data_type
+                ):
+                    raise RuntimeError(
+                        f"Illegal type (%s vs. %s): binding type is not same!"
+                        % (self.data_type, binding.data_type)
+                    )
+
+                binding.parents.append(self)
+
+                # Do acyclic check after increasing the in-degree of child node by setting
+                # current interface as a parent of the child node.
+
+                if not self.check_dag_acyclic(
+                    binding.io_owner, self.io_owner.input_bindings.bindings
+                ):
+                    raise RuntimeError(f"Illegal connection: Cause a cycle!")
+
+    class BindingList:
+        """Container for bindings(input or output interface).
+
+        Parameters
+        ----------
+        owner : ModuleWrapper/PipelineConfig
+            The owner of this class can be ModuleWrapper or PipelineConfig.
+
+        io_type : str
+            The type of this class can be "input" or "output".
+        """
+
+        def __init__(self, owner, io_type):
+            self.bindings = {}
+            self.io_owner = owner
+            self.binding_type = io_type
+
+        def get_binding_data_type(self, key):
+            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                return self.io_owner.get_data_type(key, self.binding_type)
+            return None
+
+        def __getitem__(self, key):
+            if key not in self.bindings:
+                data_type = self.get_binding_data_type(key)
+                if not data_type and isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                    raise RuntimeError(f"Can not find {key} in binding list {self.binding_type}.")
+
+                self.bindings[key] = PipelineConfig.Binding(
+                    self.io_owner, self.binding_type, key, data_type
+                )
+
+            return self.bindings[key]
+
+    class ModuleWrapper:
+        """This class is a wrapper representing the module and contains information such as
+        module information, binding information and building information.
+        """
+
+        def __init__(self, mod=None):
+            self.target_host = None
+            self.build_func = None
+            self.params = None
+            self.target = None
+            self.name = None
+            self.dev = None
+            self.idx = None
+            self.mod = mod
+            self.input_params = InferType()(mod)["main"].params
+            self.output_type = InferType()(mod)["main"].checked_type.ret_type
+            self.input_bindings = PipelineConfig.BindingList(self, "input")
+            self.output_bindings = PipelineConfig.BindingList(self, "output")
+
+        def __eq__(self, other):
+            if isinstance(other, PipelineConfig.ModuleWrapper):
+                return self.mod == other.mod
+
+            return False
+
+        def __getitem__(self, key):
+            if isinstance(key, str):
+                if key == "input":
+                    return self.input_bindings
+
+                if key == "output":
+                    return self.output_bindings
+
+            raise RuntimeError(f"{key} not found!")
+
+        def get_data_type(self, key, interface_type):
+            """Get the module interface data type according to the key value and interface type.
+            Parameters
+            ----------
+            key: str
+                The interface name.
+
+            interface_type:
+                The interface type.
+
+            Return
+            -------
+                Return data type.
+            """
+            if interface_type == "input":
+                for param in self.input_params:
+                    if param.name_hint == key:
+                        return param._checked_type_
+
+            if interface_type == "output":
+                if isinstance(self.output_type, tvm.ir.type.TupleType):
+                    if int(key) < len(self.output_type.fields):
+                        return self.output_type.fields[int(key)]
+                elif int(key) == 0:
+                    return self.output_type
+
+            return None
+
+        def set_idx_name(self, idx):
+            # Set the index value and generate the module name.
+            self.idx = idx
+            self.name = "mod{}".format(str(idx))
+
+        def is_root_mod(self):
+            """Check whether this node is the root node in DAG, this function is used
+            in topological sort.
+            """
+            return all([not b.parents for b in self.input_bindings.bindings.values()])
+
+        def remove_self_from_bindings(self):
+            """Remove the current node from child dependencies to reduce the in-degree
+            of child node, this function is used in topological sort.
+            """
+            for binding in self.output_bindings.bindings.values():
+                for child in binding.bindings:
+                    if binding in child.parents:
+                        child.parents.remove(binding)
+
+    def __init__(self):
+        self.mod_wrapper = {}
+        self.input_bindings = self.BindingList(self, "input")
+        self.output_bindings = self.BindingList(self, "output")
+
+    def __str__(self):
+        # Get configuration information as a string.
+
+        # Use topological sort to get correct module order.
+        self.dag_topology_sort()
+        # Get the input dependencies.
+        input_dump = "Inputs\n"
+        for input_name in self.input_bindings.bindings:
+            inf = self.input_bindings.bindings[input_name]
+            input_dump += str(inf) + "\n"
+
+        # Get the connections information of each module.
+        output = {}
+        connections_dump = "\nconnections\n"
+        for mod in self.mod_wrapper:
+            for interface in self.mod_wrapper[mod].output_bindings.bindings.values():
+                if interface.bindings:
+                    mname, dname = interface.get_name()
+                    iname = mname + ".output(" + dname + ")->"
+                    for dep in interface.bindings:
+                        dep_mname, dep_dname = dep.get_name()
+                        if isinstance(dep.io_owner, PipelineConfig.ModuleWrapper):
+                            iname += f" {dep_mname}.{dep_dname}"
+                            connections_dump += f"  |{iname}\n"
+                        else:
+                            output[dep_dname] = f"{mname}.output({dname})"
+
+        # Get the output dependencies.
+        output_dump = "\noutput\n"
+        for name in sorted(output.keys()):
+            output_dump += f"  |output({name}) : {output[name]}\n"
+
+        return input_dump + output_dump + connections_dump
+
+    def __getitem__(self, key):
+        if isinstance(key, tvm.ir.module.IRModule):
+            if key not in self.mod_wrapper:
+                self.mod_wrapper[key] = self.ModuleWrapper(key)
+            return self.mod_wrapper[key]
+
+        if isinstance(key, str):
+            if key == "input":
+                return self.input_bindings
+            if key == "output":
+                return self.output_bindings
+
+        raise RuntimeError(f"{key} not found.")
+
+    def get_config(self):
+        """Get the configuration information in dictionary form, this configuration
+        will be used to create pipeline executor.
+        """
+
+        # Use topological sort to get the correct order of modules.
+        self.dag_topology_sort()
+        mconfig = {}
+        for mod in self.mod_wrapper:
+            # Generate pipeline configuration.
+            mconf = {}
+            output_conf = []
+            module = self.mod_wrapper[mod]
+            for _, binding in module.output_bindings.bindings.items():
+                dep_conf = []
+                output = {}
+                if binding.bindings:
+                    for dep in binding.bindings:
+                        dep_item = {}
+                        _, dname = dep.get_name()
+                        dep_item["mod_idx"] = dep.get_owner_idx()
+                        dep_item["input_name"] = dname
+                        dep_conf.append(dep_item)
+
+                # The value of ouput_idx start from 0.
+                output["output_idx"] = int(binding.name)
+                output["dependent"] = dep_conf
+                output_conf.append(output)
+
+            mconf["mod_idx"] = module.idx
+            mconf["output"] = output_conf
+
+            mconfig[mod] = {
+                "pipeline": mconf,
+                "target_host": module.target_host,
+                "mod_name": "default",
+                "build": module.build_func,
+                "params": module.params,
+                "target": module.target,
+                "dev": module.dev,
+            }
+
+        return mconfig
+
+    def dag_topology_sort(self):
+        """Use topological sort to get order of pipeline modules."""
+        mlist = []
+        mod_wrapper = self.mod_wrapper.copy()
+        while mod_wrapper:
+            temp_list = []
+            for mod, wrapper in mod_wrapper.items():
+                if wrapper.is_root_mod():
+                    temp_list.append(mod)
+                    wrapper.remove_self_from_bindings()
+
+            for mod in temp_list:
+                mod_wrapper.pop(mod, None)
+
+            mlist += temp_list
+
+        for mod, i in zip(mlist, range(len(mlist))):
+            self.mod_wrapper[mod].set_idx_name(i + 1)
+
+    def get_mod_idx(self, mod):
+        # Return the module index.
+        idx = self.mod_wrapper[mod].idx
+        return idx
+
+    def pipe_input(self, name):
+        # Return the input interface according to the name.
+        return self.input_bindings[name]
+
+    def pipe_output(self, idx):
+        # Return the output interface according to the name.
+        return self.output_bindings[idx]
+
+
+class PipelineExecutorFactoryModule(object):
+    """Common interface for pipeline executor factory modules.
+
+    Parameters
+    ----------
+    pipeline_mods : List[GraphExecutorFactoryModule]
+        List of GraphExecutorFactoryModule.
+
+    mod_config : Dict[int, Dict[str, Any]]
+        Modules dependency configuration information.
+
+    """
+
+    def __init__(self, pipeline_mods, mods_config):
+        mods, config = self.graph_executor_create(pipeline_mods, mods_config)
+        assert (
+            pipeline_executor_enabled()
+        ), "Pipeline executor is not enabled. Please \
+              re-build TVM with USE_PIPELINE_EXECUTOR=ON"
+        pipeline_create = tvm._ffi.get_global_func(
+            "tvm.pipeline_executor.create", allow_missing=False
+        )
+        assert pipeline_create
+        self.module = pipeline_create(mods, config)
+
+    def graph_executor_create(self, pipeline_mods, mod_config):
+        """Create graph_executor list and return configuration as a json string.
+
+        Parameters
+        ----------
+        pipeline_mods : List[GraphExecutorFactoryModule]
+          List of GraphExecutorFactoryModule
+
+        mod_config : Dict[str, Any]
+            Modules dependency configuration information.
+
+        Returns
+        -------
+        mods : List[Module]
+            The Module list.
+
+        mod_config : str
+            The Modudle configuration.
+        """
+
+        mods = []
+        for pipeline_mod in pipeline_mods:
+            mod = graph_executor.GraphModule(
+                pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"])
+            )
+            mods.append(mod.module)
+
+        return mods, json.dumps(mod_config)
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index ac1a41a0c4a9..b1f00b7d1dde 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -24,6 +24,7 @@
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
 from tvm.relay.op.contrib.ethosn import partition_for_ethosn
+from tvm.relay.op.contrib.cmsisnn import partition_for_cmsisnn
 from tvm.relay.op.contrib.bnns import partition_for_bnns
 from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
 
@@ -49,6 +50,10 @@
         "config_key": None,
         "pass_pipeline": partition_for_arm_compute_lib,
     },
+    "cmsis-nn": {
+        "config_key": None,
+        "pass_pipeline": partition_for_cmsisnn,
+    },
     "ethos-n77": {
         "config_key": "relay.ext.ethos-n.options",
         "pass_pipeline": partition_for_ethosn,
diff --git a/python/tvm/ir/affine_type.py b/python/tvm/ir/affine_type.py
index a1ce08017b1b..bd77c187af40 100644
--- a/python/tvm/ir/affine_type.py
+++ b/python/tvm/ir/affine_type.py
@@ -48,10 +48,15 @@ class TensorAffineType(AffineType):
 
     dtype : str
         The content data type.
+
+    axis : int
+        The axis for per-channel quantization.
     """
 
-    def __init__(self, scale, zero_point, dtype):
-        self.__init_handle_by_constructor__(_ffi_api.TensorAffineType, scale, zero_point, dtype)
+    def __init__(self, scale, zero_point, dtype, axis=-1):
+        self.__init_handle_by_constructor__(
+            _ffi_api.TensorAffineType, scale, zero_point, dtype, axis
+        )
 
 
 @tvm._ffi.register_object("TupleAffineType")
diff --git a/python/tvm/ir/container.py b/python/tvm/ir/container.py
index c62952554bc6..3c7a57a830d9 100644
--- a/python/tvm/ir/container.py
+++ b/python/tvm/ir/container.py
@@ -38,6 +38,16 @@ def __getitem__(self, idx):
     def __len__(self):
         return _ffi_api.ArraySize(self)
 
+    def __dir__(self):
+        return sorted(dir(self.__class__) + ["type_key"])
+
+    def __getattr__(self, name):
+        if name == "handle":
+            raise AttributeError("handle is not set")
+        if name == "type_key":
+            return super().__getattr__(name)
+        raise AttributeError("%s has no attribute %s" % (str(type(self)), name))
+
 
 @tvm._ffi.register_object
 class Map(Object):
@@ -59,6 +69,16 @@ def __iter__(self):
         for i in range(len(self)):
             yield akvs[i * 2]
 
+    def __dir__(self):
+        return sorted(dir(self.__class__) + ["type_key"])
+
+    def __getattr__(self, name):
+        if name == "handle":
+            raise AttributeError("handle is not set")
+        if name == "type_key":
+            return super().__getattr__(name)
+        raise AttributeError("%s has no attribute %s" % (str(type(self)), name))
+
     def keys(self):
         return iter(self)
 
diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index 88dcde8ceaf0..2aea9d3fd61d 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -16,6 +16,8 @@
 # under the License.
 """MicroTVM module for bare-metal backends"""
 
+from .build import autotvm_build_func
+from .build import AutoTvmModuleLoader
 from .build import get_standalone_crt_dir
 from .model_library_format import export_model_library_format, UnsupportedInModelLibraryFormatError
 from .project import generate_project, GeneratedProject, TemplateProject
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 16e7ed24cb4f..9e278081933c 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -17,10 +17,15 @@
 
 """Defines top-level glue functions for building microTVM artifacts."""
 
+import json
 import logging
 import os
+import pathlib
+import contextlib
 
+from typing import Union
 from .._ffi import libinfo
+from .. import rpc as _rpc
 
 
 _LOG = logging.getLogger(__name__)
@@ -57,3 +62,54 @@ def get_standalone_crt_dir() -> str:
             raise CrtNotFoundError()
 
     return STANDALONE_CRT_DIR
+
+
+class AutoTvmModuleLoader:
+    """MicroTVM AutoTVM Module Loader
+
+    Parameters
+    ----------
+    template_project_dir : Union[pathlib.Path, str]
+        project template path
+
+    project_options : dict
+        project generation option
+    """
+
+    def __init__(
+        self, template_project_dir: Union[pathlib.Path, str], project_options: dict = None
+    ):
+        self._project_options = project_options
+
+        if isinstance(template_project_dir, (pathlib.Path, str)):
+            self._template_project_dir = str(template_project_dir)
+        elif not isinstance(template_project_dir, str):
+            raise TypeError(f"Incorrect type {type(template_project_dir)}.")
+
+    @contextlib.contextmanager
+    def __call__(self, remote_kw, build_result):
+        with open(build_result.filename, "rb") as build_file:
+            build_result_bin = build_file.read()
+
+        tracker = _rpc.connect_tracker(remote_kw["host"], remote_kw["port"])
+        remote = tracker.request(
+            remote_kw["device_key"],
+            priority=remote_kw["priority"],
+            session_timeout=remote_kw["timeout"],
+            session_constructor_args=[
+                "tvm.micro.compile_and_create_micro_session",
+                build_result_bin,
+                self._template_project_dir,
+                json.dumps(self._project_options),
+            ],
+        )
+        system_lib = remote.get_function("runtime.SystemLib")()
+        yield remote, system_lib
+
+
+def autotvm_build_func():
+    """A dummy build function which causes autotvm to use a different export format."""
+
+
+# A sentinel value for the output format.
+autotvm_build_func.output_format = ".model-library-format"
diff --git a/python/tvm/micro/interface_api.py b/python/tvm/micro/interface_api.py
index 8086b1ed6554..d9961e9de3f9 100644
--- a/python/tvm/micro/interface_api.py
+++ b/python/tvm/micro/interface_api.py
@@ -17,7 +17,13 @@
 
 """Defines functions for generating a C interface header"""
 
+# TODO: Currently the Interface API header is generated in Python but the source it references
+# is generated in C++. These should be consolidated to generate both header and source in C++
+# and avoid re-implementing logic, such as name sanitising, in the two different languages.
+# See https://github.com/apache/tvm/issues/8792 .
+
 import os
+import re
 
 from tvm.relay.backend.utils import mangle_module_name
 
@@ -58,8 +64,13 @@ def generate_c_interface_header(module_name, inputs, outputs, output_path):
 
         _emit_brief(header_file, module_name, "Input tensor pointers")
         header_file.write(f"struct {mangled_name}_inputs {{\n")
+        sanitized_names = []
         for input_name in inputs:
-            header_file.write(f"  void* {input_name};\n")
+            sanitized_input_name = re.sub(r"\W", "_", input_name)
+            if sanitized_input_name in sanitized_names:
+                raise ValueError(f"Sanitized input tensor name clash: {sanitized_input_name}")
+            sanitized_names.append(sanitized_input_name)
+            header_file.write(f"  void* {sanitized_input_name};\n")
         header_file.write("};\n\n")
 
         _emit_brief(header_file, module_name, "Output tensor pointers")
diff --git a/python/tvm/micro/project.py b/python/tvm/micro/project.py
index 8d1408c679fb..8a62c9b5f9ba 100644
--- a/python/tvm/micro/project.py
+++ b/python/tvm/micro/project.py
@@ -92,30 +92,35 @@ class TemplateProject:
     """Defines a glue interface to interact with a template project through the API Server."""
 
     @classmethod
-    def from_directory(cls, template_project_dir, options):
-        return cls(client.instantiate_from_dir(template_project_dir), options)
+    def from_directory(cls, template_project_dir):
+        return cls(client.instantiate_from_dir(template_project_dir))
 
-    def __init__(self, api_client, options):
+    def __init__(self, api_client):
         self._api_client = api_client
-        self._options = options
         self._info = self._api_client.server_info_query(__version__)
         if not self._info["is_template"]:
             raise NotATemplateProjectError()
 
-    def generate_project(self, graph_executor_factory, project_dir):
-        """Generate a project given GraphRuntimeFactory."""
-        model_library_dir = utils.tempdir()
-        model_library_format_path = model_library_dir.relpath("model.tar")
-        export_model_library_format(graph_executor_factory, model_library_format_path)
-
+    def generate_project_from_mlf(self, model_library_format_path, project_dir, options):
         self._api_client.generate_project(
-            model_library_format_path=model_library_format_path,
+            model_library_format_path=str(model_library_format_path),
             standalone_crt_dir=get_standalone_crt_dir(),
             project_dir=project_dir,
-            options=self._options,
+            options=options,
         )
 
-        return GeneratedProject.from_directory(project_dir, self._options)
+        return GeneratedProject.from_directory(project_dir, options)
+
+    def info(self):
+        return self._info
+
+    def generate_project(self, graph_executor_factory, project_dir, options):
+        """Generate a project given GraphRuntimeFactory."""
+        model_library_dir = utils.tempdir()
+        model_library_format_path = model_library_dir.relpath("model.tar")
+        export_model_library_format(graph_executor_factory, model_library_format_path)
+
+        return self.generate_project_from_mlf(model_library_format_path, project_dir, options)
 
 
 def generate_project(
@@ -147,5 +152,5 @@ def generate_project(
     GeneratedProject :
         A class that wraps the generated project and which can be used to further interact with it.
     """
-    template = TemplateProject.from_directory(str(template_project_dir), options)
-    return template.generate_project(module, str(generated_project_dir))
+    template = TemplateProject.from_directory(str(template_project_dir))
+    return template.generate_project(module, str(generated_project_dir), options)
diff --git a/python/tvm/micro/project_api/__init__.py b/python/tvm/micro/project_api/__init__.py
new file mode 100644
index 000000000000..9915040a922c
--- /dev/null
+++ b/python/tvm/micro/project_api/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""MicroTVM Project API Client and Server"""
diff --git a/python/tvm/micro/project_api/client.py b/python/tvm/micro/project_api/client.py
index f650ad946d87..ac8ff629a718 100644
--- a/python/tvm/micro/project_api/client.py
+++ b/python/tvm/micro/project_api/client.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""
+Project API client.
+"""
 import base64
 import io
 import json
 import logging
+import platform
 import os
 import pathlib
 import subprocess
@@ -56,6 +59,7 @@ class UnsupportedProtocolVersionError(ProjectAPIErrorBase):
 
 class RPCError(ProjectAPIErrorBase):
     def __init__(self, request, error):
+        ProjectAPIErrorBase.__init__()
         self.request = request
         self.error = error
 
@@ -129,7 +133,8 @@ def _request_reply(self, method, params):
 
         if "error" in reply:
             raise server.JSONRPCError.from_json(f"calling method {method}", reply["error"])
-        elif "result" not in reply:
+
+        if "result" not in reply:
             raise MalformedReplyError(f"Expected 'result' key in server reply, got {reply!r}")
 
         return reply["result"]
@@ -189,7 +194,7 @@ def write_transport(self, data, timeout_sec):
 
 # NOTE: windows support untested
 SERVER_LAUNCH_SCRIPT_FILENAME = (
-    f"launch_microtvm_api_server.{'sh' if os.system != 'win32' else '.bat'}"
+    f"launch_microtvm_api_server.{'sh' if platform.system() != 'Windows' else '.bat'}"
 )
 
 
@@ -197,7 +202,8 @@ def write_transport(self, data, timeout_sec):
 
 
 def instantiate_from_dir(project_dir: typing.Union[pathlib.Path, str], debug: bool = False):
-    """Launch server located in project_dir, and instantiate a Project API Client connected to it."""
+    """Launch server located in project_dir, and instantiate a Project API Client
+    connected to it."""
     args = None
 
     project_dir = pathlib.Path(project_dir)
@@ -224,7 +230,7 @@ def instantiate_from_dir(project_dir: typing.Union[pathlib.Path, str], debug: bo
     if debug:
         args.append("--debug")
 
-    api_server_proc = subprocess.Popen(
+    api_server_proc = subprocess.Popen(  # pylint: disable=unused-variable
         args, bufsize=0, pass_fds=(api_server_read_fd, api_server_write_fd), cwd=project_dir
     )
     os.close(api_server_read_fd)
diff --git a/python/tvm/micro/project_api/server.py b/python/tvm/micro/project_api/server.py
index 144f0cb6dee1..cee0205303f0 100644
--- a/python/tvm/micro/project_api/server.py
+++ b/python/tvm/micro/project_api/server.py
@@ -34,7 +34,6 @@
 import re
 import select
 import sys
-import textwrap
 import time
 import traceback
 import typing
@@ -100,6 +99,7 @@ class JSONRPCError(Exception):
     """An error class with properties that meet the JSON-RPC error spec."""
 
     def __init__(self, code, message, data, client_context=None):
+        Exception.__init__(self)
         self.code = code
         self.message = message
         self.data = data
@@ -123,9 +123,7 @@ def __str__(self):
 
     @classmethod
     def from_json(cls, client_context, json_error):
-        # Subclasses of ServerError capture exceptions that occur in the Handler, and thus return a
-        # traceback. The encoding in `json_error` is also slightly different to allow the specific subclass
-        # to be identified.
+        """Convert an encapsulated ServerError into JSON-RPC compliant format."""
         found_server_error = False
         try:
             if ErrorCode(json_error["code"]) == ErrorCode.SERVER_ERROR:
@@ -145,6 +143,8 @@ def from_json(cls, client_context, json_error):
 
 
 class ServerError(JSONRPCError):
+    """Superclass for JSON-RPC errors which occur while processing valid requests."""
+
     @classmethod
     def from_exception(cls, exc, **kw):
         to_return = cls(**kw)
@@ -168,21 +168,25 @@ def __str__(self):
         super_str = super(ServerError, self).__str__()
         return context_str + super_str
 
-    def set_traceback(self, traceback):
+    def set_traceback(self, traceback):  # pylint: disable=redefined-outer-name
+        """Format a traceback to be embedded in the JSON-RPC format."""
+
         if self.data is None:
             self.data = {}
 
         if "traceback" not in self.data:
             # NOTE: TVM's FFI layer reorders Python stack traces several times and strips
             # intermediary lines that start with "Traceback". This logic adds a comment to the first
-            # stack frame to explicitly identify the first stack frame line that occurs on the server.
+            # stack frame to explicitly identify the first stack frame line that occurs on the
+            # server.
             traceback_list = list(traceback)
 
-            # The traceback list contains one entry per stack frame, and each entry contains 1-2 lines:
+            # The traceback list contains one entry per stack frame, and each entry contains 1-2
+            # lines:
             #    File "path/to/file", line 123, in <method>:
             #      <copy of the line>
-            # We want to place a comment on the first line of the outermost frame to indicate this is the
-            # server-side stack frame.
+            # We want to place a comment on the first line of the outermost frame to indicate this
+            # is the server-side stack frame.
             first_frame_list = traceback_list[1].split("\n")
             self.data["traceback"] = (
                 traceback_list[0]
@@ -307,7 +311,8 @@ def flash(self, options: dict):
     def open_transport(self, options: dict) -> TransportTimeouts:
         """Open resources needed for the transport layer.
 
-        This function might e.g. open files or serial ports needed in write_transport or read_transport.
+        This function might e.g. open files or serial ports needed in write_transport or
+        read_transport.
 
         Calling this function enables the write_transport and read_transport calls. If the
         transport is not open, this method is a no-op.
@@ -323,7 +328,8 @@ def open_transport(self, options: dict) -> TransportTimeouts:
     def close_transport(self):
         """Close resources needed to operate the transport layer.
 
-        This function might e.g. close files or serial ports needed in write_transport or read_transport.
+        This function might e.g. close files or serial ports needed in write_transport or
+        read_transport.
 
         Calling this function disables the write_transport and read_transport calls. If the
         transport is not open, this method is a no-op.
@@ -331,6 +337,7 @@ def close_transport(self):
         raise NotImplementedError()
 
     @abc.abstractmethod
+    # pylint: disable=unidiomatic-typecheck
     def read_transport(self, n: int, timeout_sec: typing.Union[float, type(None)]) -> bytes:
         """Read data from the transport.
 
@@ -389,7 +396,8 @@ def write_transport(self, data: bytes, timeout_sec: float):
 class ProjectAPIServer:
     """Base class for Project API Servers.
 
-    This API server implements communication using JSON-RPC 2.0: https://www.jsonrpc.org/specification
+    This API server implements communication using JSON-RPC 2.0:
+        https://www.jsonrpc.org/specification
 
     Suggested use of this class is to import this module or copy this file into Project Generator
     implementations, then instantiate it with server.start().
@@ -451,7 +459,7 @@ def serve_one_request(self):
             _LOG.error("EOF")
             return False
 
-        except Exception as exc:
+        except Exception as exc:  # pylint: disable=broad-except
             _LOG.error("Caught error reading request", exc_info=1)
             return False
 
@@ -466,7 +474,7 @@ def serve_one_request(self):
             request_id = None if not did_validate else request.get("id")
             self._reply_error(request_id, exc)
             return did_validate
-        except Exception as exc:
+        except Exception as exc:  # pylint: disable=broad-except
             message = "validating request"
             if did_validate:
                 message = f"calling method {request['method']}"
@@ -481,7 +489,7 @@ def serve_one_request(self):
     VALID_METHOD_RE = re.compile("^[a-zA-Z0-9_]+$")
 
     def _validate_request(self, request):
-        if type(request) is not dict:
+        if not isinstance(request, dict):
             raise JSONRPCError(
                 ErrorCode.INVALID_REQUEST, f"request: want dict; got {request!r}", None
             )
@@ -493,7 +501,7 @@ def _validate_request(self, request):
             )
 
         method = request.get("method")
-        if type(method) != str:
+        if not isinstance(method, str):
             raise JSONRPCError(
                 ErrorCode.INVALID_REQUEST, f'request["method"]: want str; got {method!r}', None
             )
@@ -501,18 +509,20 @@ def _validate_request(self, request):
         if not self.VALID_METHOD_RE.match(method):
             raise JSONRPCError(
                 ErrorCode.INVALID_REQUEST,
-                f'request["method"]: should match regex {self.VALID_METHOD_RE.pattern}; got {method!r}',
+                f'request["method"]: should match regex {self.VALID_METHOD_RE.pattern}; '
+                f"got {method!r}",
                 None,
             )
 
         params = request.get("params")
-        if type(params) != dict:
+        if not isinstance(params, dict):
             raise JSONRPCError(
                 ErrorCode.INVALID_REQUEST, f'request["params"]: want dict; got {type(params)}', None
             )
 
         request_id = request.get("id")
-        if type(request_id) not in (str, int, type(None)):
+        # pylint: disable=unidiomatic-typecheck
+        if not isinstance(request_id, (str, int, type(None))):
             raise JSONRPCError(
                 ErrorCode.INVALID_REQUEST,
                 f'request["id"]: want str, number, null; got {request_id!r}',
@@ -538,10 +548,11 @@ def _dispatch_request(self, request):
         params = {}
 
         for var_name, var_type in typing.get_type_hints(interface_method).items():
-            if var_name == "self" or var_name == "return":
+            if var_name in ("self", "return"):
                 continue
 
-            # NOTE: types can only be JSON-compatible types, so var_type is expected to be of type 'type'.
+            # NOTE: types can only be JSON-compatible types, so var_type is expected to be of type
+            # 'type'.
             if var_name not in request_params:
                 raise JSONRPCError(
                     ErrorCode.INVALID_PARAMS,
@@ -553,7 +564,8 @@ def _dispatch_request(self, request):
             if not has_preprocessing and not isinstance(param, var_type):
                 raise JSONRPCError(
                     ErrorCode.INVALID_PARAMS,
-                    f'method {request["method"]}: parameter {var_name}: want {var_type!r}, got {type(param)!r}',
+                    f'method {request["method"]}: parameter {var_name}: want {var_type!r}, '
+                    f"got {type(param)!r}",
                     None,
                 )
 
@@ -636,7 +648,7 @@ def _await_nonblocking_ready(rlist, wlist, timeout_sec=None, end_time=None):
     return True
 
 
-def read_with_timeout(fd, n, timeout_sec):
+def read_with_timeout(fd, n, timeout_sec):  # pylint: disable=invalid-name
     """Read data from a file descriptor, with timeout.
 
     This function is intended as a helper function for implementations of ProjectAPIHandler
@@ -683,7 +695,7 @@ def read_with_timeout(fd, n, timeout_sec):
     return to_return
 
 
-def write_with_timeout(fd, data, timeout_sec):
+def write_with_timeout(fd, data, timeout_sec):  # pylint: disable=invalid-name
     """Write data to a file descriptor, with timeout.
 
     This function is intended as a helper function for implementations of ProjectAPIHandler
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index d4ad5b84fb76..ced20b7ebfbf 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -17,14 +17,17 @@
 
 """Defines a top-level glue class that operates the Transport and Flasher classes."""
 
+import json
 import logging
 import sys
 
 from ..error import register_error
-from .._ffi import get_global_func
+from .._ffi import get_global_func, register_func
 from ..contrib import graph_executor
+from ..contrib import utils
 from ..contrib.debugger import debug_executor
 from ..rpc import RPCSession
+from . import project
 from .transport import IoTimeoutError
 from .transport import TransportLogger
 
@@ -127,6 +130,7 @@ def __enter__(self):
                     int(timeouts.session_start_retry_timeout_sec * 1e6),
                     int(timeouts.session_start_timeout_sec * 1e6),
                     int(timeouts.session_established_timeout_sec * 1e6),
+                    self._shutdown,
                 )
             )
             self.device = self._rpc.cpu(0)
@@ -140,6 +144,9 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
         """Tear down this session and associated RPC session resources."""
         self.transport.__exit__(exc_type, exc_value, exc_traceback)
 
+    def _shutdown(self):
+        self.__exit__(None, None, None)
+
 
 def lookup_remote_linked_param(mod, storage_id, template_tensor, device):
     """Lookup a parameter that has been pre-linked into a remote (i.e. over RPC) Module.
@@ -234,3 +241,54 @@ def create_local_debug_executor(graph_json_str, mod, device, dump_root=None):
         graph_json_str,
         dump_root=dump_root,
     )
+
+
+@register_func("tvm.micro.compile_and_create_micro_session")
+def compile_and_create_micro_session(
+    mod_src_bytes: bytes,
+    template_project_dir: str,
+    project_options: dict = None,
+):
+    """Compile the given libraries and sources into a MicroBinary, then invoke create_micro_session.
+
+    Parameters
+    ----------
+    mod_src_bytes : bytes
+        The content of a tarfile which contains the TVM-generated sources which together form the
+        SystemLib. This tar is expected to be created by export_library. The tar will be extracted
+        into a directory and the sources compiled into a MicroLibrary using the Compiler.
+
+    template_project_dir: str
+        The path to a template microTVM Project API project which is used to generate the embedded
+        project that is built and flashed onto the target device.
+
+    project_options: dict
+        Options for the microTVM API Server contained in template_project_dir.
+    """
+
+    temp_dir = utils.tempdir()
+    # Keep temp directory for generate project
+    temp_dir.set_keep_for_debug(True)
+    model_library_format_path = temp_dir / "model.tar.gz"
+    with open(model_library_format_path, "wb") as mlf_f:
+        mlf_f.write(mod_src_bytes)
+
+    try:
+        template_project = project.TemplateProject.from_directory(template_project_dir)
+        generated_project = template_project.generate_project_from_mlf(
+            model_library_format_path,
+            str(temp_dir / "generated-project"),
+            options=json.loads(project_options),
+        )
+    except Exception as exception:
+        logging.error("Project Generate Error: %s", str(exception))
+        raise exception
+
+    generated_project.build()
+    generated_project.flash()
+    transport = generated_project.transport()
+
+    rpc_session = Session(transport_context_manager=transport)
+    # RPC exit is called by shutdown function.
+    rpc_session.__enter__()
+    return rpc_session._rpc._sess
diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
index c7b6c60849a1..524f69bcdd13 100644
--- a/python/tvm/relay/analysis/analysis.py
+++ b/python/tvm/relay/analysis/analysis.py
@@ -384,6 +384,23 @@ def extract_fused_functions(mod):
     return ret
 
 
+def list_op_freqs(mod):
+    """Pass to extract unique operator names and how frequently they appear
+    in an IRModule. Fused functions are traversed to count the operators
+    that compose them.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+
+    Returns
+    -------
+    ret : Dict[str, int]
+        Dict of unique operator names to frequency
+    """
+    return _ffi_api.ExtractOperators(mod)
+
+
 def search_fc_transpose(expr):
     """Search fc weight name in the patten: y = nn.dense(x, transpose(w, [1, 0]))
 
diff --git a/python/tvm/relay/backend/__init__.py b/python/tvm/relay/backend/__init__.py
index 4fc2b63748db..b84e215fa581 100644
--- a/python/tvm/relay/backend/__init__.py
+++ b/python/tvm/relay/backend/__init__.py
@@ -16,3 +16,4 @@
 # under the License.
 """Backend codegen modules for relay."""
 from . import compile_engine
+from .contrib import cmsisnn
diff --git a/python/tvm/relay/backend/contrib/__init__.py b/python/tvm/relay/backend/contrib/__init__.py
new file mode 100644
index 000000000000..bfc5b79bb2ee
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""External backend codegen modules for Relay."""
+from . import cmsisnn
diff --git a/python/tvm/relay/backend/contrib/cmsisnn/__init__.py b/python/tvm/relay/backend/contrib/cmsisnn/__init__.py
new file mode 100644
index 000000000000..cc6873f9fda6
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/cmsisnn/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CMSIS-NN codegen modules for relay."""
+from . import codegen
diff --git a/python/tvm/relay/backend/contrib/cmsisnn/codegen.py b/python/tvm/relay/backend/contrib/cmsisnn/codegen.py
new file mode 100644
index 000000000000..ef08f5eb317d
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/cmsisnn/codegen.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Codegen for CMSIS-NN"""
+import tvm
+from tvm import relay
+from tvm.relay.expr_functor import ExprVisitor
+
+
+class GenerateTIR(ExprVisitor):
+    """Generates TIR module containing TIR primfuncs corresponding to the Relay operators.
+    Note: Relay operator to primfunc mapping may not be 1:1.
+    """
+
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+        self.tir_mod = None
+        self.scale = 1.0 / 256
+
+    def call_contains_op(self, call, op_name):
+        if not isinstance(call.op, tvm.ir.op.Op):
+            return False
+        if call.op.name != op_name:
+            return False
+        return True
+
+    def is_quantized_softmax(self, call):
+        """Checks for the following relay sequence
+        a = qnn.dequantize(in, scale, zero_point)
+        b = nn.softmax(a)
+        c = qnn.quantize(c, scale, zero_point)
+        """
+        if not self.call_contains_op(call, "qnn.quantize"):
+            return False
+        softmax_call = call.args[0]
+        if not self.call_contains_op(softmax_call, "nn.softmax"):
+            return False
+        dequantize_call = softmax_call.args[0]
+        if not self.call_contains_op(dequantize_call, "qnn.dequantize"):
+            return False
+        self.scale = dequantize_call.args[1].data.numpy().item(0)
+        return True
+
+    def emit_softmax_tir(self, call):
+        """Generates TIR extern_call for softmax"""
+        shape = call.checked_type.shape  # NHWC
+        dtype = call.checked_type.dtype
+        ir_builder = tvm.tir.ir_builder.create()
+        in_buf = tvm.tir.decl_buffer(shape=shape, dtype=dtype)
+        out_buf = tvm.tir.decl_buffer(shape=shape, dtype=dtype)
+
+        trailing_dim = len(shape) - 1
+        num_rows = 1
+        for dim in range(trailing_dim):
+            num_rows *= shape[dim]
+        row_size = shape[trailing_dim]
+        ir_builder.emit(
+            tvm.tir.call_extern(
+                dtype,
+                "arm_softmax_s8",
+                in_buf.data,
+                num_rows,
+                row_size,
+                self.scale,
+                out_buf.data,
+            )
+        )
+        prim_func = tvm.tir.PrimFunc([in_buf, out_buf], ir_builder.get())
+        prim_func = prim_func.with_attr("global_symbol", self.name)
+        prim_func = prim_func.with_attr("tir.noalias", True)
+        self.tir_mod = tvm.IRModule({self.name: prim_func})
+
+    def visit_call(self, call):
+        """Iterates over the relay operators within relay external function"""
+        super().visit_call(call)
+        if self.is_quantized_softmax(call):
+            self.emit_softmax_tir(call)
+
+    def generate_tir(self, func):
+        self.visit(func)
+        return self.tir_mod
+
+
+def relay_to_tir(name, func):
+    """Lower a Relay function to TIR for the CMSIS-NN target.
+
+    The Relay function should only contain operations supported
+    by the CMSIS-NN target. This is enforced by the graph partitioner
+    for CMSIS-NN.
+
+    Parameters
+    ----------
+    name: str
+        Name of the external relay function
+    func : tvm.relay.Function
+        The Relay function to lower.
+
+    Returns
+    -------
+    mod : tvm.IRModule
+        The lowered TIR module.
+
+    """
+    return GenerateTIR(name).generate_tir(func)
+
+
+@tvm.register_func("relay.ext.cmsisnn")
+def cmsisnn_compiler(relay_func):
+    """It compiles Relay's external function into equivalent TIR
+    and subsequently converts that into 'c' code. During the 'c'
+    code generation, it embeds CMSIS-NN APIs for the corresponding
+    operators.
+    """
+    mod = tvm.IRModule()
+    mod["main"] = relay_func
+    mod = relay.transform.InferType()(mod)
+    func_name = relay_func.attrs["global_symbol"]
+    tir_mod = relay_to_tir(func_name, mod["main"])
+    cmsisnn_runtime = tvm._ffi.get_global_func("runtime.module.cmsisnn.create")
+    return cmsisnn_runtime(tir_mod)
diff --git a/python/tvm/relay/backend/contrib/ethosu/__init__.py b/python/tvm/relay/backend/contrib/ethosu/__init__.py
new file mode 100644
index 000000000000..2b424ebb5dec
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/__init__.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm(R) Ethos(TM)-U NPU codegen modules for Relay."""
+from . import util
+from . import legalize
+from . import preprocess
+from . import errors
+from . import vela_api
+from . import tir_to_cs_translator
+from .util import partition_for_ethosu
diff --git a/python/tvm/relay/backend/contrib/ethosu/_ffi_api.py b/python/tvm/relay/backend/contrib/ethosu/_ffi_api.py
new file mode 100644
index 000000000000..ccf1039a6994
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/_ffi_api.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for relay transformation passes."""
+import tvm._ffi  # type: ignore
+
+tvm._ffi._init_api("relay.ext.ethosu", __name__)
diff --git a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh b/python/tvm/relay/backend/contrib/ethosu/errors.py
old mode 100755
new mode 100644
similarity index 55%
rename from apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
rename to python/tvm/relay/backend/contrib/ethosu/errors.py
index a4c659438d4d..65f3711838be
--- a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
+++ b/python/tvm/relay/backend/contrib/ethosu/errors.py
@@ -1,4 +1,3 @@
-#!/bin/bash -e
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,29 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=super-init-not-called
+"""This module defines all error types associated with the Arm(R) Ethos(TM)-U NPU code generator."""
 
-set -e
 
-# Get number of cores for build
-if [ -n "${TVM_CI_NUM_CORES}" ]; then
-  num_cores=${TVM_CI_NUM_CORES}
-else
-  # default setup for Vagrantfile
-  num_cores=2
-fi
+class EthosUCodegenError(Exception):
+    """Base class for all exceptions related to code generation"""
 
-cd "$(dirname $0)"
-cd "$(git rev-parse --show-toplevel)"
-BUILD_DIR=build-microtvm
+    def __init__(self, data):
+        self.message = "EthosUCodegenError:" + data
 
-if [ ! -e "${BUILD_DIR}" ]; then
-    mkdir "${BUILD_DIR}"
-fi
-cp cmake/config.cmake "${BUILD_DIR}"
-cd "${BUILD_DIR}"
-sed -i 's/USE_MICRO OFF/USE_MICRO ON/' config.cmake
-sed -i 's/USE_PROFILER OFF/USE_PROFILER ON/' config.cmake
-sed -i 's/USE_LLVM OFF/USE_LLVM ON/' config.cmake
-cmake ..
-rm -rf standalone_crt host_standalone_crt  # remove stale generated files
-make -j${num_cores}
+    def __str__(self):
+        return self.message
+
+
+class UnsupportedLayout(EthosUCodegenError):
+    """Raised when unsupported layout is encountered during code generation."""
+
+    def __init__(self, layout):
+        super().__init__(f"Unsupported Layout {layout}")
diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
new file mode 100644
index 000000000000..82b7f1e68cee
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -0,0 +1,223 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel, no-value-for-parameter
+"""A set of passes to legalize some of operations for the NPU"""
+from typing import List
+import numpy as np  # type: ignore
+
+import tvm  # type: ignore
+from tvm import relay
+from tvm import ir
+from tvm.relay.dataflow_pattern import DFPatternCallback  # type: ignore
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops  # type: ignore
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout  # type: ignore
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns  # type: ignore
+
+
+class SplitRewriter(DFPatternCallback):
+    """This rewriting converts split operations into a sequence of
+    strided_slice operations, because codegen is going to be based
+    on strided_slices that will define the slice of the tensor that
+    will be fed to the consumer.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split: tvm.relay.Expr) -> List[int]:
+        """Currently, the split operator takes an array of indices or an integer
+        indicating the number of splits. However, its an array of indices could
+        represent both cases, therefore this function just make it an array of
+        indices where each index represent the co-ordinate of beginning of each
+        section -- defines as section begins.
+
+        Parameters
+        ----------
+        split : tvm.relay.Expr
+            The Relay Call expression for a split operator
+
+        Returns
+        -------
+        section_begins : List[int]
+            A list containing integers corresponding to section
+            begins
+        """
+        indices_or_sections = split.attrs.indices_or_sections
+        input_shape = split.args[0].checked_type.shape
+        split_axis = split.attrs.axis
+
+        if isinstance(indices_or_sections, tvm.ir.container.Array):
+            # 0 is the beginning of the first section.
+            return [0] + list(indices_or_sections)
+        split_axis_len = input_shape[split_axis].value
+        section_length = split_axis_len // indices_or_sections.value
+        return list(range(0, split_axis_len, section_length))
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        split_input = post.args[0]
+        split_begins = list()
+        split_ends = list()
+        section_begins_in_split_axis = self.get_section_begin_coords(post)
+        for split_cord in section_begins_in_split_axis:
+            # first begin is [0, 0, ... , 0]
+            begin_shape = [0 for i in range(len(split_input.checked_type.shape))]
+            begin_shape[post.attrs.axis] = split_cord
+            split_begins.append(begin_shape)
+
+            end_shape = list(split_input.checked_type.shape)
+            # Only the split axis coordinate changes
+            end_shape[post.attrs.axis] = split_cord
+            split_ends.append(end_shape)
+
+        # Coordinates needs to be shifted left because beginning
+        # of the next section is the end of the previous
+        split_ends = split_ends[1:]
+        # Last section end is the shape of the tensor itself.
+        split_ends.append(list(split_input.checked_type.shape))
+
+        strided_slices = list()
+        for sb, se in zip(split_begins, split_ends):
+            strided_slices.append(relay.strided_slice(split_input, sb, se))
+
+        return relay.Tuple(strided_slices)
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeSplit:
+    """This is the pass that wraps SplitRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(SplitRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+class EthosUConv2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions into ethosu_conv2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": "ethosu.qnn_conv2d"}))(wildcard())
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = ethosu_patterns.QnnConv2DParams(post.op.body)
+        params.ifm.tensor = post.args[0]
+        channels_map = {
+            "NHWC": 3,
+        }
+        if str(params.ofm.layout) not in channels_map.keys():
+            raise UnsupportedLayout(str(params.ofm.layout))
+        kernel_size_map = {
+            "HWIO": params.weights.shape[0:2],
+            "OHWI": params.weights.shape[1:3],
+            "HWOI": params.weights.shape[0:2],
+        }
+        if str(params.weights.layout) not in kernel_size_map.keys():
+            raise UnsupportedLayout(str(params.weights.layout))
+        activation_map = {"clip": "CLIP"}
+        weight_to_ohwi_transform_map = {"HWIO": [3, 0, 1, 2]}
+        weights_values = params.weights.values
+        weights_values_ohwi = np.transpose(
+            weights_values, weight_to_ohwi_transform_map[str(params.weights.layout)]
+        )
+        if params.activation:
+            activation = activation_map[params.activation.op.name]
+            clip_min = int(params.activation.attrs.a_min)
+            clip_max = int(params.activation.attrs.a_max)
+        else:
+            activation = "NONE"
+            clip_min = 0
+            clip_max = 0
+        scale_bias = vela_api.pack_biases(
+            biases=params.biases.tensor.data.asnumpy(),
+            ifm_scale=params.ifm.q_params.scale_f32,
+            ifm_dtype=np.dtype(params.ifm.dtype),
+            weight_scales=params.weights.q_params.scale_f32,
+            ofm_scale=params.ofm.q_params.scale_f32,
+            is_activation_tanh_or_sigmoid=activation in ["TANH", "SIGMOID"],
+        )
+        ethosu_conv2d = ethosu_ops.ethosu_conv2d(
+            ifm=post.args[0],
+            weight=relay.const(weights_values_ohwi, params.weights.values.dtype),
+            scale_bias=relay.const(scale_bias, "uint8"),
+            lut=relay.const([], dtype="int8"),
+            ifm_scale=float(params.ifm.q_params.scale_f32),
+            ifm_zero_point=int(params.ifm.q_params.zero_point),
+            weight_zero_point=int(params.weights.q_params.zero_point),
+            ofm_scale=float(params.ofm.q_params.scale_f32),
+            ofm_zero_point=int(params.ofm.q_params.zero_point),
+            kernel_shape=kernel_size_map[str(params.weights.layout)],
+            ofm_channels=params.ofm.shape[channels_map[str(params.ofm.layout)]],
+            strides=params.strides,
+            padding=params.padding,
+            dilation=params.dilation,
+            activation=activation,
+            clip_min=clip_min,
+            clip_max=clip_max,
+            upscale="NONE",
+            ifm_layout=str(params.ifm.layout),
+            ofm_layout=str(params.ofm.layout),
+        )
+        return ethosu_conv2d
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeEthosUConv2D:
+    """This is the pass that wraps the EthosUConv2DRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(EthosUConv2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeEthosU:
+    """This is the pass to call graph-rewrites to perform graph transformation
+    in a way such that the operations are replaced with hardware/codegen supported
+    operations.
+    """
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        mod = LegalizeSplit()(mod)
+        mod = LegalizeEthosUConv2D()(mod)
+        return mod
diff --git a/python/tvm/relay/backend/contrib/ethosu/op/__init__.py b/python/tvm/relay/backend/contrib/ethosu/op/__init__.py
new file mode 100644
index 000000000000..0406298f23f4
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/op/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"Relay operators for the Arm(R) Ethos(TM)-U NPU"
+
+from .convolution import ethosu_conv2d
diff --git a/python/tvm/relay/backend/contrib/ethosu/op/convolution.py b/python/tvm/relay/backend/contrib/ethosu/op/convolution.py
new file mode 100644
index 000000000000..b159830ceaa9
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/op/convolution.py
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""Relay operators for convolutions for Arm(R) Ethos(TM)-U NPU"""
+from typing import Tuple
+
+import tvm  # type: ignore
+from tvm.relay.op import _make  # type: ignore
+from tvm.topi.generic import schedule_injective  # type: ignore
+from tvm.relay.op.op import OpStrategy  # type: ignore
+from tvm.relay.op import strategy as _strategy
+
+from ..te import conv2d_compute
+
+
+def _extract_ethosu_conv2d_params(attrs, args):
+    """Get the parameters necessary to construct a compute TE
+    from a ethosu_conv2d Relay call."""
+    ifm = args[0]
+    weight = args[1]
+    scale_bias = args[2]
+    lut = args[3]
+    ifm_scale = attrs.ifm_scale
+    ifm_zero_point = attrs.ifm_zero_point
+    weight_zero_point = attrs.weight_zero_point
+    ofm_scale = attrs.ofm_scale
+    ofm_zero_point = attrs.ofm_zero_point
+    strides = attrs.strides
+    padding = attrs.padding
+    dilation = attrs.dilation
+    activation = attrs.activation
+    clip_min = attrs.clip_min
+    clip_max = attrs.clip_max
+    upscale = attrs.upscale
+    ifm_layout = attrs.ifm_layout
+    ofm_layout = attrs.ofm_layout
+
+    return (
+        ifm,
+        weight,
+        scale_bias,
+        lut,
+        ifm_scale,
+        ifm_zero_point,
+        weight_zero_point,
+        ofm_scale,
+        ofm_zero_point,
+        strides,
+        padding,
+        dilation,
+        activation,
+        clip_min,
+        clip_max,
+        upscale,
+        ifm_layout,
+        ofm_layout,
+    )
+
+
+@tvm.ir.register_op_attr("contrib.ethosu.conv2d", "FTVMCompute")
+def create_ethosu_conv2d_compute(attrs, args, out_type):
+    """Create an ethosu_conv2d compute op."""
+    params = _extract_ethosu_conv2d_params(attrs, args)
+    op = conv2d_compute(*params)
+    return [op]
+
+
+@tvm.ir.register_op_attr("contrib.ethosu.conv2d", "FTVMStrategy")
+def conv2d_strategy_ethosu(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        create_ethosu_conv2d_compute,
+        _strategy.wrap_topi_schedule(schedule_injective),
+        name="ethosu_conv2d",
+    )
+    return strategy
+
+
+def ethosu_conv2d(
+    ifm: tvm.relay.Expr,
+    weight: tvm.relay.Expr,
+    scale_bias: tvm.relay.Expr,
+    lut: tvm.relay.Expr,
+    ifm_scale: float,
+    ifm_zero_point: int,
+    weight_zero_point: int,
+    ofm_scale: float,
+    ofm_zero_point: int,
+    kernel_shape: Tuple[int, int],
+    ofm_channels: int,
+    strides: Tuple[int, int] = (1, 1),
+    padding: Tuple[int, int, int, int] = (0, 0, 0, 0),
+    dilation: Tuple[int, int] = (1, 1),
+    activation: str = "NONE",
+    clip_min: int = 0,
+    clip_max: int = 0,
+    upscale: str = "NONE",
+    ifm_layout: str = "NHWC",
+    ofm_layout: str = "NHWC",
+) -> tvm.relay.Call:
+    """This is a quantized 2D convolution operation as supported by the
+    the NPU. It accepts either NHWC or NHCWB16 format
+    for the input data and OHWI format for the kernel weights.
+
+    Reference: https://developer.arm.com/documentation/102420/0200/
+
+    Note that the per-channel weight scale and bias tensor must be
+    packed together into a combined tensor of uint80s. This is represented
+    in TVM by a (channels, 10) tensor of type uint8. For more detail,
+    refer to the Technical Reference Manual linked above.
+
+    Parameters
+    ----------
+    ifm : tvm.relay.Expr
+        The Input Feature Map tensor (IFM).
+    weight : tvm.relay.Expr
+        The weight tensor.
+    scale_bias : tvm.relay.Expr
+        The packed per-channel weight scale and bias tensor.
+    lut : tvm.relay.Expr
+        The look-up table values to use if activation = "LUT".
+    ifm_scale : float
+        The quantization scale for the Input Feature Map tensor.
+    ifm_zero_point : int
+        The quantization zero point for the Input Feature Map tensor.
+    weight_zero_point : int
+        The quantization zero point for the weight tensor.
+    ofm_scale : int
+        The quantization scale for the Output Feature Map tensor.
+    ofm_zero_point : int
+        The quantization zero point for the Output Feature Map tensor.
+    kernel_shape : tuple of int
+        The 2 dimensional kernel shape as (kernel_height, kernel_width).
+    ofm_channels : int
+        The number of OFM channels.
+    strides : tuple of int, optional
+        The 2 dimensional strides as (stride_height, stride_width).
+    padding : tuple of int, optional
+        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
+    dilation : tuple of int, optional
+        The 2 dimensional dilation as (dilation_height, dilation_width).
+    activation : str, optional
+        The activation function to use.
+            "NONE" - no activation function.
+            "CLIP" - clip the output between clip_min and clip_max.
+            "TANH" - tanh activation function.
+            "SIGMOID" - sigmoid activation function.
+            "LUT" - use a look-up table to perform the activation function.
+    clip_min : int, optional
+        The minimum clipping value if activation = "CLIP"
+    clip_max : int, optional,
+        The maximum clipping value if activation = "CLIP"
+    upscale : str, optional
+        The 2x2 upscaling mode to apply to the Input Feature Map tensor.
+            "NONE" - no upscaling.
+            "NEAREST" - upscale using nearest neighbour.
+            "ZEROS" - upscale using zeros.
+    ifm_layout : str, optional
+        The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16".
+    ofm_layout : str, optional
+        The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16".
+
+    Returns
+    -------
+    tvm.relay.Call
+        A call to the ethosu_conv2d op.
+
+    """
+    return _make.ethosu_conv2d(
+        ifm,
+        weight,
+        scale_bias,
+        lut,
+        ifm_scale,
+        ifm_zero_point,
+        weight_zero_point,
+        ofm_scale,
+        ofm_zero_point,
+        kernel_shape,
+        ofm_channels,
+        strides,
+        padding,
+        dilation,
+        activation,
+        clip_min,
+        clip_max,
+        upscale,
+        ifm_layout,
+        ofm_layout,
+    )
diff --git a/python/tvm/relay/backend/contrib/ethosu/preprocess.py b/python/tvm/relay/backend/contrib/ethosu/preprocess.py
new file mode 100644
index 000000000000..795adfc2fb1f
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/preprocess.py
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+"""Set of passes to pre-process the IRModule to support Arm(R)-Ethos(TM)-U
+NPU code generation. These set of passes will mutate both the main and the
+external functions.
+"""
+import tvm  # type: ignore
+from . import _ffi_api  # type: ignore
+
+
+def preprocess_ext_io() -> tvm.transform.Pass:
+    """This pass mutates the number of inputs going to / outputs coming out to/from
+    external functions to one. This is achieved via concatenation
+    of inputs and splitting of outputs in around the call to the external function.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The registered pass to mutate the IO of the external functions and their calls.
+    """
+    return _ffi_api.PreprocessExternalFuncIO()  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/__init__.py b/python/tvm/relay/backend/contrib/ethosu/te/__init__.py
new file mode 100644
index 000000000000..7ca5de3c160c
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/te/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tensor Expressions for the NPU"""
+
+from .convolution import *
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
new file mode 100644
index 000000000000..26f7ea979219
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-argument
+"""Tensor Expressions for convolutions for the NPU"""
+from typing import Tuple, Union, List
+
+from tvm import te  # type: ignore
+from .dma import dma_ofm_compute, dma_ifm_compute
+
+
+def conv2d_compute(
+    ifm: te.Tensor,
+    weight: te.Tensor,
+    scale_bias: te.Tensor,
+    lut: te.Tensor,
+    ifm_scale: float,
+    ifm_zero_point: int,
+    weight_zero_point: int,
+    ofm_scale: float,
+    ofm_zero_point: int,
+    strides: Tuple[int, int],
+    padding: Tuple[int, int, int, int],
+    dilation: Union[Tuple[int, int], List[int]],
+    activation: str,
+    clip_min: int,
+    clip_max: int,
+    upscale: str,
+    ifm_layout: str,
+    ofm_layout: str,
+) -> te.Tensor:
+    """A compute operator representing the capabilities of a 2D convolution for the NPU.
+
+    Parameters
+    ----------
+    ifm : te.Tensor
+        The Input Feature Map tensor (IFM).
+    weight : te.Tensor
+        The weight tensor.
+    scale_bias : te.Tensor
+        The packed per-channel weight scale and bias tensor.
+    lut : te.Tensor
+        The look-up table values to use if activation = "LUT".
+    ifm_scale : float
+        The quantization scale for the Input Feature Map tensor.
+    ifm_zero_point : int
+        The quantization zero point for the Input Feature Map tensor.
+    weight_zero_point : int
+        The quantization zero point for the weight tensor.
+    ofm_scale : float
+        The quantization scale for the Output Feature Map tensor.
+    ofm_zero_point : int
+        The quantization zero point for the Output Feature Map tensor.
+    strides : tuple
+        The 2 dimensional strides as (stride_height, stride_width).
+    padding : tuple
+        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
+    dilation : Union[Tuple[int, int], List[int]]
+        The 2 dimensional dilation as (dilation_height, dilation_width).
+    activation : str
+        The activation function to use.
+            "NONE" - no activation function.
+            "CLIP" - clip the output between clip_min and clip_max.
+            "TANH" - tanh activation function.
+            "SIGMOID" - sigmoid activation function.
+            "LUT" - use a look-up table to perform the activation function.
+    clip_min : int
+        The minimum clipping value if activation = "CLIP".
+    clip_max : int
+        The maximum clipping value if activation = "CLIP".
+    upscale : str
+        The 2x2 upscaling mode to apply to the Input Feature Map tensor.
+            "NONE" - no upscaling.
+            "NEAREST" - upscale using nearest neighbour.
+            "ZEROS" - upscale using zeros.
+    ifm_layout : str
+        The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16".
+    ofm_layout : str
+        The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16".
+
+    Returns
+    -------
+    te.Tensor
+        The OFM tensor.
+
+    """
+    assert ifm.shape[0] == 1
+    assert ifm_layout in {"NHWC", "NHCWB16"}
+    assert ofm_layout in {"NHWC", "NHCWB16"}
+
+    stride_h, stride_w = strides
+    dilation_h, dilation_w = dilation
+    ofm_channels, kernel_h, kernel_w, ifm_channels = weight.shape
+
+    # Compute operation for the IFM DMA pipeline
+    dmaed_ifm = dma_ifm_compute(
+        ifm, ifm_layout, ifm_zero_point, ifm_scale, weight.shape[3], padding
+    )
+
+    # 2D Convolution compute operation
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    ofm_height = (dmaed_ifm.shape[1] - dilated_kernel_h) // stride_h + 1
+    ofm_width = (dmaed_ifm.shape[2] - dilated_kernel_w) // stride_w + 1
+    rc = te.reduce_axis((0, ifm_channels), name="rc")
+    rh = te.reduce_axis((0, kernel_h), name="ry")
+    rw = te.reduce_axis((0, kernel_w), name="rx")
+
+    conv2d_attrs = {
+        "op": "ethosu_conv2d",
+        "weight_zero_point": weight_zero_point,
+        "activation": activation,
+        "upscale": upscale,
+        "clip_min": clip_min,
+        "clip_max": clip_max,
+        "stride_h": stride_h,
+        "stride_w": stride_w,
+        "dilation_h": dilation_h,
+        "dilation_w": dilation_w,
+    }
+
+    conv = te.compute(
+        (1, ofm_height, ofm_width, ofm_channels),
+        lambda nn, hh, ww, cc: te.sum(
+            dmaed_ifm(
+                nn, hh * stride_h + rh * dilation_h, ww * stride_w + rw * dilation_w, rc
+            ).astype(ifm.dtype)
+            * weight[cc, rh, rw, rc].astype(ifm.dtype)
+            # This is a trick to load 10 elements of the scale_bias at once, not accurate maths
+            + (scale_bias[cc, 0] * scale_bias[cc, 9]).astype(ifm.dtype),
+            axis=[rh, rw, rc],
+        ),
+        name="ethosu_conv2d",
+        attrs=conv2d_attrs,
+    )
+
+    # Compute operation for the OFM DMA pipeline
+    return dma_ofm_compute(conv, ofm_layout, ofm_zero_point, ofm_scale, ofm_channels)
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/dma.py b/python/tvm/relay/backend/contrib/ethosu/te/dma.py
new file mode 100644
index 000000000000..bf9a018ea855
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/te/dma.py
@@ -0,0 +1,311 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unnecessary-lambda
+"""Tensor Expressions for operations supported by the NPU DMA engine"""
+from typing import Callable, Tuple, Optional, List
+
+import tvm  # type: ignore
+from tvm import te
+from tvm.topi.utils import equal_const_int  # type: ignore
+
+
+def _pad_tensor(
+    tensor: te.Tensor, pad_before: List[int], pad_after: Optional[List[int]] = None
+) -> Callable:
+    """Generate a padded tensor.
+
+    Parameters
+    ----------
+    tensor : te.Tensor
+        The tensor to pad.
+    pad_before : tuple of int
+        The 'before' padding on each axis.
+    pad_after : tuple of int
+        The 'after' padding on each axis.
+    Returns
+    -------
+    _pad : callable
+        The padded tensor.
+
+    """
+    pad_after = pad_after or pad_before
+    dims = len(tensor.shape)
+    assert len(pad_before) == dims
+    assert len(pad_after) == dims
+
+    def _pad(*indices):
+        not_zero = []  # A list of padding conditions that aren't trivial (zero padding)
+        index_tuple = []  # The indices with which to access the padded tensor
+        for i in range(dims):
+            if equal_const_int(pad_before[i], 0) and equal_const_int(pad_after[i], 0):
+                index_tuple.append(indices[i])
+            else:
+                index_tuple.append(indices[i] - pad_before[i])
+                not_zero.append(indices[i] >= pad_before[i])
+                not_zero.append(indices[i] < tensor.shape[i] + pad_before[i])
+        if not_zero:
+            not_zero = tvm.tir.all(*not_zero)
+            return tvm.tir.if_then_else(
+                not_zero, tensor(*index_tuple), tvm.tir.const(0, tensor.dtype)
+            )
+        return tensor(*index_tuple)
+
+    return _pad
+
+
+def read_compute(tensor: te.Tensor, layout: str, zero_point: int, scale: float) -> te.Tensor:
+    """A tensor expression which represents a read.
+
+    Parameters
+    ----------
+    tensor : te.Tensor
+        The tensor to read.
+    layout : str
+        The layout of the tensor, either NHWC or NHCWB16.
+    zero_point : int
+        The zero point of the tensor.
+    scale : float
+        The scale of the tensor.
+
+    Returns
+    -------
+    te.Tensor
+        The tensor having been read.
+
+    """
+    assert layout in {"NHWC", "NHCWB16"}
+    read_attrs = {
+        "op": "ethosu_read",
+        "layout": layout,
+        "zero_point": zero_point,
+        "scale": scale,
+    }
+    return te.compute(tensor.shape, lambda *i: tensor(*i), name="ethosu_read", attrs=read_attrs)
+
+
+def write_compute(tensor: te.Tensor, layout: str, zero_point: int, scale: float) -> te.Tensor:
+    """A tensor expression which represents a write.
+
+    Parameters
+    ----------
+    tensor : te.Tensor
+        The tensor to write.
+    layout : str
+        The layout of the tensor, either NHWC or NHCWB16.
+    zero_point : int
+        The zero point of the tensor.
+    scale : float
+        The scale of the tensor.
+
+    Returns
+    -------
+    te.Tensor
+        The tensor having been written.
+
+    """
+    assert layout in {"NHWC", "NHCWB16"}
+    write_attrs = {
+        "op": "ethosu_write",
+        "layout": layout,
+        "zero_point": zero_point,
+        "scale": scale,
+    }
+    return te.compute(
+        tensor.shape,
+        lambda *i: tensor(*i),
+        name="ethosu_write",
+        attrs=write_attrs,
+    )
+
+
+def convert_to_nhwc_compute(tensor: te.Tensor, layout: str, channels: int) -> te.Tensor:
+    """Converts a tensor into NHWC layout if it's in NHWCB16 layout.
+
+    Parameters
+    ----------
+    tensor : te.Tensor
+        The tensor to convert.
+    layout : str
+        The layout of the tensor, either NHWC or NHCWB16.
+    channels : int
+        The number of valid channels for the tensor.
+
+    Returns
+    -------
+    te.Tensor
+        The converted tensor in NHWC layout.
+
+    """
+    assert layout in {"NHWC", "NHCWB16"}
+    convert_to_nhwc_attrs = {
+        "op": "ethosu_convert_to_nhwc",
+        "layout": layout,
+    }
+    if layout == "NHCWB16":
+        return te.compute(
+            (tensor.shape[0], tensor.shape[1], tensor.shape[3], channels),
+            lambda nn, hh, ww, cc: tensor(nn, hh, te.indexdiv(cc, 16), ww, te.indexmod(cc, 16)),
+            name="ethosu_convert_to_nhwc",
+            attrs=convert_to_nhwc_attrs,
+        )
+
+    return te.compute(
+        tensor.shape,
+        lambda *i: tensor(*i),
+        name="ethosu_convert_to_nhwc",
+        attrs=convert_to_nhwc_attrs,
+    )
+
+
+def convert_to_nhcwb16_compute(tensor: te.Tensor, layout: str, channels: int) -> te.Tensor:
+    """Converts a tensor into NHCWB16 layout if it's in NHWC layout.
+
+    Parameters
+    ----------
+    tensor : te.Tensor
+        The tensor to convert.
+    layout : str
+        The layout of the tensor, either NHWC or NHCWB16.
+    channels : int
+        The number of valid channels for the tensor.
+
+    Returns
+    -------
+    te.Tensor
+        The converted tensor in NHCWB16 layout.
+
+    """
+    assert layout in {"NHWC", "NHCWB16"}
+    convert_to_nhcwb16_attrs = {
+        "op": "ethosu_convert_to_nhcwb16",
+        "layout": layout,
+    }
+    if layout == "NHCWB16":
+        out_channel_bricks = te.indexdiv(channels - 1, 16) + 1
+        output_shape = (1, tensor.shape[1], out_channel_bricks, tensor.shape[2], 16)
+        return te.compute(
+            output_shape,
+            lambda nn, hh, cc, ww, cb: tvm.tir.if_then_else(
+                cc * 16 + cb < channels,
+                tensor(nn, hh, ww, cc * 16 + cb),
+                tvm.tir.IntImm(tensor.dtype, 0),
+            ),
+            name="ethosu_convert_to_nhcwb16",
+            attrs=convert_to_nhcwb16_attrs,
+        )
+
+    return te.compute(
+        tensor.shape,
+        lambda *i: tensor(*i),
+        name="ethosu_convert_to_nhcwb16",
+        attrs=convert_to_nhcwb16_attrs,
+    )
+
+
+def pad_compute(tensor: te.Tensor, padding: tuple) -> te.Tensor:
+    """Pad an NHWC tensor in the height and width axes.
+
+    Parameters
+    ----------
+    tensor : te.Tensor
+        The tensor to pad.
+    padding : tuple
+        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
+
+    Returns
+    -------
+    te.Tensor
+        The padded tensor.
+
+    """
+    pad_top, pad_left, pad_down, pad_right = padding
+    pad_before = [0, int(pad_top), int(pad_left), 0]
+    pad_after = [0, int(pad_down), int(pad_right), 0]
+    pad_attrs = {
+        "op": "ethosu_pad",
+    }
+    shape = tensor.shape
+    return te.compute(
+        (shape[0], shape[1] + pad_top + pad_down, shape[2] + pad_left + pad_right, shape[3]),
+        lambda nn, hh, ww, cc: _pad_tensor(tensor, pad_before, pad_after)(nn, hh, ww, cc),
+        name="ethosu_pad",
+        attrs=pad_attrs,
+    )
+
+
+def dma_ifm_compute(
+    ifm: te.Tensor,
+    layout: str,
+    zero_point: int,
+    scale: float,
+    channels: int,
+    padding: Tuple[int, int, int, int],
+) -> te.Tensor:
+    """A sequence of compute operators representing the DMA capabilities for an IFM.
+
+    Parameters
+    ----------
+    ifm : te.Tensor
+        The Input Feature Map (IFM) tensor.
+    layout : str
+        The layout of the data, either NHWC or NHCWB16.
+    zero_point : int
+        The zero point of the data.
+    scale : float
+        The scale of the data.
+    channels : int
+        The number of valid channels for the data.
+    padding : tuple
+        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
+
+    Returns
+    -------
+    te.Tensor
+        The dma-ed IFM tensor.
+
+    """
+    read_ifm = read_compute(ifm, layout, zero_point, scale)
+    convert_to_nhwc_ifm = convert_to_nhwc_compute(read_ifm, layout, channels)
+    return pad_compute(convert_to_nhwc_ifm, padding)
+
+
+def dma_ofm_compute(
+    ofm: te.Tensor, layout: str, zero_point: int, scale: float, channels: int
+) -> te.Tensor:
+    """A sequence of compute operators representing the DMA capabilities for an OFM.
+
+    Parameters
+    ----------
+    ofm : te.Tensor
+        The Output Feature Map (OFM) tensor.
+    layout : str
+        The layout of the data, either NHWC or NHCWB16.
+    zero_point : int
+        The zero point of the data.
+    scale : float
+        The scale of the data.
+    channels : int
+        The number of valid channels for the data.
+
+    Returns
+    -------
+    te.Tensor
+        The dma-ed OFM tensor.
+
+    """
+    convert_to_nhcwb16_ofm = convert_to_nhcwb16_compute(ofm, layout, channels)
+    return write_compute(convert_to_nhcwb16_ofm, layout, zero_point, scale)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/__init__.py b/python/tvm/relay/backend/contrib/ethosu/tir/__init__.py
new file mode 100644
index 000000000000..cc285e5241cd
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm(R) Ethos(TM)-U NPU TIR codegen modules."""
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
new file mode 100644
index 000000000000..c59a386fefbb
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""The integration of Arm(R) Ethos(TM)-U NPU TIR compiler"""
+import tvm
+from tvm import relay
+from tvm.relay.expr_functor import ExprMutator
+from tvm.driver.build_module import get_binds
+
+from .passes import ReplaceOperators, RemoveZeroStores, EncodeConstants
+from .scheduler import schedule
+
+
+def lower_ethosu(sch, args, const_dict, name="main"):
+    """Lower a schedule to TIR for the Arm(R) Ethos(TM)-U NPU target.
+
+    The resulting TIR module will contain a single function
+    that comprises of a sequence of tir.extern_calls to NPU
+    operations.
+
+    Parameters
+    ----------
+    sch : tvm.te.Schedule
+        The schedule to be lowered.
+    args : Union[list of tvm.te.Tensor, TEGraph]
+        The input/output tensors.
+    const_dict : dict of int to numpy.ndarray
+        The constant dictionary.
+    name : str, optional
+        The name of the lowered primitive function.
+
+    Returns
+    -------
+    mod : tvm.IRModule
+        The lowered TIR module.
+    const_dict : dict of int to numpy.ndarray
+        The modified constant dictionary.
+
+    """
+    if not isinstance(args, list):
+        args = list(args.inputs) + list(args.outputs)
+    # config setup
+    curr_pass_ctx = tvm.ir.transform.PassContext.current()
+    curr_cfg = dict()
+    for key, value in curr_pass_ctx.config.items():
+        curr_cfg[key] = value
+    tir_compiler_cfg = {
+        "tir.LoopPartition": {
+            "partition_const_loop": True,
+            "no_unroll_loop_with_extent_one": True,
+        },
+        "tir.UnrollLoop": {"auto_max_depth": -1},
+    }
+    # Merge two configs
+    curr_cfg = {**curr_cfg, **tir_compiler_cfg}
+
+    sch = sch.normalize()
+    bounds = tvm.te.schedule.InferBound(sch)
+    stmt = tvm.te.schedule.ScheduleOps(sch, bounds, True)
+
+    compact = tvm.te.schedule.VerifyCompactBuffer(stmt)
+    binds, arg_list = get_binds(args, compact, None)
+    func = tvm.te.schedule.SchedulePostProcToPrimFunc(arg_list, stmt, binds)
+
+    func = func.with_attr("global_symbol", name)
+    func = func.with_attr("tir.noalias", True)
+    mod = tvm.IRModule({name: func})
+    with tvm.transform.PassContext(config=curr_cfg):
+        mod = tvm.tir.transform.Simplify()(mod)
+        mod = tvm.tir.transform.StorageFlatten(64)(mod)
+        mod = tvm.tir.transform.UnrollLoop()(mod)
+        mod = tvm.tir.transform.LoopPartition()(mod)
+        mod = RemoveZeroStores()(mod)
+        mod = tvm.tir.transform.Simplify()(mod)
+        mod = tvm.tir.transform.RemoveNoOp()(mod)
+        mod = ReplaceOperators()(mod)
+        mod = tvm.tir.transform.RemoveNoOp()(mod)
+        mod, const_dict = EncodeConstants(const_dict)(mod)
+        mod = tvm.tir.transform.StorageRewrite()(mod)
+        mod = tvm.tir.transform.RemoveNoOp()(mod)
+    return mod, const_dict
+
+
+def lower_to_te(prim_func):
+    """Lower a Relay primitive function to a Tensor Expression graph.
+
+    Parameters
+    ----------
+    prim_func : tvm.relay.Function
+        The Relay function to lowerethosu_runtime([]).
+
+    Returns
+    -------
+    out : TEGraph
+        The lowered Tensor Expression graph.
+
+    """
+    f = tvm._ffi.get_global_func("relay.backend.contrib.ethosu.LowerToTE")
+    return f(prim_func)
+
+
+class ExtractConstants(ExprMutator):
+    """The actual mutator pass to extract the constants from a function and replace them with
+    Vars so the function can be lowered to a TE graph. Additionally returns all the values of
+    the constants extracted."""
+
+    def __init__(self):
+        super().__init__()
+        self.constants = []
+
+    def visit_constant(self, const):
+        if isinstance(const.checked_type, relay.ty.TensorType):
+            if const.checked_type.concrete_shape != ():
+                self.constants.append(const.data.asnumpy())
+                name = "p" + str(len(self.constants))
+                return relay.var(type_annotation=const.checked_type, name_hint=name)
+
+        return const
+
+    def visit_function(self, fn):
+        new_body = self.visit(fn.body)
+        new_params = list(relay.analysis.free_vars(new_body))
+        return relay.Function(new_params, new_body)
+
+    def extract_constants(self, func):
+        new_func = self.visit(func)
+        return new_func, self.constants
+
+
+def extract_constants(func):
+    """Extract the constants from a function and replace them with
+    Vars so the function can be lowered to a TE graph. Additionally
+    returns all the values of the constants extracted.
+
+    Parameters
+    ----------
+    func : tvm.relay.Function
+        The Relay function from which to extract constants.
+
+    Returns
+    -------
+    new_func : tvm.relay.Function
+        The Relay function with constants replaced by vars.
+    const_dict : dict of int to numpy.ndarray
+        A dict of the extracted constants keyed by their param index.
+
+    """
+    const_dict = {}
+    params = len(func.params)
+    new_func, consts = ExtractConstants().extract_constants(func)
+    for i, const in enumerate(consts):
+        const_dict[params + i] = const
+
+    new_func = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(new_func))["main"]
+    return new_func, const_dict
+
+
+def lower_to_tir(func, cascader=None):
+    """Lower a Relay function to TIR for the Arm(R) Ethos(TM)-U NPU target.
+
+    The Relay function should only contain operations supported
+    by the NPU.
+
+    Parameters
+    ----------
+    func : tvm.relay.Function
+        The Relay function to lower.
+    cascader : Callable
+        An optional cascading function,
+
+    Returns
+    -------
+    mod : tvm.IRModule
+        The lowered TIR module.
+    consts : dict of int to numpy.ndarray
+        A dict of the extracted constants keyed by their param index.
+
+    """
+    func, consts = extract_constants(func)
+    mod = tvm.IRModule.from_expr(func)
+    func = relay.transform.InferType()(mod)["main"]
+    te_graph = lower_to_te(func)
+    s = schedule(te_graph, consts, cascader)
+    mod, consts = lower_ethosu(s, te_graph, consts)
+    return mod, consts
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
new file mode 100644
index 000000000000..33fbdcd2b24f
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Extract information from the convolution operators in TIR."""
+import tvm
+from ..vela_api import SCALE_BIAS_LENGTH
+from .utils import get_outer_loops, get_op_attrs, get_base_address, get_loads, get_stores
+from .dma import get_ifm_params, get_ofm_params
+from .spec import SerialKernel, SerialAddressRange, SerialActivation, Serial2DConvolution
+
+
+def get_conv2d_params(stmt, producers, consumers):
+    """Get the parameters necessary to construct a call_extern for a 2D convolution.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a convolution loop nest.
+    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
+        A dictionary to associate pointers with the loop nest
+        that produces their values.
+    consumers : dict of tvm.tir.Var to tvm.tir.AttrStmt
+        A dictionary to associate pointers with the loop nest
+        that consumes their values.
+
+    Returns
+    -------
+    Serial2DConvolution
+        The parameters needed to construct a 2D convolution.
+    output_pointer : tvm.tir.Var
+        The output pointer of the convolution operation.
+    replace_pointer : tvm.tir.Var
+        The output pointer of the DMA write operation, which is to replace
+        the convolution output pointer.
+
+    """
+    attrs, body = get_op_attrs(stmt)
+    _, _, _, _, _, inner = get_outer_loops(body, "NHWC")
+    rh = inner
+    rw = rh.body
+    rc = rw.body
+    # loads = [output, input, weights, scale_bias, scale_bias]
+    loads = get_loads(rc.body)
+    # stores = [output]
+    stores = get_stores(rc.body)
+    input_pointer = loads[1].buffer_var
+    output_pointer = stores[0].buffer_var
+    # Get feature map info
+    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
+    serial_ofm, replace_pointer = get_ofm_params(output_pointer, consumers)
+    # Get kernel info
+    serial_kernel = SerialKernel(
+        width=int(rw.extent),
+        height=int(rh.extent),
+        stride_w=int(attrs["stride_w"]),
+        stride_h=int(attrs["stride_h"]),
+        dilation_w=int(attrs["dilation_w"]),
+        dilation_h=int(attrs["dilation_h"]),
+    )
+    # Get scale_bias info
+    scale_bias_load = loads[3]
+    scale_bias_base = get_base_address(scale_bias_load.index)
+    serial_scale_bias = SerialAddressRange(
+        address=tvm.tir.Load("uint8", scale_bias_load.buffer_var, scale_bias_base),
+        length=SCALE_BIAS_LENGTH * serial_ofm[3],
+    )
+    # Get weight info
+    weight_load = loads[2]
+    weight_base = get_base_address(weight_load.index)
+    serial_weight = SerialAddressRange(
+        address=tvm.tir.Load("uint8", weight_load.buffer_var, weight_base),
+        length=serial_ofm[3] * serial_kernel[0] * serial_kernel[1] * rc.extent,
+    )
+    # Get activation info
+    serial_activation = SerialActivation(
+        op=attrs["activation"], clip_min=attrs["clip_min"], clip_max=attrs["clip_max"]
+    )
+    return (
+        Serial2DConvolution(
+            ifm=serial_ifm,
+            ofm=serial_ofm,
+            kernel=serial_kernel,
+            weight=serial_weight,
+            weight_zero_point=attrs["weight_zero_point"],
+            scale_bias=serial_scale_bias,
+            padding=serial_padding,
+            activation=serial_activation,
+            upscale="NONE",
+        ),
+        output_pointer,
+        replace_pointer,
+    )
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
new file mode 100644
index 000000000000..ecd402d63309
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
@@ -0,0 +1,291 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Extract information from the DMA operators in TIR."""
+import tvm
+from .utils import get_outer_loops, get_base_address, get_strides, get_op_attrs
+from .spec import SerialFeatureMap, SerialPadding
+
+
+def get_pad_params(stmt):
+    """Get the padding parameters from a pad loop nest.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a pad loop nest.
+
+    Returns
+    -------
+    pad : SerialPadding
+        The serializable padding.
+    input_pointer : tvm.tir.Var
+        The pointer consumed by the operation.
+    output_pointer : tvm.tir.Var
+        The pointer produced by the operation.
+
+    """
+    _, body = get_op_attrs(stmt)
+    n, h, w, c, _, inner = get_outer_loops(body, "NHWC")
+    output_pointer = inner.buffer_var
+    pad = SerialPadding(top=0, left=0, bottom=0, right=0)
+    if isinstance(inner.value, tvm.tir.Call):
+        input_pointer = inner.value.args[1].buffer_var
+    else:
+        input_pointer = inner.value.buffer_var
+        return pad, input_pointer, output_pointer
+
+    padded_shape = [n.extent, h.extent, w.extent, c.extent]
+
+    def _visit(expr):
+        if isinstance(expr, tvm.tir.expr.LT):
+            var = expr.a
+            val = expr.b
+            if var == h.loop_var:
+                pad.bottom = padded_shape[1] - val
+            else:
+                pad.right = padded_shape[2] - val
+        elif isinstance(expr, tvm.tir.expr.LE):
+            var = expr.b
+            val = expr.a
+            if var == h.loop_var:
+                pad.top = val
+            else:
+                pad.left = val
+
+    cond = inner.value.args[0]
+    tvm.tir.stmt_functor.post_order_visit(cond, _visit)
+    return (
+        pad,
+        input_pointer,
+        output_pointer,
+    )
+
+
+def get_convert_to_nhwc_params(stmt):
+    """Get the true number of channels from a convert_to_nhwc loop nest.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a convert_to_nhwc loop nest.
+
+    Returns
+    -------
+    int
+        The true number of channels.
+    input_pointer : tvm.tir.Var
+        The pointer consumed by the operation.
+    output_pointer : tvm.tir.Var
+        The pointer produced by the operation.
+
+    """
+    _, body = get_op_attrs(stmt)
+    _, _, _, c, _, inner = get_outer_loops(body, "NHWC")
+    output_pointer = inner.buffer_var
+    input_pointer = inner.value.buffer_var
+    return c.extent, input_pointer, output_pointer
+
+
+def get_convert_to_nhcwb16_params(stmt):
+    """Get the true number of channels from a convert_to_nhcwb16 loop nest.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a convert_to_nhcwb16 loop nest.
+
+    Returns
+    -------
+    out_channels : int
+        The true number of channels.
+    input_pointer : tvm.tir.Var
+        The pointer consumed by the operation.
+    output_pointer : tvm.tir.Var
+        The pointer produced by the operation.
+
+    """
+    attrs, body = get_op_attrs(stmt)
+    _, _, _, c, b, inner = get_outer_loops(body, attrs["layout"])
+    output_pointer = inner.buffer_var
+    if isinstance(inner.value, tvm.tir.Call):
+        cond = inner.value.args[0]
+        out_channels = cond.b.value
+        input_pointer = inner.value.args[1].buffer_var
+    else:
+        input_pointer = inner.value.buffer_var
+        out_channels = c.extent * b.extent if attrs["layout"] == "NHCWB16" else c.extent
+
+    return out_channels, input_pointer, output_pointer
+
+
+def get_read_params(stmt):
+    """Get the feature map parameters from a read loop nest.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a read loop nest.
+
+    Returns
+    -------
+    SerialFeatureMap
+        The serializable feature map.
+    input_pointer : tvm.tir.Var
+        The pointer consumed by the operation.
+    output_pointer : tvm.tir.Var
+        The pointer produced by the operation.
+
+    """
+    attrs, body = get_op_attrs(stmt)
+    _, h, w, c, _, inner = get_outer_loops(body, attrs["layout"])
+    input_pointer = inner.value.buffer_var
+    output_pointer = inner.buffer_var
+    stride_vars = [h.loop_var, w.loop_var, c.loop_var]
+    strides = get_strides(inner.value.index, stride_vars)
+    base_address = get_base_address(inner.value.index)
+    data_type = inner.buffer_var.type_annotation.element_type.dtype
+    return (
+        SerialFeatureMap(
+            data_type=data_type,
+            height=h.extent,
+            width=w.extent,
+            channels=c.extent,
+            tile_height_0=h.extent,
+            tile_height_1=0,
+            tile_width_0=w.extent,
+            tile_address_0=tvm.tir.Load(data_type, inner.value.buffer_var, base_address),
+            tile_address_1=0,
+            tile_address_2=0,
+            tile_address_3=0,
+            scale=attrs["scale"],
+            zero_point=attrs["zero_point"],
+            layout=attrs["layout"],
+            stride_h=strides[0],
+            stride_w=strides[1],
+            stride_c=strides[2],
+        ),
+        input_pointer,
+        output_pointer,
+    )
+
+
+def get_write_params(stmt):
+    """Get the feature map parameters from a write loop nest.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a write loop nest.
+
+    Returns
+    -------
+    SerialFeatureMap
+        The serializable feature map.
+    input_pointer : tvm.tir.Var
+        The pointer consumed by the operation.
+    output_pointer : tvm.tir.Var
+        The pointer produced by the operation.
+
+    """
+    attrs, body = get_op_attrs(stmt)
+    _, h, w, c, _, inner = get_outer_loops(body, attrs["layout"])
+    input_pointer = inner.value.buffer_var
+    output_pointer = inner.buffer_var
+    stride_vars = [h.loop_var, w.loop_var, c.loop_var]
+    strides = get_strides(inner.index, stride_vars)
+    base_address = get_base_address(inner.index)
+    data_type = inner.buffer_var.type_annotation.element_type.dtype
+    return (
+        SerialFeatureMap(
+            data_type=data_type,
+            height=h.extent,
+            width=w.extent,
+            channels=c.extent,
+            tile_height_0=h.extent,
+            tile_height_1=0,
+            tile_width_0=w.extent,
+            tile_address_0=tvm.tir.Load(data_type, inner.buffer_var, base_address),
+            tile_address_1=0,
+            tile_address_2=0,
+            tile_address_3=0,
+            scale=attrs["scale"],
+            zero_point=attrs["zero_point"],
+            layout=attrs["layout"],
+            stride_h=strides[0],
+            stride_w=strides[1],
+            stride_c=strides[2],
+        ),
+        input_pointer,
+        output_pointer,
+    )
+
+
+def get_ifm_params(pointer, producers):
+    """Get the parameters associated with the DMA capabilities for an IFM.
+
+    Parameters
+    ----------
+    pointer : tvm.tir.Var
+        The pointer that the IFM DMA pipeline produces.
+    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
+        A dictionary to associate pointers with the loop nest
+        that produces their values.
+
+    Returns
+    -------
+    serial_ifm : SerialFeatureMap
+        The serializable IFM.
+    serial_padding : SerialPadding
+        The serializable padding.
+
+    """
+    pad = producers[pointer]
+    serial_padding, input_pointer, _ = get_pad_params(pad)
+    convert_to_nhwc = producers[input_pointer]
+    in_channels, input_pointer, _ = get_convert_to_nhwc_params(convert_to_nhwc)
+    read = producers[input_pointer]
+    serial_ifm, _, _ = get_read_params(read)
+    serial_ifm.channels = in_channels
+    return serial_ifm, serial_padding
+
+
+def get_ofm_params(pointer, consumers):
+    """Get the parameters associated with the DMA capabilities for an OFM.
+
+    Parameters
+    ----------
+    pointer : tvm.tir.Var
+        The pointer that the OFM DMA pipeline consumes.
+    consumers : dict of tvm.tir.Var to tvm.tir.AttrStmt
+        A dictionary to associate pointers with the loop nest
+        that consumes their values.
+
+    Returns
+    -------
+    serial_ifm : SerialFeatureMap
+        The serializable OFM.
+    output_pointer : tvm.tir.Var
+        The pointer that the OFM DMA pipeline produces.
+
+    """
+    convert_to_nhcwb16 = consumers[pointer]
+    out_channels, _, output_pointer = get_convert_to_nhcwb16_params(convert_to_nhcwb16)
+    write = consumers[output_pointer]
+    serial_ofm, _, output_pointer = get_write_params(write)
+    serial_ofm.channels = out_channels
+    return serial_ofm, output_pointer
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
new file mode 100644
index 000000000000..1af44962c141
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -0,0 +1,475 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler"""
+import numpy as np  # type: ignore
+
+import tvm
+from tvm.relay.backend.contrib.ethosu import vela_api
+from .convolution import get_conv2d_params
+from .transform import get_copy_params
+from .utils import get_weights_pointer, get_scale_bias_pointer
+
+
+def RemoveZeroStores():
+    """This pass removes stores which just store zero to initialise buffers.
+
+    We don't codegen these stores and they otherwise considerably reduce
+    the simplicity of the static traversal of convolution."""
+
+    def _remove_zero_store(stmt):
+        if isinstance(stmt.value, tvm.tir.IntImm) and int(stmt.value) == 0:
+            return tvm.tir.Evaluate(tvm.tir.IntImm("uint8", 0))
+        return stmt
+
+    def _ftransform(f, mod, ctx):
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(f.body, _remove_zero_store, None, ["tir.Store"])
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.ethosu.remove_zero_stores"
+    )
+
+
+def ReplaceOperators():
+    """Replace operators represented as explicit loop nests with call_externs
+    to NPU operators."""
+    op_map = {
+        "ethosu_conv2d": get_conv2d_params,
+        "ethosu_copy": get_copy_params,
+    }
+    pointer_to_producer = {}
+    pointer_to_consumer = {}
+    replace_output_pointer = {}
+    pointer_to_extents = {}
+
+    def _resolve_pointers(stmt):
+        """This pass determines information about the pointers present in the IR.
+        In particular, it associates pointers with both the operations that
+        produce them and the operations that consume them through the
+        pointer_to_producer and pointer_to_consumer dicts.
+
+        Additionally, it determines the extent (size/shape) of each pointer which
+        is required for the _replace_pointers pass which runs later."""
+        loads = []
+
+        def _get_loads(stmt):
+            if isinstance(stmt, tvm.tir.Load):
+                loads.append(stmt.buffer_var)
+
+        if isinstance(stmt, tvm.tir.Allocate):
+            pointer_to_extents[stmt.buffer_var] = stmt.extents
+            if isinstance(stmt.body[0], tvm.tir.AttrStmt):
+                if stmt.body[0].attr_key == "pragma_op":
+                    pointer_to_producer[stmt.buffer_var] = stmt.body[0]
+
+        elif isinstance(stmt, tvm.tir.AttrStmt):
+            if stmt.attr_key == "pragma_op":
+                tvm.tir.stmt_functor.post_order_visit(stmt, _get_loads)
+                for load_buffer in loads:
+                    pointer_to_consumer[load_buffer] = stmt
+
+    def _replace_operator(stmt):
+        """Replace operators with call_externs, having derived the parameters
+        from the relevant TIR expressions/statements.
+
+        Note the complexity of this pass is mostly from the concept of 'replace
+        pointers'. A call_extern may in principle require information from several
+        loop nests in TIR (each corresponding to a different TE compute op). For
+        example, a convolution operator will have other TE compute ops before and
+        after corresponding to the input/output DMA functionality. Therefore, when
+        the 'central' convolution op is replaced with a call_extern, the memory
+        from the final DMA output op must be hoisted to the location/scope of
+        the call_extern.
+
+        The is done by replacing the pointer corresponding to the current operation
+        with the 'true' output operator through the replace_output_pointer dict.
+        Because of this, the param_func must provide a replace_pointer if the op
+        isn't the true output but instead a no_compile op is."""
+        if isinstance(stmt, tvm.tir.AttrStmt):
+            op_name = stmt.value.value
+            if stmt.attr_key == "pragma_op" and op_name in op_map:
+                # Get the parameters for the extern call
+                param_func = op_map[op_name]
+                info, output_pointer, replace_pointer = param_func(
+                    stmt, pointer_to_producer, pointer_to_consumer
+                )
+                if replace_pointer is not None:
+                    replace_output_pointer[output_pointer] = replace_pointer
+                # Make the extern call
+                irb = tvm.tir.ir_builder.create()
+                irb.emit(tvm.tir.call_extern("handle", op_name, *info))
+                return irb.get()
+        return None
+
+    def _remove_no_compile(stmt):
+        """Certain operators are marked as 'no compile' operators. This means they
+        should be removed from the IR as they are compiled as part of other operators.
+        The IFM DMA operations are an example of this, as they don't get compiled
+        independently but instead get compiled into the operator they're associated with,
+        e.g. a conv2d.
+
+        There are potentially 3 parts to remove for an operator: the memory scope, the
+        allocate for its output and the compute nest itself. For the memory scope and
+        allocate, we can check if the pointer they reference is produced by a 'no compile'
+        operator. For the compute nest, we can just check the op pragma."""
+        if isinstance(stmt, tvm.tir.AttrStmt):
+            # Remove memory scopes
+            if stmt.node in pointer_to_producer:
+                producer_attr = pointer_to_producer[stmt.node]
+                if (
+                    producer_attr.attr_key == "pragma_op"
+                    and producer_attr.value.value not in op_map
+                ):
+                    return stmt.body
+
+            # Remove compute nests
+            if stmt.attr_key == "pragma_op" and stmt.value.value not in op_map:
+                return tvm.tir.Evaluate(0)
+
+        if isinstance(stmt, tvm.tir.Allocate):
+            # Remove allocates
+            if stmt.buffer_var in pointer_to_producer:
+                op_attr = pointer_to_producer[stmt.buffer_var]
+                if op_attr.attr_key == "pragma_op" and op_attr.value.value not in op_map:
+                    return stmt.body
+        return None
+
+    def _replace_pointers(stmt):
+        if isinstance(stmt, tvm.tir.AttrStmt):
+            # If the attribute references a pointer that needs replacing
+            if stmt.node in replace_output_pointer:
+                replace_pointer = replace_output_pointer[stmt.node]
+                # If the pointer doesn't have an extent registered to it,
+                # this means the pointer is to a Buffer. In this case, we
+                # just want to delete the memory scope attribute
+                if replace_pointer not in pointer_to_extents:
+                    return stmt.body
+                # Otherwise, rewrite the memory scope attribute with the new pointer
+                return tvm.tir.AttrStmt(
+                    replace_output_pointer[stmt.node], stmt.attr_key, stmt.value, stmt.body
+                )
+
+        if isinstance(stmt, tvm.tir.Allocate):
+            # If the allocate allocates a pointer that needs replacing
+            if stmt.buffer_var in replace_output_pointer:
+                replace_pointer = replace_output_pointer[stmt.buffer_var]
+                # If the pointer doesn't have an extent registered to it,
+                # this means the pointer is to a Buffer. In this case, we
+                # just want to delete the allocation statement
+                if replace_pointer not in pointer_to_extents:
+                    return stmt.body
+                # Otherwise, rewrite the allocation statement with the new pointer
+                # and the new extent
+                replace_type = replace_pointer.type_annotation.element_type.dtype
+                replace_extents = pointer_to_extents[replace_pointer]
+                return tvm.tir.Allocate(
+                    replace_pointer, replace_type, replace_extents, stmt.condition, stmt.body
+                )
+        return None
+
+    def _post_transform(stmt):
+        # Replace operators with call_externs
+        result = _replace_operator(stmt)
+        # Remove operators that don't need compiling
+        result = result or _remove_no_compile(stmt)
+        # Replace necessary pointers that were removed in the previous step
+        return result or _replace_pointers(stmt)
+
+    def _ftransform(f, mod, ctx):
+        tvm.tir.stmt_functor.post_order_visit(f.body, _resolve_pointers)
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(
+                f.body, None, _post_transform, ["tir.AttrStmt", "tir.Allocate"]
+            )
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.ethosu.replace_operators"
+    )
+
+
+def DivideConstants(const_dict):
+    """This pass rewrites the IR and constant dict such that all constant
+    accesses are at 0 offset and full length (i.e. they read the whole buffer).
+
+    Where necessary, new constants are created in order to ensure the rewrite
+    can take place. As an example, if a convolution is tiled along the channels
+    axis, the accesses to the weights will need to be offset. This pass will
+    create new constants consisting of 'slices' of the weights so each tile
+    of the compute can access one of these 'slices'.
+
+    The purpose of this pass is to transform the IR into a form we can apply
+    constant encoding to (which will compress weights and encode biases)."""
+    buffer_to_const = {}  # type: ignore
+    new_buffers = []
+    new_consts = []
+    keep_buffers = set()
+    new_const_dict = {}
+
+    def _visit(stmt):
+        new_args = []
+        for i, arg in enumerate(stmt.args):
+            if isinstance(arg, tvm.tir.expr.Load):
+                # If we're trying to load a buffer that maps to a constant
+                if arg.buffer_var in buffer_to_const:
+                    const = buffer_to_const[arg.buffer_var]
+                    offset = int(arg.index)
+                    # Note by convention the arg after a constant read is the length of the read
+                    length = int(stmt.args[i + 1])
+                    # If it's anything other than a full read, create a new buffer
+                    if offset != 0 or len(const) != length:
+                        new_consts.append(const[offset : offset + length])
+                        new_buffer = tvm.tir.decl_buffer((length,), arg.dtype)
+                        new_buffers.append(new_buffer)
+                        new_args.append(tvm.tir.expr.Load(new_buffer.dtype, new_buffer.data, 0))
+                        continue
+                    keep_buffers.add(arg.buffer_var)
+
+            new_args.append(arg)
+
+        return tvm.tir.Call(stmt.dtype, stmt.op, new_args, stmt.span)
+
+    def _ftransform(f, mod, ctx):
+        for i, param in enumerate(f.params):
+            if i in const_dict:
+                buffer_to_const[param] = const_dict[i].flatten()
+                buffer_to_const[f.buffer_map[param].data] = const_dict[i].flatten()
+
+        new_body = tvm.tir.stmt_functor.ir_transform(f.body, _visit, None, ["tir.Call"])
+        # Both the params and buffer map need updating for the newly introduced buffers
+        new_params = []  # type: ignore
+        new_buffer_map = {}
+        for i, param in enumerate(f.params):
+            buffer = f.buffer_map[param]
+            pointer = buffer.data
+            if pointer in buffer_to_const:
+                if pointer not in keep_buffers:
+                    continue
+                new_const_dict[len(new_params)] = const_dict[i]
+            new_params.append(param)
+            new_buffer_map[param] = buffer
+
+        for i, new_buffer in enumerate(new_buffers):
+            handle = tvm.tir.Var("placeholder", "handle")
+            new_params.append(handle)
+            new_buffer_map[handle] = new_buffer
+            new_const_dict[len(new_params) - 1] = new_consts[i]
+
+        new_f = tvm.tir.PrimFunc(new_params, new_body, f.ret_type, new_buffer_map, f.attrs, f.span)
+        return new_f
+
+    def _divide_constants(mod):
+        transform_func = tvm.tir.transform.prim_func_pass(
+            _ftransform, opt_level=0, name="tir.ethosu.divide_constants"
+        )
+        new_func = transform_func(mod)
+        return new_func, new_const_dict
+
+    return _divide_constants
+
+
+def EncodeConstants(const_dict):
+    """the NPU requires that weights are compressed and bias/scales are 'encoded', both
+    of which are performed by this pass.
+
+    This pass modifies both the constant dict to contain the post-encoding values of the
+    constants and the IR to adjust buffer types/sizes/accesses so they align with the
+    encoded constants. Calls to the Vela API are made to perform the actual compression/
+    encoding.
+
+    """
+    new_const_dict = {}
+    buffer_to_const = {}
+    pointer_to_buffer = {}
+    rewrite_buffer = {}
+    rewrite_pointer = {}
+    accel_type = vela_api.get_target_accel_type()  # type: ignore
+
+    def _align_scale_bias(tir_extern_call, bias):
+        """Align the scale_bias to 16 bytes."""
+        value_bytes = bytearray()
+        value_bytes.extend(bias.tobytes())
+        # Align to 16
+        remainder = (len(value_bytes)) % 16
+        if remainder > 0:
+            value_bytes.extend(bytearray(16 - remainder))
+        value = np.frombuffer(value_bytes, dtype="uint8")
+        return value
+
+    def _encode_weights(tir_extern_call, weights):
+        """Encode the weights for a TIR extern call."""
+        value_bytes = vela_api.encode_weights(tir_extern_call, weights, accel_type)
+        value = np.frombuffer(value_bytes, dtype="uint8")
+        return value
+
+    def _new_buffer(old_buffer, new_value):
+        """Create a new buffer and add the old buffer and its pointer to the
+        rewriting maps."""
+        new_buffer = tvm.tir.decl_buffer((len(new_value),), str(new_value.dtype))
+        pointer_to_buffer[new_buffer.data] = new_buffer
+        rewrite_buffer[old_buffer] = new_buffer
+        rewrite_pointer[old_buffer.data] = new_buffer.data
+        buffer_to_const[new_buffer] = new_value
+
+    def _visit_encode_pre(stmt):
+        if isinstance(stmt, tvm.tir.Call):
+            # Handle copies as a special-case by propagating the buffer information
+            # from the read to the write pointer.
+            if stmt.args[0] == "ethosu_copy":
+                read_pointer = stmt.args[1].buffer_var
+                if read_pointer in pointer_to_buffer:
+                    write_pointer = stmt.args[3].buffer_var
+                    # Assert writing to the base of the write_var (pre-StorageRewrite)
+                    assert stmt.args[3].index == 0
+                    assert stmt.args[1].index == 0
+                    pointer_to_buffer[write_pointer] = pointer_to_buffer[read_pointer]
+            else:
+                # Encode the weights
+                weights_pointer = get_weights_pointer(stmt)
+                if weights_pointer is not None:
+                    assert weights_pointer in pointer_to_buffer
+                    weights_buffer = pointer_to_buffer[weights_pointer]
+                    weights_value = buffer_to_const[weights_buffer]
+                    new_weights_value = _encode_weights(stmt, weights_value)
+                    _new_buffer(weights_buffer, new_weights_value)
+                # Align the scale_bias to 16 bytes
+                scale_bias_pointer = get_scale_bias_pointer(stmt)
+                if scale_bias_pointer is not None:
+                    assert scale_bias_pointer in pointer_to_buffer
+                    scale_bias_buffer = pointer_to_buffer[scale_bias_pointer]
+                    scale_bias_value = buffer_to_const[scale_bias_buffer]
+                    new_scale_bias_value = _align_scale_bias(stmt, scale_bias_value)
+                    _new_buffer(scale_bias_buffer, new_scale_bias_value)
+
+    def _visit_encode_post(stmt):
+        # Because encoding may change the data type (e.g. bias to uint8) and type information
+        # is stored in pointer vars, it's necessary to rewrite all the pointers which point
+        # to encoded data.
+        if isinstance(stmt, tvm.tir.Allocate):
+            allocate_pointer = stmt.buffer_var
+            if allocate_pointer in pointer_to_buffer:
+                buffer = pointer_to_buffer[allocate_pointer]
+                if buffer in rewrite_buffer:  # If the pointer needs rewriting
+                    # Create a new pointer var with the type of the new buffer
+                    new_buffer = rewrite_buffer[buffer]
+                    storage_type = tvm.ir.PrimType(new_buffer.dtype)
+                    new_pointer = tvm.tir.Var(
+                        allocate_pointer.name,
+                        tvm.ir.PointerType(storage_type, buffer.scope()),
+                        allocate_pointer.span,
+                    )
+                    # Set the new pointer to resolve to the new buffer
+                    pointer_to_buffer[new_pointer] = new_buffer
+                    # Add the old pointer to the pointer rewriting dict
+                    rewrite_pointer[allocate_pointer] = new_pointer
+
+    def _visit_rewrite(stmt):
+        if isinstance(stmt, tvm.tir.Call):
+            # For extern calls, we need to rewrite pairs of arguments corresponding to
+            # base address load and the length of the load.
+            new_args = [stmt.args[0]]
+            for i in range(1, len(stmt.args)):
+                # If the previous argument was a load, the current should be a length
+                if isinstance(stmt.args[i - 1], tvm.tir.Load):
+                    load = stmt.args[i - 1]
+                    pointer = load.buffer_var
+                    if pointer in pointer_to_buffer:
+                        new_args.append(np.prod(list(pointer_to_buffer[pointer].shape)))
+                        continue
+                new_args.append(stmt.args[i])
+
+            return tvm.tir.Call(stmt.dtype, stmt.op, new_args, stmt.span)
+        if isinstance(stmt, tvm.tir.Allocate):
+            # Where a pointer needs rewriting, the allocate for it must be rewritten
+            allocate_pointer = stmt.buffer_var
+            if allocate_pointer in pointer_to_buffer:
+                if pointer_to_buffer[allocate_pointer] in rewrite_buffer:
+                    new_buffer = rewrite_buffer[pointer_to_buffer[allocate_pointer]]
+                    new_pointer = rewrite_pointer[allocate_pointer]
+                    return tvm.tir.Allocate(
+                        new_pointer,
+                        new_buffer.dtype,
+                        new_buffer.shape,
+                        stmt.condition,
+                        stmt.body,
+                        stmt.span,
+                    )
+        # The following rewrites would be better expressed by just rewriting the Vars, however
+        # ir_transform doesn't seem to visit Vars. So instead we do the next best thing and rewrite
+        # the nodes which contain the Vars.
+        if isinstance(stmt, tvm.tir.Load):
+            load_pointer = stmt.buffer_var
+            if load_pointer in rewrite_pointer:
+                new_pointer = rewrite_pointer[load_pointer]
+                element_type = new_pointer.type_annotation.element_type.dtype
+                return tvm.tir.Load(
+                    element_type, new_pointer, stmt.index, stmt.predicate, stmt.span
+                )
+        if isinstance(stmt, tvm.tir.AttrStmt):
+            node_pointer = stmt.node
+            if node_pointer in rewrite_pointer:
+                return tvm.tir.AttrStmt(
+                    rewrite_pointer[node_pointer], stmt.attr_key, stmt.value, stmt.body, stmt.span
+                )
+        return None
+
+    def _ftransform(f, mod, ctx):
+        for i, param in enumerate(f.params):
+            if i in const_dict:
+                buffer_to_const[f.buffer_map[param]] = const_dict[i].flatten()
+                pointer_to_buffer[f.buffer_map[param].data] = f.buffer_map[param]
+
+        # First analyse what needs to be rewritten
+        new_body = tvm.tir.stmt_functor.ir_transform(
+            f.body, _visit_encode_pre, _visit_encode_post, ["tir.Call", "tir.Allocate"]
+        )
+        # Then perform the rewrites
+        new_body = tvm.tir.stmt_functor.ir_transform(
+            f.body, None, _visit_rewrite, ["tir.Call", "tir.Allocate", "tir.Load", "tir.AttrStmt"]
+        )
+        new_buffer_map = {}
+        # Rewrite the buffer map and const dict to instead use the encoded versions
+        for i, param in enumerate(f.params):
+            buffer = f.buffer_map[param]
+            if buffer in rewrite_buffer:
+                new_buffer = rewrite_buffer[buffer]
+                new_buffer_map[param] = new_buffer
+                new_value = buffer_to_const[new_buffer]
+                new_const_dict[i] = new_value
+            elif buffer in buffer_to_const:
+                new_const_dict[i] = buffer_to_const[buffer]
+                new_buffer_map[param] = buffer
+            else:
+                new_buffer_map[param] = buffer
+
+        new_f = tvm.tir.PrimFunc(f.params, new_body, f.ret_type, new_buffer_map, f.attrs, f.span)
+        return new_f
+
+    def _encode_constants(mod):
+        mod, divided_const_dict = DivideConstants(const_dict)(mod)
+        const_dict.clear()
+        for key, value in divided_const_dict.items():
+            const_dict[key] = value
+        transform_func = tvm.tir.transform.prim_func_pass(
+            _ftransform, opt_level=0, name="tir.ethosu.encode_constants"
+        )
+        new_func = transform_func(mod)
+        return new_func, new_const_dict
+
+    return _encode_constants
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
new file mode 100644
index 000000000000..5d9027bf2078
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -0,0 +1,277 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Different schedulers for Arm(R) Ethos(TM)-U NPU"""
+import tvm
+
+
+def schedule(te_graph, const_dict, cascader=None):
+    """Schedule a TE graph for NPU compilation.
+
+    Parameters
+    ----------
+    te_graph
+        The TE graph to schedule.
+    const_dict : dict of int to numpy.ndarray
+        The constant dictionary.
+    cascader : callable, optional
+        A cascading function to apply optimizing scheduling
+        to the graph.
+
+    Returns
+    -------
+    s : tvm.te.Schedule
+        The completed schedule for the graph.
+
+    """
+    s = tvm.te.create_schedule([t.op for t in te_graph.outputs])
+    if cascader:
+        cascader(te_graph, const_dict, s)
+    inline_no_ops(te_graph, s)
+    schedule_pragmas(s)
+    schedule_cache_reads(s)
+    return s
+
+
+def tile_nd(s, tensor, tile):
+    """Scheduling utility to perform N-dimensional tiling.
+
+    Parameters
+    ----------
+    s : tvm.te.Schedule
+        The schedule to apply the tiling to.
+    tensor : tvm.te.Tensor
+        The tensor to apply the tiling to.
+    tile : tuple
+        The N-dimensional tile size.
+
+    Returns
+    -------
+    outer_indices : list of tvm.tir.IterVar
+        The outer iteration variables.
+    inner_indices : list of tvm.tir.IterVar
+        The inner iteration variables.
+
+    """
+    outer_indices = []
+    inner_indices = []
+    for i, size in enumerate(tile):
+        outer, inner = s[tensor].split(tensor.op.axis[i], size)
+        outer_indices.append(outer)
+        inner_indices.append(inner)
+
+    s[tensor].reorder(*outer_indices, *inner_indices)
+    return outer_indices, inner_indices
+
+
+def total_cascader(stripe_size):
+    """A demo/test cascader which tries to cascade every op in the graph together.
+
+    The desired output stride size should be specified. Note this only works
+    for single output graphs.
+
+    Parameters
+    ----------
+    stripe_size : tuple
+        The output stripe size.
+
+    Returns
+    -------
+    func : callable
+        The cascading function.
+
+    """
+
+    def _cascader(te_graph, const_dict, sch):
+        scheduled = set()
+
+        def _visit(tensor, stage, ax):
+            if tensor not in scheduled and isinstance(tensor.op, tvm.te.ComputeOp):
+                sch[tensor].compute_at(stage, ax)
+                scheduled.add(tensor)
+                for input_tensor in tensor.op.input_tensors:
+                    _visit(input_tensor, stage, ax)
+
+        assert len(te_graph.outputs) == 1
+        out = te_graph.outputs[0]
+        oi, _ = tile_nd(sch, out, stripe_size)
+        for ax in oi:
+            sch[out].unroll(ax)
+        for input_tensor in out.op.input_tensors:
+            _visit(input_tensor, sch[out], oi[-1])
+
+    return _cascader
+
+
+def copy_constants():
+    """A simple planner which copies all constant data from FLASH -> SRAM.
+
+    Returns
+    -------
+    planner : callable
+        The planning function.
+    """
+
+    def _planner(te_graph, const_dict, sch):
+        planned = set()  # type: ignore
+
+        def _visit(tensor, reader):
+            if tensor is not planned:
+                planned.add(tensor)
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    index = list(te_graph.inputs).index(tensor)
+                    if index in const_dict:
+                        sch.cache_read(tensor, "global", [reader])
+
+                elif isinstance(tensor.op, tvm.te.ComputeOp):
+                    for input_tensor in tensor.op.input_tensors:
+                        _visit(input_tensor, tensor)
+
+        for output_tensor in te_graph.outputs:
+            _visit(output_tensor, None)
+
+    return _planner
+
+
+def schedule_pragmas(sch):
+    """Add pragmas to the operators that require them.
+
+    This adds the pragmas used for codegen to the NPU ops.
+    They are taken directly from the TE compute op's attributes.
+    Modifies the schedule in-place.
+
+    Parameters
+    ----------
+    sch : tvm.te.Schedule
+        The schedule.
+
+    """
+
+    def _add_pragmas(stage, ax):
+        if "op" in [attr for attr, val in stage.op.attrs.items()]:
+            stage.pragma(ax, "op", stage.op.attrs["op"])
+            for attr, val in stage.op.attrs.items():
+                if attr != "op":
+                    stage.pragma(ax, str(attr), val)
+
+    for stage in sch.stages:
+        if (
+            isinstance(stage.op, tvm.te.ComputeOp)
+            and len(stage.op.axis) + len(stage.op.reduce_axis) > 0
+        ):
+            # The logic ensures the pragmas are assigned to the inner tiling loops
+            # rather than the outer ones (which end up getting unrolled).
+            num_inner_loops = len(stage.op.axis) + len(stage.op.reduce_axis)
+            ax = stage.leaf_iter_vars[-num_inner_loops]
+            _add_pragmas(stage, ax)
+
+
+def schedule_cache_reads(sch):
+    """Schedule cache reads that have been introduced.
+
+    There are two things we need to happen to cache_read stages. They should be tagged
+    with the 'ethosu_copy' pragma and have all their axes fused to make them 1D.
+
+    Parameters
+    ----------
+    sch : tvm.te.Schedule
+        The schedule.
+
+    """
+
+    def _detect_cache_read(stage):
+        # Try and detect cache_reads by checking if the compute op is identity
+        if isinstance(stage.op, tvm.te.ComputeOp):
+            op = stage.op
+            if "ethosu" in op.name:
+                return False
+            axes = op.axis
+            if len(op.input_tensors) == 1:
+                tensor = op.input_tensors[0]
+                try:
+                    identity_op = tensor(*axes)
+                except ValueError:
+                    return False
+                if tvm.tir.analysis.expr_deep_equal(identity_op, op.body[0]):
+                    return True
+        return False
+
+    for stage in sch.stages:
+        if _detect_cache_read(stage):
+            fax = stage.fuse(*stage.op.axis)
+            stage.pragma(fax, "op", "ethosu_copy")
+
+
+def inline_no_ops(te_graph, sch):
+    """Inline 'no-ops' - operations that in principle do nothing.
+
+    Modifies the schedule in-place. For now we inline reshape and
+    strided slice - more could be added.
+
+    Parameters
+    ----------
+    te_graph
+        The TE graph.
+    sch : tvm.te.Schedule
+        The schedule.
+
+    """
+    no_ops = {"T_reshape", "T_strided_slice"}
+    scheduled = set()
+
+    def _visit(tensor):
+        if tensor not in scheduled and isinstance(tensor.op, tvm.te.ComputeOp):
+            if tensor.op.name in no_ops:
+                sch[tensor].compute_inline()
+            scheduled.add(tensor)
+            for input_tensor in tensor.op.input_tensors:
+                _visit(input_tensor)
+
+    for out in te_graph.outputs:
+        _visit(out)
+
+
+class Convolution2DCompute:
+    """A helper class to manipulate the series of compute ops that make up a 2D convolution."""
+
+    def __init__(self, read, convert_to_nhwc, pad, conv2d, convert_to_nhcwb16, write):
+        self.read = read
+        self.convert_to_nhwc = convert_to_nhwc
+        self.pad = pad
+        self.conv2d = conv2d
+        self.convert_to_nhcwb16 = convert_to_nhcwb16
+        self.write = write
+
+    @classmethod
+    def from_output(cls, out):
+        write = out
+        convert_to_nhcwb16 = write.op.input_tensors[0]
+        conv2d = convert_to_nhcwb16.op.input_tensors[0]
+        pad = conv2d.op.input_tensors[0]
+        convert_to_nhwc = pad.op.input_tensors[0]
+        read = convert_to_nhwc.op.input_tensors[0]
+        return cls(read, convert_to_nhwc, pad, conv2d, convert_to_nhcwb16, write)
+
+    def split(self, sch, axis, val):
+        outer, inner = sch[self.write].split(self.write.op.axis[axis], val)
+        sch[self.write].reorder(
+            outer, *[ax for ax in self.write.op.axis if ax != self.write.op.axis[axis]], inner
+        )
+        sch[self.write].unroll(outer)
+        g = sch.create_group(outputs=self.convert_to_nhcwb16, inputs=self.read, include_inputs=True)
+        g.compute_at(sch[self.write], outer)
+        return outer
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/spec.py b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
new file mode 100644
index 000000000000..3ecbcd5f3cdc
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
@@ -0,0 +1,263 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The TIR serialization specification for Arm(R) Ethos(TM)-U NPU."""
+from typing import Union
+from typing import get_type_hints
+from inspect import isclass
+
+import tvm
+from tvm.relay.backend.contrib.ethosu import util
+
+
+def create_serial_object(serialized_type, deserialized_elements):
+    """
+    This function will create serialized type that is one of the subclasses
+    of tvm.relay.backend.contrib.ethosu.tir.spec.SerializableFormat
+
+    Parameters
+    ----------
+    serialized_type : a subclass type of SerializableFormat
+
+    deserialized_elements : list
+        The list of arguments that needs to packed to create SerializableFormat objects
+
+    Returns
+    -------
+    The constructed object of type serialized_type
+    """
+
+    def _create_serial_object(internal_serialized_type, read_element_idx=0):
+        """The internal function that increments the read_element_idx
+        when creating nested serial objects"""
+        arg_len = util.get_arg_count(internal_serialized_type.__init__) - 1
+        serial_init_types = get_type_hints(internal_serialized_type.__init__)
+        serial_init_arg_names = list(serial_init_types.keys())
+        serial_init_args = []
+        assert arg_len == len(serial_init_arg_names)
+        for si_arg_name in serial_init_arg_names:
+            si_arg_type = serial_init_types[si_arg_name]
+            if isclass(si_arg_type) and issubclass(si_arg_type, SerializableFormat):
+                sia, read_element_idx = _create_serial_object(si_arg_type, read_element_idx)
+                serial_init_args.append(sia)
+            else:
+                serial_init_args.append(deserialized_elements[read_element_idx])
+                read_element_idx += 1
+        return internal_serialized_type(*serial_init_args), read_element_idx
+
+    # Just return the primary serial object
+    return _create_serial_object(serialized_type)[0]
+
+
+class SerializableFormat:
+    """Base class to retrieve arguments on a predefined ordering"""
+
+    def __iter__(self):
+        # Note class attribute definition order is preserved - see PEP 520
+        for name in self.__dict__:
+            value = self.__getattribute__(name)
+            if isinstance(value, SerializableFormat):
+                yield from list(value)
+            else:
+                yield value
+
+    def __getitem__(self, index):
+        # Note class attribute definition order is preserved - see PEP 520
+        name = list(self.__dict__.keys())[index]
+        return self.__getattribute__(name)
+
+
+class SerialFeatureMap(SerializableFormat):
+    """Specialization class to retrieve arguments of a Feature Map
+    (similiar to NpuFeatureMap of Vela) on a predefined ordering"""
+
+    def __init__(
+        self,
+        data_type: str,
+        height: int,
+        width: int,
+        channels: int,
+        tile_height_0: int,
+        tile_height_1: int,
+        tile_width_0: int,
+        tile_address_0: tvm.tir.expr.Load,
+        tile_address_1: Union[tvm.tir.expr.Load, int],
+        tile_address_2: Union[tvm.tir.expr.Load, int],
+        tile_address_3: Union[tvm.tir.expr.Load, int],
+        scale: float,
+        zero_point: int,
+        layout: str,
+        stride_h: int,
+        stride_w: int,
+        stride_c: int,
+    ):
+        self.data_type = data_type
+        self.height = height
+        self.width = width
+        self.channels = channels
+        self.tile_height_0 = tile_height_0
+        self.tile_height_1 = tile_height_1
+        self.tile_width_0 = tile_width_0
+        self.tile_address_0 = tile_address_0
+        self.tile_address_1 = tile_address_1
+        self.tile_address_2 = tile_address_2
+        self.tile_address_3 = tile_address_3
+        self.scale = scale
+        self.zero_point = zero_point
+        self.layout = layout
+        self.stride_h = stride_h
+        self.stride_w = stride_w
+        self.stride_c = stride_c
+
+
+class SerialKernel(SerializableFormat):
+    """Specialization class to retrieve arguments of a Kernel
+    (similiar to NpuKernel of Vela) on a predefined ordering"""
+
+    def __init__(
+        self,
+        width: int,
+        height: int,
+        stride_w: int,
+        stride_h: int,
+        dilation_w: int,
+        dilation_h: int,
+    ):
+        self.width = width
+        self.height = height
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+        self.dilation_w = dilation_w
+        self.dilation_h = dilation_h
+
+
+class SerialAddressRange(SerializableFormat):
+    """Specialization class to retrieve arguments of a AddressRange
+    (similiar to NpuAddressRange of Vela) on a predefined ordering"""
+
+    def __init__(self, address: tvm.tir.expr.Load, length: int):
+        self.address = address
+        self.length = length
+
+
+class SerialPadding(SerializableFormat):
+    """Specialization class to retrieve arguments of a Padding
+    (similiar to NpuPadding of Vela) on a predefined ordering"""
+
+    def __init__(self, top: int, left: int, bottom: int, right: int):
+        self.top = top
+        self.left = left
+        self.bottom = bottom
+        self.right = right
+
+
+class SerialActivation(SerializableFormat):
+    """Specialization class to retrieve arguments of a Activation
+    (similiar to NpuActivation of Vela) on a predefined ordering"""
+
+    def __init__(self, op: str, clip_min: int, clip_max: int):
+        self.op = op
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+
+
+class Serial2DConvolution(SerializableFormat):
+    """Specialization class to retrieve arguments of
+    a ethosu.conv2d tir extern call on a predefined ordering"""
+
+    def __init__(
+        self,
+        ifm: SerialFeatureMap,
+        ofm: SerialFeatureMap,
+        kernel: SerialKernel,
+        weight: SerialAddressRange,
+        weight_zero_point: int,
+        scale_bias: SerialAddressRange,
+        padding: SerialPadding,
+        activation: SerialActivation,
+        upscale: str,
+    ):
+        self.ifm = ifm
+        self.ofm = ofm
+        self.kernel = kernel
+        self.weight = weight
+        self.weight_zero_point = weight_zero_point
+        self.scale_bias = scale_bias
+        self.padding = padding
+        self.activation = activation
+        self.upscale = upscale
+
+
+class Serial2DDepthwise(SerializableFormat):
+    """Specialization class to retrieve arguments of
+    a ethosu.depthwise2d tir extern call on a predefined ordering"""
+
+    def __init__(
+        self,
+        ifm: SerialFeatureMap,
+        ofm: SerialFeatureMap,
+        kernel: SerialKernel,
+        weight: SerialAddressRange,
+        weight_zero_point: int,
+        scale_bias: SerialAddressRange,
+        padding: SerialPadding,
+        activation: SerialActivation,
+        upscale: str,
+    ):
+        self.ifm = ifm
+        self.ofm = ofm
+        self.kernel = kernel
+        self.weight = weight
+        self.weight_zero_point = weight_zero_point
+        self.scale_bias = scale_bias
+        self.padding = padding
+        self.activation = activation
+        self.upscale = upscale
+
+
+class SerialCopy(SerializableFormat):
+    """Specialization class to retrieve arguments of
+    a ethosu.copy tir extern call on a predefined ordering"""
+
+    def __init__(
+        self, read_address: tvm.tir.expr.Load, length: int, write_address: tvm.tir.expr.Load
+    ):
+        self.read_address = read_address
+        self.length = length
+        self.write_address = write_address
+
+
+class SerialPooling(SerializableFormat):
+    """Specialization class to retrieve arguments of
+    a ethosu.pooling tir extern call on a predefined ordering"""
+
+    def __init__(
+        self,
+        ifm: SerialFeatureMap,
+        ofm: SerialFeatureMap,
+        pooling_type: str,
+        pool_shape: SerialKernel,
+        padding: SerialPadding,
+        activation: SerialActivation,
+        upscale: str,
+    ):
+        self.ifm = ifm
+        self.ofm = ofm
+        self.pooling_type = pooling_type
+        self.pool_shape = pool_shape
+        self.padding = padding
+        self.activation = activation
+        self.upscale = upscale
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/transform.py b/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
new file mode 100644
index 000000000000..0403ce2c7e8f
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Extract information from the transform operators in TIR."""
+import tvm
+from .spec import SerialCopy
+from .utils import get_base_address, get_op_attrs
+
+
+def get_copy_params(stmt, producers, consumers):
+    """Get the parameters necessary to construct a call_extern for a copy.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a copy loop nest.
+    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
+        A dictionary to associate pointers with the loop nest
+        that produces their values.
+    consumers : dict of tvm.tir.Var to tvm.tir.AttrStmt
+        A dictionary to associate pointers with the loop nest
+        that consumes their values.
+
+    Returns
+    -------
+    SerialCopy
+        The parameters needed to construct a copy.
+    tvm.tir.Var
+        The output pointer of the copy operation.
+
+    """
+    _, body = get_op_attrs(stmt)
+    length = body.extent
+    write_store = body.body
+    write_base = get_base_address(write_store.index)
+    read_load = body.body.value
+    read_base = get_base_address(read_load.index)
+    dtype = body.body.value.dtype
+    return (
+        SerialCopy(
+            read_address=tvm.tir.expr.Load(dtype, read_load.buffer_var, read_base),
+            length=length,
+            write_address=tvm.tir.expr.Load(dtype, write_store.buffer_var, write_base),
+        ),
+        write_store.buffer_var,
+        None,
+    )
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
new file mode 100644
index 000000000000..7d6fd3bf82d8
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
@@ -0,0 +1,222 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Helper utility functions used by the TIR compiler"""
+import tvm
+from tvm import arith
+
+
+# TODO(@mbaret): Formalise this with a specification
+def get_weights_pointer(tir_extern_call):
+    """Get the weights pointer from a NPU extern call if it exists"""
+    if tir_extern_call.args[0] == "ethosu_conv2d":
+        return tir_extern_call.args[41].buffer_var
+    return None
+
+
+# TODO(@mbaret): Formalise this with a specification
+def get_scale_bias_pointer(tir_extern_call):
+    """Get the scale_bias pointer from a NPU extern call if it exists"""
+    if tir_extern_call.args[0] == "ethosu_conv2d":
+        return tir_extern_call.args[44].buffer_var
+    return None
+
+
+def get_op_attrs(stmt):
+    """Iterate through nested attribute statements accumulating their values
+    in an attribute dictionary.
+
+    The "pragma_" prefix is removed as a convenience.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement to begin from.
+
+    Returns
+    -------
+    attrs : dict of str to object
+        The attribute dictionary.
+    stmt : tvm.tir.Stmt
+        The body after having collected the final attribute statement.
+
+    """
+    attrs = {}
+    while isinstance(stmt, tvm.tir.AttrStmt):
+        # The pragma scheduler inserts "pragma_" before all the
+        # attr names, this is annoying so we get rid of it
+        attr = stmt.attr_key.replace("pragma_", "")
+        attrs[attr] = stmt.value
+        stmt = stmt.body
+
+    return attrs, stmt
+
+
+def get_strides(index, stride_vars):
+    """Get the striding of given vars in an indexing expression.
+
+    Parameters
+    ----------
+    index : tvm.tir.PrimExpr
+        The index expression where the stride vars are present.
+    stride_vars : list of tvm.tir.Var
+        The vars to determine the striding of.
+
+    Returns
+    -------
+    strides : list of int
+        The striding of each stride var in the index expression
+        in the same order as the stride vars were given.
+
+    """
+    strides = [1] * len(stride_vars)
+    dmap = {}
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.Var):
+            dmap[stmt] = arith.IntervalSet(0, 0)
+
+    tvm.tir.stmt_functor.post_order_visit(index, _visit)
+    min_value = int(arith.Analyzer().int_set(index, dmap).min_value)
+    for var in dmap:
+        if var in stride_vars:
+            # NOTE: Doing this using a [0, 1] interval doesn't work reliably
+            # Seems to be a bug
+            dmap[var] = arith.IntervalSet(1, 1)
+            max_value = int(arith.Analyzer().int_set(index, dmap).max_value)
+            stride = int(max_value - min_value)
+            i = stride_vars.index(var)
+            strides[i] = stride
+            dmap[var] = arith.IntervalSet(0, 0)
+
+    return strides
+
+
+def get_base_address(index):
+    """Determine the first (base) address accessed by an index expression.
+
+    Parameters
+    ----------
+    index : tvm.tir.PrimExpr
+        The index expression to determine the base address of.
+
+    Returns
+    -------
+    base_address:
+        The first address accessed by the index expression.
+
+    """
+    dmap = {}
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.Var):
+            dmap[stmt] = arith.IntervalSet(0, 0)
+
+    tvm.tir.stmt_functor.post_order_visit(index, _visit)
+    base_address = int(arith.Analyzer().int_set(index, dmap).min_value)
+    return base_address
+
+
+def get_outer_loops(stmt, layout):
+    """Get the outer loops of an operator.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.For
+        The outermost loop.
+    layout : str
+        The output tensor layout (NHWC or NHCWB16).
+
+    Returns
+    -------
+    n : tvm.tir.For
+        The batch loop.
+    h : tvm.tir.For
+        The height loop.
+    w : tvm.tir.For
+        The width loop.
+    c : tvm.tir.For
+        The channels loop.
+    b : tvm.tir.For
+        The brick loop. None for NHWC
+    body : tvm.tir.Stmt
+        The inner body of the loops.
+
+    """
+    if layout == "NHWC":
+        n = stmt
+        h = n.body
+        w = h.body
+        c = w.body
+        b = tvm.tir.For(tvm.tir.Var("b", "int32"), 0, 0, 0, tvm.tir.Evaluate(0))
+        return n, h, w, c, b, c.body
+    if layout == "NHCWB16":
+        n = stmt
+        h = n.body
+        cb = h.body
+        w = cb.body
+        b = w.body
+        return n, h, w, cb, b, b.body
+    return None
+
+
+def get_loads(stmt):
+    """Get the Load statements.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.Stmt
+        The statement to get the Loads from.
+
+    Returns
+    -------
+    loads : list of tvm.tir.Load
+        The Loads found.
+
+    """
+    loads = []
+
+    def _visit(s):
+        if isinstance(s, tvm.tir.Load):
+            loads.append(s)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, _visit)
+    return loads
+
+
+def get_stores(stmt):
+    """Get the Store statements.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.Stmt
+        The statement to get the Stores from.
+
+    Returns
+    -------
+    stores : list of tvm.tir.Store
+        The Stores found.
+
+    """
+    stores = []
+
+    def _visit(s):
+        if isinstance(s, tvm.tir.Store):
+            stores.append(s)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, _visit)
+    return stores
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
new file mode 100644
index 000000000000..ce9abcbd683d
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -0,0 +1,332 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This source will contain code to convert TIR, as produced by
+the Relay to TIR compilation process, to Vela API calls to
+generate command stream.
+"""
+from typing import NamedTuple
+from enum import auto
+from enum import Enum
+import numpy as np  # type: ignore
+import ethosu.vela.api as vapi  # type: ignore
+
+import tvm
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.backend.contrib.ethosu.tir import spec
+
+
+class BufferType(Enum):
+    """The buffer types the codegen supports"""
+
+    constant = auto()
+    input_or_output = auto()
+    scratch = auto()
+    input = auto()
+    output = auto()
+
+
+class BufferInfo(NamedTuple):
+    """A data structure to hold metadata of the buffer"""
+
+    # If the buffer holds constants, the values will contain that otherwise None
+    values: np.ndarray
+    shape: tvm.ir.container.Array
+    dtype: np.dtype
+    btype: BufferType
+
+
+def extract_buffer_info(mod, param_dict):
+    """
+    This function is to read the tvm.IRModule that
+    contains Relay to TIR compiled IRModule. Thereafter,
+    this will extract the buffer information as the shape
+    and constant data (if any).
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The NPU TIR IRModule.
+    param_dict : dict
+        A dictionary containing param idx --> const numpy.NDArray
+    Returns
+    -------
+    dict
+        a dictionary of buffer names --> BufferInfo
+    """
+    buffer_info = dict()
+    # There should only be a single function
+    assert len(mod.functions.items()) == 1
+    primfunc = mod.functions.items()[0][1]
+    for idx, const_data in param_dict.items():
+        param = primfunc.params[idx]
+        buffer_info[primfunc.buffer_map[param].data] = BufferInfo(
+            const_data, const_data.shape, const_data.dtype, BufferType.constant
+        )
+
+    for param in primfunc.params:
+        if primfunc.buffer_map[param].data not in buffer_info.keys():
+            buffer_info[primfunc.buffer_map[param].data] = BufferInfo(
+                None,
+                primfunc.buffer_map[param].shape,
+                primfunc.buffer_map[param].dtype,
+                BufferType.input_or_output,
+            )
+
+    def populate_allocate_buffer_info(stmt):
+        if isinstance(stmt, tvm.tir.stmt.Allocate):
+            allocate = stmt
+            buffer_info[allocate.buffer_var] = BufferInfo(
+                None,
+                allocate.extents,
+                allocate.dtype,
+                BufferType.scratch,
+            )
+
+    tvm.tir.stmt_functor.post_order_visit(primfunc.body, populate_allocate_buffer_info)
+
+    return buffer_info
+
+
+def _convert_clip_bounds(npu_op):
+    """
+    This function will convert the min and max value
+    of clip activations to non quantized floats as
+    expected by the API.
+    Parameters
+    ----------
+    npu_op : ethosu.vela.api.NpuBlockOperation
+    """
+    clip_min_quant = npu_op.activation.min
+    clip_max_quant = npu_op.activation.max
+    clip_min_actual = (
+        clip_min_quant - npu_op.ofm.quantization.zero_point
+    ) * npu_op.ofm.quantization.scale_f32
+    clip_max_actual = (
+        clip_max_quant - npu_op.ofm.quantization.zero_point
+    ) * npu_op.ofm.quantization.scale_f32
+    npu_op.activation.min = clip_min_actual
+    npu_op.activation.max = clip_max_actual
+
+
+def translate_ethosu_conv2d(tir_extern_call):
+    """This function will translate a tir extern_call
+    as produced by Relay to TIR compilation.
+    Parameters
+    ----------
+    tir_extern_call : tvm.tir.Call
+        This should be an tir external call that has a agreed upon ordering
+        for TIR Compiler. See Serial2DConvolution in
+        tvm/relay/backend/contrib/ethosu/tir/spec.py for the ordering.
+
+    Returns
+    -------
+    ethosu.vela.api.NpuConv2DOperation
+        The vela object containing the params of ethosu_conv2d
+    weights_zero_point : int
+        The zero point of the weights
+    """
+    # We skip the first element as it is the extern_call function name
+    serial_object = spec.create_serial_object(spec.Serial2DConvolution, tir_extern_call.args[1:])
+    return _create_npu_op_conv2d(serial_object)
+
+
+def _create_npu_op_conv2d(serial_2d_convolution):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuConv2DOperation object
+    """
+    npu_conv2d_op = vapi.NpuConv2DOperation()
+    npu_conv2d_op.ifm = _create_npu_feature_map(serial_2d_convolution.ifm)
+    npu_conv2d_op.ofm = _create_npu_feature_map(serial_2d_convolution.ofm)
+    npu_conv2d_op.kernel = _create_npu_kernel(serial_2d_convolution.kernel)
+    npu_conv2d_op.weights = [_create_npu_address_range(serial_2d_convolution.weight)]
+    weights_zero_point = np.int64(serial_2d_convolution.weight_zero_point.value)
+    npu_conv2d_op.biases = [_create_npu_address_range(serial_2d_convolution.scale_bias)]
+    npu_conv2d_op.padding = _create_npu_padding(serial_2d_convolution.padding)
+
+    npu_conv2d_op.activation = _create_npu_activation(serial_2d_convolution.activation)
+    if (
+        npu_conv2d_op.activation
+        and npu_conv2d_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU
+    ):
+        _convert_clip_bounds(npu_conv2d_op)
+
+    npu_conv2d_op.upscale = _create_npu_resampling_mode(serial_2d_convolution.upscale)
+    target_accel_type = vela_api.get_target_accel_type()  # type: ignore
+    block_config = vela_api.get_optimal_block_config(npu_conv2d_op, target_accel_type)
+    npu_conv2d_op.block_config = block_config
+    weights_shape_ohwi = [
+        npu_conv2d_op.ofm.shape.depth,
+        npu_conv2d_op.kernel.height,
+        npu_conv2d_op.kernel.width,
+        npu_conv2d_op.ifm.shape.depth,
+    ]
+    npu_conv2d_op.block_traversal = vela_api.calculate_block_traversal_mode(
+        is_depthwise=False,
+        weights_shape_ohwi=weights_shape_ohwi,
+        ifm_bitdepth=npu_conv2d_op.ifm.data_type.size_in_bits(),
+    )
+    return npu_conv2d_op, weights_zero_point
+
+
+def _create_npu_feature_map(serial_feature_map):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuFeatureMap object
+    """
+    layout_map = {"NHWC": vapi.NpuLayout.NHWC, "NHCWB16": vapi.NpuLayout.NHCWB16}
+    datatype_map = {
+        "uint8": vapi.NpuDataType.UINT8,
+        "int8": vapi.NpuDataType.INT8,
+        "uint16": vapi.NpuDataType.UINT16,
+        "int16": vapi.NpuDataType.INT16,
+        "int32": vapi.NpuDataType.INT32,
+    }
+    layout = str(serial_feature_map.layout.value)
+    data_type = str(serial_feature_map.data_type.value)
+    assert layout in layout_map.keys()
+    assert data_type in datatype_map.keys()
+    nfm = vapi.NpuFeatureMap()
+    nfm.data_type = datatype_map[data_type]
+    nfm.shape = vapi.NpuShape3D(
+        int(serial_feature_map.height.value),
+        int(serial_feature_map.width.value),
+        int(serial_feature_map.channels.value),
+    )
+    nfm.tiles = vapi.NpuTileBox(
+        int(serial_feature_map.tile_height_0.value),
+        int(serial_feature_map.tile_height_1.value),
+        int(serial_feature_map.tile_width_0.value),
+        [
+            serial_feature_map.tile_address_0,
+            serial_feature_map.tile_address_1,
+            serial_feature_map.tile_address_2,
+            serial_feature_map.tile_address_3,
+        ],
+    )
+    nfm.quantization = _create_npu_quantization(
+        serial_feature_map.scale, serial_feature_map.zero_point
+    )
+    nfm.layout = layout_map[layout]
+    nfm.strides = vapi.NpuShape3D(
+        int(serial_feature_map.stride_h.value),
+        int(serial_feature_map.stride_w.value),
+        int(serial_feature_map.stride_c.value),
+    )
+    return nfm
+
+
+def _create_npu_kernel(serial_kernel):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuKernel object
+    """
+    nknl = vapi.NpuKernel(
+        w=int(serial_kernel.width.value),
+        h=int(serial_kernel.height.value),
+        stride_x=int(serial_kernel.stride_w.value),
+        stride_y=int(serial_kernel.stride_h.value),
+        dilation_x=int(serial_kernel.dilation_w.value),
+        dilation_y=int(serial_kernel.dilation_h.value),
+    )
+    return nknl
+
+
+def _create_npu_address_range(serial_address_range):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuAddressRange object
+    """
+    addr_range = vapi.NpuAddressRange(
+        # region will be updated later
+        region=0,
+        address=serial_address_range.address,
+        length=int(serial_address_range.length.value),
+    )
+    return addr_range
+
+
+def _create_npu_quantization(
+    scale,
+    zero_point,
+):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuQuantization object
+    """
+    # Scale could be an ndarray if per-channel quantization is available
+    if not isinstance(scale, tvm.tir.expr.Load):
+        if isinstance(scale.value, float):
+            scale = np.single(scale.value)
+        else:
+            assert isinstance(scale.value.value, float)
+            scale = np.single(scale.value.value)
+    q_params = vapi.NpuQuantization(scale_f32=scale, zero_point=zero_point.value)
+    return q_params
+
+
+def _create_npu_weights_zero_point(
+    zero_point,
+):
+    """This is a helper function to capture the weights zero point"""
+    return zero_point.value
+
+
+def _create_npu_padding(serial_padding):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuPadding object"""
+    padding = vapi.NpuPadding(
+        top=int(serial_padding.top.value),
+        left=int(serial_padding.left.value),
+        bottom=int(serial_padding.bottom.value),
+        right=int(serial_padding.right.value),
+    )
+    return padding
+
+
+def _create_npu_activation(serial_activation):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuActivation object"""
+    if serial_activation.op == "NONE":
+        return None
+    if (
+        serial_activation.op == "CLIP"
+        and serial_activation.clip_min == 0
+        and serial_activation.clip_max == 0
+    ):
+        return None
+    op_map = {
+        "CLIP": vapi.NpuActivationOp.NONE_OR_RELU,
+        "TANH": vapi.NpuActivationOp.TANH,
+        "SIGMOID": vapi.NpuActivationOp.SIGMOID,
+    }
+    op = str(serial_activation.op.value)
+    assert op in op_map.keys()
+    act_op = vapi.NpuActivation(op_map[op])
+    act_op.min = int(serial_activation.clip_min.value)
+    act_op.max = int(serial_activation.clip_max.value)
+    return act_op
+
+
+def _create_npu_resampling_mode(
+    mode,
+):
+    """This is a helper function to capture a list
+    of arguments to create Vela NpuResamplingMode object"""
+    mode_map = {
+        "NONE": vapi.NpuResamplingMode.NONE,
+        "NEAREST": vapi.NpuResamplingMode.NEAREST,
+        "TRANSPOSE": vapi.NpuResamplingMode.TRANSPOSE,
+    }
+    mode = str(mode.value)
+    assert mode in mode_map.keys()
+    return mode_map[mode]
diff --git a/python/tvm/relay/backend/contrib/ethosu/util.py b/python/tvm/relay/backend/contrib/ethosu/util.py
new file mode 100644
index 000000000000..0919d3fe7a5f
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/util.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper utility Enums and Functions used through out code generation.
+
+The rest of the utility functions are misc.
+Refer to the description inside such functions
+"""
+
+from inspect import signature
+from enum import Enum
+from typing import Union, Tuple, Dict, Optional
+import numpy as np  # type: ignore
+
+import tvm  # type: ignore
+from tvm import relay
+from tvm.relay.build_module import bind_params_by_name  # type: ignore
+from tvm.relay.backend.contrib.ethosu import preprocess  # type: ignore
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.conv2d arguments.
+    """
+
+    IFM = 0
+    WEIGHTS = 1
+    IFM_ZERO_POINT = 2
+    WEIGHTS_ZERO_POINT = 3
+    IFM_SCALE = 4
+    WEIGHTS_SCALE = 5
+
+
+class RequantArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.requantize arguments.
+    """
+
+    IFM_SCALE = 1
+    IFM_ZERO_POINT = 2
+    OFM_SCALE = 3
+    OFM_ZERO_POINT = 4
+
+
+class BiasAddArgs(Enum):
+    """
+    This is a helper enums to obtain the correct index
+    of qnn.bias_add arguments.
+    """
+
+    BIASES = 1
+
+
+class ClipArgs(Enum):
+    """
+    This is a helper enums to obtain the correct index
+    of clip arguments.
+    """
+
+    A_MIN = 1
+    A_MAX = 2
+
+
+def is_composite_func(func: relay.Function, name: str) -> bool:
+    """
+    This method checks whether the call is to
+    a composite function of a given name.
+
+    Parameters
+    ----------
+    func : relay.Function
+        The header to be displayed along with the dump.
+
+    name : str
+        The candidate name to be checked
+
+    Returns
+    --------
+    a boolean
+    """
+
+    if not hasattr(func, "attrs"):
+        return False
+    if "Composite" not in func.attrs.keys():
+        return False
+    composite_name = func.attrs["Composite"]
+
+    return composite_name == name
+
+
+def get_range_for_dtype_str(dtype: str) -> Tuple[int, int]:
+    """
+    Produce the min,max for a give data type.
+
+    Parameters
+    ----------
+    dtype : str
+        a type string (e.g., int8)
+
+    Returns
+    -------
+    type_info.min : int
+        the minimum of the range
+    type_info.max : int
+        the maximum of the range
+    """
+
+    try:
+        type_info = np.iinfo(dtype)
+    except ValueError:
+        type_info = np.finfo(dtype)
+    return type_info.min, type_info.max
+
+
+def round_away_zero(f: Union[float, np.double, np.single, np.float32, np.float64]) -> np.float64:
+    """Round the number away from zero towards +inf / -inf"""
+    offset = -0.5 if (f < 0) else 0.5
+    return np.trunc(f + offset)
+
+
+def round_up(a: int, b: int) -> int:
+    """Round up to a multiple of b"""
+    return ((a + b - 1) // b) * b
+
+
+def get_accelerator_config():
+    """Get the variant of the accelerator to compile for"""
+    compiler_attrs = tvm.get_global_func("relay.ext.ethosu.get_compiler_attrs")()
+    return compiler_attrs.accelerator_config
+
+
+# pylint: disable=unused-argument
+def partition_for_ethosu(
+    mod: tvm.ir.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None, **opts
+):
+    """This helper function partition the relay graph as produced by the
+    relay frontend for a given model into external functions
+    to be presented to the codegen.
+
+    Parameters
+    ----------
+    mod : tvm.ir.IRModule
+        The IRModule that gets generated from a relay frontend
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    mod : IRModule
+        The partitioned IRModule with external global functions
+    """
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    pattern = relay.op.contrib.get_pattern_table("ethosu")
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.MergeComposite(pattern)(mod)
+    mod = relay.transform.AnnotateTarget("ethosu")(mod)
+    mod = relay.transform.MergeCompilerRegions()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = preprocess.preprocess_ext_io()(mod)
+    return mod
+
+
+def get_arg_count(func):
+    """Helper function to get the number of
+    arguments in a python function"""
+    sig = signature(func)
+    return len(sig.parameters)
+
+
+def get_dim_value(layout: str, dim: int):
+    """This is a helper function to retrieve the value
+    of the dimension given the shape and the layout
+    """
+    assert isinstance(layout, str)
+    assert dim in list(layout)
+    for idx, dim_char in enumerate(layout):
+        if dim_char == dim:
+            return idx
+    return None
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
new file mode 100644
index 000000000000..5009c3157c77
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -0,0 +1,372 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This is an adapter module for conversions between TVM and Vela.
+The following conversion APIs are added :
+    *Obtaining the best block config
+    *Compressing weights
+    *Packing biases
+"""
+import logging
+import math
+from typing import Tuple, Optional, List
+import numpy as np  # type: ignore
+from ethosu.vela import api as vapi  # type: ignore
+
+from tvm.relay.backend.contrib.ethosu import util  # type: ignore
+from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
+
+# pylint: disable=invalid-name
+logger = logging.getLogger("Ethos-U")
+
+VELA_TO_NP_DTYPES = {
+    vapi.NpuDataType.UINT8: np.uint8,
+    vapi.NpuDataType.UINT16: np.uint16,
+    vapi.NpuDataType.INT8: np.int8,
+    vapi.NpuDataType.INT16: np.int16,
+    vapi.NpuDataType.INT32: np.int32,
+}
+
+SCALE_BIAS_LENGTH = 10
+
+
+def get_optimal_block_config(
+    npu_op: vapi.NpuOperation, accel_type: vapi.NpuAccelerator
+) -> vapi.NpuShape3D:
+    """
+    "The NPU's unit of work is known as a block. It will fetch block(s) from Input
+    Feature Map (IFM) and a compute block for Output Feature Map (OFM).
+    Therefore, we need to pick an optimal block configuration considering bandwidth
+    to bring IFM blocks and the number of OFM block computes need to happen
+    to cover the OFM as indicated by the npu op.
+
+    Parameters
+    ----------
+    npu_op : ethosu.vela.api.NpuOperation
+        The NPU operation and its params
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+
+    Returns
+    -------
+    ethosu.vela.api.NpuShape3D :
+        The optimal block config for the operator
+    """
+    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_type)
+    return _get_optimal_block_config(all_valid_block_configs)
+
+
+def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D]) -> vapi.NpuShape3D:
+    """An internal function to get block config with largest depth
+    and then highest volume/area"""
+    assert isinstance(all_valid_block_configs, list)
+    for block_cfg in all_valid_block_configs:
+        assert isinstance(block_cfg, vapi.NpuShape3D)
+
+    # Getting the largest volume block for benchmarking
+    all_valid_block_configs.sort(
+        key=lambda _cfg: _cfg.depth * _cfg.height * _cfg.width, reverse=True
+    )
+    largest_volume_block_config = all_valid_block_configs[0]
+    largest_volume = (
+        largest_volume_block_config.depth
+        * largest_volume_block_config.height
+        * largest_volume_block_config.width
+    )
+
+    all_valid_block_configs.sort(key=lambda _cfg: _cfg.depth, reverse=True)
+    max_d = all_valid_block_configs[0].depth
+    max_depth_block_configs = [_cfg for _cfg in all_valid_block_configs if _cfg.depth == max_d]
+    max_depth_block_configs.sort(key=lambda _cfg: _cfg.height * _cfg.width, reverse=True)
+    max_area = max_depth_block_configs[0].height * max_depth_block_configs[0].width
+    max_area_depth_block_configs = [
+        _cfg for _cfg in max_depth_block_configs if _cfg.height * _cfg.width == max_area
+    ]
+    # This to get a deterministic anwser everytime
+    max_area_depth_block_configs.sort(key=lambda _cfg: _cfg.height, reverse=True)
+    assert len(max_area_depth_block_configs) > 0
+    current_volume = (
+        max_area_depth_block_configs[0].depth
+        * max_area_depth_block_configs[0].height
+        * max_area_depth_block_configs[0].width
+    )
+    logger.info("Using block config=%s", max_area_depth_block_configs[0])
+    logger.info(
+        "Quality of the block config w.r.t. max volume block config=%s",
+        100.0 * (current_volume / largest_volume),
+    )
+    return max_area_depth_block_configs[0]
+
+
+def encode_weights(tir_extern_call, values, accel_type):
+    """This is an API function to compress weights by passing
+    a tir_extern_call to NPU Convolution operation and values.
+
+    Parameters
+    ----------
+    tir_extern_call : tvm.tir.Call
+        tir_extern_call to NPU Convolution operation
+    values : numpy.ndarray
+        The constant flattened weight data in OHWI layout
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+
+    Returns
+    -------
+    bytearray
+        Compressed weights
+    """
+    supported_ops = ["ethosu_conv2d"]
+    op = str(tir_extern_call.args[0].value)
+    assert op in supported_ops
+    npu_op, weights_zero_point = tirtocs.translate_ethosu_conv2d(tir_extern_call)
+    block_config = get_optimal_block_config(npu_op, accel_type)
+    # The weight layout is assumed to be flat OHWI, always.
+    assert len(values.shape) == 1
+    shape_ohwi = (
+        npu_op.ofm.shape.depth,
+        npu_op.kernel.height,
+        npu_op.kernel.width,
+        npu_op.ifm.shape.depth,
+    )
+    assert values.size == np.prod(shape_ohwi)
+    values = np.reshape(values, shape_ohwi)
+    return compress_weights(
+        weights=values,
+        weights_zp=weights_zero_point,
+        # The weight layout is assumed to be OHWI, always.
+        weights_layout="OHWI",
+        ifm_bitdepth=npu_op.ifm.data_type.size_in_bits(),
+        block_depth=block_config.depth,
+        dilation=(npu_op.kernel.dilation_x, npu_op.kernel.dilation_y),
+        accel_type=accel_type,
+        # TODO(@manupa-arm): change this when we support depthwise
+        is_depthwise=False,
+    )
+
+
+def compress_weights(
+    weights: np.ndarray,
+    weights_zp: int,
+    weights_layout: str,
+    ifm_bitdepth: int,
+    block_depth: int,
+    dilation: Tuple[int, int],
+    accel_type: vapi.NpuAccelerator,
+    is_depthwise: Optional[bool] = False,
+) -> bytearray:
+    """The NPU requires the weights to be compressed
+    to be executed. Therefore, this function calls into
+    the Vela APIs to compress the weights.
+
+    Parameters
+    ----------
+    weights : numpy.ndarray
+        The raw weights
+    weights_zp : int
+        The zero point of the weights
+    weights_layout : str
+        A string literal indicating the layout
+        Supported values : HWIO, HWOI, OHWI
+    ifm_bitdepth : int
+        The bit depth of the ifm the weights are used with
+    block_depth : int
+        The depth of the optimal block config for the operator
+    dilation : tuple
+        A tuple of 2 elements indicating dilation in h and w
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+    is_depthwise : bool, Optional
+        This indicates whether the weights are compressed for depthwise convolution
+
+    Returns
+    -------
+    compressed_weights : bytearray
+        Compressed weights
+    """
+    layout_transform_indices = {"HWIO": (3, 0, 1, 2), "HWOI": (2, 0, 1, 3), "OHWI": (0, 1, 2, 3)}
+    assert weights_layout in layout_transform_indices.keys()
+    assert isinstance(weights_zp, np.int64)
+    weights = weights.astype(np.int64) - weights_zp
+    # Vela needs the weights in OHWI layout
+    weights_ohwi = np.transpose(weights, layout_transform_indices[weights_layout])
+    shape_ohwi = [
+        weights.shape[layout_transform_indices[weights_layout][0]],
+        weights.shape[layout_transform_indices[weights_layout][1]],
+        weights.shape[layout_transform_indices[weights_layout][2]],
+        weights.shape[layout_transform_indices[weights_layout][3]],
+    ]
+    block_traversal = calculate_block_traversal_mode(is_depthwise, shape_ohwi, ifm_bitdepth)
+    compressed_weights = vapi.npu_encode_weights(
+        accelerator=accel_type,
+        weights_volume=weights_ohwi,
+        dilation_xy=dilation,
+        ifm_bitdepth=ifm_bitdepth,
+        ofm_block_depth=block_depth,
+        is_depthwise=is_depthwise,
+        block_traversal=block_traversal,
+    )
+    return compressed_weights
+
+
+def calculate_block_traversal_mode(
+    is_depthwise: bool, weights_shape_ohwi: List[int], ifm_bitdepth: int
+) -> vapi.NpuBlockTraversal:
+    """Calculate a block traversal mode given whether the op is depthwise convolution,
+    shape of weights and bit-depth of the ifm.
+    """
+
+    if is_depthwise:
+        return vapi.NpuBlockTraversal.DEPTH_FIRST
+    # Determine which block traversal strategy has better DPU utilization
+    kernel_size = weights_shape_ohwi[1] * weights_shape_ohwi[2]
+    depth_utilization = weights_shape_ohwi[3] / util.round_up(
+        weights_shape_ohwi[3], 32 if ifm_bitdepth == 8 else 16
+    )
+    part_kernel_utilization = (weights_shape_ohwi[3] / util.round_up(weights_shape_ohwi[3], 8)) * (
+        kernel_size / util.round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
+    )
+    if part_kernel_utilization >= depth_utilization or weights_shape_ohwi[3] <= 8:
+        # Part-kernel first is always better for ifm depths <= 8
+        return vapi.NpuBlockTraversal.PART_KERNEL_FIRST
+    return vapi.NpuBlockTraversal.DEPTH_FIRST
+
+
+def pack_biases(
+    biases: np.ndarray,
+    ifm_scale: float,
+    ifm_dtype: np.dtype,
+    weight_scales: np.ndarray,
+    ofm_scale: float,
+    is_activation_tanh_or_sigmoid: bool = False,
+) -> np.ndarray:
+    """
+    The NPU requires the each bias value to be packed with
+    output scale parameters in a 80-bit format (that is returned
+    via npu_encode_bias API). This function will pack such values
+    to a binary artifact that the NPU will use in the execution.
+
+
+    Parameters
+    ----------
+    biases : numpy.ndarray
+        The values of biases
+    ifm_scale : float
+        The quantization scale parameter of input feature map
+    ifm_dtype : numpy.dtype
+        The data type of input feature map data.
+    weight_scales : numpy.ndarray
+        The quantization scale parameter of weight feature map
+        This could be a tuple if per-channel quantization is present.
+    ofm_scale : float
+        The quantization scale parameter of output feature map.
+    is_activation_tanh_or_sigmoid : bool
+        Indicates whether the fused activation function is tanh or sigmoid.
+
+    Returns
+    -------
+    scale_bias : numpy.ndarray
+        Packed scales/biases as the hardware requires them.
+    """
+    # The BYOC infra should not partition anything else.
+    supported_ifm_dtypes = (np.uint8, np.int8, np.int16)
+    assert ifm_dtype in supported_ifm_dtypes
+
+    if weight_scales.size == 1:
+        weight_scales = [weight_scales] * biases.size
+
+    hw_bias_scales = _calculate_hw_bias_scales(
+        ifm_scale, weight_scales, ofm_scale, ifm_dtype, is_activation_tanh_or_sigmoid
+    )
+    assert len(hw_bias_scales) == biases.size
+    biases = biases.astype("int64")
+    packed_biases = bytearray()
+    for idx, scale in enumerate(hw_bias_scales):
+        packed_biases.extend(vapi.npu_encode_bias(biases[idx], *scale))
+    scale_bias = np.frombuffer(packed_biases, dtype=np.uint8)
+    scale_bias = np.reshape(scale_bias, (-1, 10))
+    return scale_bias
+
+
+def _quantize_scale(scale: float) -> Tuple[int, int]:
+    """Quantize floating point scale into 32-bit int scale with a 6-bit shift.
+    This is to be used with 8-bit data.
+    """
+    mantissa, exponent = math.frexp(scale)
+    mantissa_scaled = mantissa * (1 << 31)
+    mantissa_scaled = int(util.round_away_zero(mantissa_scaled))
+    required_shift = 31 - exponent
+    assert 0 <= required_shift < (1 << 6)
+    return mantissa_scaled, required_shift
+
+
+def _reduced_quantize_scale(scale: float) -> Tuple[int, int]:
+    """A reduction of precision is required for 16 bit data."""
+    mantissa_scaled, required_shift = _quantize_scale(scale)
+    # This is max a signed 16-bit number could represent
+    max_reduced_mantissa_scaled = (1 << 15) - 1
+    # if the current value is larger than pre-scaled max_reduced_mantissa_scaled
+    # we need to saturate the anwser to max_reduced_mantissa_scaled
+    if mantissa_scaled >= max_reduced_mantissa_scaled << 16:
+        reduced_mantissa_scaled = max_reduced_mantissa_scaled
+    else:
+        reduced_mantissa_scaled = (mantissa_scaled + (1 << 15)) >> 16
+    reduced_shift = required_shift - 16
+    return reduced_mantissa_scaled, reduced_shift
+
+
+def _calculate_hw_bias_scales(
+    ifm_scale: float,
+    weight_scales: List[float],
+    ofm_scale: float,
+    ifm_dtype: np.dtype,
+    is_faf_tanh_sigmoid: bool = False,
+) -> List[Tuple[int, int]]:
+    """This function will produce a scale that is calculated using scales of ifm,
+    weights and ofm. It is also important to note that if per-channel / per-value
+    quantization required they should go into hw bias scales"""
+    if is_faf_tanh_sigmoid:
+        ifm_scale = ifm_scale * 0x3000
+    if ifm_dtype == np.uint8:
+        bias_scales = [np.double(ifm_scale * ws) / np.double(ofm_scale) for ws in weight_scales]
+    else:
+        assert ifm_dtype in (np.int8, np.int16)
+        ifm_scale_dbl = np.double(ifm_scale)
+        ofm_scale_dbl = np.double(ofm_scale)
+        bias_scales = [ifm_scale_dbl * np.double(ws) / ofm_scale_dbl for ws in weight_scales]
+
+    if ifm_dtype == np.int16:
+        hw_bias_scales = [_reduced_quantize_scale(bs) for bs in bias_scales]
+    else:
+        assert ifm_dtype in (np.uint8, np.int8)
+        hw_bias_scales = [_quantize_scale(bs) for bs in bias_scales]
+
+    return hw_bias_scales
+
+
+def get_target_accel_type():
+    """This is a helper function to convert cli accelerator type str argument
+    to NpuAccelerator"""
+    npu_accel_str_map = {
+        "ethos-u55-256": vapi.NpuAccelerator.Ethos_U55_256,
+        "ethos-u55-128": vapi.NpuAccelerator.Ethos_U55_128,
+        "ethos-u55-64": vapi.NpuAccelerator.Ethos_U55_64,
+        "ethos-u55-32": vapi.NpuAccelerator.Ethos_U55_32,
+    }
+    accel_type_str = util.get_accelerator_config()
+    assert accel_type_str in npu_accel_str_map.keys(), f"{accel_type_str} is not supported"
+    return npu_accel_str_map[accel_type_str]
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index ce048105ae8b..3a4897ad3166 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -726,12 +726,13 @@ def gru_cell(
                 b_ir, b_iz, b_in = _op.split(b_inp, 3, axis=-1)
                 b_hr, b_hz, b_hn = _op.split(b_hid, 3, axis=-1)
                 r_gate += b_ir + b_hr
+                r_gate = rz_act(r_gate)
                 z_gate += b_iz + b_hz
                 i_n += b_in
                 h_n = _op.nn.dense((r_gate * hidden_state), w_hn) + b_hn
             else:
+                r_gate = rz_act(r_gate)
                 h_n = _op.nn.dense((r_gate * hidden_state), w_hn)
-            r_gate = rz_act(r_gate)
             z_gate = rz_act(z_gate)
             n_gate = n_act(i_n + h_n)
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ede859245dbb..66a62b9c4034 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -19,6 +19,7 @@
 """ONNX: Open Neural Network Exchange frontend for Relay."""
 import copy
 import warnings
+from typing import Optional
 
 import numpy as np
 import tvm
@@ -36,21 +37,9 @@
 from .. import random as _random
 from .. import ty as _ty
 from .. import vision as _vision
-from .common import (
-    AttrCvt,
-    Renamer,
-    fold_constant,
-    get_name,
-    get_relay_op,
-    gru_cell,
-    infer_channels,
-    infer_shape,
-    infer_type,
-    infer_value,
-    lstm_cell,
-    new_var,
-    unbind,
-)
+from .common import (AttrCvt, Renamer, fold_constant, get_name, get_relay_op,
+                     gru_cell, infer_channels, infer_shape, infer_type,
+                     infer_value, lstm_cell, new_var, unbind)
 
 __all__ = ["from_onnx"]
 
@@ -69,7 +58,11 @@ class onnx_input(list):
 
     def __getitem__(self, item):
         if isinstance(item, slice):
-            indices = list(range(item.stop)[item])
+            if item.stop is None:
+                stop = len(self)
+            else:
+                stop = item.stop
+            indices = list(range(stop)[item])
             return [self[i] for i in indices]
         if isinstance(item, int):
             return list(self)[item] if item < len(self) else None
@@ -196,6 +189,17 @@ def _dim_check(attrs):
     return _dim_check, "Only 1d, 2d and 3d kernel supported."
 
 
+def get_scalar(x, params, dtype="float32"):
+    """Helper to get a scalar value for Quantized operators."""
+    if isinstance(x, _expr.Var) and x.name_hint in params:
+        return _op.const(params[x.name_hint].numpy(), dtype)
+    rank = len(infer_shape(x))
+    assert rank <= 1, "scale and zero_point input must be scalars"
+    if rank == 1:
+        x = _op.squeeze(x, [0])
+    return _op.cast(x, dtype)
+
+
 class OnnxOpConverter(object):
     """A helper class for holding onnx op converters."""
 
@@ -479,7 +483,7 @@ def _impl_v1(cls, inputs, attr, params):
                 attr["dilations"] = [1] + list(attr["dilations"])
             if "pads" in attr:
                 attr["pads"] = [0, attr["pads"][0], 0, attr["pads"][1]]
-
+        attr["channels"] = kernel_shapes[0][0]
         out = AttrCvt(
             op_name=dimension_picker("conv"),
             transforms={
@@ -1010,6 +1014,32 @@ def _impl_v1(cls, inputs, attr, params):
         return _op.log(_op.exp(beta * inputs[0]) + _expr.const(1.0)) * alpha
 
 
+class Pow(OnnxOpConverter):
+    """Operator converter for Pow."""
+
+    @classmethod
+    def _impl_v13(cls, inputs, attr, params):
+        x = inputs[0]
+        y = inputs[1]
+
+        x_type = infer_type(x).checked_type.dtype
+        output_type = x_type
+        y_type = infer_type(y).checked_type.dtype
+
+        if not x_type.startswith("float"):
+            x_type = "float32"
+            x = _op.cast(x, x_type)
+
+        if x_type != y_type:
+            y = _op.cast(y, x_type)
+
+        # TODO: come up with good default integer pow() func for common backends
+        result = _op.power(x, y)
+        if x_type != output_type:
+            return _op.cast(result, output_type)
+        return result
+
+
 class Prelu(OnnxOpConverter):
     """Operator converter for Prelu."""
 
@@ -1382,20 +1412,13 @@ class Slice(OnnxOpConverter):
 
     @classmethod
     def _common(cls, starts, ends, axes):
-        new_axes = []
-        new_starts = []
-        new_ends = []
-        pop_index = 0
-        for i in range(max(axes) + 1):
-            if i in axes:
-                new_axes.append(i)
-                new_starts.append(starts[pop_index])
-                new_ends.append(ends[pop_index])
-                pop_index += 1
-            else:
-                new_axes.append(i)
-                new_starts.append(0)
-                new_ends.append(np.iinfo(np.int32).max)
+        N = max(axes) + 1
+        new_axes = list(range(N))
+        new_starts = [0] * N
+        new_ends = [np.iinfo(np.int32).max] * N
+        for i, axis in enumerate(axes):
+            new_starts[axis] = starts[i]
+            new_ends[axis] = ends[i]
         return new_starts, new_ends, new_axes
 
     @classmethod
@@ -1408,13 +1431,10 @@ def _impl_v1(cls, inputs, attr, params):
             # Update the starts and ends according to axes if required.
             if isinstance(attr["axes"], int):
                 attr["axes"] = (attr["axes"],)
-            if (max(attr["axes"]) + 1) != len(attr["axes"]):
-                new_starts, new_ends, new_axes = cls._common(
-                    attr["starts"], attr["ends"], attr["axes"]
-                )
-                attr["axes"] = new_axes
-                attr["starts"] = new_starts
-                attr["ends"] = new_ends
+            new_starts, new_ends, new_axes = cls._common(attr["starts"], attr["ends"], attr["axes"])
+            attr["axes"] = new_axes
+            attr["starts"] = new_starts
+            attr["ends"] = new_ends
         except KeyError:
             pass
         begin = list(attr["starts"])
@@ -1530,7 +1550,12 @@ def _impl_common(cls, data, indices, batch_dims=0):
         indices_shape = infer_shape(indices)
         indices = _op.transpose(indices, axes=[-1] + list(range(indices_dims - 1)))
         index_rank = indices_shape[-1]
-        return _op.gather_nd(data, indices, batch_dims, index_rank)
+        return _op.gather_nd(
+            data,
+            indices,
+            batch_dims=batch_dims,
+            index_rank=index_rank,
+        )
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
@@ -1584,22 +1609,6 @@ def _impl_v9(cls, inputs, attr, params):
         return _op.scatter_nd(zeros, _op.stack([indices, indices + k], axis=0), ones, "update")
 
 
-class Greater(OnnxOpConverter):
-    """Operator logical greater."""
-
-    @classmethod
-    def _impl_v7(cls, inputs, attr, params):
-        return _op.greater(inputs[0], inputs[1])
-
-
-class Less(OnnxOpConverter):
-    """Operator logical less than."""
-
-    @classmethod
-    def _impl_v7(cls, inputs, attr, params):
-        return _op.less(inputs[0], inputs[1])
-
-
 class LRN(OnnxOpConverter):
     """Operator converter for Local Response Normalization."""
 
@@ -1899,6 +1908,23 @@ def _impl_v1(cls, inputs, attr, params):
         )
         return _op.reshape(onehot, shape_of(inputs[0]))
 
+    @classmethod
+    def _impl_v13(cls, inputs, attr, params) -> relay.Expr:
+        inferred_type = infer_type(inputs[0])
+        dtype = inferred_type.checked_type.dtype
+        ndim = len(inferred_type.checked_type.shape)
+        axis = attr.get("axis", -1) % ndim
+
+        argmax = _op.argmax(inputs[0], axis=axis)
+        return _op.one_hot(
+            argmax,
+            _op.const(1.0, dtype),
+            _op.const(0.0, dtype),
+            fold_constant(_op.take(shape_of(inputs[0]), _op.const([axis], "int64"))),
+            axis,
+            dtype,
+        )
+
 
 class OneHot(OnnxOpConverter):
     """Operator converter for OneHot."""
@@ -3021,6 +3047,7 @@ def _op_dispatch(cls, operator, inputs, attr, params):
         op_map = {
             "size": cls._size,
             "arange": cls._arange,
+            "index_put": cls._index_put,
             "reshape": cls._reshape,
             "embedding_bag": cls._embedding_bag,
         }
@@ -3040,6 +3067,47 @@ def _size(cls, inputs, attr, params):
     def _arange(cls, inputs, attr, params):
         return _op.arange(inputs[0], inputs[1], inputs[2], dtype="int64")
 
+    @classmethod
+    def _check_index(cls, indices, values):
+        def unfolding_indices(indices, values):
+            n = len(indices)
+            flatten_indices = []
+            slices_size = []
+            for index in indices:
+                flatten_indices.append(_op.reshape(index, _op.const([-1])))
+                slices_size.append(infer_shape(flatten_indices[-1])[0])
+            repeat_size = [1]
+            tile_size = [1]
+            for i in range(1, n):
+                repeat_size.append(slices_size[-i] * repeat_size[-1])
+                tile_size.append(slices_size[i - 1] * tile_size[-1])
+            repeat_size.reverse()
+            unflod_slices = []
+            for i in range(n):
+                unflod_slices.append(
+                    fold_constant(
+                        _op.repeat(_op.tile(flatten_indices[i], (tile_size[i],)), repeat_size[i], 0)
+                    )
+                )
+            return unflod_slices, _op.reshape(values, _op.const([-1]))
+
+        values_shape = infer_shape(values)
+        if len(values_shape) != 1:
+            return unfolding_indices(indices, values)
+        return indices, values
+
+    @classmethod
+    def _index_put(cls, inputs, attr, params):
+        in_tensor = inputs[0]
+        indices, values = cls._check_index(inputs[1 : len(inputs) - 2], inputs[len(inputs) - 2])
+        accumulate = inputs[len(inputs) - 1].data.asnumpy() != 0
+        if not accumulate:
+            mode = "update"
+        else:
+            mode = "add"
+        index_tensor = _op.stack(indices, axis=0)
+        return _op.transform.scatter_nd(in_tensor, index_tensor, values, mode)
+
     @classmethod
     def _reshape(cls, inputs, attr, params):
         return _op.reshape(inputs[0], inputs[1])
@@ -3137,23 +3205,14 @@ class QLinearConv(OnnxOpConverter):
 
     @classmethod
     def _impl_v10(cls, inputs, attr, params):
-        def get_scalar(x, dtype="float32"):
-            if isinstance(x, _expr.Var) and x.name_hint in params:
-                return _op.const(params[x.name_hint].numpy(), dtype)
-            rank = len(infer_shape(x))
-            assert rank <= 1, "QLinearConv scale and zero_point input must be scalars"
-            if rank == 1:
-                x = _op.squeeze(x, [0])
-            return _op.cast(x, dtype)
-
         data = inputs[0]
-        x_scale = get_scalar(inputs[1])
-        x_zero_point = get_scalar(inputs[2], "int32")
+        x_scale = get_scalar(inputs[1], params)
+        x_zero_point = get_scalar(inputs[2], params, "int32")
         weight = inputs[3]
-        w_scale = get_scalar(inputs[4])
-        w_zero_point = get_scalar(inputs[5], "int32")
-        y_scale = fold_constant(get_scalar(inputs[6]))
-        y_zero_point = get_scalar(inputs[7], "int32")
+        w_scale = get_scalar(inputs[4], params)
+        w_zero_point = get_scalar(inputs[5], params, "int32")
+        y_scale = fold_constant(get_scalar(inputs[6], params))
+        y_zero_point = get_scalar(inputs[7], params, "int32")
 
         input_shape = infer_shape(data)
 
@@ -3241,23 +3300,14 @@ class QLinearAdd(OnnxOpConverter):
 
     @classmethod
     def _impl_v10(cls, inputs, attr, params):
-        def get_scalar(x, dtype="float32"):
-            if isinstance(x, _expr.Var) and x.name_hint in params:
-                return _op.const(params[x.name_hint].numpy(), dtype)
-            rank = len(infer_shape(x))
-            assert rank <= 1, "QLinearConv scale and zero_point input must be scalars"
-            if rank == 1:
-                x = _op.squeeze(x, [0])
-            return _op.cast(x, dtype)
-
         a = inputs[0]
-        a_scale = get_scalar(inputs[1])
-        a_zero_point = get_scalar(inputs[2], "int32")
+        a_scale = get_scalar(inputs[1], params)
+        a_zero_point = get_scalar(inputs[2], params, "int32")
         b = inputs[3]
-        b_scale = get_scalar(inputs[4])
-        b_zero_point = get_scalar(inputs[5], "int32")
-        c_scale = get_scalar(inputs[6])
-        c_zero_point = get_scalar(inputs[7], "int32")
+        b_scale = get_scalar(inputs[4], params)
+        b_zero_point = get_scalar(inputs[5], params, "int32")
+        c_scale = get_scalar(inputs[6], params)
+        c_zero_point = get_scalar(inputs[7], params, "int32")
 
         dtype = infer_type(a).checked_type.dtype
 
@@ -3279,23 +3329,14 @@ class QLinearMul(OnnxOpConverter):
 
     @classmethod
     def _impl_v10(cls, inputs, attr, params):
-        def get_scalar(x, dtype="float32"):
-            if isinstance(x, _expr.Var) and x.name_hint in params:
-                return _op.const(params[x.name_hint].numpy(), dtype)
-            rank = len(infer_shape(x))
-            assert rank <= 1, "QLinearMul scale and zero_point input must be scalars"
-            if rank == 1:
-                x = _op.squeeze(x, [0])
-            return _op.cast(x, dtype)
-
         a = inputs[0]
-        a_scale = get_scalar(inputs[1])
-        a_zero_point = get_scalar(inputs[2], "int32")
+        a_scale = get_scalar(inputs[1], params)
+        a_zero_point = get_scalar(inputs[2], params, "int32")
         b = inputs[3]
-        b_scale = get_scalar(inputs[4])
-        b_zero_point = get_scalar(inputs[5], "int32")
-        y_scale = fold_constant(get_scalar(inputs[6]))
-        y_zero_point = get_scalar(inputs[7], "int32")
+        b_scale = get_scalar(inputs[4], params)
+        b_zero_point = get_scalar(inputs[5], params, "int32")
+        y_scale = fold_constant(get_scalar(inputs[6], params))
+        y_zero_point = get_scalar(inputs[7], params, "int32")
 
         dtype = infer_type(a).checked_type.dtype
 
@@ -3308,6 +3349,32 @@ def get_scalar(x, dtype="float32"):
         return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=dtype)
 
 
+class QLinearConcat(OnnxOpConverter):
+    """Operator converter for QLinearConcat from Microsoft onnxruntime contrib opset."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # which axis to concat on
+        axis = attr["axis"]
+
+        y_scale = fold_constant(get_scalar(inputs[0], params))
+        y_zero_point = get_scalar(inputs[1], params, "int32")
+
+        # input tensors, scales, zero_points
+        assert (
+            len(inputs) % 3 == 2
+        ), "Additional input count must be a multiple of 3 -- tensor/scale/zero_point tuples"
+        tensors = []
+        scales = []
+        zero_points = []
+        for i in range(2, len(inputs), 3):
+            tensors.append(inputs[i])
+            scales.append(get_scalar(inputs[i + 1], params))
+            zero_points.append(get_scalar(inputs[i + 2], params, "int32"))
+
+        return _qnn.op.concatenate(tensors, scales, zero_points, y_scale, y_zero_point, axis)
+
+
 class ConvInteger(OnnxOpConverter):
     """Operator converter for ConvInteger."""
 
@@ -3432,6 +3499,15 @@ def _impl_v11(cls, inputs, attr, params):
         return _expr.TupleWrapper(_expr.Tuple([unique_vals, indices, inverse_indices, counts]), 4)
 
 
+class Einsum(OnnxOpConverter):
+    """Operator converter for Einsum"""
+
+    @classmethod
+    def _impl_v12(cls, inputs, attr, params):
+        equation = attr["equation"].decode("utf-8")
+        return _op.einsum(inputs, equation)
+
+
 class RandomUniform(OnnxOpConverter):
     """Operator converter for random_uniform"""
 
@@ -3457,15 +3533,24 @@ def _impl_v1(cls, inputs, attr, params):
 
 
 class NegativeLogLikelihoodLoss(OnnxOpConverter):
-    """Operator converter for random_uniform"""
+    """Operator converter for NegativeLogLikehoodLoss"""
 
     VALID_REDUCTIONS = {"mean", "sum", "none"}
 
     @classmethod
-    def run_calculation(
-        cls, input_tensor, target_tensor, weight_tensor=None, ignore_index=None, reduction="none"
+    def _run_calculation(
+        input_tensor: relay.Expr,
+        target_tensor: relay.Expr,
+        weight_tensor: Optional[relay.Expr],
+        ignore_index: int,
     ):
-        """Run main calculation for onnx spec of NegativeLogLikehoodLoss"""
+        """Run calculation for NegativeLogLikelihood, returning output tensor and
+        weight tensor used for mean-style reductions.
+        """
+        # Convert negative indices --> positive indices for gather ops, note we have to
+        # use the original target tensor to interact with ignore_index to have proper behavior.
+        normalized_target_tensor = normalize_gather_indices(input_tensor, target_tensor, 1)
+
         if weight_tensor is None:
             channels = infer_shape(input_tensor)[1]
             weight_tensor = relay.ones(
@@ -3473,12 +3558,18 @@ def run_calculation(
                 dtype=infer_type(input_tensor).checked_type.dtype,
             )
 
-        loss = -relay.gather(input_tensor, axis=1, indices=relay.expand_dims(target_tensor, 1))
+        loss = -relay.gather(
+            input_tensor,
+            axis=1,
+            indices=relay.expand_dims(normalized_target_tensor, 1),
+        )
         loss = relay.squeeze(loss, axis=[1])
 
-        expanded_target_tensor = relay.expand_dims(target_tensor, 0)
-        expanded_target_tensor = relay.nn.batch_flatten(expanded_target_tensor)
-        flattened_weights = relay.gather_nd(weight_tensor, expanded_target_tensor)
+        expanded_normalized_target_tensor = relay.expand_dims(normalized_target_tensor, 0)
+        expanded_normalized_target_tensor = relay.nn.batch_flatten(
+            expanded_normalized_target_tensor
+        )
+        flattened_weights = relay.gather_nd(weight_tensor, expanded_normalized_target_tensor)
         select_weights = relay.reshape_like(flattened_weights, loss)
         loss *= select_weights
 
@@ -3488,7 +3579,9 @@ def run_calculation(
                 target_tensor, relay.const(ignore_index, dtype=target_tensor.type_annotation.dtype)
             )
             mask_tensor = relay.const(1, dtype="int8") - relay.cast(mask_tensor, "int8")
-            loss *= relay.cast_like(mask_tensor, loss)
+            loss = relay.where(
+                mask_tensor, loss, relay.const(0, infer_type(loss).checked_type.dtype)
+            )
 
             # This is not explained super clearly in the onnx spec, but masked values don't
             # contribute toward the final value in reduction
@@ -3562,6 +3655,176 @@ def _impl_v13(cls, inputs, attr, params):
         return loss
 
 
+class Adagrad(OnnxOpConverter):
+    """Operator converter for adagrad op."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        decay_factor = attr.get("decay_factor", 0.0)
+        epsilon = attr.get("epsilon", 0.0)
+        norm_coefficient = attr.get("norm_coefficient", 0.0)
+
+        R = inputs[0]
+        T = inputs[1]
+
+        # convert attributes to constants, proper types
+        dtype_inputs = infer_type(inputs[3]).checked_type.dtype
+        decay_factor = relay.const(decay_factor, dtype=dtype_inputs)
+        epsilon = relay.const(epsilon, dtype=dtype_inputs)
+        norm_coefficient = relay.const(norm_coefficient, dtype=dtype_inputs)
+        T = relay.cast_like(T, inputs[3])
+
+        assert (
+            len(inputs) - 2
+        ) % 3 == 0, f"Expect triplets for remaining inputs, found {len(inputs) - 2}"
+
+        # Remaining inputs are:
+        # [x_1, x_2 ..., x_1_gradient, x_2_gradient, ... x_1_sq_g, x_2_sq_g...]
+        num_input_tensors = (len(inputs) - 2) // 3
+        output_tensors = []
+        output_accumulated_squared_gradients = []
+        for i in range(num_input_tensors):
+            x = inputs[i + 2]
+            gradient = inputs[i + 2 + num_input_tensors]
+            accumulated_squared_gradient = inputs[i + 2 + 2 * num_input_tensors]
+
+            r = R / (relay.const(1.0, dtype=dtype_inputs) + T * decay_factor)
+            g_regularized = norm_coefficient * x + gradient
+            new_accumulated_squared_gradient = (
+                accumulated_squared_gradient + g_regularized * g_regularized
+            )
+            h_adaptive = relay.sqrt(new_accumulated_squared_gradient) + epsilon
+
+            x_new = x - r * g_regularized / h_adaptive
+
+            output_tensors.append(x_new)
+            output_accumulated_squared_gradients.append(new_accumulated_squared_gradient)
+
+        # append lists together, momentums come after result tensors
+        result = output_tensors + output_accumulated_squared_gradients
+        return _expr.TupleWrapper(_expr.Tuple(result), len(result))
+
+
+class Adam(OnnxOpConverter):
+    """Operator converter for Adam op."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = attr.get("alpha", 0.9)
+        beta = attr.get("beta", 0.999)
+
+        # Note in the docs epsilon default is 0.0 but in the tests it is set to 1e-2:
+        # https://git.io/Ju5C4
+        epsilon = attr.get("epsilon", 1e-2)
+        norm_coefficient = attr.get("norm_coefficient", 0.0)
+        norm_coefficient_post = attr.get("norm_coefficient_post", 0.0)
+
+        R = inputs[0]
+        T = inputs[1]
+
+        assert (
+            len(inputs) - 2
+        ) % 4 == 0, f"Expect 4-lets for remaining inputs, found {len(inputs) - 2}"
+
+        # convert attributes to constants, proper types
+        dtype_inputs = infer_type(inputs[3]).checked_type.dtype
+        inverse_alpha = relay.const(1 - alpha, dtype=dtype_inputs)
+        alpha = relay.const(alpha, dtype=dtype_inputs)
+        inverse_beta = relay.const(1 - beta, dtype=dtype_inputs)
+        beta = relay.const(beta, dtype=dtype_inputs)
+        epsilon = relay.const(epsilon, dtype=dtype_inputs)
+        norm_coefficient = relay.const(norm_coefficient, dtype=dtype_inputs)
+        norm_coefficient_post = relay.const(norm_coefficient_post, dtype=dtype_inputs)
+        one = relay.const(1, dtype=dtype_inputs)
+        T = relay.cast_like(T, inputs[3])
+
+        # Remaining inputs are:
+        # [x_1, x_2 ..., x_1_grad, x_2_grad, ... x_1_g_accum, x_2_g_accum..., x_1_g_sq_accum, ...]
+        num_input_tensors = (len(inputs) - 2) // 4
+        output_tensors = []
+        output_accumulated_gradients = []
+        output_accumulated_squared_gradients = []
+        for i in range(num_input_tensors):
+            x = inputs[i + 2]
+            g = inputs[i + 2 + num_input_tensors]
+            v = inputs[i + 2 + 2 * num_input_tensors]
+            h = inputs[i + 2 + 3 * num_input_tensors]
+
+            g_regularized = norm_coefficient * x + g
+            v_new = alpha * v + inverse_alpha * g_regularized
+            h_new = beta * h + inverse_beta * g_regularized * g_regularized
+            h_sqrt = relay.sqrt(h_new) + epsilon
+
+            true_branch = R * relay.sqrt(one - relay.power(beta, T)) / (one - relay.power(alpha, T))
+            R_adjusted = relay.If(T > relay.const(0, dtype=dtype_inputs), true_branch, R)
+
+            x_new = x - R_adjusted * (v_new / h_sqrt)
+            x_result = (one - norm_coefficient_post) * x_new
+
+            output_tensors.append(x_result)
+            output_accumulated_gradients.append(v_new)
+            output_accumulated_squared_gradients.append(h_new)
+
+        # append lists together to get final result
+        result = (
+            output_tensors + output_accumulated_gradients + output_accumulated_squared_gradients
+        )
+        return _expr.TupleWrapper(_expr.Tuple(result), len(result))
+
+
+class Momentum(OnnxOpConverter):
+    """Operator converter for Momentum op."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = attr["alpha"]
+        beta = attr["beta"]
+        mode = attr["mode"].decode("utf-8")
+        norm_coefficient = attr["norm_coefficient"]
+
+        assert mode in ["nesterov", "standard"], f"Unknown momentum mode {mode}"
+        R = inputs[0]
+        T = inputs[1]
+
+        assert (
+            len(inputs) - 2
+        ) % 3 == 0, f"Expect triplets for remaining inputs, found {len(inputs) - 2}"
+        # Remaining inputs are:
+        # [x_1, x_2 ..., x_1_gradient, x_2_gradient, ... x_1_momentum, x_2_momentum...]
+        num_input_tensors = (len(inputs) - 2) // 3
+
+        # convert attributes to constants
+        dtype_inputs = infer_type(inputs[3]).checked_type.dtype
+        alpha = relay.const(alpha, dtype=dtype_inputs)
+        beta = relay.const(beta, dtype=dtype_inputs)
+        norm_coefficient = relay.const(norm_coefficient, dtype=dtype_inputs)
+        default_beta = relay.const(1.0, dtype=dtype_inputs)
+
+        # Calculate updated values for every input
+        output_tensors = []
+        output_momentums = []
+        for i in range(num_input_tensors):
+            x = inputs[i + 2]
+            gradient = inputs[i + 2 + num_input_tensors]
+            momentum = inputs[i + 2 + 2 * num_input_tensors]
+            g_regularized = norm_coefficient * x + gradient
+            beta_adjusted = relay.If(T > relay.const(0, dtype="int64"), beta, default_beta)
+            new_momentum = alpha * momentum + beta_adjusted * g_regularized
+
+            if mode == "standard":
+                x_output = x - R * new_momentum
+            else:
+                # mode == 'nesterov'
+                x_output = x - R * (g_regularized + alpha * new_momentum)
+
+            output_tensors.append(x_output)
+            output_momentums.append(new_momentum)
+
+        # append lists together, momentums come after result tensors
+        result = output_tensors + output_momentums
+        return _expr.TupleWrapper(_expr.Tuple(result), len(result))
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -3620,8 +3883,8 @@ def _get_convert_map(opset):
         "Selu": Selu.get_converter(opset),
         "Elu": Elu.get_converter(opset),
         "Exp": Renamer("exp"),
-        "Greater": Greater.get_converter(opset),
-        "Less": Less.get_converter(opset),
+        "Greater": Renamer("greater"),
+        "Less": Renamer("less"),
         "Log": Renamer("log"),
         "Acos": Renamer("acos"),
         "Acosh": Renamer("acosh"),
@@ -3635,7 +3898,7 @@ def _get_convert_map(opset):
         "Sinh": Renamer("sinh"),
         "Tan": Renamer("tan"),
         "Tanh": Renamer("tanh"),
-        "Pow": Renamer("power"),
+        "Pow": Pow.get_converter(opset),
         "PRelu": Prelu.get_converter(opset),
         "Sigmoid": Renamer("sigmoid"),
         "HardSigmoid": HardSigmoid.get_converter(opset),
@@ -3728,6 +3991,7 @@ def _get_convert_map(opset):
         "Range": Range.get_converter(opset),
         "CumSum": CumSum.get_converter(opset),
         "Unique": Unique.get_converter(opset),
+        "Einsum": Einsum.get_converter(opset),
         # defs/control_flow
         "Loop": Loop.get_converter(opset),
         "If": If.get_converter(opset),
@@ -3739,14 +4003,18 @@ def _get_convert_map(opset):
         "DynamicQuantizeLinear": DynamicQuantizeLinear.get_converter(opset),
         "ReverseSequence": ReverseSequence.get_converter(opset),
         "QLinearConv": QLinearConv.get_converter(opset),
+        "QLinearConcat": QLinearConcat.get_converter(opset),
         "QLinearAdd": QLinearAdd.get_converter(opset),
         "QLinearMul": QLinearMul.get_converter(opset),
         "ConvInteger": ConvInteger.get_converter(opset),
         # Random number generation.
         "RandomUniform": RandomUniform.get_converter(opset),
-        # Loss functions
+        # Loss functions / training
         "NegativeLogLikelihoodLoss": NegativeLogLikelihoodLoss.get_converter(opset),
         "SoftmaxCrossEntropyLoss": SoftmaxCrossEntropyLoss.get_converter(opset),
+        "Adagrad": Adagrad.get_converter(opset),
+        "Adam": Adam.get_converter(opset),
+        "Momentum": Momentum.get_converter(opset),
     }
 
 
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index c13d791cf2e2..39bcfc68e421 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -3400,8 +3400,8 @@ def _getattr_attr_name(node):
     return attr_name
 
 
-def _getattr_full_name(getattrs):
-    return ".".join([_getattr_attr_name(node) for node in getattrs])
+def _getattr_full_name(getattrs, sep="."):
+    return sep.join([_getattr_attr_name(node) for node in getattrs])
 
 
 def _get_pytorch_value_type(typ, default_dtype="float32"):
@@ -3657,7 +3657,7 @@ def terminate(users):
     return get_use_chains(root_getattr_node, terminate)
 
 
-def convert_params(graph, state_dict):
+def convert_params(graph, state_dict, use_parser_friendly_name=False):
     """
     Return Relay vars and TVM NDArrays for input parameters
     A chain of prim::GetAttr nodes is processed one at a time
@@ -3668,6 +3668,7 @@ def convert_params(graph, state_dict):
     packed_param_map = {}
     vars_by_name = {}
     seen = set()
+    attr_name_sep = "_" if use_parser_friendly_name else "."
 
     for node in getattr_nodes:
         if _get_output_name(node) in seen:
@@ -3676,7 +3677,7 @@ def convert_params(graph, state_dict):
         for getattrs in get_attr_chains(node):
             seen.update(map(_get_output_name, getattrs))
 
-            full_attr = _getattr_full_name(getattrs)
+            full_attr = _getattr_full_name(getattrs, attr_name_sep)
             full_attr_node_name = _get_output_name(getattrs[-1])
 
             if full_attr.endswith("_packed_params"):  # for quantized models
@@ -3706,7 +3707,13 @@ def get_all_op_names(graph):
     return set(node.kind() for node in nodes)
 
 
-def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dtype="float32"):
+def from_pytorch(
+    script_module,
+    input_infos,
+    custom_convert_map=None,
+    default_dtype="float32",
+    use_parser_friendly_name=False,
+):
     """Load PyTorch model in the form of a scripted PyTorch model and convert into relay.
     The companion parameters will be handled automatically.
 
@@ -3729,6 +3736,15 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
     custom_convert_map : Dictionary of str to Relay op
         A custom op conversion map in the same format as _convert_map above
 
+    default_type : str
+        The default dtype to use when type information is not provided by PyTorch.
+
+    use_parser_friendly_name : bool
+        When True, replace '.' with `_' in a original parameter name.
+        The Relay text parser treats a variable name followed by a period as a tuple element access,
+        so a variable name like "dense.weight" cannot be parsed correctly.
+        Use this option when you want to run the AnnotateSpans pass on the imported module.
+
     Returns
     -------
     mod : tvm.IRModule
@@ -3758,7 +3774,13 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
     outputs = _get_relay_input_vars(
         graph, input_infos, prelude, default_dtype=default_dtype, is_module=is_module
     )
-    param_vars, tensors, packed_param_map = convert_params(graph, params)
+
+    if use_parser_friendly_name:
+        new_names = [key.replace(".", "_") for key in params.keys()]
+        params = dict(zip(new_names, params.values()))
+
+    param_vars, tensors, packed_param_map = convert_params(graph, params, use_parser_friendly_name)
+
     tvm_params = {k: tvm.nd.array(v) for k, v in tensors.items()}
 
     outputs.update(param_vars)
@@ -3778,7 +3800,7 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
         # ListConstruct kept original python list. Convert to tuple.
         ret = _expr.Tuple(ret)
 
-    # Separate data inputs and parameters to make sure data inputs are always in the beginning.
+    # Separate data inputs and parameters to make sure data inputs come first.
     func_args = []
     data_inputs = []
     for arg in _analysis.free_vars(ret):
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 2e509a111c4a..825bd1f627ca 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -54,6 +54,7 @@
 from . import _transform
 from . import _reduce
 from . import _algorithm
+from . import _math
 
 
 def _register_op_make():
diff --git a/python/tvm/relay/op/_math.py b/python/tvm/relay/op/_math.py
new file mode 100644
index 000000000000..ff74fafcef75
--- /dev/null
+++ b/python/tvm/relay/op/_math.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Backend compiler related feature registration"""
+from . import op as _reg
+from . import strategy
+
+# einsum
+_reg.register_strategy("einsum", strategy.einsum_strategy)
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index bee188f19364..0284d2483ce5 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -182,6 +182,7 @@ def compute_unique(attrs, inputs, output_type):
 _reg.register_strategy("invert_permutation", strategy.invert_permutation_strategy)
 _reg.register_shape_func("invert_permutation", False, elemwise_shape_func)
 
+
 #####################
 #  Shape functions  #
 #####################
@@ -899,9 +900,9 @@ def tile_shape_func(attrs, inputs, _):
 
 
 @script
-def _split_shape_func(data_shape, index, indices_or_sections, axis):
+def _split_shape_func(data_shape, index, indices_or_sections, param_is_indices, axis):
     out = output_tensor((data_shape.shape[0],), "int64")
-    if len(indices_or_sections) == 1:
+    if param_is_indices:
         for i in const_range(data_shape.shape[0]):
             if i == axis:
                 assert (
@@ -949,10 +950,18 @@ def split_shape_func(attrs, inputs, _):
         if isinstance(indices_or_sections, int)
         else len(indices_or_sections) + 1
     )
-    if isinstance(indices_or_sections, int):
+
+    param_is_indices = isinstance(indices_or_sections, int)
+    if param_is_indices:
         indices_or_sections = [indices_or_sections]
     return [
-        _split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis))
+        _split_shape_func(
+            inputs[0],
+            convert(i),
+            convert(indices_or_sections),
+            convert(param_is_indices),
+            convert(axis),
+        )
         for i in range(num_out)
     ]
 
diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index daf1e098d7f1..f1153c6a8575 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -68,11 +68,15 @@ def softmax_pattern():
 
     def check_quantized_softmax(extract):
         """Check if softmax is supported by CMSIS-NN."""
+        dequantize_call = extract.args[0].args[0]
+        scale = extract.args[1].data.numpy().item(0)
+        zero_point = extract.args[2].data.numpy().item(0)
 
         # check for dtypes of quantize and dequantize
         return (
-            extract.attrs.out_dtype == "int8"
-            and extract.args[0].args[0].args[0].checked_type.dtype == "int8"
+            (scale == 1.0 / 256 and zero_point == -128)
+            and extract.attrs.out_dtype == "int8"
+            and dequantize_call.args[0].checked_type.dtype == "int8"
         )
 
     return [
diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
new file mode 100644
index 000000000000..0da81101c77b
--- /dev/null
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm(R) Ethos(TM)-U NPU supported operators."""
+from typing import List, Tuple, Callable
+import numpy as np  # type: ignore
+
+import tvm  # type: ignore
+from tvm.relay.expr import Constant  # type: ignore
+from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
+from tvm.relay.dataflow_pattern import wildcard, is_op, is_constant  # type: ignore
+from tvm.relay.backend.contrib.ethosu.util import QConv2DArgs  # type: ignore
+from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs
+from tvm.relay.backend.contrib.ethosu.util import RequantArgs
+from tvm.relay.backend.contrib.ethosu.util import get_dim_value
+from ethosu.vela import api as vapi  # type: ignore
+
+
+class TensorParams:
+    """
+    This class will parse a tvm Expr along with quantization scale
+    and zero point to populate parameters that are required
+    for the creation of tensors in Vela.
+    """
+
+    def __init__(self, tensor, layout=None, scale=None, zero_point=None):
+        self.tensor = tensor
+        if isinstance(tensor, Constant):
+            self.values = tensor.data.asnumpy()
+        else:
+            self.values = None
+        self.dtype = tensor.checked_type.dtype
+        self.shape = [int(i) for i in tensor.checked_type.shape]
+        self.layout = layout
+
+        if scale is not None and zero_point is not None:
+            self.q_params = vapi.NpuQuantization(
+                scale.data.asnumpy().astype("float32"), zero_point.data.asnumpy().astype(self.dtype)
+            )
+        else:
+            # put default values
+            self.q_params = vapi.NpuQuantization(1.0, 0)
+
+
+def check_strides(strides: List[int]) -> bool:
+    """This function checks whether strides are within the limits supported by the NPU"""
+    stride_range = (1, 3)
+    smin, smax = stride_range
+    if not smax >= strides[0] >= smin:
+        return False
+    if not smax >= strides[1] >= smin:
+        return False
+    return True
+
+
+def check_valid_dtypes(tensor_params: List[TensorParams]) -> bool:
+    """This function checks whether dtypes are supported by the NPU"""
+    supported_dtypes = (np.uint8, np.int8)
+    for tep in tensor_params:
+        # Check for dtypes
+        if np.dtype(tep.dtype) not in supported_dtypes:
+            return False
+        # Check for shape sizes
+        if any(dimlen > 65536 for dimlen in tep.shape):
+            return False
+    return True
+
+
+def check_weights(weights: TensorParams, dilation: List[int]):
+    """This function checks whether weight tensor is compatible with the NPU"""
+    dilated_height_range = (1, 64)
+    dilated_hxw_range = (1, 64 * 64)
+    weights_limit = 127 * 65536
+    dilated_width = (weights.shape[get_dim_value(weights.layout, "W")] - 1) * dilation[0] + 1
+    dilated_height = (weights.shape[get_dim_value(weights.layout, "H")] - 1) * dilation[1] + 1
+    dh_min, dh_max = dilated_height_range
+    if not dh_min <= dilated_height <= dh_max:
+        return False
+    dilated_hxw = dilated_height * dilated_width
+    dhxw_min, dhxw_max = dilated_hxw_range
+    if not dhxw_min <= dilated_hxw <= dhxw_max:
+        return False
+    # A saturation upper bound check for accumulators
+    weights.values = weights.values - weights.q_params.zero_point
+    axis = (
+        get_dim_value(weights.layout, "H"),
+        get_dim_value(weights.layout, "W"),
+        get_dim_value(weights.layout, "I"),
+    )
+    sum_weights = np.amax(np.sum(np.absolute(weights.values), axis=axis))
+    return sum_weights <= weights_limit
+
+
+def check_bias(bias: TensorParams):
+    """This function checks whether the bias values fit in 40 bits"""
+    if bias and bias.dtype == np.dtype("int64"):
+        valid = all(len(bin(bias_value)[2:]) <= 40 for bias_value in bias.values)
+        return valid
+    return True
+
+
+def check_batch_size(ifm: TensorParams):
+    """This function checks for the number of batches vela currently supports"""
+    return ifm.shape[0] == 1
+
+
+def check_dilation(dilation: List[int]):
+    """This function checks whether dilation is within the limits supported by the NPU"""
+    dilation_range = (1, 2)
+    dmin, dmax = dilation_range
+    if not dmin <= dilation[0] <= dmax:
+        return False
+    if not dmin <= dilation[1] <= dmax:
+        return False
+    return True
+
+
+def check_padding(padding: List[int], bounds: List[int]):
+    """This function checks whether padding is within the limits supported by the NPU"""
+    if len(padding) != 4 or len(bounds) != 4:
+        return False
+    top, left, bottom, right = padding
+    topb, leftb, bottomb, rightb = bounds
+    return not (top > topb or left > leftb or bottom > bottomb or right > rightb)
+
+
+class QnnConv2DParams:
+    """
+    This class will parse a Call to a ethosu.qnn_conv2d composite function
+    and extract quantization information of all the associated tensors.
+    """
+
+    composite_name = "ethosu.qnn_conv2d"
+    # The NPU only supports padding upto the numbers as follows
+    padding_bounds = [31, 31, 32, 32]
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        activation = None
+        if str(func_body.op) in self.activation_map.keys():
+            activation = func_body
+            requantize_op = activation.args[0]
+        else:
+            requantize_op = func_body
+        bias_add = requantize_op.args[0]
+        qnn_conv2d = bias_add.args[0]
+        data_layout = qnn_conv2d.attrs.data_layout
+        kernel_layout = qnn_conv2d.attrs.kernel_layout
+        # We consider the weights & biases as params as it should be a Constant
+        self.weights = TensorParams(
+            qnn_conv2d.args[QConv2DArgs.WEIGHTS.value],
+            kernel_layout,
+            qnn_conv2d.args[QConv2DArgs.WEIGHTS_SCALE.value],
+            qnn_conv2d.args[QConv2DArgs.WEIGHTS_ZERO_POINT.value],
+        )
+
+        self.biases = TensorParams(
+            bias_add.args[BiasAddArgs.BIASES.value],
+            data_layout,
+            requantize_op.args[RequantArgs.IFM_SCALE.value],
+            requantize_op.args[RequantArgs.IFM_ZERO_POINT.value],
+        )
+        self.ifm = TensorParams(
+            qnn_conv2d.args[QConv2DArgs.IFM.value],
+            data_layout,
+            qnn_conv2d.args[QConv2DArgs.IFM_SCALE.value],
+            qnn_conv2d.args[QConv2DArgs.IFM_ZERO_POINT.value],
+        )
+        self.ofm = TensorParams(
+            func_body,
+            data_layout,
+            requantize_op.args[RequantArgs.OFM_SCALE.value],
+            requantize_op.args[RequantArgs.OFM_ZERO_POINT.value],
+        )
+        self.padding = qnn_conv2d.attrs.padding
+        self.strides = qnn_conv2d.attrs.strides
+        self.dilation = qnn_conv2d.attrs.dilation
+        self.activation = activation
+
+        # If groups are equal to channel, its a depthwise_conv2d
+        self.groups = qnn_conv2d.attrs.groups
+        self.is_depthwise = False
+        channels_axis = {"HWIO": 3, "HWOI": 2}
+        if qnn_conv2d.attrs.groups == self.weights.shape[channels_axis[kernel_layout]]:
+            self.is_depthwise = True
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether QnnConv2D has compatible attributes with the NPU
+        """
+        tensor_params = [self.weights, self.ifm, self.ofm]
+        if not check_valid_dtypes(tensor_params):
+            return False
+        if not check_weights(self.weights, self.dilation):
+            return False
+        if not check_bias(self.biases):
+            return False
+        if not check_strides(self.strides):
+            return False
+        if not check_batch_size(self.ifm):
+            return False
+        if not check_dilation(self.dilation):
+            return False
+        if not check_padding(self.padding, self.padding_bounds):
+            return False
+        legal_groups = [1, self.ofm.shape[3]]
+        if self.groups not in legal_groups:
+            return False
+        # This should be a valid QnnDepthwise2DParams, not QnnConv2DParams
+        return not self.is_depthwise
+
+
+def qnn_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """
+    This function creates the pattern for qnn.conv2D with optional fused RELU activation.
+    """
+    qnn_conv2d = is_op("qnn.conv2d")(
+        wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+    ).has_attr({"kernel_layout": "HWIO"})
+    bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
+    req = is_op("qnn.requantize")(
+        qnn_conv2d | bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    clip_or_req = req.optional(is_op("clip"))
+    return clip_or_req
+
+
+@register_pattern_table("ethosu")
+def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+    return [
+        (
+            QnnConv2DParams.composite_name,
+            qnn_conv2d_pattern(),
+            lambda pat: QnnConv2DParams(pat).is_valid(),
+        )
+    ]
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index a9e485866381..f06ee09fc7f4 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -39,17 +39,17 @@
 
 # softmax
 reg.register_strategy("nn.softmax", strategy.softmax_strategy)
-reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
+reg.register_pattern("nn.softmax", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # fast softmax
 reg.register_strategy("nn.fast_softmax", strategy.fast_softmax_strategy)
-reg.register_pattern("nn.fast_softmax", OpPattern.OPAQUE)
+reg.register_pattern("nn.fast_softmax", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # log_softmax
 reg.register_strategy("nn.log_softmax", strategy.log_softmax_strategy)
-reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
+reg.register_pattern("nn.log_softmax", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_legalize("nn.matmul")
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index e882bcf7e271..5a17db745b3e 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1548,7 +1548,7 @@ def dense(data, weight, units=None, out_dtype=""):
     return _make.dense(data, weight, units, out_dtype)
 
 
-def contrib_dense_pack(data, weight, weight_layout="NK", units=None, out_dtype=""):
+def contrib_dense_pack(data, weight, weight_layout="NC", units=None, out_dtype=""):
     """Dense operator.
     Applies a linear transformation with packed weight
 
@@ -1567,7 +1567,7 @@ def contrib_dense_pack(data, weight, weight_layout="NK", units=None, out_dtype="
         of shape `(units // pack_weight_tile, units_in, pack_weight_tile)`.
 
     weight_layout: str
-        The layout of weight, such as "NK" or "NK8n".
+        The layout of weight, such as "NC" or "NC8n".
 
     units : int, optional
         Number of hidden units of the dense transformation.
diff --git a/python/tvm/relay/op/strategy/__init__.py b/python/tvm/relay/op/strategy/__init__.py
index 8d0543ba30af..cf915777ed0b 100644
--- a/python/tvm/relay/op/strategy/__init__.py
+++ b/python/tvm/relay/op/strategy/__init__.py
@@ -28,3 +28,4 @@
 from . import bifrost
 from . import rocm
 from . import intel_graphics
+from . import hexagon
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index ba47ae7bc4f1..918c36c20079 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -144,7 +144,11 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
     if groups == 1:
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
-            if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8"):
+            if (
+                target.kind.name == "cuda"
+                and data.dtype in ("int8", "uint8")
+                and kernel.dtype in ("int8", "uint8")
+            ):
                 assert data.dtype == kernel.dtype
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
@@ -293,7 +297,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                     "Unsupported shape for conv2d HWNC.\
                                     Need to satisfy tensor core schedule."
                 )
-        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
+        elif target.kind.name == "cuda" and layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
             assert kernel_layout == "OIHW4o4i"
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
@@ -353,7 +357,8 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
             ic_chunk = in_channels // 4
 
             if (
-                data.dtype in ["int8", "uint8"]
+                target.kind.name == "cuda"
+                and data.dtype in ["int8", "uint8"]
                 and kernel.dtype in ["int8", "uint8"]
                 and channels % groups == 0
                 and out_channels % groups == 0
@@ -1210,3 +1215,16 @@ def invert_permutation_strategy_cuda(attrs, inputs, out_type, target):
         name="invert_permutation.cuda",
     )
     return strategy
+
+
+@einsum_strategy.register(["cuda", "gpu"])
+def einsum_strategy_cuda(attrs, inputs, out_type, target):
+    """einsum cuda strategy"""
+    strategy = _op.OpStrategy()
+    # TODO: Add cuda-specific op implementation for einsum
+    strategy.add_implementation(
+        wrap_compute_einsum(topi.einsum),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="einsum.cuda",
+    )
+    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 9c756f201721..2822585caeaf 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1669,3 +1669,24 @@ def invert_permutation_strategy(attrs, inputs, out_type, target):
         name="invert_permutation.generic",
     )
     return strategy
+
+
+def wrap_compute_einsum(topi_compute):
+    """Wrap einsum topi compute"""
+
+    def _compute_einsum(attrs, inputs, _):
+        return [topi_compute(attrs.equation, *inputs)]
+
+    return _compute_einsum
+
+
+@override_native_generic_func("einsum_strategy")
+def einsum_strategy(attrs, inputs, out_type, target):
+    """einsum generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_einsum(topi.einsum),
+        wrap_topi_schedule(topi.generic.schedule_einsum),
+        name="einsum.generic",
+    )
+    return strategy
diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
new file mode 100644
index 000000000000..cb1fec355917
--- /dev/null
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Definition of Hexagon operator strategy."""
+
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+
+from tvm import topi
+from .generic import *
+from .. import op as _op
+
+
+@conv2d_strategy.register("hexagon")
+def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
+    """Conv2d strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+
+    if data_layout == "NHWC" and kernel_layout == "HWIO":
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.nn.conv2d_nhwc),
+            wrap_topi_schedule(topi.hexagon.schedule_conv2d_nhwc),
+            name="conv2d.hexagon",
+        )
+        return strategy
+
+    raise RuntimeError(
+        "Unsupported layouts: data_layout:{}, kernel_layout:{}".format(data_layout, kernel_layout)
+    )
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index 64373dcdd7bf..8d9c28ba714b 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -27,13 +27,6 @@
 from .cuda import judge_winograd, naive_schedule
 
 
-@schedule_lrn.register("rocm")
-def schedule_lrn_rocm(attrs, outs, target):
-    """schedule LRN for rocm"""
-    with target:
-        return topi.rocm.schedule_lrn(outs)
-
-
 @conv2d_strategy.register("rocm")
 def conv2d_strategy_rocm(attrs, inputs, out_type, target):
     """conv2d rocm strategy"""
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index a38a23064d6f..e47928919ce1 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -1104,6 +1104,29 @@ def concatenate(data, axis):
     return _make.concatenate(Tuple(data), axis)
 
 
+def einsum(data, equation):
+    """Evaluates the Einstein summation convention on data
+
+    Parameters
+    ----------
+    data : Union(List[relay.Expr], Tuple[relay.Expr])
+        A list of tensors.
+    equation : str
+        The einsum expression string.
+
+    Returns
+    -------
+    result : relay.Expr
+        The output tensor from the einsum op.
+    """
+    data = list(data)
+    if not data:
+        raise ValueError("relay.einsum requires data to be non-empty.")
+    if not isinstance(equation, str):
+        raise ValueError("einsum `equation` must be a str")
+    return _make.einsum(Tuple(data), equation)
+
+
 def stack(data, axis):
     """Join a sequence of arrays along a new axis.
 
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 3226240fbe39..52fe6c8ebe2f 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -20,7 +20,9 @@
 
 import tvm
 from tvm import relay
+from tvm._ffi.base import TVMError
 from .. import op as reg
+from ....topi.x86.utils import target_has_sse42
 
 #################################################
 # Register the functions for different operators.
@@ -139,11 +141,35 @@ def helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay_op):
     data, kernel, input_zero_point, kernel_zero_point, _, _ = inputs
 
     shift_data = relay.subtract(
-        relay.cast(data, dtype="int16"), relay.cast(input_zero_point, "int16")
-    )
-    shift_kernel = relay.subtract(
-        relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, "int16")
+        relay.cast(data, dtype="int16"), relay.cast(input_zero_point, dtype="int16")
     )
+    # If kernel zero point is a scalar we can directly subtract it.
+    if len(types[3].shape) == 0:
+        shift_kernel = relay.subtract(
+            relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, dtype="int16")
+        )
+    # Otherwise it needs to be broadcast.
+    else:
+        # Determine output axis of kernel for spatial operations.
+        if hasattr(attrs, "kernel_layout"):
+            output_axis = tvm.tir.layout(attrs["kernel_layout"]).index_of("O")
+        # For dense operations, broadcast to [N, K] layout.
+        elif isinstance(attrs, relay.op.op_attrs.DenseAttrs):
+            output_axis = 0
+        # For matrix multiplication instead expand to [K, N] layout.
+        elif isinstance(attrs, relay.op.op_attrs.MatmulAttrs):
+            output_axis = 1
+        else:
+            raise TVMError(
+                "Legalization of %s is not yet supported with per channel parameters"
+                % str(type(attrs))
+            )
+
+        shift_kernel = relay.nn.bias_add(
+            relay.cast(kernel, dtype="int16"),
+            relay.cast(kernel_zero_point, dtype="int16"),
+            output_axis,
+        )
     new_attrs = {k: attrs[k] for k in attrs.keys()}
     return relay_op(shift_data, shift_kernel, **new_attrs)
 
@@ -318,7 +344,7 @@ def _shift(data, zero_point, out_dtype):
 def is_fast_int8_on_intel():
     """Checks whether the hardware has support for fast Int8 arithmetic operations."""
     target = tvm.target.Target.current(allow_none=False)
-    return target.mcpu in {"skylake-avx512", "cascadelake"}
+    return target_has_sse42(target.mcpu)
 
 
 def is_fast_int8_on_arm():
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index e74256ec74c3..83b5cf0a831c 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -276,8 +276,10 @@ def conv2d(
 ):
     r"""Quantized 2D convolution.
 
-    This operator convolves quantized data with quantized kernel. The scale of
-    the output quantized tensor is the product of the kernel_scale and
+    This operator convolves quantized data with quantized kernel.
+    If doing Per-channel quantization, qnn expects the kernel_zero_scale
+    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
+    The scale of the output quantized tensor is the product of the kernel_scale and
     input_scale of the input quantized tensors. The zero point of the output
     quantized tensor is 0. By default, the dtype of output is int32. Please also
     refer to Requantize operator to understand how to scale back the int32
@@ -544,6 +546,9 @@ def dense(
 
      `Y = X * W`
 
+    If doing Per-channel quantization, qnn expects the kernel_zero_scale
+    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
+
     Parameters
     ----------
     data : tvm.relay.Expr
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 8eb07d7b583b..9fc75199bdf5 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -81,6 +81,7 @@ def check_grad(
     scale=None,
     mean=0,
     mode="higher_order",
+    target_devices=None,
 ):
     """Perform numerical gradient checking given a relay function.
 
@@ -117,6 +118,11 @@ def check_grad(
 
     mean: float
         The mean of the inputs.
+
+    target_devices: Optional[List[Tuple[tvm.target.Target, tvm.runtime.Device]]]
+        A list of targets/devices on which the gradient should be
+        tested.  If not specified, will default to `tvm.testing.enabled_targets()`.
+
     """
 
     fwd_func = run_infer_type(func)
@@ -133,7 +139,10 @@ def check_grad(
     if test_inputs is None:
         test_inputs = inputs
 
-    for target, dev in enabled_targets():
+    if target_devices is None:
+        target_devices = enabled_targets()
+
+    for target, dev in target_devices:
         # Eval the backward and forward functions
         # TODO(mbs): Evaluate a pair of functions so can share preparation between them.
         bwd_func_compiled = relay.create_executor(device=dev, target=target).evaluate(bwd_func)
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index cf55c67c8083..0ed75191c40d 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -18,6 +18,7 @@
 import tvm
 from tvm import relay
 from tvm.ir import TensorAffineType, TupleAffineType
+from tvm.tir import bijective_layout
 from ..op import register_fake_quantization_to_integer
 
 
@@ -25,6 +26,14 @@ def fold_constant(expr):
     return relay.transform.FoldConstantExpr(expr, tvm.IRModule())
 
 
+def get_zeros(scale):
+    return fold_constant(relay.op.cast(relay.op.zeros_like(scale), "int32"))
+
+
+def infer_shape(expr):
+    return relay.transform.InferType()(tvm.IRModule.from_expr(expr))["main"].body.checked_type.shape
+
+
 @register_fake_quantization_to_integer("qnn.dequantize")
 def dequantize(expr, type_map):
     """Remove dequantize op"""
@@ -52,8 +61,13 @@ def quantize(expr, type_map):
             expr.args[1],
             expr.args[2],
             out_dtype=expr.attrs.out_dtype,
+            axis=t.axis,
         )
-    return [out, TensorAffineType(expr.args[1], expr.args[2], expr.attrs.out_dtype)]
+
+    return [
+        out,
+        TensorAffineType(expr.args[1], expr.args[2], expr.attrs.out_dtype, expr.attrs.axis),
+    ]
 
 
 def register_unary_identity(op_name):
@@ -73,6 +87,7 @@ def identity(expr, type_map):
 register_unary_identity("expand_dims")
 register_unary_identity("nn.max_pool2d")
 register_unary_identity("nn.batch_flatten")
+register_unary_identity("nn.depth_to_space")
 
 
 @register_fake_quantization_to_integer("nn.avg_pool2d")
@@ -94,7 +109,11 @@ def bias_add(expr, type_map):
     b_t = type_map[b]
     in_scale = fold_constant(x_t.scale)
     in_zero_point = fold_constant(x_t.zero_point)
-    if not tvm.ir.structural_equal(x_t, b_t):
+    if not (
+        tvm.ir.structural_equal(x_t.scale, b_t.scale)
+        and tvm.ir.structural_equal(x_t.zero_point, b_t.zero_point)
+        and tvm.ir.structural_equal(x_t.dtype, b_t.dtype)
+    ):
         b = relay.qnn.op.requantize(
             b,
             b_t.scale,
@@ -102,6 +121,7 @@ def bias_add(expr, type_map):
             in_scale,
             in_zero_point,
             out_dtype=x_t.dtype,
+            axis=0,
         )
     out = relay.op.nn.bias_add(x, b, **expr.attrs)
     return [out, x_t]
@@ -116,11 +136,13 @@ def conv2d(expr, type_map):
     x_t = type_map[x]
     w_t = type_map[weight]
     conv_scale = fold_constant(x_t.scale * w_t.scale)
-    conv_zp = relay.const(0)
+    conv_zp = get_zeros(conv_scale)
     out = relay.qnn.op.conv2d(
         x, weight, x_t.zero_point, w_t.zero_point, x_t.scale, w_t.scale, **attrs
     )
-    return [out, TensorAffineType(conv_scale, conv_zp, out.attrs.out_dtype)]
+    out_layout = attrs["out_layout"] if attrs["out_layout"] != "" else attrs["data_layout"]
+    out_axis = bijective_layout(out_layout, "NCHW").backward_index(list(range(4)))[1]
+    return [out, TensorAffineType(conv_scale, conv_zp, out.attrs.out_dtype, out_axis.value)]
 
 
 @register_fake_quantization_to_integer("nn.dense")
@@ -132,11 +154,11 @@ def dense(expr, type_map):
     x_t = type_map[x]
     w_t = type_map[weight]
     dense_scale = fold_constant(x_t.scale * w_t.scale)
-    dense_zp = relay.const(0)
+    dense_zp = get_zeros(dense_scale)
     out = relay.qnn.op.dense(
         x, weight, x_t.zero_point, w_t.zero_point, x_t.scale, w_t.scale, **attrs
     )
-    return [out, TensorAffineType(dense_scale, dense_zp, out.attrs.out_dtype)]
+    return [out, TensorAffineType(dense_scale, dense_zp, out.attrs.out_dtype, 1)]
 
 
 @register_fake_quantization_to_integer("nn.batch_matmul")
@@ -148,7 +170,7 @@ def batch_matmul(expr, type_map):
     matmul_scale = fold_constant(x_t.scale * y_t.scale)
     matmul_zp = relay.const(0)
     out = relay.qnn.op.batch_matmul(x, y, x_t.zero_point, y_t.zero_point, x_t.scale, y_t.scale)
-    return [out, TensorAffineType(matmul_scale, matmul_zp, out.attrs.out_dtype)]
+    return [out, TensorAffineType(matmul_scale, matmul_zp, out.attrs.out_dtype, x_t.axis)]
 
 
 @register_fake_quantization_to_integer("concatenate")
@@ -198,19 +220,52 @@ def clip(expr, type_map):
     amax = expr.attrs.a_max
     scale = fold_constant(t.scale)
     z_p = fold_constant(t.zero_point)
-    if isinstance(scale, relay.expr.Constant) and isinstance(z_p, relay.expr.Constant):
+    if (
+        isinstance(scale, relay.expr.Constant)
+        and scale.data.numpy().size == 1
+        and isinstance(z_p, relay.expr.Constant)
+        and z_p.data.numpy().size == 1
+    ):
         scale = scale.data.numpy().item()
         z_p = z_p.data.numpy().item()
         new_min = int(amin / scale + z_p)
         new_max = int(amax / scale + z_p)
         out = relay.op.clip(arg, new_min, new_max)
     else:
-        amin = relay.op.round(relay.op.const(amin) / scale + z_p)
-        amax = relay.op.round(relay.op.const(amax) / scale + z_p)
-        out = relay.op.minimum(relay.op.maximum(arg, amin), amax)
+        if not isinstance(amin, relay.expr.Constant):
+            amin = relay.op.const(amin)
+        if not isinstance(amax, relay.expr.Constant):
+            amax = relay.op.const(amax)
+
+        scale_shape = infer_shape(scale)
+        if len(scale_shape) > 0 and scale_shape[0] > 1:
+            b_shape = [1] * len(infer_shape(arg))
+            b_shape[t.axis] = -1
+            amin = relay.op.reshape(relay.op.broadcast_to(amin, scale_shape), b_shape)
+            amax = relay.op.reshape(relay.op.broadcast_to(amax, scale_shape), b_shape)
+        amin = relay.qnn.op.quantize(amin, scale, z_p, t.axis, t.dtype)
+        amax = relay.qnn.op.quantize(amax, scale, z_p, t.axis, t.dtype)
+        out = relay.op.minimum(relay.op.maximum(arg, fold_constant(amin)), fold_constant(amax))
+
     return [out, t]
 
 
+@register_fake_quantization_to_integer("nn.relu")
+def relu(expr, type_map):
+    """Rewrite a relu op"""
+    arg = expr.args[0]
+    t = type_map[arg]
+    scale_shape = infer_shape(t.scale)
+    z_p = t.zero_point
+    assert len(scale_shape) <= 1
+    if len(scale_shape) == 1 and scale_shape[0] > 1:
+        b_shape = [1] * len(infer_shape(arg))
+        b_shape[t.axis] = -1
+        z_p = relay.op.reshape(relay.op.broadcast_to(z_p, scale_shape), b_shape)
+    zero = relay.op.cast(z_p, t.dtype)
+    return [relay.op.maximum(arg, fold_constant(zero)), t]
+
+
 @register_fake_quantization_to_integer("nn.pad")
 def pad(expr, type_map):
     """Rewite an nn.pad op"""
@@ -231,6 +286,7 @@ def pad(expr, type_map):
                 t.scale,
                 t.zero_point,
                 out_dtype=t.dtype,
+                axis=pad_t.axis,
             )
     else:
         ## If the pad-value is a constant, we need to quantize it
@@ -319,6 +375,7 @@ def binary(expr, type_map):
                 out_t.scale,
                 out_t.zero_point,
                 out_dtype=out_t.dtype,
+                axis=left_t.axis,
             )
 
         if right_t != out_t:
@@ -329,6 +386,7 @@ def binary(expr, type_map):
                 out_t.scale,
                 out_t.zero_point,
                 out_dtype=out_t.dtype,
+                axis=right_t.axis,
             )
         out = op(left, right)
         return [out, out_t]
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index a9834391ed88..045bf7904885 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -366,7 +366,9 @@ def text_summary(self):
         res += separate_line
         return res
 
-    def request(self, key, priority=1, session_timeout=0, max_retry=5):
+    def request(
+        self, key, priority=1, session_timeout=0, max_retry=5, session_constructor_args=None
+    ):
         """Request a new connection from the tracker.
 
         Parameters
@@ -384,6 +386,11 @@ def request(self, key, priority=1, session_timeout=0, max_retry=5):
 
         max_retry : int, optional
             Maximum number of times to retry before give up.
+
+        session_constructor_args : list, optional
+            List of additional arguments to passed as the remote session constructor.
+            The first element of the list is always a string specifying the name of
+            the session constructor, the following args are the positional args to that function.
         """
         last_err = None
         for _ in range(max_retry):
@@ -395,7 +402,13 @@ def request(self, key, priority=1, session_timeout=0, max_retry=5):
                 if value[0] != base.TrackerCode.SUCCESS:
                     raise RuntimeError("Invalid return value %s" % str(value))
                 url, port, matchkey = value[1]
-                return connect(url, port, matchkey, session_timeout)
+                return connect(
+                    url,
+                    port,
+                    matchkey,
+                    session_timeout,
+                    session_constructor_args=session_constructor_args,
+                )
             except socket.error as err:
                 self.close()
                 last_err = err
diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py
index b3043d8b8760..4f625c0c67f1 100644
--- a/python/tvm/runtime/profiler_vm.py
+++ b/python/tvm/runtime/profiler_vm.py
@@ -22,7 +22,9 @@
 """
 import warnings
 from tvm.runtime import _ffi_api
+from tvm.rpc import base as rpc_base
 from . import vm
+from .profiling import Report
 
 
 def enabled():
@@ -35,10 +37,18 @@ class VirtualMachineProfiler(vm.VirtualMachine):
 
     def __init__(self, exe, device, memory_cfg=None):
         super(VirtualMachineProfiler, self).__init__(exe, device, memory_cfg)
-        self.module = _ffi_api._VirtualMachineDebug(exe.module)
+
+        # Make sure the constructor of the VM module is on the proper device
+        # Remote devices have device_type of their actual device_type + RPC_SESS_MASK
+        if device.device_type >= rpc_base.RPC_SESS_MASK:
+            self.module = device._rpc_sess.get_function("runtime._VirtualMachineDebug")(exe)
+        else:
+            self.module = _ffi_api._VirtualMachineDebug(exe.module)
+
         self._init = self.module["init"]
         self._invoke = self.module["invoke"]
         self._profile = self.module["profile"]
+        self._profile_rpc = self.module["profile_rpc"]
         self._set_input = self.module["set_input"]
         self._setup_device(device, memory_cfg)
 
@@ -59,7 +69,7 @@ def profile(self, *args, func_name="main", collectors=None, **kwargs):
             The name of the function.
 
         collectors : Optional[Sequence[MetricCollector]]
-            Extra metrics to collect.
+            Extra metrics to collect. If profiling over RPC, collectors must be `None`.
 
         args : list[tvm.runtime.NDArray] or list[np.ndarray]
             The arguments to the function.
@@ -72,7 +82,10 @@ def profile(self, *args, func_name="main", collectors=None, **kwargs):
         timing_results : str
             Overall and per-op timing results formatted in a table.
         """
-        collectors = [] if collectors is None else collectors
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
+        if self.module.type_key == "rpc":
+            # We cannot serialize MetricCollectors over RPC
+            assert collectors is None, "Profiling with collectors is not supported over RPC"
+            return Report.from_json(self._profile_rpc(func_name))
         return self._profile(func_name, collectors)
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 881691609398..b91fe727698b 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -104,6 +104,22 @@ def json(self):
         """
         return _ffi_api.AsJSON(self)
 
+    @classmethod
+    def from_json(cls, s):
+        """Deserialize a report from JSON.
+
+        Parameters
+        ----------
+        s : str
+            Report serialize via :py:meth:`json`.
+
+        Returns
+        -------
+        report : Report
+            The deserialized report.
+        """
+        return _ffi_api.FromJSON(s)
+
 
 @_ffi.register_object("runtime.profiling.MetricCollector")
 class MetricCollector(Object):
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 6416ad7814e1..2be3f3ec1a78 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -48,6 +48,8 @@ def _convert(arg, cargs):
         dtype = "int32" if isinstance(arg, (_base.integer_types, bool)) else "float32"
         value = tvm.nd.array(np.array(arg, dtype=dtype), device=tvm.cpu(0))
         cargs.append(value)
+    elif isinstance(arg, str):
+        cargs.append(arg)
     else:
         raise TypeError("Unsupported type: %s" % (type(arg)))
 
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 60fc49678866..51ee0aed982c 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -594,6 +594,19 @@ def transform_For(self, node):
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
+    def transform_While(self, node):
+        """While visitor
+        AST abstract grammar:
+            While(expr condition, stmt* body)
+        """
+        condition = self.transform(node.condition)
+        # body
+        self.context.enter_scope(nodes=node.body.stmts)
+        body = self.parse_body(node)
+        self.context.exit_scope()
+
+        return tvm.tir.While(condition, body, span=tvm_span_from_synr(node.span))
+
     def transform_With(self, node):
         """With visitor
         AST abstract grammar:
diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py
index bb408f6cdc8f..cba067990bef 100644
--- a/python/tvm/script/scope_handler.py
+++ b/python/tvm/script/scope_handler.py
@@ -591,10 +591,13 @@ class RangeHandler(ForScopeHandler):
     def __init__(self):
         def for_range(
             begin: PrimExpr,
-            end: PrimExpr,
+            end: PrimExpr = None,
             annotations: Optional[Mapping[str, Object]] = None,
             span: Optional[Span] = None,
         ):
+            if end is None:
+                end = begin
+                begin = 0
             return self.create_loop(begin, end, ForKind.SERIAL, annotations=annotations, span=span)
 
         super().__init__(for_range)
diff --git a/python/tvm/support.py b/python/tvm/support.py
index 800bfe4e2546..1adbee09c52c 100644
--- a/python/tvm/support.py
+++ b/python/tvm/support.py
@@ -29,7 +29,14 @@ def libinfo():
     info: Dict[str, str]
         The dictionary of compile-time info.
     """
-    return {k: v for k, v in GetLibInfo().items()}  # pylint: disable=unnecessary-comprehension
+    get_lib_info_func = get_global_func("support.GetLibInfo", allow_missing=True)
+    if get_lib_info_func is not None:
+        lib_info = get_lib_info_func()
+        if lib_info is None:
+            return {}
+    else:
+        return {}
+    return dict(lib_info.items())
 
 
 class FrontendTestModule(Module):
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index d4b538a4bef0..af2f5d857293 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -483,6 +483,8 @@ def hexagon(cpu_ver="v66", **kwargs):
         error if invalid. Does not affect codegen.
     llvm_options : str or list of str (default: None)
         User defined compiler arguments.
+    link_params : bool (default: False)
+        Whether to link graph parameters into the LLVM module.
     """
 
     # Some of the target parameters correspond to target kind attributes
@@ -507,6 +509,7 @@ def hexagon(cpu_ver="v66", **kwargs):
         "hvx": 128,
         "sim_options": None,
         "llvm_options": None,
+        "link_params": False,
     }
     config.update(kwargs)
 
@@ -615,12 +618,27 @@ def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
         args = [s.replace("=", "@") for s in llvm_options.split()]
         return "--llvm-options=" + ",".join(args)
 
+    # TVM target attributes string
+    def create_tvm_options(cpu_ver, config):  # pylint: disable=unused-argument
+        """ Create TVM target features string. """
+
+        features = {
+            "link_params": "link-params",
+        }
+        opts = ""
+        for k in config:
+            if k in features:
+                opts += " --" + features[k] + "=" + str(config[k])
+        return opts
+
     # Sim args
     os.environ["HEXAGON_SIM_ARGS"] = create_sim_options(cpu_ver, config)
 
     target_str = create_llvm_target(cpu_ver, config)
     llvm_str = create_llvm_options(cpu_ver, config)
-    args_list = target_str.split() + llvm_str.split()
+    tvm_str = create_tvm_options(cpu_ver, config)
+
+    args_list = target_str.split() + llvm_str.split() + tvm_str.split()
 
     return Target(" ".join(["hexagon"] + args_list))
 
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 6af3429b3eef..a0b9b4373535 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -22,13 +22,13 @@
 import tvm._ffi
 import tvm.tir
 import tvm.tir._ffi_api
-
 from tvm._ffi.base import string_types
+from tvm.ir import Array
 from tvm.runtime import convert
 
+from . import _ffi_api
 from . import tag as _tag
 from . import tensor as _tensor
-from . import _ffi_api
 
 
 def placeholder(shape, dtype=None, name="placeholder"):
@@ -431,6 +431,7 @@ def reduce_axis(dom, name="rv", thread_tag="", span=None):
 
 def create_prim_func(ops: List[_tensor.Tensor]) -> tvm.tir.PrimFunc:
     """Create a TensorIR PrimFunc from tensor expression
+
     Parameters
     ----------
     ops : List[Tensor]
@@ -473,6 +474,6 @@ def tir_matmul(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
     func : tir.PrimFunc
         The created function.
     """
-    if not isinstance(ops, list):
+    if not isinstance(ops, (list, tuple, Array)):
         ops = [ops]
     return _ffi_api.CreatePrimFunc(ops)
diff --git a/python/tvm/testing/__init__.py b/python/tvm/testing/__init__.py
index 56a435ea3887..d84846725ec4 100644
--- a/python/tvm/testing/__init__.py
+++ b/python/tvm/testing/__init__.py
@@ -17,21 +17,16 @@
 
 # pylint: disable=redefined-builtin, wildcard-import
 """Utility Python functions for TVM testing"""
-from .utils import assert_allclose, assert_prim_expr_equal, check_bool_expr_is_true
-from .utils import check_int_constraints_trans_consistency, check_numerical_grads
-from .utils import device_enabled, enabled_targets, exclude_targets
-from .utils import fixture, parameter, parameters, parametrize_targets, uses_gpu
-from .utils import known_failing_targets, requires_cuda, requires_cudagraph
-from .utils import requires_gpu, requires_llvm, requires_rocm, requires_rpc
-from .utils import requires_tensorcore, requires_metal, requires_micro, requires_opencl
-from .utils import requires_package
-from .utils import identity_after, terminate_self
+
+from .utils import *
 
 from ._ffi_api import nop, echo, device_test, run_check_signal, object_use_count
 from ._ffi_api import test_wrap_callback, test_raise_error_callback, test_check_eq_callback
 from ._ffi_api import ErrorTest, FrontendTestModule, identity_cpp
 
 from .popen_pool import initializer, after_initializer, register_ffi, call_cpp_ffi
-from .popen_pool import call_py_ffi, call_cpp_py_ffi
+from .popen_pool import call_py_ffi, call_cpp_py_ffi, fast_summation, slow_summation
+from .popen_pool import timeout_job
 
 from . import auto_scheduler
+from . import autotvm
diff --git a/tests/python/unittest/test_autotvm_common.py b/python/tvm/testing/autotvm.py
similarity index 97%
rename from tests/python/unittest/test_autotvm_common.py
rename to python/tvm/testing/autotvm.py
index 60f7d8bafb1b..6f7bb13fe6dc 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/python/tvm/testing/autotvm.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name, missing-function-docstring, missing-class-docstring
 """Common utilities for testing autotvm"""
 import time
 
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index 06b4fa4f65eb..95875acbd82c 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -31,8 +31,6 @@
 
 """
 
-import collections
-
 import pytest
 import _pytest
 
@@ -49,6 +47,7 @@
     "vulkan": "mark a test as requiring vulkan",
     "metal": "mark a test as requiring metal",
     "llvm": "mark a test as requiring llvm",
+    "ethosn": "mark a test as requiring ethosn",
 }
 
 
@@ -66,6 +65,7 @@ def pytest_generate_tests(metafunc):
     """Called once per unit test, modifies/parametrizes it as needed."""
     _parametrize_correlated_parameters(metafunc)
     _auto_parametrize_target(metafunc)
+    _add_target_specific_marks(metafunc)
 
 
 def pytest_collection_modifyitems(config, items):
@@ -99,7 +99,39 @@ def _auto_parametrize_target(metafunc):
 
     """
 
+    if "target" in metafunc.fixturenames:
+        # Check if any explicit parametrizations exist, and apply one
+        # if they do not.  If the function is marked with either
+        # excluded or known failing targets, use these to determine
+        # the targets to be used.
+        parametrized_args = [
+            arg.strip()
+            for mark in metafunc.definition.iter_markers("parametrize")
+            for arg in mark.args[0].split(",")
+        ]
+        if "target" not in parametrized_args:
+            excluded_targets = getattr(metafunc.function, "tvm_excluded_targets", [])
+
+            # Add a parametrize marker instead of calling
+            # metafunc.parametrize so that the parametrize rewriting
+            # can still occur.
+            mark = pytest.mark.parametrize(
+                "target",
+                [
+                    t["target"]
+                    for t in utils._get_targets()
+                    if t["target_kind"] not in excluded_targets
+                ],
+                scope="session",
+            )
+            metafunc.definition.add_marker(mark)
+
+
+def _add_target_specific_marks(metafunc):
+    """Add any target-specific marks to parametrizations over target"""
+
     def update_parametrize_target_arg(
+        mark,
         argnames,
         argvalues,
         *args,
@@ -130,6 +162,16 @@ def update_parametrize_target_arg(
                     target = param_set[target_i]
                     additional_marks = []
 
+                if mark in metafunc.definition.own_markers:
+                    xfail_targets = getattr(metafunc.function, "tvm_known_failing_targets", [])
+                    target_kind = target.split()[0] if isinstance(target, str) else target.kind.name
+                    if target_kind in xfail_targets:
+                        additional_marks.append(
+                            pytest.mark.xfail(
+                                reason=f'Known failing test for target "{target_kind}"'
+                            )
+                        )
+
                 new_argvalues.append(
                     pytest.param(
                         *param_set, marks=_target_to_requirement(target) + additional_marks
@@ -154,25 +196,7 @@ def update_parametrize_target_arg(
         # parametrize over targets.  This adds the appropriate
         # @tvm.testing.requires_* markers for each target.
         for mark in metafunc.definition.iter_markers("parametrize"):
-            update_parametrize_target_arg(*mark.args, **mark.kwargs)
-
-        # Check if any explicit parametrizations exist, and apply one
-        # if they do not.  If the function is marked with either
-        # excluded or known failing targets, use these to determine
-        # the targets to be used.
-        parametrized_args = [
-            arg.strip()
-            for mark in metafunc.definition.iter_markers("parametrize")
-            for arg in mark.args[0].split(",")
-        ]
-        if "target" not in parametrized_args:
-            excluded_targets = getattr(metafunc.function, "tvm_excluded_targets", [])
-            xfail_targets = getattr(metafunc.function, "tvm_known_failing_targets", [])
-            metafunc.parametrize(
-                "target",
-                _pytest_target_params(None, excluded_targets, xfail_targets),
-                scope="session",
-            )
+            update_parametrize_target_arg(mark, *mark.args, **mark.kwargs)
 
 
 def _count_num_fixture_uses(items):
@@ -211,43 +235,6 @@ def _remove_global_fixture_definitions(items):
                 delattr(module, name)
 
 
-def _pytest_target_params(targets, excluded_targets=None, xfail_targets=None):
-    # Include unrunnable targets here.  They get skipped by the
-    # pytest.mark.skipif in _target_to_requirement(), showing up as
-    # skipped tests instead of being hidden entirely.
-    if targets is None:
-        if excluded_targets is None:
-            excluded_targets = set()
-
-        if xfail_targets is None:
-            xfail_targets = set()
-
-        target_marks = []
-        for t in utils._get_targets():
-            # Excluded targets aren't included in the params at all.
-            if t["target_kind"] not in excluded_targets:
-
-                # Known failing targets are included, but are marked
-                # as expected to fail.
-                extra_marks = []
-                if t["target_kind"] in xfail_targets:
-                    extra_marks.append(
-                        pytest.mark.xfail(
-                            reason='Known failing test for target "{}"'.format(t["target_kind"])
-                        )
-                    )
-
-                target_marks.append((t["target"], extra_marks))
-
-    else:
-        target_marks = [(target, []) for target in targets]
-
-    return [
-        pytest.param(target, marks=_target_to_requirement(target) + extra_marks)
-        for target, extra_marks in target_marks
-    ]
-
-
 def _target_to_requirement(target):
     if isinstance(target, str):
         target = tvm.target.Target(target)
@@ -255,6 +242,8 @@ def _target_to_requirement(target):
     # mapping from target to decorator
     if target.kind.name == "cuda" and "cudnn" in target.attrs.get("libs", []):
         return utils.requires_cudnn()
+    if target.kind.name == "cuda" and "cublas" in target.attrs.get("libs", []):
+        return utils.requires_cublas()
     if target.kind.name == "cuda":
         return utils.requires_cuda()
     if target.kind.name == "rocm":
@@ -273,7 +262,7 @@ def _target_to_requirement(target):
 
 
 def _parametrize_correlated_parameters(metafunc):
-    parametrize_needed = collections.defaultdict(list)
+    parametrize_needed = {}
 
     for name, fixturedefs in metafunc.definition._fixtureinfo.name2fixturedefs.items():
         fixturedef = fixturedefs[-1]
@@ -282,13 +271,20 @@ def _parametrize_correlated_parameters(metafunc):
         ):
             group = fixturedef.func.parametrize_group
             values = fixturedef.func.parametrize_values
-            parametrize_needed[group].append((name, values))
+            ids = fixturedef.func.parametrize_ids
+            if group in parametrize_needed:
+                assert ids == parametrize_needed[group]["ids"]
+            else:
+                parametrize_needed[group] = {"ids": ids, "params": []}
+            parametrize_needed[group]["params"].append((name, values))
 
     for parametrize_group in parametrize_needed.values():
-        if len(parametrize_group) == 1:
-            name, values = parametrize_group[0]
-            metafunc.parametrize(name, values, indirect=True)
+        params = parametrize_group["params"]
+        ids = parametrize_group["ids"]
+        if len(params) == 1:
+            name, values = params[0]
+            metafunc.parametrize(name, values, indirect=True, ids=ids)
         else:
-            names = ",".join(name for name, values in parametrize_group)
-            value_sets = zip(*[values for name, values in parametrize_group])
-            metafunc.parametrize(names, value_sets, indirect=True)
+            names = ",".join(name for name, values in params)
+            value_sets = zip(*[values for name, values in params])
+            metafunc.parametrize(names, value_sets, indirect=True, ids=ids)
diff --git a/python/tvm/testing/popen_pool.py b/python/tvm/testing/popen_pool.py
index 20345a2218fe..b646d7a89e94 100644
--- a/python/tvm/testing/popen_pool.py
+++ b/python/tvm/testing/popen_pool.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, missing-function-docstring
 """Common functions for popen_pool test cases"""
+import time
 import tvm
 
 TEST_GLOBAL_STATE_1 = 0
@@ -57,3 +58,18 @@ def call_cpp_ffi(arg):
 
 def call_cpp_py_ffi(arg):
     return tvm.testing.identity_cpp(arg)
+
+
+def fast_summation(n):
+    return n * (n + 1) // 2
+
+
+def slow_summation(n):
+    r = 0
+    for i in range(0, n + 1):
+        r += i
+    return r
+
+
+def timeout_job(n):
+    time.sleep(n * 1.5)
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 85a8b7738184..62531ff7c194 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -81,6 +81,7 @@ def test_something():
 
 from tvm.contrib import nvcc, cudnn
 from tvm.error import TVMError
+from tvm.relay.op.contrib.ethosn import ethosn_available
 
 
 def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
@@ -593,6 +594,27 @@ def requires_cudnn(*args):
     return _compose(args, requirements)
 
 
+def requires_cublas(*args):
+    """Mark a test as requiring the cuBLAS library.
+
+    This also marks the test as requiring a cuda gpu.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+
+    requirements = [
+        pytest.mark.skipif(
+            tvm.get_global_func("tvm.contrib.cublas.matmul", True),
+            reason="cuDNN library not enabled",
+        ),
+        *requires_cuda(),
+    ]
+    return _compose(args, requirements)
+
+
 def requires_nvptx(*args):
     """Mark a test as requiring the NVPTX compilation on the CUDA runtime
 
@@ -774,6 +796,28 @@ def requires_rpc(*args):
     return _compose(args, _requires_rpc)
 
 
+def requires_ethosn(*args):
+    """Mark a test as requiring ethosn to run.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    marks = [
+        pytest.mark.ethosn,
+        pytest.mark.skipif(
+            not ethosn_available(),
+            reason=(
+                "Ethos-N support not enabled.  "
+                "Set USE_ETHOSN=ON in config.cmake to enable, "
+                "and ensure that hardware support is present."
+            ),
+        ),
+    ]
+    return _compose(args, marks)
+
+
 def requires_package(*packages):
     """Mark a test as requiring python packages to run.
 
@@ -945,7 +989,7 @@ def wraps(func):
     return wraps
 
 
-def parameter(*values, ids=None):
+def parameter(*values, ids=None, by_dict=None):
     """Convenience function to define pytest parametrized fixtures.
 
     Declaring a variable using ``tvm.testing.parameter`` will define a
@@ -965,16 +1009,23 @@ def parameter(*values, ids=None):
 
     Parameters
     ----------
-    values
+    values : Any
+
        A list of parameter values.  A unit test that accepts this
        parameter as an argument will be run once for each parameter
        given.
 
     ids : List[str], optional
+
        A list of names for the parameters.  If None, pytest will
        generate a name from the value.  These generated names may not
        be readable/useful for composite types such as tuples.
 
+    by_dict : Dict[str, Any]
+
+       A mapping from parameter name to parameter value, to set both the
+       values and ids.
+
     Returns
     -------
     function
@@ -992,8 +1043,22 @@ def parameter(*values, ids=None):
     >>> def test_using_size(shape):
     >>>     ... # Test code here
 
+    Or
+
+    >>> shape = tvm.testing.parameter(by_dict={'small': (5,10), 'large': (512,1024)})
+    >>> def test_using_size(shape):
+    >>>     ... # Test code here
+
     """
 
+    if by_dict is not None:
+        if values or ids:
+            raise RuntimeError(
+                "Use of the by_dict parameter cannot be used alongside positional arguments"
+            )
+
+        ids, values = zip(*by_dict.items())
+
     # Optional cls parameter in case a parameter is defined inside a
     # class scope.
     @pytest.fixture(params=values, ids=ids)
@@ -1006,7 +1071,7 @@ def as_fixture(*_cls, request):
 _parametrize_group = 0
 
 
-def parameters(*value_sets):
+def parameters(*value_sets, ids=None):
     """Convenience function to define pytest parametrized fixtures.
 
     Declaring a variable using tvm.testing.parameters will define a
@@ -1029,11 +1094,18 @@ def parameters(*value_sets):
     Parameters
     ----------
     values : List[tuple]
+
        A list of parameter value sets.  Each set of values represents
        a single combination of values to be tested.  A unit test that
        accepts parameters defined will be run once for every set of
        parameters in the list.
 
+    ids : List[str], optional
+
+       A list of names for the parameter sets.  If None, pytest will
+       generate a name from each parameter set.  These generated names may
+       not be readable/useful for composite types such as tuples.
+
     Returns
     -------
     List[function]
@@ -1062,6 +1134,7 @@ def fixture_func(*_cls, request):
 
         fixture_func.parametrize_group = parametrize_group
         fixture_func.parametrize_values = param_values
+        fixture_func.parametrize_ids = ids
         outputs.append(pytest.fixture(fixture_func))
 
     return outputs
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index eb200df0c599..44006239acfd 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -27,7 +27,7 @@
 from .expr import Select, BufferLoad, ProducerLoad, Load, Ramp, Broadcast, Shuffle
 from .expr import Call, CallEffectKind, Let, IterVar, Any
 
-from .stmt import Stmt, LetStmt, AssertStmt, ForKind, For
+from .stmt import Stmt, LetStmt, AssertStmt, ForKind, For, While
 from .stmt import BufferStore, BufferRealize, Store, ProducerStore, Allocate, AttrStmt
 from .stmt import ProducerRealize, SeqStmt
 from .stmt import IfThenElse, Evaluate, Prefetch, stmt_seq, stmt_list
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index 4ba8c5471b5d..2bfa0aacb184 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -523,6 +523,9 @@ def __init__(self, dtype, value, span=None):
             tvm.ir._ffi_api.FloatImm, dtype, value, span  # type: ignore
         )
 
+    def __float__(self):
+        return self.value
+
 
 @tvm._ffi.register_object
 class IntImm(ConstExpr):
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index ac09bdbb264d..7545c09b020d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -927,6 +927,183 @@ def after_cache_write(a: ty.handle, b: ty.handle) -> None:
 
     ########## Schedule: Compute location ##########
 
+    def compute_at(
+        self,
+        block: BlockRV,
+        loop: LoopRV,
+        preserve_unit_loops: bool = False,
+    ) -> None:
+        """Compute-At. Move a producer block under the specific loop, and regenerate the
+        loops induced by the block so that the buffer region produced by the producer block could
+        cover those regions consumed by its consumer blocks under the given loop. It requires:
+
+        1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+
+        2) The scope block has stage-pipeline property
+
+        3) The subtree of the scope block, where the given block is in, satisfies the compact
+        dataflow condition. i.e. all the blocks in the scope block's subtree must be either
+        complete block or reduction block
+
+        4) The block is not an output block with regard to the scope block, i.e. the buffers written
+        by the block are allocated under the scope block
+
+        5) All the consumers of the block are under the given loop
+
+        Parameters
+        ----------
+        block : BlockRV
+            The block to be moved
+
+        loop: LoopRV
+            The loop where the block to be moved under
+
+        preserve_unit_loops: bool
+            Whether to keep the trivial loops whose extents are 1
+
+        Examples
+        --------
+
+        Before compute-at, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def before_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                with tir.block([128, 128], "B") as [vi, vj]:
+                    B[vi, vj] = A[vi, vj] * 2.0
+                with tir.block([128, 128], "C") as [vi, vj]:
+                    C[vi, vj] = B[vi, vj] + 1.0
+
+        Create the schedule and do compute-at:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_compute_at)
+            block = sch.get_block("B")
+            loop, _ = sch.get_loops(sch.get_block("C"))
+            sch.compute_at(block, loop, preserve_unit_loops=False)
+            print(tvm.script.asscript(sch.mod["main"]))
+
+        After applying compute-at, the IR becomes:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def after_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                for i in tir.serial(0, 128):
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "B") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            B[vi, vj] = A[vi, vj] * 2.0
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "C") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            C[vi, vj] = B[vi, vj] + 1.0
+
+        """
+        _ffi_api.ScheduleComputeAt(  # type: ignore # pylint: disable=no-member
+            self,
+            block,
+            loop,
+            preserve_unit_loops,
+        )
+
+    def reverse_compute_at(
+        self,
+        block: BlockRV,
+        loop: LoopRV,
+        preserve_unit_loops: bool = False,
+    ) -> None:
+        """Reverse-Compute-At. Move a consumer block under the specific loop, and regenerate the
+        loops induced by the block so that the buffer region consumed by the consumer block could
+        cover those regions produced by its producer blocks under the given loop. It requires:
+
+        1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+
+        2) The scope block has stage-pipeline property
+
+        3) The subtree of the scope block, where the given block is in, satisfies the compact
+        dataflow condition. i.e. all the blocks in the scope block's subtree must be either
+        complete block or reduction block
+
+        4) All the producers of the block are under the given loop
+
+        Parameters
+        ----------
+        block : BlockRV
+            The block to be moved
+
+        loop: LoopRV
+            The loop where the block to be moved under
+
+        preserve_unit_loops: bool
+            Whether to keep the trivial loops whose extents are 1
+
+        Examples
+        --------
+
+        Before reverse-compute-at, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def before_reverse_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                with tir.block([128, 128], "B") as [vi, vj]:
+                    B[vi, vj] = A[vi, vj] * 2.0
+                with tir.block([128, 128], "C") as [vi, vj]:
+                    C[vi, vj] = B[vi, vj] + 1.0
+
+        Create the schedule and do reverse-compute-at:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_reverse_compute_at)
+            block = sch.get_block("C")
+            loop, _ = sch.get_loops(sch.get_block("B"))
+            sch.reverse_compute_at(block, loop, preserve_unit_loops=False)
+            print(tvm.script.asscript(sch.mod["main"]))
+
+        After applying reverse-compute-at, the IR becomes:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def after_reverse_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                for i in tir.serial(0, 128):
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "B") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            B[vi, vj] = A[vi, vj] * 2.0
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "C") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            C[vi, vj] = B[vi, vj] + 1.0
+
+        """
+        _ffi_api.ScheduleReverseComputeAt(  # type: ignore # pylint: disable=no-member
+            self,
+            block,
+            loop,
+            preserve_unit_loops,
+        )
+
     def compute_inline(self, block: BlockRV) -> None:
         """Inline a block into its consumer(s). It requires:
 
@@ -1189,10 +1366,15 @@ def after_rfactor(a: ty.handle, b: ty.handle) -> None:
         """
         return _ffi_api.ScheduleRFactor(self, loop, factor_axis)  # type: ignore # pylint: disable=no-member
 
-    ######## Schedule: Block annotatoin ########
+    ######## Schedule: Block annotation ########
 
     def storage_align(  # pylint: disable=too-many-arguments
-        self, block: BlockRV, buffer_index: int, axis: int, factor: int, offset: int
+        self,
+        block: BlockRV,
+        buffer_index: int,
+        axis: int,
+        factor: int,
+        offset: int,
     ) -> None:
         """Set alignment requirement for specific dimension such that
         stride[axis] == k * factor + offset for some k. This is useful to set memory layout for more
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 9b843ae181fb..6b22cf13f5b9 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -61,6 +61,7 @@
 from . import sparse
 from . import hls
 from . import random
+from . import hexagon
 
 # error reporting
 from .utils import InvalidShapeError
diff --git a/python/tvm/topi/cuda/nn.py b/python/tvm/topi/cuda/nn.py
index 0de377705531..e29bb440de35 100644
--- a/python/tvm/topi/cuda/nn.py
+++ b/python/tvm/topi/cuda/nn.py
@@ -18,7 +18,9 @@
 """scheduler functions for cuda backend"""
 from __future__ import absolute_import as _abs
 
-from .. import cpp
+import tvm
+from tvm import te
+from ..utils import traverse_inline
 
 
 def schedule_lrn(outs):
@@ -35,4 +37,19 @@ def schedule_lrn(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    return cpp.cuda.schedule_lrn(outs)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+
+    def _callback(op):
+        if "sqr_sum" in op.tag:
+            pad = op.input_tensors[0]
+            s[pad].compute_inline()
+            fused_axis = s[outs[0]].fuse(*s[outs[0]].op.axis)
+            bx, tx = s[outs[0]].split(fused_axis, factor=max_threads)
+            s[outs[0]].bind(bx, te.thread_axis("blockIdx.x"))
+            s[outs[0]].bind(tx, te.thread_axis("threadIdx.x"))
+            s[op].compute_at(s[outs[0]], tx)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/python/tvm/topi/cuda/pooling.py b/python/tvm/topi/cuda/pooling.py
index f2a6aadb659f..ba2e7da8e11e 100644
--- a/python/tvm/topi/cuda/pooling.py
+++ b/python/tvm/topi/cuda/pooling.py
@@ -20,6 +20,8 @@
 from tvm import te
 from .. import tag
 from ..utils import traverse_inline
+from .reduction import _schedule_reduce
+from .injective import schedule_injective_from_existing
 
 
 def schedule_adaptive_pool(outs, layout="NCHW"):
@@ -39,12 +41,7 @@ def schedule_adaptive_pool(outs, layout="NCHW"):
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
-    def _schedule(Pool):
-        num_thread = 8
-        block_x = te.thread_axis("blockIdx.x")
-        block_y = te.thread_axis("blockIdx.y")
-        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-        thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
+    def _schedule_non_global(Pool):
         if Pool.op in s.outputs:
             Out = Pool
             OL = s.cache_write(Pool, "local")
@@ -52,16 +49,12 @@ def _schedule(Pool):
             Out = outs[0].op.output(0)
             s[Pool].set_scope("local")
 
-        by, ty = s[Out].split(s[Out].op.axis[0], factor=num_thread)
-        if layout == "NHWC":
-            bx, tx = s[Out].split(s[Out].op.axis[3], factor=num_thread)
-        else:
-            bx, tx = s[Out].split(s[Out].op.axis[1], factor=num_thread)
-        s[Out].reorder(by, bx, ty, tx)
-        s[Out].bind(ty, thread_y)
-        s[Out].bind(tx, thread_x)
-        s[Out].bind(by, block_y)
-        s[Out].bind(bx, block_x)
+        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+        fused_axis = s[Out].fuse(*s[Out].op.axis)
+        bx, tx = s[Out].split(fused_axis, factor=max_threads)
+        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
+        s[Out].bind(tx, te.thread_axis("threadIdx.x"))
+
         if Pool.op in s.outputs:
             s[OL].compute_at(s[Out], tx)
         else:
@@ -72,7 +65,7 @@ def _schedule(Pool):
     def traverse(OP):
         """Internal traverse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
+        if tag.is_injective(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
@@ -81,7 +74,16 @@ def traverse(OP):
         # schedule global_pool
         elif OP.tag.startswith("adaptive_pool"):
             Pool = OP.output(0)
-            _schedule(Pool)
+            oshape = Pool.shape
+            if (layout == "NCHW" and oshape[2] == 1 and oshape[3] == 1) or (
+                layout == "NHWC" and oshape[1] == 1 and oshape[2] == 1
+            ):
+                _schedule_reduce(OP, s)
+                if OP != outs[0].op:
+                    # the final division for adaptive pool or fused elemwise ops
+                    schedule_injective_from_existing(s, outs[0])
+            else:
+                _schedule_non_global(Pool)
         else:
             raise RuntimeError("Unsupported operator: %s" % OP.tag)
 
@@ -135,7 +137,7 @@ def _schedule(PaddedInput, Pool):
     def traverse(OP):
         """Internal traverse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
+        if tag.is_injective(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
diff --git a/python/tvm/topi/cuda/softmax.py b/python/tvm/topi/cuda/softmax.py
index 516d4f93672e..14d2963acf98 100644
--- a/python/tvm/topi/cuda/softmax.py
+++ b/python/tvm/topi/cuda/softmax.py
@@ -21,43 +21,26 @@
 from tvm.contrib import cudnn
 from .. import generic
 from .injective import schedule_injective_from_existing
+from ..utils import traverse_inline
 
 
-def schedule_softmax(outs):
-    """Schedule for softmax op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of softmax in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-    softmax = outs[0]
-    tgt = Target.current(allow_none=False)
-
-    op_tag = softmax.op.tag
+def _schedule_softmax(softmax_op, s, outs, tgt):
+    op_tag = softmax_op.tag
     if op_tag == "softmax_output":
-        expsum = softmax.op.input_tensors[1]
-        exp = softmax.op.input_tensors[0]
+        expsum = softmax_op.input_tensors[1]
+        exp = softmax_op.input_tensors[0]
         max_elem = s[exp].op.input_tensors[1]
         delta = None
     elif op_tag == "fast_softmax_output":
-        expsum = softmax.op.input_tensors[1]
-        exp = softmax.op.input_tensors[0]
+        expsum = softmax_op.input_tensors[1]
+        exp = softmax_op.input_tensors[0]
         delta = s[exp].op.input_tensors[0]
         max_elem = s[delta].op.input_tensors[1]
     elif op_tag == "log_softmax_output":
         exp = None
         delta = None
-        max_elem = softmax.op.input_tensors[1]
-        expsum = softmax.op.input_tensors[2]
+        max_elem = softmax_op.input_tensors[1]
+        expsum = softmax_op.input_tensors[2]
     else:
         raise ValueError(
             "Tag is expected to be softmax_output or log_softmax_output. \
@@ -71,19 +54,23 @@ def schedule_softmax(outs):
     #
     # TODO(tvm-team) Fix nvptx codegen or deprecate nvptx backend.
     def sched_warp_softmax():
-        if tgt.kind.name == "nvptx" or tgt.kind.name == "rocm":
-            return softmax.dtype == "float32" or softmax.dtype == "int32"
+        if tgt.kind.name in ["nvptx", "rocm"]:
+            dtype = softmax_op.output(0).dtype
+            return dtype in ["float32", "int32"]
         if tgt.kind.name != "cuda":
-            # this is used as the gpu schedule for other arches which may not have warp reductions
+            # this is used as the gpu schedule for other arches which
+            # may not have warp reductions
             return False
         return True
 
-    if len(softmax.shape) > 2:
-        ops = [max_elem.op, expsum.op, softmax.op]
+    if len(outs[0].shape) > 2:
+        ops = [max_elem.op, expsum.op, softmax_op]
         if delta is not None:
             ops.append(delta.op)
         if exp is not None:
             ops.append(exp.op)
+        if softmax_op != outs[0].op:
+            ops.append(outs[0].op)
 
         for op in ops:
             s = schedule_injective_from_existing(s, op.output(0))
@@ -95,17 +82,22 @@ def sched_warp_softmax():
         thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
 
         # (4) softmax
-        xo, xi = s[softmax].split(softmax.op.axis[1], nparts=num_thread)
-        _, xii = s[softmax].split(xi, factor=4)
-        s[softmax].vectorize(xii)
-        s[softmax].bind(xo, thread_x)
-        s[softmax].bind(softmax.op.axis[0], block_x)
+        output = outs[0]
+        xo, xi = s[output].split(output.op.axis[1], nparts=num_thread)
+        xio, xii = s[output].split(xi, factor=4)
+        s[output].vectorize(xii)
+        s[output].bind(xo, thread_x)
+        s[output].bind(output.op.axis[0], block_x)
+
+        if softmax_op != outs[0].op:
+            s[softmax_op].compute_at(s[output], xio)
+            s[softmax_op].vectorize(softmax_op.axis[1])  # vec_len == 4
 
         # (3) expsum
         k = expsum.op.reduce_axis[0]
         ko, _ = s[expsum].split(k, nparts=num_thread)
         s[expsum].bind(ko, thread_x)
-        s[expsum].compute_at(s[softmax], xo)
+        s[expsum].compute_at(s[output], xo)
 
         # (2) exp
         if delta is not None:
@@ -117,7 +109,7 @@ def sched_warp_softmax():
             s[exp].vectorize(xii)
             s[exp].bind(xo, thread_x)
             s[exp].compute_at(s[expsum], expsum.op.axis[0])
-            s[exp].compute_at(s[softmax], softmax.op.axis[0])
+            s[exp].compute_at(s[output], output.op.axis[0])
             s[exp].set_scope("warp")
 
         # (1) max_elem
@@ -149,10 +141,39 @@ def sched_warp_softmax():
         s[expsum].bind(s[expsum].op.reduce_axis[0], thread_x)
         s[EF].compute_at(s[expsum], s[expsum].op.reduce_axis[0])
         s[expsum].set_store_predicate(thread_x.var.equal(0))
-        tx, xi = s[softmax].split(softmax.op.axis[1], nparts=num_thread)
-        s[softmax].bind(softmax.op.axis[0], block_x)
-        s[softmax].bind(tx, thread_x)
 
+        output = outs[0]
+        tx, xi = s[output].split(output.op.axis[1], nparts=num_thread)
+        s[output].bind(output.op.axis[0], block_x)
+        s[output].bind(tx, thread_x)
+
+        if softmax_op != outs[0].op:
+            s[softmax_op].compute_at(s[output], tx)
+
+
+def schedule_softmax(outs):
+    """Schedule for softmax op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of softmax in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tgt = Target.current(allow_none=False)
+
+    def _callback(op):
+        if "softmax" in op.tag:
+            _schedule_softmax(op, s, outs, tgt)
+
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
 
diff --git a/python/tvm/topi/generic/__init__.py b/python/tvm/topi/generic/__init__.py
index cc64abab8ed8..021f9a1bbe1d 100644
--- a/python/tvm/topi/generic/__init__.py
+++ b/python/tvm/topi/generic/__init__.py
@@ -39,3 +39,4 @@
 from .sort import *
 from .search import *
 from .image import *
+from .math import *
diff --git a/python/tvm/topi/generic/math.py b/python/tvm/topi/generic/math.py
new file mode 100644
index 000000000000..3af6cd16a374
--- /dev/null
+++ b/python/tvm/topi/generic/math.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Generic math operators"""
+from .default import default_schedule as _default_schedule
+
+
+def schedule_einsum(outs):
+    """Schedule for einsum operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of einsum.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/rocm/nn.py b/python/tvm/topi/hexagon/__init__.py
similarity index 80%
rename from python/tvm/topi/rocm/nn.py
rename to python/tvm/topi/hexagon/__init__.py
index c963375c636b..3263819ccf3a 100644
--- a/python/tvm/topi/rocm/nn.py
+++ b/python/tvm/topi/hexagon/__init__.py
@@ -14,11 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""scheduler for normalization functions on rocm backend"""
-from __future__ import absolute_import as _abs
 
-from .. import cpp
+""" Schedules for Hexagon. """
 
+# pylint: disable=wildcard-import
 
-def schedule_lrn(outs):
-    return cpp.rocm.schedule_lrn(outs)
+from .conv2d import *
diff --git a/python/tvm/topi/hexagon/conv2d.py b/python/tvm/topi/hexagon/conv2d.py
new file mode 100644
index 000000000000..8a484ae77e48
--- /dev/null
+++ b/python/tvm/topi/hexagon/conv2d.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Schedules for conv2d. """
+
+import tvm
+
+
+def schedule_conv2d_nhwc(outs):
+    """Schedule for Conv2d NHWC operator."""
+    s = tvm.te.create_schedule([x.op for x in outs])
+    return s
diff --git a/python/tvm/topi/rocm/__init__.py b/python/tvm/topi/rocm/__init__.py
index 1ea4c79aaea7..f61039ab91cc 100644
--- a/python/tvm/topi/rocm/__init__.py
+++ b/python/tvm/topi/rocm/__init__.py
@@ -22,4 +22,3 @@
 from .batch_matmul import *
 from .conv2d import *
 from .dense import *
-from .nn import *
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 32b06725cdc2..bda1f8c725f5 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -26,11 +26,11 @@
 from ..generic import conv2d as conv2d_generic
 from ..utils import get_const_tuple, simplify
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-from .utils import get_fp32_len
+from .utils import get_simd_32bit_lanes
 
 
 def _fallback_schedule(cfg, wkl):
-    simd_width = get_fp32_len()
+    simd_width = get_simd_32bit_lanes()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.stride_h, wkl.stride_w
     dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1
@@ -157,7 +157,7 @@ def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last):
         kernel_vec,
         conv_out,
         last,
-        int32_lanes=16,
+        int32_lanes=get_simd_32bit_lanes(),
         intrin=dot_16x1x16_uint8_int8_int32(),
     )
 
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 5e63de329bba..4f129fc6912f 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -22,11 +22,11 @@
 from ..generic import conv2d as conv2d_generic
 from ..utils import get_const_tuple
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-from .utils import get_fp32_len
+from .utils import get_simd_32bit_lanes
 
 
 def _fallback_schedule(cfg, wkl):
-    simd_width = get_fp32_len()
+    simd_width = get_simd_32bit_lanes()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.stride_h, wkl.stride_w
     dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
@@ -174,6 +174,6 @@ def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last):
         kernel_vec,
         conv_out,
         last,
-        int32_lanes=16,
+        int32_lanes=get_simd_32bit_lanes(),
         intrin=dot_16x1x16_uint8_int8_int32(),
     )
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index ca0d0b8b223c..075723303841 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -30,6 +30,7 @@
 from ..utils import get_const_tuple, traverse_inline
 from .. import nn
 from . import conv2d_avx_1x1, conv2d_avx_common
+from .utils import target_has_sse42
 
 
 def _get_default_config_int8(
@@ -73,9 +74,7 @@ def is_int8_hw_support(data_dtype, kernel_dtype):
 
     # 3) Check target
     mcpu = tvm.target.Target.current().mcpu
-    is_target_support = False
-    if mcpu in ("skylake-avx512", "cascadelake"):
-        is_target_support = True
+    is_target_support = target_has_sse42(mcpu)
 
     return is_dtype_support and is_llvm_support and is_target_support
 
diff --git a/python/tvm/topi/x86/conv3d.py b/python/tvm/topi/x86/conv3d.py
index d5b09e640e16..c4194167ce47 100644
--- a/python/tvm/topi/x86/conv3d.py
+++ b/python/tvm/topi/x86/conv3d.py
@@ -26,7 +26,7 @@
 from ..nn.utils import get_pad_tuple3d, infer_pad3d
 from ..nn.pad import pad
 from ..utils import get_const_tuple, simplify, get_const_int
-from .utils import get_fp32_len
+from .utils import get_simd_32bit_lanes
 
 Workload3D = namedtuple(
     "Workload",
@@ -520,7 +520,7 @@ def _get_conv3d_workload(data, kernel, stride, padding, out_dtype, data_layout="
 
 
 def _fallback_schedule(cfg, wkl):
-    simd_width = get_fp32_len()
+    simd_width = get_simd_32bit_lanes()
     DPAD, HPAD, WPAD = wkl.dpad, wkl.hpad, wkl.wpad
     DSTR, HSTR, WSTR = wkl.dstride, wkl.hstride, wkl.wstride
     out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 29c378dda30f..9799ec02d644 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -26,7 +26,7 @@
 from tvm.contrib import mkl
 from tvm.contrib import mkldnn
 
-from .utils import get_fp32_len
+from .utils import get_simd_32bit_lanes
 from .. import generic, tag
 from ..utils import traverse_inline, get_const_tuple
 
@@ -107,7 +107,7 @@ def _default_dense_pack_config(cfg, M, N, K):
     if isinstance(K, (tvm.tir.Var, tvm.tir.Any)):
         K = 16
 
-    vec_width = get_fp32_len()
+    vec_width = get_simd_32bit_lanes()
     tilex_ii = 1
     for bn in range(vec_width * 2, 0, -1):
         if N % bn == 0:
@@ -145,7 +145,7 @@ def _default_dense_nopack_config(cfg, M, N, K):
     if isinstance(K, (tvm.tir.Var, tvm.tir.Any)):
         K = 16
 
-    vec_width = get_fp32_len()
+    vec_width = get_simd_32bit_lanes()
     tilek_bn = 1
     for bn in range(vec_width * 2, 0, -1):
         if K % bn == 0:
diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
index cb2f1929d395..8db84497f82d 100644
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -47,7 +47,7 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
             if cfg.is_fallback:
                 _default_dense_pack_config(cfg, M, N, K)
             packw_bn = cfg["tile_x"].size[-1]
-            weight_layout = "NK%dn" % packw_bn
+            weight_layout = "NC%dn" % packw_bn
             new_weight = te.placeholder(
                 (N // packw_bn, K, packw_bn),
                 dtype=weight_tensor.dtype,
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index a0225ef9e147..5e49c2cb3b78 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -27,7 +27,7 @@
 from ..nn.depthwise_conv2d import _get_workload, depthwise_conv2d_infer_layout
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..utils import traverse_inline
-from .utils import get_fp32_len
+from .utils import get_simd_32bit_lanes
 
 
 def _fallback_schedule(cfg, wkl):
@@ -40,7 +40,7 @@ def _fallback_schedule(cfg, wkl):
     wkl : topi.nn.depthwise_conv2d.Workload
         Convolution workload
     """
-    simd_width = get_fp32_len()
+    simd_width = get_simd_32bit_lanes()
 
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.stride_h, wkl.stride_w
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 0e10052e2428..890a15898a1a 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -23,7 +23,7 @@
 from tvm import te
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
-from .utils import get_fp32_len
+from .utils import get_simd_32bit_lanes
 from ..utils import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
@@ -62,7 +62,7 @@ def _get_default_config(
 
 
 def _fallback_schedule(cfg, wkl):
-    simd_width = get_fp32_len()
+    simd_width = get_simd_32bit_lanes()
     pad_left, pad_right = wkl.padl, wkl.padr
     stride_w = wkl.stride_w
     out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1
diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py
index 4c39f2ad7382..9b6754c5e847 100644
--- a/python/tvm/topi/x86/nn.py
+++ b/python/tvm/topi/x86/nn.py
@@ -17,44 +17,28 @@
 # pylint: disable=invalid-name,too-many-locals,unused-variable
 """x86 nn operators"""
 from tvm import te
+from ..utils import traverse_inline
 
 
-def schedule_softmax(outs):
-    """Schedule for softmax
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of softmax
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    softmax = outs[0]
-    s = te.create_schedule([x.op for x in outs])
-
-    op_tag = softmax.op.tag
+def _schedule_softmax(softmax_op, s, outs):
+    op_tag = softmax_op.tag
     if op_tag == "softmax_output":
-        exp = softmax.op.input_tensors[0]
-        expsum = softmax.op.input_tensors[1]
+        exp = softmax_op.input_tensors[0]
+        expsum = softmax_op.input_tensors[1]
         max_elem = s[exp].op.input_tensors[1]
         delta = None
-        axis = int(softmax.op.attrs["axis"])
+        axis = int(softmax_op.attrs["axis"])
     elif op_tag == "fast_softmax_output":
-        exp = softmax.op.input_tensors[0]
-        expsum = softmax.op.input_tensors[1]
+        exp = softmax_op.input_tensors[0]
+        expsum = softmax_op.input_tensors[1]
         delta = s[exp].op.input_tensors[0]
         max_elem = s[delta].op.input_tensors[1]
-        axis = int(softmax.op.attrs["axis"])
+        axis = int(softmax_op.attrs["axis"])
     elif op_tag == "log_softmax_output":
         exp = None
         delta = None
-        max_elem = softmax.op.input_tensors[1]
-        expsum = softmax.op.input_tensors[2]
+        max_elem = softmax_op.input_tensors[1]
+        expsum = softmax_op.input_tensors[2]
         axis = 1
     else:
         raise ValueError(
@@ -65,18 +49,49 @@ def schedule_softmax(outs):
         )
 
     # only parallelize outer dimensions up to axis
-    outer_axes = [s[softmax].op.axis[i] for i in range(0, axis)]
-    fused_outer_axes = s[softmax].fuse(*outer_axes)
-    s[softmax].parallel(fused_outer_axes)
+    outer_axes = [s[softmax_op].op.axis[i] for i in range(0, axis)]
+    fused_outer_axes = s[softmax_op].fuse(*outer_axes)
+    s[softmax_op].parallel(fused_outer_axes)
 
     # move computations with the same outer dimensions under the same root
-    s[max_elem].compute_at(s[softmax], fused_outer_axes)
-    s[expsum].compute_at(s[softmax], fused_outer_axes)
+    s[max_elem].compute_at(s[softmax_op], fused_outer_axes)
+    s[expsum].compute_at(s[softmax_op], fused_outer_axes)
 
     if delta is not None:
         s[exp].compute_inline()
         s[delta].compute_inline()
     if exp is not None:
-        s[exp].compute_at(s[softmax], fused_outer_axes)
+        s[exp].compute_at(s[softmax_op], fused_outer_axes)
+
+    if softmax_op != outs[0].op:
+        # fuse softmax output with following elemwise ops.
+        output = outs[0]
+        outer_axes = [s[output].op.axis[i] for i in range(0, axis)]
+        fused_outer_axes = s[output].fuse(*outer_axes)
+        s[output].parallel(fused_outer_axes)
+        s[softmax_op].compute_at(s[output], fused_outer_axes)
+
+
+def schedule_softmax(outs):
+    """Schedule for softmax
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of softmax
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "softmax" in op.tag:
+            _schedule_softmax(op, s, outs)
 
+    traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/python/tvm/topi/x86/pooling.py b/python/tvm/topi/x86/pooling.py
index db0f9faf1970..b3f4eedec67c 100644
--- a/python/tvm/topi/x86/pooling.py
+++ b/python/tvm/topi/x86/pooling.py
@@ -89,7 +89,7 @@ def _schedule(PaddedInput, Pool):
     def traverse(OP):
         """Internal traverse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
+        if tag.is_injective(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
@@ -137,7 +137,7 @@ def schedule_adaptive_pool(outs):
     def traverse(OP):
         """Internal traverse function"""
         # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
+        if tag.is_injective(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
diff --git a/python/tvm/topi/x86/sparse.py b/python/tvm/topi/x86/sparse.py
index 48ec233fa4bb..8a2cb0b69475 100644
--- a/python/tvm/topi/x86/sparse.py
+++ b/python/tvm/topi/x86/sparse.py
@@ -21,7 +21,7 @@
 
 from ..transform import reshape
 from ..utils import traverse_inline, get_const_int
-from .utils import get_fp32_len
+from .utils import get_simd_32bit_lanes
 
 
 def schedule_sparse_dense(outs):
@@ -29,7 +29,7 @@ def schedule_sparse_dense(outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        simd_width = get_fp32_len()
+        simd_width = get_simd_32bit_lanes()
         if op.tag == "sparse_dense_sp_lhs_csrmm" or op.tag == "sparse_dense_sp_lhs_csrmm":
             (y_o, y_i) = s[op].split(s[op].op.axis[1], 2)
             fused = s[op].fuse(s[op].op.axis[0], y_o)
diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index 818765dc0b27..727319c95c5c 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -19,20 +19,19 @@
 import tvm
 from tvm import te
 import tvm.target.codegen
+from .utils import target_has_sse42, target_has_vnni, get_simd_32bit_lanes
 
 
 def dot_16x1x16_uint8_int8_int32():
     """Dispatch the most optimized intrin depending on the target"""
     mcpu = tvm.target.Target.current().mcpu
 
-    assert mcpu in (
-        "skylake-avx512",
-        "cascadelake",
-    ), "An old Intel machine that does not have fast Int8 support."
-    if mcpu == "skylake-avx512":
-        return dot_16x1x16_uint8_int8_int32_skylake()
-    # cascadelake
-    return dot_16x1x16_uint8_int8_int32_cascadelake()
+    assert target_has_sse42(mcpu), "An old Intel machine that does not have fast Int8 support."
+    if target_has_vnni(mcpu):
+        # VNNI capable platform
+        return dot_16x1x16_uint8_int8_int32_cascadelake()
+    # vpmaddubsw/vpmaddwd fallback
+    return dot_16x1x16_uint8_int8_int32_skylake()
 
 
 def dot_16x1x16_uint8_int8_int32_skylake():
@@ -64,7 +63,7 @@ def dot_16x1x16_uint8_int8_int32_skylake():
         The Skylake int8 TensorIntrin that can be used in tensorizing schedule
     """
 
-    int32_lanes = 16  # 16 int32 lanes in AVX512
+    int32_lanes = get_simd_32bit_lanes()
     num_int8_elements = 4  # 4 int8 elements in int32
     data = te.placeholder((num_int8_elements,), dtype="uint8", name="data")
     kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel")
@@ -84,27 +83,50 @@ def dot_16x1x16_uint8_int8_int32_skylake():
 
     def _intrin_func(ins, outs):
         def _instr(index):
+            # int_lx32 - output datatype after pmaddubs - 16 bits to number of lanes
+            # int_8xl - input datatype to pmaddubs - 8 bits to number of lanes
+            # int_32xl - output datatype after pmaddw - 32 bits per number of lanes
+
+            if int32_lanes == 4:
+                int_lx32 = "int16x8"
+                int_8xl = "int8x16"
+                int_32xl = "int32x4"
+                pmaddubs = "llvm.x86.ssse3.pmadd.ub.sw.128"
+                pmaddw = "llvm.x86.sse2.pmadd.wd"
+            elif int32_lanes == 8:
+                int_lx32 = "int16x16"
+                int_8xl = "int8x32"
+                int_32xl = "int32x8"
+                pmaddubs = "llvm.x86.avx2.pmadd.ub.sw"
+                pmaddw = "llvm.x86.avx2.pmadd.wd"
+            elif int32_lanes == 16:
+                int_lx32 = "int16x32"
+                int_8xl = "int8x64"
+                int_32xl = "int32x16"
+                pmaddubs = "llvm.x86.avx512.pmaddubs.w.512"
+                pmaddw = "llvm.x86.avx512.pmaddw.d.512"
+
             ib = tvm.tir.ir_builder.create()
             if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x16")))
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, int_32xl)))
                 return ib.get()
 
             a_int8 = ins[0].vload([0], "uint8x4")
             re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-            vec_ai32 = re_int32.astype("int32x16")
-            vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32)
-            vec_b = ins[1].vload([0, 0], "int8x64")
-            vec_one = tvm.tir.const(1, "int16x32")
+            vec_ai32 = re_int32.astype(int_32xl)
+            vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
+            vec_b = ins[1].vload([0, 0], int_8xl)
+            vec_one = tvm.tir.const(1, int_lx32)
             pair_reduction = tvm.tir.call_llvm_pure_intrin(
-                "int16x32",
-                "llvm.x86.avx512.pmaddubs.w.512",
+                int_lx32,
+                pmaddubs,
                 tvm.tir.const(0, "uint32"),
                 vec_a,
                 vec_b,
             )
             quad_reduction = tvm.tir.call_llvm_pure_intrin(
-                "int32x16",
-                "llvm.x86.avx512.pmaddw.d.512",
+                int_32xl,
+                pmaddw,
                 tvm.tir.const(0, "uint32"),
                 pair_reduction,
                 vec_one,
@@ -112,7 +134,7 @@ def _instr(index):
             if index == 0:
                 ib.emit(outs[0].vstore(0, quad_reduction))
             else:
-                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x16")))
+                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], int_32xl)))
             return ib.get()
 
         # body, reset, update
diff --git a/python/tvm/topi/x86/utils.py b/python/tvm/topi/x86/utils.py
index 92c11a7f1ef1..658a92966257 100644
--- a/python/tvm/topi/x86/utils.py
+++ b/python/tvm/topi/x86/utils.py
@@ -18,9 +18,95 @@
 import tvm
 
 
-def get_fp32_len():
+def target_has_sse42(target):
+    return (
+        target_has_avx(target)
+        or target_has_avx2(target)
+        or target_has_avx512(target)
+        or target_has_vnni(target)
+        or target
+        in {
+            "silvermont",
+            "slm",
+            "goldmont",
+            "goldmont-plus",
+            "tremont",
+            "nehalem",
+            "corei7",
+            "westmere",
+            "bdver1",
+            "bdver2",
+            "bdver3",
+            "x86-64-v2",
+        }
+    )
+
+
+def target_has_avx(target):
+    return (
+        target_has_avx2(target)
+        or target_has_avx512(target)
+        or target_has_vnni(target)
+        or target in {"sandybridge", "corei7-avx", "ivybridge", "core-avx-i"}
+    )
+
+
+def target_has_avx2(target):
+    return (
+        target_has_avx512(target)
+        or target_has_vnni(target)
+        or target
+        in {
+            "haswell",
+            "core-avx2",
+            "broadwell",
+            "skylake",
+            "bdver4",
+            "znver1",
+            "znver2",
+            "znver3",
+            "x86-64-v3",
+        }
+    )
+
+
+def target_has_avx512(target):
+    return target in {
+        "skylake-avx512",
+        "skx",
+        "knl",
+        "knm",
+        "x86-64-v4",
+        "cannonlake",
+        # explicit enumeration of VNNI capable due to collision with alderlake
+        "cascadelake",
+        "icelake-client",
+        "rocketlake",
+        "icelake",
+        "tigerlake",
+        "cooperlake",
+        "sapphirerapids",
+    }
+
+
+def target_has_vnni(target):
+    return target in {
+        "cascadelake",
+        "icelake-client",
+        "rocketlake",
+        "icelake",
+        "tigerlake",
+        "cooperlake",
+        "sapphirerapids",
+        "alderlake",
+    }
+
+
+def get_simd_32bit_lanes():
     mcpu = tvm.target.Target.current().mcpu
-    fp32_vec_len = 8
-    if mcpu in ("skylake-avx512", "cascadelake"):
+    fp32_vec_len = 4
+    if target_has_avx512(mcpu):
         fp32_vec_len = 16
+    elif target_has_avx2(mcpu):
+        fp32_vec_len = 8
     return fp32_vec_len
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 7000de96dc99..a402212cf4ea 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -607,6 +607,13 @@ inline bool ProveEqual(Analyzer* analyzer, PrimExpr lhs, PrimExpr rhs) {
   return is_zero(analyzer->Simplify(lhs - rhs));
 }
 
+IntSet IntSet::FromMinExtent(PrimExpr min, PrimExpr extent) {
+  if (is_one(extent)) {
+    return IntSet::SinglePoint(min);
+  }
+  return IntervalSet(min, extent + min - 1);
+}
+
 IntSet IntSet::FromRange(Range r) {
   // must make sure it can be matched back by MatchRange.
   if (is_one(r->extent)) {
@@ -815,19 +822,18 @@ IntSet EvalSet(Range r, const Map<IterVar, IntSet>& dom_map) {
   return EvalSet(r, ConvertDomMap(dom_map));
 }
 
-Optional<Array<arith::IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
-                                                        const Map<Var, Range>& var_dom,
-                                                        const PrimExpr& predicate,
-                                                        arith::Analyzer* analyzer) {
+Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
+                                                 const Map<Var, Range>& var_dom,
+                                                 const PrimExpr& predicate, Analyzer* analyzer) {
   int ndim = region.size();
-  Array<arith::IterSumExpr> iter_sum_exprs{nullptr};
+  Array<IterSumExpr> iter_sum_exprs{nullptr};
   {
     Array<PrimExpr> affine_indices;
     affine_indices.reserve(ndim);
     for (const Range& range : region) {
       affine_indices.push_back(range->min);
     }
-    iter_sum_exprs = arith::DetectIterMap(
+    iter_sum_exprs = DetectIterMap(
         /*indices=*/affine_indices, /*input_iters=*/var_dom,
         /*predicate=*/predicate, /*require_bijective=*/false, analyzer);
   }
@@ -835,17 +841,17 @@ Optional<Array<arith::IntSet>> EstimateRegionLowerBound(const Array<Range>& regi
     return NullOpt;
   }
   ICHECK_EQ(iter_sum_exprs.size(), ndim);
-  Array<arith::IntSet> result;
+  Array<IntSet> result;
   result.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
-    const arith::IterSumExpr& sum_expr = iter_sum_exprs[i];
+    const IterSumExpr& sum_expr = iter_sum_exprs[i];
     const Range& range = region[i];
     if (sum_expr->args.empty()) {
-      result.push_back(arith::IntSet::Interval(sum_expr->base, sum_expr->base + range->extent));
+      result.push_back(IntSet::FromMinExtent(sum_expr->base, range->extent));
       continue;
     }
     ICHECK_EQ(sum_expr->args.size(), 1);
-    const arith::IterSplitExpr& split = sum_expr->args[0];
+    const IterSplitExpr& split = sum_expr->args[0];
     if (!analyzer->CanProve(range->extent >= split->scale)) {
       return NullOpt;
     }
@@ -853,8 +859,8 @@ Optional<Array<arith::IntSet>> EstimateRegionLowerBound(const Array<Range>& regi
     // IterSplitExpr: (source // lower_factor) % extent * scale
     // where `(source // lower_factor) % extent` is within [0, extent - 1]
     // Therefore, the range of `region[i]->min` is `base + [0, (extent - 1) * scale]`
-    result.push_back(arith::IntSet::Interval(
-        base, split->extent * split->scale + base + (range->extent - split->scale) - 1));
+    result.push_back(
+        IntSet::FromMinExtent(base, split->extent * split->scale + (range->extent - split->scale)));
   }
   return result;
 }
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 1d3475b13dad..4a99e10211b7 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -474,6 +474,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
     if ((div(ramp(b1, c1, lanes), broadcast(c2, lanes))).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
+      ICHECK(c2val != 0) << "division by zero";
       if (c1val % c2val == 0) {
         return ramp(div(b1, c2), div(c1, c2), lanes).Eval();
       }
@@ -644,6 +645,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
     if (truncmod(ramp(b1, c1, lanes), broadcast(c2, lanes)).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
+      ICHECK(c2val != 0) << "division by zero";
       if (c1val % c2val == 0) {
         return broadcast(truncmod(b1, c2), lanes).Eval();
       }
@@ -723,6 +725,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     if (floordiv(ramp(b1, c1, lanes), broadcast(c2, lanes)).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
+      ICHECK(c2val != 0) << "division by zero";
       if (c1val % c2val == 0) {
         return ramp(floordiv(b1, c2), floordiv(c1, c2), lanes).Eval();
       }
@@ -851,6 +854,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     if (floormod(ramp(b1, c1, lanes), broadcast(c2, lanes)).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
+      ICHECK(c2val != 0) << "division by zero";
       if (c1val % c2val == 0) {
         return broadcast(floormod(b1, c2), lanes).Eval();
       }
diff --git a/src/ir/affine_type.cc b/src/ir/affine_type.cc
index 3454b6011c9b..87235fe20ade 100644
--- a/src/ir/affine_type.cc
+++ b/src/ir/affine_type.cc
@@ -30,26 +30,28 @@ namespace tvm {
 using tvm::ReprPrinter;
 using namespace tvm::runtime;
 
-TensorAffineType::TensorAffineType(RelayExpr scale, RelayExpr zero_point, DataType dtype) {
+TensorAffineType::TensorAffineType(RelayExpr scale, RelayExpr zero_point, DataType dtype,
+                                   int axis) {
   ObjectPtr<TensorAffineTypeNode> n = make_object<TensorAffineTypeNode>();
   n->scale = std::move(scale);
   n->zero_point = std::move(zero_point);
   n->dtype = std::move(dtype);
+  n->axis = std::move(axis);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_NODE_TYPE(TensorAffineTypeNode);
 
 TVM_REGISTER_GLOBAL("ir.TensorAffineType")
-    .set_body_typed([](RelayExpr scale, RelayExpr zero_point, DataType dtype) {
-      return TensorAffineType(scale, zero_point, dtype);
+    .set_body_typed([](RelayExpr scale, RelayExpr zero_point, DataType dtype, int axis) {
+      return TensorAffineType(scale, zero_point, dtype, axis);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<TensorAffineTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const TensorAffineTypeNode*>(ref.get());
       p->stream << "TensorAffineType(" << node->scale << ", " << node->zero_point << ", "
-                << node->dtype << ")";
+                << node->dtype << ", " << node->axis << ")";
     });
 
 TupleAffineType::TupleAffineType(Array<TensorAffineType> types) {
diff --git a/src/ir/module.cc b/src/ir/module.cc
index d4129c84ccf5..15c441d61a23 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -43,7 +43,8 @@ namespace tvm {
 
 IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
                    tvm::Map<GlobalTypeVar, TypeData> type_definitions,
-                   std::unordered_set<String> import_set, parser::SourceMap source_map) {
+                   std::unordered_set<String> import_set, parser::SourceMap source_map,
+                   DictAttrs attrs) {
   auto n = make_object<IRModuleNode>();
   n->functions = std::move(functions);
   n->type_definitions = std::move(type_definitions);
@@ -52,6 +53,7 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
   n->constructor_tag_map_ = {};
   n->import_set_ = std::move(import_set);
   n->source_map = source_map;
+  n->attrs = std::move(attrs);
 
   for (const auto& kv : n->functions) {
     // set global var map
@@ -72,6 +74,7 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
 
 bool IRModuleNode::SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const {
   if (functions.size() != other->functions.size()) return false;
+  if (!equal(this->attrs, other->attrs)) return false;
   for (const auto& kv : this->functions) {
     if (!other->ContainGlobalVar(kv.first->name_hint)) return false;
     if (!equal(kv.second, other->Lookup(kv.first->name_hint))) return false;
@@ -112,6 +115,7 @@ void IRModuleNode::SHashReduce(SHashReducer hash_reduce) const {
     temp.emplace_back(kv.first->name_hint, kv.second);
   }
   reduce_temp();
+  hash_reduce(this->attrs);
 }
 
 bool IRModuleNode::ContainGlobalVar(const String& name) const {
@@ -361,6 +365,11 @@ void IRModuleNode::Update(const IRModule& mod) {
   }
 }
 
+IRModule IRModuleNode::ShallowCopy() {
+  return IRModule(this->functions, this->type_definitions, this->Imports(), this->source_map,
+                  this->attrs);
+}
+
 std::pair<IRModule, GlobalVar> IRModule::FromExprInContext(
     const RelayExpr& expr, const tvm::Map<GlobalVar, BaseFunc>& global_funcs,
     const tvm::Map<GlobalTypeVar, TypeData>& type_definitions,
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index df02a6906a0c..906dc258560a 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -45,15 +45,47 @@
 namespace tvm {
 namespace tir {
 
+enum class ExprPrecedence : int {
+  /*! \brief Identity(e.g., IntImm, Var) and function call(e.g., floordiv, min) */
+  kIdentity = 0,
+  /*!
+   * \brief Multiplication(*), division(/), and remainder(%)
+   * \note floorDiv, floorMod is marked as kIdentity since they are function calls.
+   */
+  kMultiplicationDivision = 1,
+  /*! \brief Addition(+) and subtraction(-) */
+  kAdditionSubtraction = 2,
+  /*! \brief For relational operators < and <= and > and >= respectively */
+  kRelational = 3,
+  /*! \brief For equality operators = and != respectively */
+  kEquality = 4,
+  /*! \brief And(&&) */
+  kAnd = 5,
+  /*! \brief Or(||) */
+  kOr = 6,
+  /*! \brief Unknown precedence */
+  kUnknown = 7,
+};
+
+/*!
+ * \brief The printer for TVMScript
+ * \details The printer obtain the precedence of the top-level operation when printing each
+ *          subexpression to decide whether or not parentheses is needed.
+ */
 class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
-                         public ExprFunctor<Doc(const PrimExpr&)>,
+                         public ExprFunctor<Doc(const PrimExpr&, ExprPrecedence*)>,
                          public TypeFunctor<Doc(const Type&)> {
  public:
   explicit TVMScriptPrinter(bool show_meta,
                             runtime::TypedPackedFunc<std::string(Stmt)> annotate = nullptr)
       : show_meta_(show_meta), annotate_(std::move(annotate)), meta_collector_(&meta_) {}
 
-  /*! \brief Print the node */
+  /*!
+   * \brief Print the node.
+   * \param node The node to be printed.
+   * \param out_precedence The operator precedence of node if it's a PrimExpr,
+   *        so we can simplify the bracket.
+   */
   TVM_DLL Doc Print(const ObjectRef& node);
 
  private:
@@ -68,12 +100,12 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   /*! \brief map from Function to GlobalVar */
   std::unordered_map<const BaseFuncNode*, GlobalVar> func2var_;
   /*! \brief var collector (var defined by For/Loop/Block) */
-  std::unordered_set<const VarNode*> var_not_in_headers;
+  std::unordered_set<const VarNode*> var_not_in_headers_;
   /*!
    * \brief buffer collector
    *        (buffer defined in BufferMap, BufferAllocation and MatchBufferRegion)
    */
-  std::unordered_set<const BufferNode*> buf_not_in_headers;
+  std::unordered_set<const BufferNode*> buf_not_in_headers_;
   /*! \brief Map from Var to thread env name */
   std::unordered_map<Var, String, ObjectPtrHash, ObjectPtrEqual> var_env_map_;
   /*! \brief Map from Var to Doc */
@@ -93,40 +125,40 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   /*! \brief loop stack without annotations */
   std::vector<For> loop_stack_;
 
-  Doc VisitExpr_(const CastNode* op) override;
-  Doc VisitExpr_(const VarNode* op) override;
-  Doc VisitExpr_(const AddNode* op) override;
-  Doc VisitExpr_(const SubNode* op) override;
-  Doc VisitExpr_(const MulNode* op) override;
-  Doc VisitExpr_(const DivNode* op) override;
-  Doc VisitExpr_(const ModNode* op) override;
-  Doc VisitExpr_(const FloorDivNode* op) override;
-  Doc VisitExpr_(const FloorModNode* op) override;
-  Doc VisitExpr_(const MinNode* op) override;
-  Doc VisitExpr_(const MaxNode* op) override;
-  Doc VisitExpr_(const EQNode* op) override;
-  Doc VisitExpr_(const NENode* op) override;
-  Doc VisitExpr_(const LTNode* op) override;
-  Doc VisitExpr_(const LENode* op) override;
-  Doc VisitExpr_(const GTNode* op) override;
-  Doc VisitExpr_(const GENode* op) override;
-  Doc VisitExpr_(const AndNode* op) override;
-  Doc VisitExpr_(const OrNode* op) override;
-  Doc VisitExpr_(const NotNode* op) override;
-  Doc VisitExpr_(const SelectNode* op) override;
-  Doc VisitExpr_(const IntImmNode* op) override;
-  Doc VisitExpr_(const FloatImmNode* op) override;
-  Doc VisitExpr_(const StringImmNode* op) override;
-  Doc VisitExpr_(const ProducerLoadNode* op) override;
-  Doc VisitExpr_(const BufferLoadNode* op) override;
-  Doc VisitExpr_(const LoadNode* op) override;
-  Doc VisitExpr_(const RampNode* op) override;
-  Doc VisitExpr_(const BroadcastNode* op) override;
-  Doc VisitExpr_(const LetNode* op) override;
-  Doc VisitExpr_(const CallNode* op) override;
-  Doc VisitExpr_(const ShuffleNode* op) override;
-  Doc VisitExpr_(const ReduceNode* op) override;
-  Doc VisitExprDefault_(const Object* op) override;
+  Doc VisitExpr_(const CastNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const VarNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const AddNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const SubNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const MulNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const DivNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const ModNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const FloorDivNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const FloorModNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const MinNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const MaxNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const EQNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const NENode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const LTNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const LENode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const GTNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const GENode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const AndNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const OrNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const NotNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const SelectNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const IntImmNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const FloatImmNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const StringImmNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const ProducerLoadNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const BufferLoadNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const LoadNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const RampNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const BroadcastNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const LetNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const CallNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const ShuffleNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const ReduceNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExprDefault_(const Object* op, ExprPrecedence* out_precedence) override;
 
   Doc VisitStmt_(const LetStmtNode* op) override;
   Doc VisitStmt_(const AttrStmtNode* op) override;
@@ -138,6 +170,7 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitStmt_(const IfThenElseNode* op) override;
   Doc VisitStmt_(const SeqStmtNode* op) override;
   Doc VisitStmt_(const ForNode* op) override;
+  Doc VisitStmt_(const WhileNode* op) override;
   Doc VisitStmt_(const PrefetchNode* op) override;
   Doc VisitStmt_(const EvaluateNode* op) override;
   Doc VisitStmt_(const BlockRealizeNode* op) override;
@@ -155,6 +188,9 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc PrintArray(const ArrayNode* op);
   Doc PrintBuffer(const BufferNode* op);
   Doc AllocBufferDeclaration(const Buffer& buf);
+  Doc PrintBlockVar(const BlockNode* op);
+  Doc PrintBlockAttr(const BlockRealizeNode* op);
+  Doc PrintBlockBody(const BlockNode* op);
   Doc PrintBufferRegion(const BufferRegionNode* op);
   Doc PrintMatchBufferRegion(const MatchBufferRegionNode* op);
   Doc PrintAnnotations(const Map<String, ObjectRef>& annotations);
@@ -163,6 +199,7 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc GetUniqueName(std::string prefix);
   Doc AllocVar(const Var& var);
   Doc AllocBuf(const Buffer& buffer);
+  void TryDeallocVar(const Var& var);
 
   /*! Helper functions for loop printing. */
   /*!
@@ -254,7 +291,7 @@ Doc TVMScriptPrinter::GetUniqueName(std::string prefix) {
   std::replace(prefix.begin(), prefix.end(), '.', '_');
   std::string unique_prefix = prefix;
   auto it = name_alloc_map_.find(prefix);
-  if (it != name_alloc_map_.end()) {
+  if (it != name_alloc_map_.end() && it->second >= 0) {
     while (name_alloc_map_.count(unique_prefix = prefix + "_" + std::to_string(++it->second)) > 0) {
     }
   }
@@ -278,38 +315,41 @@ Doc TVMScriptPrinter::AllocVar(const Var& var) {
 
 Doc TVMScriptPrinter::AllocBufferDeclaration(const Buffer& buf) {
   Doc doc = Print(buf->shape);
-  if (!runtime::TypeEqual(buf->dtype, DataType::Float(32))) {
-    doc << ", dtype=" << PrintDType(buf->dtype);
-  }
+  bool print_factor_explicitly = false;
+  doc << ", dtype=" << PrintDType(buf->dtype);
   if (memo_var_.find(buf->data) != memo_var_.end()) {
     doc << ", data=" << Print(buf->data);
   } else {
     // implicitly define data
     memo_var_[buf->data] = Doc::Text(memo_buf_[buf].str() + ".data");
-    var_not_in_headers.insert(buf->data.get());
+    var_not_in_headers_.insert(buf->data.get());
   }
   if (!buf->strides.empty()) {
     doc << ", strides=" << Print(buf->strides);
   }
-  if (buf->offset_factor != 0 && buf->elem_offset->IsInstance<VarNode>()) {
+  if (buf->elem_offset->IsInstance<VarNode>()) {
     Var elem_offset = Downcast<Var>(buf->elem_offset);
     if (memo_var_.find(elem_offset) != memo_var_.end()) {
       doc << ", elem_offset=" << Print(buf->elem_offset);
     } else {
       // implicitly define elem_offset
       memo_var_[elem_offset] = Doc::Text(memo_buf_[buf].str() + ".elem_offset");
-      var_not_in_headers.insert(elem_offset.get());
+      var_not_in_headers_.insert(elem_offset.get());
+      print_factor_explicitly = true;
+    }
+  } else if (buf->elem_offset->IsInstance<IntImmNode>()) {
+    IntImm elem_offset = Downcast<IntImm>(buf->elem_offset);
+    if (elem_offset->value != 0) {
+      doc << ", elem_offset=" << Print(buf->elem_offset);
     }
-  } else {
-    doc << ", elem_offset=" << Print(buf->elem_offset);
   }
   if (buf.scope() != "global") {
     doc << ", scope=" << Doc::StrLiteral(buf.scope());
   }
-  if (buf->data_alignment != -1) {
+  if (buf->data_alignment != runtime::kAllocAlignment) {
     doc << ", align=" << buf->data_alignment;
   }
-  if (buf->offset_factor != 0) {
+  if (buf->offset_factor != 1 || print_factor_explicitly) {
     doc << ", offset_factor=" << buf->offset_factor;
   }
   if (buf->buffer_type != 1) {
@@ -333,9 +373,36 @@ Doc TVMScriptPrinter::AllocBuf(const Buffer& buffer) {
   return val;
 }
 
+/*!
+ * \brief Try to dealloc vars out of space and leave the index to coming vars.
+ * \note It is not a necessary step.
+ */
+void TVMScriptPrinter::TryDeallocVar(const Var& var) {
+  auto it = memo_var_.find(var);
+  ICHECK(it != memo_var_.end());
+  std::string print_name = it->second.str();
+
+  std::string name_hint = var->name_hint.operator std::string();
+  if (name_hint.length() == 0 || !std::isalpha(name_hint[0])) {
+    name_hint = "v" + name_hint;
+  }
+  std::replace(name_hint.begin(), name_hint.end(), '.', '_');
+
+  auto it2 = name_alloc_map_.find(name_hint);
+  // Skip it if we can not find the name_hint in name_alloc_map_.
+  if (it2 == name_alloc_map_.end()) return;
+  if (it2->second > 0) {
+    name_hint = name_hint + '_' + std::to_string(it2->second);
+  }
+  // Skip it if the name_hint is not equal to how it should be printed.
+  if (name_hint != print_name) return;
+  // Free the conresponding name_alloc_map_ index
+  --it2->second;
+}
+
 Doc TVMScriptPrinter::PrintMatchBufferRegion(const MatchBufferRegionNode* op) {
   const Buffer& buf = op->buffer;
-  buf_not_in_headers.insert(buf.get());
+  buf_not_in_headers_.insert(buf.get());
 
   Doc doc = Print(op->buffer) << " = tir.match_buffer(" << Print(op->source) << ", "
                               << memo_buf_decl_[op->buffer] << ")";
@@ -347,7 +414,8 @@ Doc TVMScriptPrinter::Print(const ObjectRef& node) {
   if (node->IsInstance<StmtNode>()) {
     return PrintOptionalInfo(Downcast<Stmt>(node)) << VisitStmt(Downcast<Stmt>(node));
   } else if (node->IsInstance<PrimExprNode>()) {
-    return VisitExpr(Downcast<PrimExpr>(node));
+    ExprPrecedence t = ExprPrecedence::kUnknown;
+    return VisitExpr(Downcast<PrimExpr>(node), &t);
   } else if (node->IsInstance<TypeNode>()) {
     return VisitType(Downcast<Type>(node));
   } else if (node->IsInstance<PrimFuncNode>()) {
@@ -374,7 +442,7 @@ Doc TVMScriptPrinter::Print(const ObjectRef& node) {
   }
 }
 
-Doc TVMScriptPrinter::VisitExprDefault_(const Object* op) {
+Doc TVMScriptPrinter::VisitExprDefault_(const Object* op, ExprPrecedence* out_precedence) {
   LOG(FATAL) << "Do not know how to print " << op->GetTypeKey();
   return Doc();
 }
@@ -384,92 +452,125 @@ Doc TVMScriptPrinter::VisitStmtDefault_(const Object* op) {
   return Doc();
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const IntImmNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const IntImmNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   return PrintConstScalar<int64_t>(op->dtype, &(op->value));
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const FloatImmNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const FloatImmNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   return PrintConstScalar<double>(op->dtype, &(op->value));
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const StringImmNode* op) { return Doc::StrLiteral(op->value); }
+Doc TVMScriptPrinter::VisitExpr_(const StringImmNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
+  return Doc::StrLiteral(op->value);
+}
 
-Doc TVMScriptPrinter::VisitExpr_(const CastNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const CastNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.cast(" << Print(op->value) << ", " << PrintDType(op->dtype) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const VarNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const VarNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   const Var& var = GetRef<Var>(op);
   return meta_.InMeta(var) ? meta_.GetMetaNode(var) : AllocVar(GetRef<Var>(op));
 }
 
-#define TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OpName, OpString)      \
-  Doc TVMScriptPrinter::VisitExpr_(const OpName* op) {             \
-    Doc doc;                                                       \
-    doc << '(' << Print(op->a) << OpString << Print(op->b) << ")"; \
-    return doc;                                                    \
-  }
-
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AddNode, " + ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(SubNode, " - ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(MulNode, "*")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(DivNode, " / ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(ModNode, " % ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(EQNode, " == ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(NENode, " != ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LTNode, " < ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LENode, " <= ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GTNode, " > ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GENode, " >= ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AndNode, " and ")
-TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OrNode, " or ")
-
-Doc TVMScriptPrinter::VisitExpr_(const FloorDivNode* op) {
+#define TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OpName, OpString, OpPrecedence)            \
+  Doc TVMScriptPrinter::VisitExpr_(const OpName* op, ExprPrecedence* out_precedence) { \
+    Doc doc;                                                                           \
+    ExprPrecedence lhs_precedence = ExprPrecedence::kUnknown;                          \
+    ExprPrecedence rhs_precedence = ExprPrecedence::kUnknown;                          \
+    /* Get children expr out_precedence */                                             \
+    Doc lhs_doc = VisitExpr(op->a, &lhs_precedence);                                   \
+    Doc rhs_doc = VisitExpr(op->b, &rhs_precedence);                                   \
+    ICHECK(lhs_precedence != ExprPrecedence::kUnknown);                                \
+    ICHECK(rhs_precedence != ExprPrecedence::kUnknown);                                \
+    /* Update out_precedence of current node. */                                       \
+    *out_precedence = OpPrecedence;                                                    \
+    if (lhs_precedence > OpPrecedence) {                                               \
+      doc << "(" << lhs_doc << ")";                                                    \
+    } else {                                                                           \
+      doc << lhs_doc;                                                                  \
+    }                                                                                  \
+    doc << OpString;                                                                   \
+    if (rhs_precedence >= OpPrecedence) {                                              \
+      doc << "(" << rhs_doc << ")";                                                    \
+    } else {                                                                           \
+      doc << rhs_doc;                                                                  \
+    }                                                                                  \
+    return doc;                                                                        \
+  }
+
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(MulNode, "*", ExprPrecedence::kMultiplicationDivision)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(DivNode, " / ", ExprPrecedence::kMultiplicationDivision)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(ModNode, " % ", ExprPrecedence::kMultiplicationDivision)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AddNode, " + ", ExprPrecedence::kAdditionSubtraction)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(SubNode, " - ", ExprPrecedence::kAdditionSubtraction)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LTNode, " < ", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(LENode, " <= ", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GTNode, " > ", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(GENode, " >= ", ExprPrecedence::kRelational)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(EQNode, " == ", ExprPrecedence::kEquality)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(NENode, " != ", ExprPrecedence::kEquality)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(AndNode, " and ", ExprPrecedence::kAnd)
+TVM_DECLARE_TVMSCRIPT_PRINTER_BINOP(OrNode, " or ", ExprPrecedence::kOr)
+
+Doc TVMScriptPrinter::VisitExpr_(const FloorDivNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.floordiv(" << Print(op->a) << ", " << Print(op->b) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const FloorModNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const FloorModNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.floormod(" << Print(op->a) << ", " << Print(op->b) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const MinNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const MinNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.min(" << Print(op->a) << ", " << Print(op->b) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const MaxNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const MaxNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.max(" << Print(op->a) << ", " << Print(op->b) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const NotNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const NotNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
-  doc << "not (" << Print(op->a) << ")";
+  doc << "not(" << Print(op->a) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const SelectNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const SelectNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.Select(" << Print(op->condition) << ", " << Print(op->true_value) << ", "
       << Print(op->false_value) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const ProducerLoadNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const ProducerLoadNode* op, ExprPrecedence* out_precedence) {
   LOG(FATAL) << "Cannot print a tir.ProducerLoad as it is not valid in TIR Primfuncs. You need to "
                 "lower this function first.";
   return Doc();
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const BufferLoadNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const BufferLoadNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   if (op->indices.size() == 0) {
     doc << Print(op->buffer) << "[()]";
@@ -479,7 +580,8 @@ Doc TVMScriptPrinter::VisitExpr_(const BufferLoadNode* op) {
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const LoadNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const LoadNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   if (op->dtype == DataType::Float(32) && is_one(op->predicate) &&
       op->buffer_var->dtype == DataType::Float(32)) {
@@ -495,25 +597,29 @@ Doc TVMScriptPrinter::VisitExpr_(const LoadNode* op) {
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const RampNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const RampNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.ramp(" << Print(op->base) << ", " << Print(op->stride) << ", " << op->lanes << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const BroadcastNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const BroadcastNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.broadcast(" << Print(op->value) << ", " << op->lanes << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const LetNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const LetNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.let(" << Print(op->var) << ", " << Print(op->value) << ", " << Print(op->body) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const CallNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const CallNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   if (auto* ptr_op = op->op.as<OpNode>()) {
     doc << Doc::Text(ptr_op->name) << "(";
@@ -531,13 +637,15 @@ Doc TVMScriptPrinter::VisitExpr_(const CallNode* op) {
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const ShuffleNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const ShuffleNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.shuffle(" << Print(op->vectors) << ", " << Print(op->indices) << ")";
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const ReduceNode* op) {
+Doc TVMScriptPrinter::VisitExpr_(const ReduceNode* op, ExprPrecedence* out_precedence) {
+  *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << "tir.reduce(" << Print(op->combiner) << ", " << Print(op->source) << ", "
       << Print(op->axis) << ", " << op->value_index << ")";
@@ -550,7 +658,7 @@ Doc TVMScriptPrinter::VisitStmt_(const LetStmtNode* op) {
     doc << "with tir.let(" << Print(op->var) << ", " << Print(op->value) << "):";
     doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
   } else {
-    if (memo_var_.find(op->var) == memo_var_.end()) var_not_in_headers.insert(op->var.get());
+    if (memo_var_.find(op->var) == memo_var_.end()) var_not_in_headers_.insert(op->var.get());
     doc << Print(op->var) << ": " << Print(GetType(op->var)) << " = " << Print(op->value)
         << Doc::NewLine() << PrintBody(op->body);
   }
@@ -586,7 +694,7 @@ Doc TVMScriptPrinter::VisitStmt_(const AttrStmtNode* op) {
   if (op->node->IsInstance<IterVarNode>() &&
       (op->attr_key == "thread_extent" || op->attr_key == "virtual_thread")) {
     const auto* iter_var = Downcast<IterVar>(op->node).get();
-    var_not_in_headers.insert(iter_var->var.get());
+    var_not_in_headers_.insert(iter_var->var.get());
     var_env_map_[iter_var->var] = iter_var->thread_tag;
     if (current_num_ != num_child_ - 1) {
       doc << "with tir.launch_thread(" << Print(iter_var->var) << ", " << Print(op->value) << "):";
@@ -595,6 +703,7 @@ Doc TVMScriptPrinter::VisitStmt_(const AttrStmtNode* op) {
       doc << "tir.launch_thread(" << Print(iter_var->var) << ", " << Print(op->value) << ")";
       doc << Doc::NewLine() << PrintBody(op->body);
     }
+    TryDeallocVar(iter_var->var);
     return doc;
   }
   // default
@@ -636,7 +745,7 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferRealizeNode* op) {
 }
 
 Doc TVMScriptPrinter::VisitStmt_(const AllocateNode* op) {
-  var_not_in_headers.insert(op->buffer_var.get());
+  var_not_in_headers_.insert(op->buffer_var.get());
   Doc doc;
   auto storage_scope = GetPtrStorageScope(op->buffer_var);
   if (current_num_ != num_child_ - 1) {
@@ -655,6 +764,7 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateNode* op) {
     }
     doc << ")" << Doc::NewLine() << PrintBody(op->body);
   }
+  TryDeallocVar(op->buffer_var);
   return doc;
 }
 
@@ -685,12 +795,16 @@ Doc TVMScriptPrinter::VisitStmt_(const EvaluateNode* op) {
 
 Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) {
   Doc doc;
-  var_not_in_headers.insert(op->loop_var.get());
+  var_not_in_headers_.insert(op->loop_var.get());
   const auto* body = op->body.as<ForNode>();
   bool simple_loop = op->kind == ForKind::kSerial && op->annotations.empty() && is_zero(op->min);
   if (simple_loop) loop_stack_.push_back(GetRef<For>(op));
   // It is a loop that can be compressed, let the loops below print it out
-  if (simple_loop && body != nullptr) return Print(GetRef<For>(body));
+  if (simple_loop && body != nullptr) {
+    Doc result = Print(GetRef<For>(body));
+    TryDeallocVar(op->loop_var);
+    return result;
+  }
   // It is a loop that can not be compressed
   bool print_above = !loop_stack_.empty();
   // print loops above if needed
@@ -707,6 +821,7 @@ Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) {
   } else {
     doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
   }
+  TryDeallocVar(op->loop_var);
   return doc;
 }
 
@@ -716,6 +831,13 @@ Doc TVMScriptPrinter::VisitStmt_(const PrefetchNode* op) {
   return doc;
 }
 
+Doc TVMScriptPrinter::VisitStmt_(const WhileNode* op) {
+  Doc doc;
+  doc << "while " << Print(op->condition) << ":";
+  doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
+  return doc;
+}
+
 Doc TVMScriptPrinter::VisitType_(const PrimTypeNode* node) {
   Doc doc;
   doc << "ty." << runtime::DLDataType2String(node->dtype);
@@ -754,13 +876,11 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferStoreNode* op) {
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) {
-  const auto* block_op = op->block.as<BlockNode>();
-  // print block name and block vars
+Doc TVMScriptPrinter::PrintBlockVar(const BlockNode* op) {
   Doc doc;
   doc << "with tir.block([";
   std::vector<Doc> block_var_docs;
-  for (const auto& iter_var : block_op->iter_vars) {
+  for (const auto& iter_var : op->iter_vars) {
     Doc block_var_doc;
     if (is_zero(iter_var->dom->min) && iter_var->iter_type == kDataPar) {
       block_var_doc << Print(iter_var->dom->extent);
@@ -788,17 +908,25 @@ Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) {
     }
     block_var_docs.push_back(block_var_doc);
   }
-  doc << PrintSep(block_var_docs, Doc::Text(", ")) << "], ";
-  doc << Doc::StrLiteral(block_op->name_hint) << ")";
+  doc << PrintSep(block_var_docs, Doc::Text(", ")) << "]";
+  if (!op->name_hint.empty()) {
+    doc << ", " << Doc::StrLiteral(op->name_hint);
+  }
+  doc << ")";
   std::vector<Doc> block_var_names;
-  for (const auto& iter_var : block_op->iter_vars) {
-    var_not_in_headers.insert(iter_var->var.get());
+  for (const auto& iter_var : op->iter_vars) {
+    var_not_in_headers_.insert(iter_var->var.get());
     block_var_names.push_back(Print(iter_var->var));
   }
   if (!block_var_names.empty()) {
     doc << " as [" << PrintSep(block_var_names, Doc::Text(", ")) << "]";
   }
   doc << ":";
+  return doc;
+}
+
+Doc TVMScriptPrinter::PrintBlockAttr(const BlockRealizeNode* op) {
+  const auto* block_op = op->block.as<BlockNode>();
   Doc block_attr_doc;
   // print predicate, binding, read/write tensor region, annotations
   if (!is_one(op->predicate)) {
@@ -814,25 +942,41 @@ Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) {
     block_attr_doc << PrintAnnotations(block_op->annotations);
     block_attr_doc << "})";
   }
-  // print body
+  return block_attr_doc;
+}
+
+Doc TVMScriptPrinter::PrintBlockBody(const BlockNode* op) {
   Doc body;
-  body << Doc::NewLine();
-  for (const auto& alloc_buf : block_op->alloc_buffers) {
-    buf_not_in_headers.insert(alloc_buf.get());
+  for (const auto& alloc_buf : op->alloc_buffers) {
+    buf_not_in_headers_.insert(alloc_buf.get());
     body << Print(alloc_buf) << " = tir.alloc_buffer(" << memo_buf_decl_[alloc_buf] << ")"
          << Doc::NewLine();
   }
-  for (const auto& match_buf : block_op->match_buffers) {
+  for (const auto& match_buf : op->match_buffers) {
     body << Print(match_buf) << Doc::NewLine();
   }
-  if (block_op->init.defined()) {
+  if (op->init.defined()) {
     Doc init_block;
     init_block << "with tir.init():";
-    init_block << Doc::Indent(4, Doc::NewLine() << PrintBody(block_op->init.value()));
+    init_block << Doc::Indent(4, Doc::NewLine() << PrintBody(op->init.value()));
     body << init_block << Doc::NewLine();
   }
-  body << PrintBody(block_op->body);
-  doc << Doc::Indent(4, block_attr_doc << body);
+  body << PrintBody(op->body);
+  return body;
+}
+
+Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) {
+  const auto* block_op = op->block.as<BlockNode>();
+  // print block name and block vars
+  Doc doc = PrintBlockVar(block_op);
+  // print predicate, binding, read/write tensor region, annotations
+  Doc block_attr_doc = PrintBlockAttr(op);
+  // print body
+  Doc body = PrintBlockBody(block_op);
+  doc << Doc::Indent(4, block_attr_doc << Doc::NewLine() << body);
+  for (const auto& iter_var : block_op->iter_vars) {
+    TryDeallocVar(iter_var->var);
+  }
   return doc;
 }
 
@@ -890,38 +1034,49 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
   memo_buf_.clear();
   memo_buf_decl_.clear();
   memo_reducer_.clear();
-  var_not_in_headers.clear();
-  buf_not_in_headers.clear();
+  var_not_in_headers_.clear();
+  buf_not_in_headers_.clear();
   // print signature
   Doc doc;
   doc << "def " << (func2var_.find(op) == func2var_.end() ? "func" : func2var_[op]->name_hint)
       << "(";
   std::vector<Doc> params;
   for (const auto& param : op->params) {
-    var_not_in_headers.insert(param.get());
+    var_not_in_headers_.insert(param.get());
     params.push_back(Print(param) << ": " << Print(GetType(param)));
   }
   doc << PrintSep(params, Doc::Text(", ")) << ") -> " << Print(primFunc->ret_type) << ":";
 
   Doc body = Doc::NewLine();
   // print buffer_bind
-  for (const auto& it : op->buffer_map) {
-    buf_not_in_headers.insert(it.second.get());
-    body << Print(it.second) << " = tir.match_buffer(";
-    body << Print(it.first) << ", " << memo_buf_decl_[it.second];
+  for (const auto& param : op->params) {
+    auto it = op->buffer_map.find(param);
+    if (it == op->buffer_map.end()) continue;
+    buf_not_in_headers_.insert((*it).second.get());
+    body << Print((*it).second) << " = tir.match_buffer(";
+    body << Print((*it).first) << ", " << memo_buf_decl_[(*it).second];
     body << ")" << Doc::NewLine();
   }
   // print comm_reducer
   for (const auto& it : memo_reducer_) {
     body << it.second << " = tir.comm_reducer(";
-    var_not_in_headers.insert(it.first->lhs[0].get());
-    var_not_in_headers.insert(it.first->rhs[0].get());
+    var_not_in_headers_.insert(it.first->lhs[0].get());
+    var_not_in_headers_.insert(it.first->rhs[0].get());
     body << "lambda " << Print(it.first->lhs[0]) << ", " << Print(it.first->rhs[0]) << ": "
          << Print(it.first->result[0]) << ", " << Print(it.first->identity_element[0]);
     body << ")" << Doc::NewLine();
   }
   // print body
-  body << "# body" << Doc::NewLine() << PrintBody(op->body);
+  body << "# body" << Doc::NewLine();
+  if (op->body->IsInstance<BlockRealizeNode>() &&
+      op->body.as<BlockRealizeNode>()->iter_values.empty()) {
+    // Skip print root block
+    body << "# tir.with block(\"root\")" << Doc::NewLine();
+    const BlockNode* block = op->body.as<BlockRealizeNode>()->block.get();
+    body << PrintBlockBody(block);
+  } else {
+    body << PrintBody(op->body);
+  }
   // print func attrs
   Doc header_attr;
   if (primFunc->attrs.defined()) {
@@ -936,7 +1091,7 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
   Doc header_buf;
   std::vector<const BufferNode*> bufs;
   for (const auto& it : memo_buf_) {
-    if (buf_not_in_headers.find(it.first.get()) == buf_not_in_headers.end()) {
+    if (buf_not_in_headers_.find(it.first.get()) == buf_not_in_headers_.end()) {
       bufs.push_back(it.first.get());
     }
   }
@@ -954,7 +1109,7 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
   Doc header_var;
   std::vector<const VarNode*> vars;
   for (const auto& it : memo_var_) {
-    if (var_not_in_headers.find(it.first.get()) == var_not_in_headers.end()) {
+    if (var_not_in_headers_.find(it.first.get()) == var_not_in_headers_.end()) {
       vars.push_back(it.first.get());
     }
   }
@@ -1032,7 +1187,7 @@ Doc TVMScriptPrinter::PrintBufferRegion(const BufferRegionNode* op) {
       if (i != 0) doc << ", ";
       const auto& range = op->region[i];
       if (!is_one(range->extent)) {
-        doc << Print(range->min) << ":" << Print(range->min + range->extent);
+        doc << Print(range->min) << " : " << Print(range->min + range->extent);
       } else {
         doc << Print(range->min);
       }
diff --git a/src/relay/analysis/extract_operators.cc b/src/relay/analysis/extract_operators.cc
new file mode 100644
index 000000000000..8fd0f87239ff
--- /dev/null
+++ b/src/relay/analysis/extract_operators.cc
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file extract_operators.cc
+ * \brief Extract unique operators from an IRModule
+ */
+#include <tvm/node/structural_hash.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+
+namespace tvm {
+namespace relay {
+
+class OperatorExtractorWrapper : private MixedModeVisitor {
+ public:
+  explicit OperatorExtractorWrapper(const IRModule& mod) : mod_(mod) {}
+
+  Map<String, tvm::Integer> Extract() {
+    VisitExpr(this->mod_->Lookup("main"));
+
+    return operator_freqs_;
+  }
+
+ private:
+  const IRModule mod_;
+  /*! \brief Map of operator to frequency. */
+  Map<String, tvm::Integer> operator_freqs_;
+
+  void VisitExpr_(const CallNode* n) final {
+    VisitExpr(n->op);
+
+    auto op = n->op.as<OpNode>();
+    if (op) {
+      auto it = operator_freqs_.find(op->name);
+      ICHECK(it != operator_freqs_.end())
+          << "Call's OpNode must be visited and registered before access";
+      operator_freqs_.Set(op->name, 1 + operator_freqs_.at(op->name));
+    }
+
+    MixedModeVisitor::VisitExpr_(n);
+  }
+
+  void VisitExpr_(const OpNode* n) final {
+    // NOTE: OpNode is visited only once for every operator kind
+    // regardless of how many times that op appears in the graph.
+    operator_freqs_.Set(n->name, 0U);
+  }
+};
+
+Map<String, tvm::Integer> ExtractOperatorsPacked(const IRModule& mod) {
+  return OperatorExtractorWrapper(mod).Extract();
+}
+
+TVM_REGISTER_GLOBAL("relay.analysis.ExtractOperators").set_body_typed(ExtractOperatorsPacked);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index b2e862b22b48..ad9ba1b2069d 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -38,8 +38,8 @@
 #include <string>
 #include <vector>
 
-#include "te_compiler.h"
-#include "utils.h"
+#include "./te_compiler.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
@@ -583,8 +583,16 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // performing the preexisting AOT executor code generation phase.
     IRModule mod = IRModule::FromExpr(func);
 
-    IRModule new_mod =
-        LowerTEPass(targets_, device_context_map, memory_plan, mod_name, [this](Function func) {
+    backend::FunctionInfo func_info;
+
+    if (memory_plan.defined()) {
+      // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
+      func_info = tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info);
+      mod = WithAttr(mod, "main_func_info", func_info);
+    }
+
+    IRModule lowered_mod =
+        tec::LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
@@ -598,9 +606,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         })(mod);
 
-    tec::LoweredModule lowered_module = tec::IRModuleToLoweredModule(new_mod);
-    function_metadata_.Set(runtime::symbol::tvm_module_main, lowered_module.main_func_info);
-    auto lowered_main = lowered_module.main_module->Lookup("main");
+    auto lowered_main = lowered_mod->Lookup("main");
     auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
 
     // Post-lowering storage map for writing main func - this should be the same map as previously
@@ -653,6 +659,20 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     auto storage_rewrite = tir::transform::StorageRewrite();
     mod_run = storage_rewrite(mod_run);
 
+    // The workspace for main function should be calculated after performing storage_rewrite for
+    // the top level TIR function.
+    auto workspace_byte_alignment =
+        target_host_->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Integer main_workspace_size = CalculateWorkspaceBytes(
+        Downcast<tir::PrimFunc>(mod_run->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)),
+        workspace_byte_alignment);
+
+    Optional<backend::FunctionInfo> main_func_info =
+        lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
+
+    main_func_info.value()->workspace_sizes.Set(target_host_, main_workspace_size);
+    function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
+
     // Legalize AOT if needed. This means that all the packed calls
     // need to be wrapped in TVMValues (unless use_unpacked_api is set)
     if (!use_unpacked_api_) {
@@ -662,8 +682,13 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     ret.function_metadata = std::move(function_metadata_);
 
-    ret.lowered_funcs = lowered_module.per_target_module;
-    ret.external_mods = lowered_module.external_mods;
+    Optional<Array<tvm::runtime::Module>> external_modules =
+        lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
+    ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";
+
+    // This is the point where we separate the functions in the module by target
+    ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
+    ret.external_mods = external_modules.value();
 
     if (ret.lowered_funcs.find(target_host_) != ret.lowered_funcs.end()) {
       ret.lowered_funcs[target_host_]->Update(mod_run);
diff --git a/src/relay/backend/contrib/cmsisnn/codegen_cmsisnn.cc b/src/relay/backend/contrib/cmsisnn/codegen_cmsisnn.cc
new file mode 100644
index 000000000000..d2e498a52ce4
--- /dev/null
+++ b/src/relay/backend/contrib/cmsisnn/codegen_cmsisnn.cc
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <cmath>
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "../../../../runtime/file_utils.h"
+#include "../../../../target/source/codegen_c.h"
+#include "../../../qnn/utils.h"
+
+namespace tvm {
+namespace runtime {
+
+using namespace tir;
+
+class CodeGenCMSISNN : public tvm::codegen::CodeGenC {
+ public:
+  void Init(bool output_ssa) {
+    decl_stream << "#include <stdio.h>\n";
+    decl_stream << "#include <stdlib.h>\n";
+    decl_stream << "#include <dlpack/dlpack.h>\n";
+    decl_stream << "#include <tvm/runtime/crt/module.h>\n";
+    decl_stream << "#include <arm_nnfunctions.h>\n";
+    CodeGenC::Init(output_ssa);
+  }
+
+  /*!
+   * \brief Emit code that offloads a subgraph to the Cortex-M
+   *
+   * \return string of code that offloads a subgraph to the Cortex-M
+   */
+  void AddFunction(const PrimFunc& prim_func) {
+    PrintExternCPrefix(stream);
+    CodeGenC::AddFunction(prim_func);
+    PrintExternCPostfix(stream);
+  }
+
+ private:
+  void VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
+    if (!op->op.same_as(builtin::call_extern())) {
+      return;
+    }
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+    if (cmsis_func_name == "arm_softmax_s8") {
+      EmitSoftmax(op);
+    }
+    return;
+  }
+
+  /*!  * \brief Creates a cplusplus guard prefix for extern "C" printing */
+  void PrintExternCPrefix(std::ostringstream& ss) {
+    PrintIndent();
+    ss << "#ifdef __cplusplus\n";
+    ss << "extern \"C\" {\n";
+    ss << "#endif\n";
+  }
+
+  /*!  * \brief Creates a cplusplus guard postfix for extern "C" printing */
+  void PrintExternCPostfix(std::ostringstream& ss) {
+    PrintIndent();
+    ss << "#ifdef __cplusplus\n";
+    ss << "}\n";
+    ss << "#endif\n";
+  }
+
+  /*!  * \brief Emits CMSIS-NN code block for softmax */
+  void EmitSoftmax(const CallNode* op) {
+    // @tir.call_extern("arm_softmax_s8", buffer_0, num_rows, row_size, scale, buffer_1, dtype=int8)
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+    int32_t num_rows = op->args[2].as<IntImmNode>()->value;
+    int32_t row_size = op->args[3].as<IntImmNode>()->value;
+    float quant_scale = op->args[4].as<FloatImmNode>()->value;
+
+    // calculate multiplier and shift for CMSIS-NN softmax API
+    // Note: tfl micro assumptions
+    // TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
+    // TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+    // TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
+    double beta = 1.0;
+    int32_t input_bits = 5;
+    double beta_multiplier = (beta * quant_scale * (1 << (31 - input_bits)));
+    beta_multiplier = std::min<double>(beta_multiplier, (1ll << 31) - 1.0);
+    auto mult_shift_pair = tvm::relay::qnn::GetFixedPointMultiplierShift(beta_multiplier);
+    int32_t mult = std::get<0>(mult_shift_pair);
+    int32_t shift = std::get<1>(mult_shift_pair);
+    int32_t diff_min = (1 << 5) - 1;
+    diff_min <<= (31 - 5);
+    diff_min >>= shift;
+    diff_min *= -1;
+
+    PrintIndent();
+    stream << "int32_t num_rows = " << num_rows << ";\n";
+    PrintIndent();
+    stream << "int32_t row_size = " << row_size << ";\n";
+    PrintIndent();
+    stream << "int32_t mult = " << mult << ";\n";
+    PrintIndent();
+    stream << "int32_t shift = " << shift << ";\n";
+    PrintIndent();
+    stream << "int32_t diff_min = " << diff_min << ";\n";
+    PrintIndent();
+    stream << cmsis_func_name << "(buffer,";
+    PrintIndent();
+    stream << " num_rows, row_size, mult, shift, diff_min, buffer1);\n";
+    PrintIndent();
+    stream << "return;\n";
+  }
+};
+
+class CMSISNNModuleNode : public runtime::ModuleNode {
+ public:
+  CMSISNNModuleNode(const std::string& code, const std::string& fmt,
+                    const Array<String>& func_names)
+      : code_(code), fmt_(fmt), func_names_(func_names) {}
+
+  std::string GetSource(const std::string& format) final { return code_; }
+
+  const char* type_key() const { return "c"; }
+
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
+    if (name == "get_symbol") {
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->func_names_[0]; });
+    } else if (name == "get_func_names") {
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->func_names_; });
+    } else {
+      return PackedFunc(nullptr);
+    }
+  }
+
+  void SaveToFile(const std::string& file_name, const std::string& format) final {
+    std::string fmt = GetFileFormat(file_name, format);
+    std::string meta_file = GetMetaFilePath(file_name);
+    if (fmt == "c" || fmt == "cu") {
+      ICHECK_NE(code_.length(), 0);
+      SaveBinaryToFile(file_name, code_);
+    } else {
+      ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+    }
+  }
+
+ protected:
+  std::string code_;
+  std::string fmt_;
+  Array<String> func_names_;
+};
+
+class CMSISNNModule : public Module {
+ public:
+  CMSISNNModule() {}
+  explicit CMSISNNModule(ObjectPtr<Object> n) : Module(n) {}
+  inline CMSISNNModuleNode* operator->();
+  inline const CMSISNNModuleNode* operator->() const;
+};
+
+inline CMSISNNModuleNode* CMSISNNModule::operator->() {
+  return static_cast<CMSISNNModuleNode*>(get_mutable());
+}
+
+static Module CMSISNNModuleNodeCreate(IRModule mod) {
+  bool output_ssa = false;
+  CodeGenCMSISNN cg;
+  Array<String> function_names;
+  cg.Init(output_ssa);
+  ICHECK(mod->functions.size() == 1) << "Supports modules with single PrimFunc.";
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
+    auto f = Downcast<PrimFunc>(kv.second);
+    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    ICHECK(global_symbol.defined())
+        << "CodeGenCHost: Expect PrimFunc to have the global_symbol attribute";
+    function_names.push_back(global_symbol.value());
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+  auto n = make_object<CMSISNNModuleNode>(code, "c", function_names);
+  return Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.module.cmsisnn.create").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = CMSISNNModuleNodeCreate(args[0]);
+});
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
new file mode 100644
index 000000000000..6a87d11d5d6a
--- /dev/null
+++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/ir/error.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "../../../op/make_op.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosu {
+
+/*! \brief Attributes to store the compiler options for Arm(R) Ethos(TM)-U NPU. */
+struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode> {
+  String accelerator_config;
+
+  TVM_DECLARE_ATTRS(EthosUCompilerConfigNode, "ext.attrs.EthosUCompilerConfigNode") {
+    TVM_ATTR_FIELD(accelerator_config)
+        .describe(
+            "The class of Arm(R) Ethos(TM)-U NPU; possible values = {ethos-u55-32, ethos-u55-64, "
+            "ethos-u55-128, ethos-u55-256}")
+        .set_default("ethos-u55-256");
+  }
+};
+
+class EthosUCompilerConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(EthosUCompilerConfig, Attrs, EthosUCompilerConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(EthosUCompilerConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.ethosu.options", EthosUCompilerConfig);
+
+auto GetCompilerAttrs() {
+  auto ctx = transform::PassContext::Current();
+  auto cfg = ctx->GetConfig<EthosUCompilerConfig>("relay.ext.ethosu.options");
+  if (!cfg.defined()) {
+    cfg = AttrsWithDefaultValues<EthosUCompilerConfig>();
+  }
+  return cfg;
+}
+TVM_REGISTER_GLOBAL("relay.ext.ethosu.get_compiler_attrs").set_body_typed(GetCompilerAttrs);
+
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/ethosu/preprocess.cc b/src/relay/backend/contrib/ethosu/preprocess.cc
new file mode 100644
index 000000000000..ac52844091b4
--- /dev/null
+++ b/src/relay/backend/contrib/ethosu/preprocess.cc
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ir/error.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "../../../op/make_op.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosu {
+
+/*!
+ * \brief This expression rewriter will traverse the graph to find calls
+ * to all external functions. If they have multiple inputs and/or
+ * multiple outputs, the following has to be done :
+ * 1) If multiple inputs are present, they needed to be concat before the call.
+ * 2) Inside the external function they need to be split again to their original inputs.
+ * 3) If there are multiple outputs, they need to be concat at the end of external function.
+ * 4) Then, the concat output again need to be split and made the original tuple output in the
+ * main.
+ */
+class ExternalFuncIOHandler : public ExprRewriter {
+ public:
+  explicit ExternalFuncIOHandler(const IRModule& module) : module_(module) {}
+  int count = 0;
+
+  Function InferType(const Function& expr, const IRModule& m) {
+    IRModule mod(m);
+    mod->Update(mod->GetGlobalVar("main"), expr);
+    mod = transform::InferType()(mod);
+    return Downcast<Function>(mod->Lookup("main"));
+  }
+
+  /*!
+   * \brief This function will take shape and compute
+   * the scalar size value for it to be use to create
+   * flat single dimensional tensors.
+   */
+  int64_t CalcSize(const Array<Integer>& shape) {
+    int size = 1;
+    for (auto dim_size : shape) {
+      size = size * Downcast<Integer>(dim_size)->value;
+    }
+    return size;
+  }
+
+  /*!
+   * \brief This will take a tensor and create a flattened
+   * tensor to be used by the concat.
+   */
+  Expr CreateFlattenTensor(const Expr& input) {
+    auto ishape = Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape);
+    int flatten_size = CalcSize(ishape);
+    Array<Integer> output_shape = {Integer(flatten_size)};
+    return MakeReshape(input, output_shape);
+  }
+
+  /*!
+   * \brief This will take flattened tensors and create
+   * a single concat'd tensor.
+   */
+  Expr CreateConcatTensor(const Array<Expr>& inputs) {
+    auto tuple = Tuple(inputs);
+    return MakeConcatenate(tuple, 0);
+  }
+
+  /*!
+   * \brief This will take a flattened concat'd tensor and use the original inputs shapes
+   * to recreate a Tuple of the original set of tensors.
+   */
+  Expr CreateSplitReshapedTensors(const Expr& input, const Array<Expr>& original_args) {
+    Array<Array<Integer>> shapes;
+    Array<Integer> flatten_tensor_sizes;
+    Array<IndexExpr> split_indices;
+    Array<Expr> rets;
+
+    int total_size = 0;
+    for (auto orig_arg : original_args) {
+      auto shape = Downcast<Array<Integer>>(Downcast<TensorType>(orig_arg->checked_type())->shape);
+      shapes.push_back(shape);
+      flatten_tensor_sizes.push_back(CalcSize(shape));
+      if (total_size != 0) {
+        split_indices.push_back(total_size);
+      }
+      total_size += CalcSize(shape);
+    }
+    auto split_outs = MakeSplit(input, split_indices, 0);
+    for (unsigned int i = 0; i < shapes.size(); i++) {
+      auto split_out = TupleGetItem(split_outs, i);
+      split_out->checked_type_ = original_args[i]->checked_type_;
+      rets.push_back(MakeReshape(split_out, shapes[i]));
+    }
+    return Tuple(rets);
+  }
+
+  /*!
+   * \brief Modify the external function to split the input as the original compute
+   * as required originally. Moreover, the outputs will be flattened and concat'd
+   * to make a single output. Finaly, the external function should only have a single input
+   * and a single output.
+   */
+  Function ModifyExternalFunction(const Function& func, const GlobalVar& gv,
+                                  const DataType& dtype) {
+    Array<Expr> inputs;
+    Var ifms;
+    if (func->params.size() > 1) {
+      Array<Array<Integer>> shapes;
+      Array<Integer> flatten_tensor_sizes;
+      Array<IndexExpr> split_indices;
+
+      auto func_name = gv->name_hint;
+      int total_size = 0;
+      for (auto input : func->params) {
+        auto shape = Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape);
+        shapes.push_back(shape);
+        auto flat_size = CalcSize(shape);
+        flatten_tensor_sizes.push_back(flat_size);
+        if (total_size != 0) {
+          split_indices.push_back(total_size);
+        }
+        total_size += flat_size;
+      }
+      Array<PrimExpr> ifms_shape = {total_size};
+      ifms = Var(func_name + "_ifms", TensorType(ifms_shape, dtype));
+      auto split_outs = MakeSplit(ifms, split_indices, 0);
+      for (unsigned int i = 0; i < shapes.size(); i++) {
+        auto split_out = TupleGetItem(split_outs, i);
+        split_out->checked_type_ = func->params[i]->checked_type();
+        inputs.push_back(MakeReshape(split_out, shapes[i]));
+      }
+    } else {
+      CHECK_EQ(func->params.size(), 1);
+      inputs.push_back(func->params[0]);
+      ifms = func->params[0];
+    }
+    Map<Var, Expr> bind_map;
+    CHECK_EQ(func->params.size(), inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++) {
+      bind_map.Set(func->params[i], inputs[i]);
+    }
+    auto core_compute_expr = Bind(func->body, bind_map);
+
+    // Creation of wrapper inside the external function
+    Array<Var> params = {ifms};
+    if (func->body->IsInstance<TupleNode>()) {
+      auto tuple_out = func->body.as<TupleNode>();
+      Array<Expr> reshaped_outputs;
+      for (unsigned int i = 0; i < tuple_out->fields.size(); i++) {
+        auto out = Downcast<Tuple>(core_compute_expr)->fields[i];
+        out->checked_type_ = tuple_out->fields[i]->checked_type_;
+        reshaped_outputs.push_back(CreateFlattenTensor(out));
+      }
+      auto concat_out = CreateConcatTensor(reshaped_outputs);
+      auto f = Function(params, concat_out, concat_out->checked_type_, {}, func->attrs);
+      return InferType(f, this->module_);
+    } else {
+      auto f =
+          Function(params, core_compute_expr, core_compute_expr->checked_type_, {}, func->attrs);
+      return InferType(f, this->module_);
+    }
+  }
+
+  Expr Rewrite_(const CallNode* call, const Expr& post) final {
+    auto post_call = Downcast<Call>(post);
+
+    if (auto glb_var_node = post_call->op.as<GlobalVarNode>()) {
+      auto glb_var = GetRef<GlobalVar>(glb_var_node);
+      auto func = Downcast<Function>(module_->functions[glb_var]);
+
+      // If the number of inputs and output are 1 --> no need to do anything
+      if (post_call->args.size() == 1 && !func->body->IsInstance<TupleNode>()) {
+        return post;
+      }
+      if (auto compiler = func->GetAttr<String>(attr::kCompiler)) {
+        if (compiler == "ethosu") {
+          auto ext_input = std::move(post_call->args[0]);
+          auto arg_dtype = Downcast<TensorType>(post_call->args[0]->checked_type())->dtype;
+          if (post_call->args.size() > 1) {
+            Array<Expr> reshaped_inputs;
+            for (const auto& arg : post_call->args) {
+              // All arguments should be of same data type
+              CHECK_EQ(arg_dtype, Downcast<TensorType>(arg->checked_type())->dtype)
+                  << "Currently NPU external functions require all inputs to be of same data "
+                     "type";
+              reshaped_inputs.push_back(CreateFlattenTensor(arg));
+            }
+            ext_input = CreateConcatTensor(reshaped_inputs);
+          }
+          auto ext_func = ModifyExternalFunction(func, glb_var, arg_dtype);
+          Array<Expr> new_args = {ext_input};
+          module_->Add(glb_var, ext_func);
+          Expr new_call = Call(glb_var, new_args);
+          if (func->body->IsInstance<TupleNode>()) {
+            auto orginal_tuple_out = Downcast<Tuple>(func->body);
+            new_call = CreateSplitReshapedTensors(new_call, orginal_tuple_out->fields);
+          }
+          return std::move(new_call);
+        }
+      }
+    }
+    return post;
+  }
+
+ private:
+  IRModule module_;
+};
+
+IRModule PreprocessExternalFuncIO_(const IRModule& module) {
+  ExternalFuncIOHandler ex_func_io_handle(module);
+  auto func = GetRef<Function>(module->Lookup("main").as<FunctionNode>());
+  auto preprocessed = PostOrderRewrite(func, &ex_func_io_handle);
+  module->Update(module->GetGlobalVar("main"), GetRef<Function>(preprocessed.as<FunctionNode>()));
+  return module;
+}
+
+}  // namespace ethosu
+}  // namespace contrib
+
+/*!
+ * \brief This is a pre-processing pass for all NPU external functions.
+ * Currently, the NPU runtime module expects a single input and a single output.
+ * Therefore, this pass will concat the inputs pre-call, split again inside ext. func,
+ * concat the output inside ext. func and re-split again after the call.
+ */
+
+namespace transform {
+Pass PreprocessExternalFuncIO() {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pre_processed_ext_func =
+      [=](IRModule m, PassContext pc) {
+        auto _m = contrib::ethosu::PreprocessExternalFuncIO_(m);
+        return _m;
+      };
+  auto preprocess_pass =
+      CreateModulePass(pre_processed_ext_func, 0, "PreprocessExternalFuncIO", {});
+  return Sequential({preprocess_pass, InferType()});
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.ethosu.PreprocessExternalFuncIO")
+    .set_body_typed(transform::PreprocessExternalFuncIO);
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/ethosu/to_te_graph.cc b/src/relay/backend/contrib/ethosu/to_te_graph.cc
new file mode 100644
index 000000000000..9646c39da089
--- /dev/null
+++ b/src/relay/backend/contrib/ethosu/to_te_graph.cc
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/contrib/ethosu/to_te_graph.cc
+ * \brief Lower a Relay function to a TE graph.
+ */
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/schedule.h>
+#include <tvm/te/schedule_pass.h>
+#include <tvm/topi/tags.h>
+
+#include <functional>
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../../compile_engine.h"
+#include "../../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosu {
+
+/*! \brief Node container to represent a Tensor Expression graph. */
+class TEGraphNode : public Object {
+ public:
+  /* \brief The inputs to the graph */
+  tvm::Array<te::Tensor> inputs;
+  /* \brief The outputs to the graph */
+  tvm::Array<te::Tensor> outputs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+  }
+
+  static constexpr const char* _type_key = "relay.TEGraph";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TEGraphNode, Object);
+};
+
+class TEGraph : public ObjectRef {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(TEGraph, ObjectRef, TEGraphNode);
+};
+
+TVM_REGISTER_NODE_TYPE(TEGraphNode);
+
+Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
+  // for now, we always use int32 shape when possible
+  // even if the result of shape inference becomes int64.
+  Array<IndexExpr> res;
+  for (IndexExpr val : shape) {
+    const int64_t* pval = tir::as_const_int(val);
+    if (pval != nullptr) {
+#ifndef TVM_INDEX_DEFAULT_I64
+      ICHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
+      ICHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
+      res.push_back(IntImm(DataType::Int(32), *pval));
+#else
+      res.push_back(val);
+#endif  // TVM_INDEX_DEFAULT_I64
+    } else if (val->IsInstance<tir::AnyNode>()) {
+      res.push_back(val.as<tir::AnyNode>()->ToVar());
+    } else {
+      res.push_back(val);
+    }
+  }
+  return res;
+}
+
+class RelayToTE : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
+ public:
+  RelayToTE() = default;
+
+  TEGraph Lower(const Function& prim_func) {
+    auto graph_node = make_object<TEGraphNode>();
+    for (Var param : prim_func->params) {
+      Array<tvm::te::Tensor> inputs;
+      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
+        tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
+        graph_node->inputs.push_back(tensor);
+        inputs.push_back(tensor);
+      } else {
+        // flatten tuple of tensor type.
+        const auto* tuple_type = param->type_as<TupleTypeNode>();
+        for (Type field : tuple_type->fields) {
+          const auto* ttype = field.as<TensorTypeNode>();
+          ICHECK(ttype != nullptr);
+          tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
+          graph_node->inputs.push_back(tensor);
+          inputs.push_back(tensor);
+        }
+      }
+      memo_[param] = inputs;
+    }
+    graph_node->outputs = this->VisitExpr(prim_func->body);
+    return TEGraph(graph_node);
+  }
+
+  Array<te::Tensor> VisitExpr_(const VarNode* op) final {
+    LOG(FATAL) << "Free variable " << op->name_hint();
+    return {};
+  }
+
+  Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
+    using tir::make_const;
+    ICHECK(op->is_scalar());
+    void* data = op->data->data;
+    DataType dtype = DataType(op->data->dtype);
+    auto value = te::compute(
+        {},
+        [&](const Array<tvm::tir::Var>&) {
+          if (dtype == DataType::Int(32)) {
+            return make_const(dtype, static_cast<const int32_t*>(data)[0]);
+          } else if (dtype == DataType::Int(64)) {
+            return make_const(dtype, static_cast<const int64_t*>(data)[0]);
+          } else if (dtype == DataType::Float(32)) {
+            return make_const(dtype, static_cast<const float*>(data)[0]);
+          } else if (dtype == DataType::Float(64)) {
+            return make_const(dtype, static_cast<const double*>(data)[0]);
+          } else if (dtype == DataType::Bool()) {
+            return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
+          } else {
+            LOG(FATAL) << "not handled";
+            return tvm::PrimExpr();
+          }
+        },
+        "compile_engine_const", topi::kBroadcast);
+    return {value};
+  }
+
+  Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
+    static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
+    ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
+
+    Array<te::Tensor> inputs;
+    int count_tuple = 0;
+    for (Expr arg : call_node->args) {
+      if (arg->checked_type().as<TupleTypeNode>()) {
+        ++count_tuple;
+      }
+      for (te::Tensor tensor : VisitExpr(arg)) {
+        inputs.push_back(tensor);
+      }
+    }
+    if (count_tuple) {
+      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
+    }
+
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    Op op = Downcast<Op>(call_node->op);
+
+    Array<te::Tensor> outputs;
+    LoweredOutput lowered_out =
+        (*flower_call)(GetRef<Call>(call_node), inputs, tvm::Target("llvm"));
+    outputs = lowered_out->outputs;
+
+    if (outputs.size() != 1) {
+      const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
+      ICHECK(tuple_type) << "Expect output to be a tuple type";
+      ICHECK_EQ(tuple_type->fields.size(), outputs.size());
+    }
+    return outputs;
+  }
+
+  Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
+    LOG(FATAL) << "Do not support sub function";
+    return Array<te::Tensor>();
+  }
+
+  Array<te::Tensor> VisitExpr_(const LetNode* op) final {
+    Array<te::Tensor> val = VisitExpr(op->value);
+    ICHECK(!memo_.count(op->var));
+    memo_[op->var] = val;
+    return VisitExpr(op->body);
+  }
+
+  Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
+    Array<te::Tensor> fields;
+    for (Expr field : op->fields) {
+      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
+      Array<te::Tensor> res = VisitExpr(field);
+      ICHECK_EQ(res.size(), 1);
+      fields.push_back(res[0]);
+    }
+    return fields;
+  }
+
+  Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
+    const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
+    Array<te::Tensor> tuple = VisitExpr(op->tuple);
+    ICHECK_EQ(tuple_type->fields.size(), tuple.size());
+    ICHECK_GE(op->index, 0);
+    ICHECK_LT(static_cast<size_t>(op->index), tuple.size());
+    return {tuple[op->index]};
+  }
+};
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethosu.LowerToTE")
+    .set_body_typed([](Function prim_func) { return RelayToTE().Lower(prim_func); });
+
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
new file mode 100644
index 000000000000..6d332803041d
--- /dev/null
+++ b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
@@ -0,0 +1,131 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace example_target_hooks {
+
+class ConvertAddToSubtract : public MixedModeMutator {
+ public:
+  explicit ConvertAddToSubtract(IRModule ir_module, Target host_target)
+      : ir_module_(ir_module), host_target_(host_target) {}
+
+  IRModule Mutate() {
+    GlobalVar main_global_var = ir_module_->GetGlobalVar("main");
+    BaseFunc main = ir_module_->Lookup(main_global_var);
+    Function main_func = GetRef<Function>(main.as<FunctionNode>());
+
+    // Copy everything across and mutate the body
+    Function mutated_main =
+        Function(main_func->params, VisitExpr(main_func->body), main_func->ret_type,
+                 main_func->type_params, main_func->attrs, main_func->span);
+
+    ir_module_->Update(main_global_var, mutated_main);
+
+    return ir_module_;
+  }
+
+ private:
+  tir::Load LoadIndex(const tir::Buffer& buffer, const PrimExpr& index) {
+    return tir::Load(DataType::Float(32), buffer->data, index, tir::const_true());
+  }
+
+  void ReplaceAddWithSubtractPrimFunc(const GlobalVar& new_global_var, const Function& func) {
+    tir::Buffer x_buffer = tir::decl_buffer({8}, DataType::Float(32), "x");
+    tir::Buffer y_buffer = tir::decl_buffer({8}, DataType::Float(32), "y");
+    tir::Buffer out_buffer = tir::decl_buffer({8}, DataType::Float(32));
+
+    tir::Var x_var("x", DataType::Handle());
+    tir::Var y_var("y", DataType::Handle());
+    tir::Var out_var("out", DataType::Handle());
+
+    Map<String, ObjectRef> dict_attrs;
+    dict_attrs.Set("global_symbol", new_global_var->name_hint);
+    dict_attrs.Set("tir.noalias", Bool(true));
+
+    te::Var index("index", DataType::Int(32));
+    tir::Sub indexed_sub = tir::Sub(LoadIndex(x_buffer, index), LoadIndex(y_buffer, index));
+    tir::Stmt math_body = tir::Store(out_buffer->data, indexed_sub, index, tir::const_true());
+    tir::Stmt math_loop = tir::For(index, 0, 8, tir::ForKind::kSerial, math_body);
+
+    Map<tir::Var, tir::Buffer> buffer_map = {
+        {x_var, x_buffer},
+        {y_var, y_buffer},
+        {out_var, out_buffer},
+    };
+
+    tir::PrimFunc replacement_func = tir::PrimFunc({x_var, y_var, out_var}, math_loop, VoidType(),
+                                                   buffer_map, DictAttrs(dict_attrs));
+    replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, host_target_);
+    ir_module_->Add(new_global_var, replacement_func);
+  }
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) override {
+    if (const CallNode* call = post.as<CallNode>()) {
+      auto* func = call->op.as<FunctionNode>();
+      if (func == nullptr) {
+        return post;
+      }
+
+      auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
+      if (func_name.defined() && func_name == "replace_add_with_subtract") {
+        // Introduce a new global var to map the function to and copy the source type
+        // over for InferType
+        GlobalVar new_global_var(func_name.value());
+        new_global_var->checked_type_ = func->checked_type();
+        ReplaceAddWithSubtractPrimFunc(new_global_var, GetRef<Function>(func));
+        return Call(new_global_var, call->args, call->attrs, call->type_args, call->span);
+      }
+    }
+
+    return post;
+  }
+
+ public:
+  IRModule ir_module_;
+  Target host_target_;
+};
+
+transform::Pass RelayToTIR() {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [=](IRModule ir_module, transform::PassContext pass_context) {
+        auto relay_to_tir = ConvertAddToSubtract(ir_module, Target("c"));
+        return relay_to_tir.Mutate();
+      };
+  return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIR", {});
+}
+
+}  // namespace example_target_hooks
+}  // namespace contrib
+}  // namespace relay
+
+TVM_REGISTER_TARGET_KIND("example_target_hook", kDLCPU)
+    .set_attr<tvm::transform::Pass>("RelayToTIR",
+                                    relay::contrib::example_target_hooks::RelayToTIR());
+
+}  // namespace tvm
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 486a6dcd7d87..92e7568d9f38 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -36,8 +36,8 @@
 #include <string>
 #include <vector>
 
-#include "te_compiler.h"
-#include "utils.h"
+#include "./te_compiler.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
@@ -221,8 +221,17 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       device_context_map.insert({expr, dev});
     }
 
-    IRModule new_mod =
-        LowerTEPass(targets_, device_context_map, memory_plan_, mod_name_, [this](Function func) {
+    backend::FunctionInfo func_info;
+
+    if (memory_plan_.defined()) {
+      // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
+      func_info =
+          relay::tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan_->expr_to_storage_info);
+      mod = WithAttr(mod, "main_func_info", func_info);
+    }
+
+    IRModule lowered_mod =
+        tec::LowerTEPass(targets_, device_context_map, mod_name_, [this](Function func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
@@ -236,27 +245,28 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         })(mod);
 
-    tec::LoweredModule lowered_module = tec::IRModuleToLoweredModule(new_mod);
-    function_metadata_.Set(runtime::symbol::tvm_module_main, lowered_module.main_func_info);
-    auto main_module = lowered_module.main_module;
-    main_module = relay::transform::InferType()(main_module);
-    relay::Function main_func = Downcast<relay::Function>(main_module->Lookup("main"));
+    Optional<backend::FunctionInfo> main_func_info =
+        lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
+
+    function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
+
+    Function lowered_main_func = Downcast<Function>(lowered_mod->Lookup("main"));
 
     // Now that we have lowered all operators to TIR code, we can proceed with compilation.
     //
     // We need to unfortunately re-plan as the previous results have been invalidated by lowering
     // we will fix this in future refactors.
-    memory_plan_ = GraphPlanMemory(main_func);
+    memory_plan_ = GraphPlanMemory(lowered_main_func);
 
     // The graph planner also can not handle planning calls to global variables to we must remap
 
     // First we convert all the parameters into input nodes.
-    for (auto param : main_func->params) {
+    for (auto param : lowered_main_func->params) {
       auto node_ptr = GraphInputNode::make_node_ptr(param->name_hint(), GraphAttrs());
       var_map_[param.get()] = AddNode(node_ptr, param);
     }
 
-    heads_ = VisitExpr(main_func->body);
+    heads_ = VisitExpr(lowered_main_func->body);
     std::ostringstream os;
 
     dmlc::JSONWriter writer(&os);
@@ -270,8 +280,14 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
     }
     ret.function_metadata = std::move(function_metadata_);
-    ret.lowered_funcs = lowered_module.per_target_module;
-    ret.external_mods = lowered_module.external_mods;
+
+    Optional<Array<tvm::runtime::Module>> external_modules =
+        lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
+    ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";
+
+    // This is the point where we separate the functions in the module by target
+    ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
+    ret.external_mods = external_modules.value();
     return ret;
   }
 
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 76b6f9186eb5..d87cf9811bc7 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -292,13 +292,8 @@ InterpreterState::InterpreterState(Expr current_expr, InterpreterState::Stack st
 class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
                     PatternFunctor<bool(const Pattern& p, const ObjectRef& v)> {
  public:
-  // TODO(mbs): Collapse mod and per_target_module once IRModule subsumes LoweredModule.
-  Interpreter(IRModule mod, Map<Target, IRModule> per_target_module, Device device, Target target)
-      : mod_(mod),
-        per_target_module_(per_target_module),
-        device_(device),
-        target_(target),
-        debug_op_(Op::Get("debug")) {}
+  Interpreter(IRModule unified_mod, Device device, Target target)
+      : unified_mod_(unified_mod), device_(device), target_(target), debug_op_(Op::Get("debug")) {}
 
   template <typename T>
   T WithFrame(const Frame& fr, const std::function<T()>& f) {
@@ -315,7 +310,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   ObjectRef VisitExpr_(const VarNode* var_node) final { return Lookup(GetRef<Var>(var_node)); }
 
   ObjectRef VisitExpr_(const GlobalVarNode* op) final {
-    return Eval(mod_->Lookup(GetRef<GlobalVar>(op)));
+    return Eval(unified_mod_->Lookup(GetRef<GlobalVar>(op)));
   }
 
   ObjectRef VisitExpr_(const OpNode* id) override {
@@ -386,9 +381,9 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
 
     // Project out just the function(s) we need.
     IRModule lowered_projected_mod;
+    Map<Target, IRModule> per_target_module = tec::GetPerTargetModules(unified_mod_);
     std::unordered_map<Target, IRModule, backend::TargetStrHash, backend::TargetStrEqual>
-        per_target_module_std_map =
-            backend::TargetModuleMapToTargetStrModuleMap(per_target_module_);
+        per_target_module_std_map = backend::TargetModuleMapToTargetStrModuleMap(per_target_module);
     auto mod_itr = per_target_module_std_map.find(target);
     ICHECK(mod_itr != per_target_module_std_map.end())
         << "No target module for target '" << target->str() << "'";
@@ -547,7 +542,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
    *
    * @param prim_fn_var Global bound to lowered primitive.
    * @param all_prim_fn_vars  All globals references by lowered primitive, plus prim_fn_var itself.
-   * @param prim_shape_fn_var Global bound to lowered shape function for primitive, if neeeded.
+   * @param prim_shape_fn_var Global bound to lowered shape function for primitive, if needed.
    * @param all_prim_shape_fn_vars All globals references by lowered shape function, plus
    * prim_shape_fn_var itself.
    * @param prim_shape_fn_states Records whether shape and/or data is needed by the dynamic
@@ -768,7 +763,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   ObjectRef VisitExpr_(const TupleGetItemNode* op) final {
     ObjectRef val = Eval(op->tuple);
     const auto* adt_obj = val.as<ADTObj>();
-    ICHECK(adt_obj) << "interal error: when evaluating TupleGetItem expected an ADT value";
+    ICHECK(adt_obj) << "internal error: when evaluating TupleGetItem expected an ADT value";
     auto adt = GetRef<ADT>(adt_obj);
     ICHECK_LT(static_cast<size_t>(op->index), adt.size()) << "internal error: index out of bounds";
     return adt[op->index];
@@ -875,13 +870,11 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   }
 
  private:
-  // Main module. All expressions are eval'ed w.r.t. the definitions in this module. This module
-  // may contain calls to TIR functions bound in a per_target_module_ below.
-  IRModule mod_;
-  // Map from target key to lowered TIR functions derived from mod_.
-  // Note that primitives are implicitly executed on target_, while shape functions are implicitly
-  // executed on the default 'cpu' host. Thus this map has at most two entries.
-  Map<Target, IRModule> per_target_module_;
+  // Unified module. Functions are annotated with their target.
+  // All expressions are eval'ed w.r.t. the definitions in this module.
+  // This module contains functions that used to be in main_module and the per_target_module (TIR
+  // functions) in one module.
+  IRModule unified_mod_;
   // Cached packed functions for the primitives and shape functions, keyed by target and
   // global var name.
   std::unordered_map<std::pair<Target, std::string>, PackedFunc, PairHash> compiled_packed_funcs_;
@@ -901,7 +894,14 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
  * rewritten \p mod and target-specific modules containing bindings for all TIR primitive
  * functions needed by the rewritten module.
  */
-std::pair<IRModule, Map<Target, IRModule>> Prepare(IRModule mod, Device device, Target target) {
+IRModule Prepare(IRModule mod, Device device, Target target) {
+  // Things to initialize to pass into tec::LowerTEPass
+  // We only have one device-specific target.
+  tec::TargetMap targets = {{device.device_type, target}};
+
+  // All calls to primitives will use the unique target.
+  tec::DeviceMap device_map;
+
   // Run minimal transforms on module to establish invariants needed by interpreter.
   transform::Sequential seq({transform::SimplifyInference(),
                              // FuseOps will mark wrapped calls to prim-ops with the 'Primitive'
@@ -910,28 +910,15 @@ std::pair<IRModule, Map<Target, IRModule>> Prepare(IRModule mod, Device device,
                              // eta expand to support constructors in argument position
                              transform::EtaExpand(
                                  /*expand_constructor=*/true, /*expand_global_var=*/false),
-                             transform::InferType()});
+                             transform::InferType(),
+                             tec::LowerTEPass(targets, device_map, /*module_name=*/"intrp",
+                                              [](Function func) { /* no-op */ })});
 
   transform::PassContext pass_ctx = transform::PassContext::Current();
   With<transform::PassContext> ctx(pass_ctx);
   mod = seq(mod);
 
-  // We only have one device-specific target.
-  tec::TargetMap targets = {{device.device_type, target}};
-
-  // All calls to primitives will use the unique target.
-  tec::DeviceMap device_map;
-
-  // No need for a memory plan.
-  backend::StaticMemoryPlan memory_plan; /*=nullptr*/
-
-  // Lower all primitive functions reachable from expr.
-  // TODO(mbs): This should be just another pass in seq above, which requires LoweredModule to
-  // be merged into IRModule.
-  LoweredModule lowered_module =
-      tec::LowerTE(mod, targets, device_map, memory_plan, /*module_name=*/"intrp",
-                   [](Function func) { /* no-op */ });
-  return {lowered_module.main_module, lowered_module.per_target_module};
+  return mod;
 }
 
 /*! \brief Check if an expression could be changed by \p Prepare.
@@ -1020,11 +1007,9 @@ TypedPackedFunc<ObjectRef(Array<Expr>)> EvalFunction(IRModule mod, Expr expr, De
     // and can just eval it directly.
     expr_to_eval = expr;
   }
-  std::pair<IRModule, Map<Target, IRModule>> main_and_lowered =
-      Prepare(mod_with_expr, device, target);
-  std::shared_ptr<Interpreter> intrp = std::make_shared<Interpreter>(
-      /*mod=*/main_and_lowered.first, /*per_target_module=*/main_and_lowered.second, device,
-      target);
+  IRModule lowered_mod = Prepare(mod_with_expr, device, target);
+
+  std::shared_ptr<Interpreter> intrp = std::make_shared<Interpreter>(lowered_mod, device, target);
 
   //
   // Step 2: Evaluate target function to a closure.
@@ -1063,12 +1048,11 @@ ObjectRef Eval(Expr expr, Map<GlobalTypeVar, TypeData> type_definitions,
                std::unordered_set<String> import_set, Device device, Target target) {
   std::pair<IRModule, GlobalVar> mod_and_global =
       IRModule::FromExprInContext(expr, /*global_funcs=*/{}, type_definitions, import_set);
-  std::pair<IRModule, Map<Target, IRModule>> main_and_lowered =
-      Prepare(mod_and_global.first, device, target);
-  Interpreter intrp(
-      /*mod=*/main_and_lowered.first, /*per_target_module=*/main_and_lowered.second, device,
-      target);
-  Expr expr_to_eval = main_and_lowered.first->GetGlobalVar(mod_and_global.second->name_hint);
+
+  IRModule mod = Prepare(mod_and_global.first, device, target);
+
+  Interpreter intrp(mod, device, target);
+  Expr expr_to_eval = mod->GetGlobalVar(mod_and_global.second->name_hint);
   if (expr.as<BaseFuncNode>() == nullptr) {
     // TODO(mbs): IRModule::FromExpr will implicitly close over the free vars of expr
     // unless it is a function, so we must reverse that in the expression to eval.
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 06d862b781e1..e322ccaff1ce 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-#include "te_compiler.h"
+#include "./te_compiler.h"
 
 #include <tvm/driver/driver_api.h>
 #include <tvm/ir/attrs.h>
@@ -42,8 +42,8 @@
 #include <utility>
 #include <vector>
 
-#include "te_compiler_cache.h"
-#include "utils.h"
+#include "./te_compiler_cache.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
@@ -85,39 +85,53 @@ class TECompilerImpl : public TECompilerNode {
     return LowerShapeFuncInternal(key)->cached_func;
   }
 
-  Map<Target, IRModule> GetLoweredFunctions() {
-    std::unordered_map<Target, IRModule, backend::TargetStrHash, backend::TargetStrEqual>
-        lowered_functions;
+  IRModule GetLoweredFunctions() {
+    IRModule mod;
+    // Extract lowered functions from the cache
     for (const auto& it : cache_) {
       auto source_func = it.first;
       auto lowered_func = it.second;
-      auto target = source_func->target;
 
-      if (!lowered_functions.count(target)) {
-        lowered_functions[target] = IRModule(Map<GlobalVar, BaseFunc>({}));
-      }
+      IRModule lowered_mod = lowered_func->cached_func->funcs;
 
-      lowered_functions[target]->Update(lowered_func->cached_func->funcs);
-    }
+      // Annotate functions with their target and put them in the return module
+      for (auto kv : lowered_mod->functions) {
+        const GlobalVar& var = kv.first;
+        const BaseFunc& func = kv.second;
 
+        // Only add functions that are not external functions
+        if (!func->GetAttr<String>(attr::kCompiler).defined()) {
+          ICHECK(func->IsInstance<tir::PrimFuncNode>())
+              << "Expected all functions that are not external to be PrimFuncs, but found "
+              << func->GetTypeKey();
+          const tir::PrimFunc& prim_func = Downcast<tir::PrimFunc>(func);
+          mod->Update(var, WithAttr(prim_func, tvm::attr::kTarget, source_func->target));
+        }
+      }
+    }
+    // Extract lowered dynamic shape functions from the shape cache
     for (const auto& it : shape_func_cache_) {
       auto source_func = it.first;
       auto lowered_func = it.second;
       auto target = source_func->target;
-
-      if (!lowered_functions.count(target)) {
-        lowered_functions[target] = IRModule(Map<GlobalVar, BaseFunc>({}));
+      IRModule lowered_mod = lowered_func->cached_func->funcs;
+
+      // Annotate functions with their target and put them in the return module
+      for (auto kv : lowered_mod->functions) {
+        const GlobalVar& var = kv.first;
+        const BaseFunc& func = kv.second;
+        const tir::PrimFunc& prim_func = Downcast<tir::PrimFunc>(func);
+        mod->Update(var, WithAttr(prim_func, tvm::attr::kTarget, source_func->target));
       }
-
-      lowered_functions[target]->Update(lowered_func->cached_func->funcs);
     }
-    return backend::TargetStrModuleMapToTargetModuleMap(lowered_functions);
+    return mod;
   }
 
   Array<tvm::runtime::Module> LowerExternalFunctions() {
     Array<tvm::runtime::Module> ret;
     std::unordered_map<std::string, std::string> cached_symbol;
     std::vector<CCacheKey> cached_ext_funcs;
+
     for (const auto& it : cache_) {
       auto src_func = it.first->source_func;
       ICHECK(src_func.defined());
@@ -370,10 +384,12 @@ class LowerTensorExprMutator : public ExprMutator {
    *  \brief Returns the primitive function associated with \p expr, or
    *  nullptr if none.
    */
-  Function ResolveToPrimitive(Expr expr) {
+  BaseFunc ResolveToPrimitive(Expr expr) {
     if (const GlobalVarNode* gvn = expr.as<GlobalVarNode>()) {
       BaseFunc base_func = module_->Lookup(GetRef<GlobalVar>(gvn));
       return ResolveToPrimitive(base_func);
+    } else if (const tir::PrimFuncNode* prim_func = expr.as<tir::PrimFuncNode>()) {
+      return GetRef<tir::PrimFunc>(prim_func);
     } else if (const VarNode* vn = expr.as<VarNode>()) {
       auto itr = primitive_functions_.find(GetRef<Var>(vn));
       return itr == primitive_functions_.end() ? Function() : itr->second;
@@ -503,10 +519,17 @@ class LowerTensorExprMutator : public ExprMutator {
   Expr VisitExpr_(const LetNode* let) override {
     Var var = Downcast<Var>(Mutate(let->var));
     Expr value = Mutate(let->value);
-    Function prim_func = ResolveToPrimitive(value);
+    BaseFunc prim_func = ResolveToPrimitive(value);
+
     if (prim_func.defined()) {
+      // Already lowered by other means, no need to mutate the Let node
+      if (prim_func->IsInstance<tir::PrimFuncNode>()) {
+        return GetRef<Let>(let);
+      }
+
       // Remember let var is bound to (possibly indirectly) to a primitive.
-      primitive_functions_.emplace(let->var, prim_func);
+      Function func = Downcast<Function>(prim_func);
+      primitive_functions_.emplace(let->var, func);
     }
     Expr body = Mutate(let->body);
     if (prim_func.defined()) {
@@ -524,7 +547,7 @@ class LowerTensorExprMutator : public ExprMutator {
     Call expr = GetRef<Call>(call);
 
     // Look for (indirect) calls to primitives.
-    Function prim_func = ResolveToPrimitive(call->op);
+    BaseFunc prim_func = ResolveToPrimitive(call->op);
     if (!prim_func.defined()) {
       // Not a call to a primitive function.
       if (const FunctionNode* fn = call->op.as<FunctionNode>()) {
@@ -533,6 +556,12 @@ class LowerTensorExprMutator : public ExprMutator {
       return ExprMutator::VisitExpr_(call);
     }
 
+    // Already lowered by other means so we don't need to mutate
+    // the call
+    if (prim_func->IsInstance<tir::PrimFuncNode>()) {
+      return expr;
+    }
+
     // Find the desired target device.
     Target target;
     if (prim_func->GetAttr<String>(attr::kCompiler).defined()) {
@@ -552,7 +581,8 @@ class LowerTensorExprMutator : public ExprMutator {
     }
 
     // Lower the primitive function for that target.
-    std::pair<GlobalVar, Attrs> pair = LowerFunction(prim_func, target);
+    Function func = Downcast<Function>(prim_func);
+    std::pair<GlobalVar, Attrs> pair = LowerFunction(func, target);
 
     // Similarly transform arguments.
     Array<Expr> args;
@@ -583,19 +613,7 @@ class LowerTensorExprMutator : public ExprMutator {
   const Op& debug_op_;
 };
 
-Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map,
-                     backend::StaticMemoryPlan memory_plan, const String& module_name,
-                     TECompiler compiler, std::function<void(Function)> process_fn) {
-  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-      [=](Function func, IRModule module, PassContext ctx) {
-        LowerTensorExprMutator lower_te(module, targets, device_context_map, process_fn,
-                                        module_name, compiler);
-        return Downcast<Function>(lower_te.Mutate(func));
-      };
-  return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
-}
-
-Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets) {
+Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) {
   if (targets.size() == 1) {
     // The homogeneous execution case, return the only target.
     const auto& it = targets.begin();
@@ -625,26 +643,28 @@ Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets) {
   }
 }
 
-/*!
- * \brief Update the "main" control function's metadata
- *
- * \param mod The module
- * \param targets Map of targets
- * \return function_infos Function info for each function in the module
- */
+Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, const String& module_name,
+                     TECompiler compiler, std::function<void(Function)> process_fn) {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function func, IRModule module, PassContext ctx) {
+        LowerTensorExprMutator lower_te(module, targets, device_context_map, process_fn,
+                                        module_name, compiler);
+        return Downcast<Function>(lower_te.Mutate(func));
+      };
+  return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
+}
 
-backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, TargetMap targets,
+backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
                                               Map<Expr, backend::StorageInfo> storage_info_map) {
-  CHECK_EQ(mod->functions.size(), 1)
-      << "There should only be one function in the module passed to UpdateMainWorkspaceSize";
   Function func = Downcast<Function>(mod->Lookup("main"));
 
   // This is a Map<device,Map<storage_id, size>>
-  std::unordered_map<DLDeviceType, std::unordered_map<int, int>, EnumClassHash> sid_workspace;
+  std::unordered_map<DLDeviceType, std::unordered_map<int, int>, backend::EnumClassHash>
+      sid_workspace;
   // This is a Map<device, size_of_inputs_and_outputs>
-  std::unordered_map<DLDeviceType, int, EnumClassHash> device_io;
+  std::unordered_map<DLDeviceType, int, backend::EnumClassHash> device_io;
   // This is a Map<device, size_of_constants>
-  std::unordered_map<DLDeviceType, int, EnumClassHash> device_consts;
+  std::unordered_map<DLDeviceType, int, backend::EnumClassHash> device_consts;
 
   // Initialize the mapping from all storage identifiers to workspace sizes,
   // the amount of device io, and the device constants.
@@ -710,7 +730,7 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, TargetMap tar
   }
 
   // This is a Map<device, workspace_size>
-  std::unordered_map<DLDeviceType, int, EnumClassHash> device_workspace;
+  std::unordered_map<DLDeviceType, int, backend::EnumClassHash> device_workspace;
   // Once we know the sizes of sids, we need to accumulate per device
   for (const auto& dev_sid_size : sid_workspace) {
     auto dev = dev_sid_size.first;
@@ -733,17 +753,17 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, TargetMap tar
   }
 
   for (const auto& dev_and_size : device_workspace) {
-    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
     workspace_sizes.Set(tgt, dev_and_size.second);
     relay_primfuncs.Set(tgt, func);
   }
   for (const auto& dev_and_size : device_io) {
-    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
     io_sizes.Set(tgt, dev_and_size.second);
   }
 
   for (const auto& dev_and_size : device_consts) {
-    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
     constant_sizes.Set(tgt, dev_and_size.second);
   }
 
@@ -787,7 +807,7 @@ void UpdateFunctionMetadata(Function relay_func,
     CHECK(prim_fn.defined()) << "the primitive function must be defined";
 
     auto workspace_byte_alignment =
-        relay_target.value()->GetAttr<Integer>("workspace_byte_alignment").value_or(16);
+        relay_target.value()->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
 
     Integer workspace_size = CalculateWorkspaceBytes(prim_fn, workspace_byte_alignment);
 
@@ -830,21 +850,14 @@ void UpdateFunctionMetadata(Function relay_func,
   function_metadata.Set(prim_fn_var.value()->name_hint, fi);
 }
 
-LoweredModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_context_map,
-                      backend::StaticMemoryPlan memory_plan, const String& module_name,
-                      std::function<void(Function)> process_fn) {
+IRModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_context_map,
+                 const String& module_name, std::function<void(Function)> process_fn) {
   DLOG(INFO) << "lowering module:\n" << PrettyPrint(module);
 
   TECompiler compiler;
 
-  backend::FunctionInfo func_info;
-  if (memory_plan.defined()) {
-    // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
-    func_info = UpdateMainWorkspaceSize(module, targets, memory_plan->expr_to_storage_info);
-  }
-
-  auto updated_module = LowerTensorExpr(targets, device_context_map, memory_plan, module_name,
-                                        compiler, process_fn)(module);
+  auto updated_module =
+      LowerTensorExpr(targets, device_context_map, module_name, compiler, process_fn)(module);
 
   // A temporary solution until we can rewrite the auto-scheduler task extraction code to work
   // in a more reasonable way.
@@ -864,124 +877,57 @@ LoweredModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap devic
     (*te_compiler_update_weights)(weight_map);
   }
 
-  LoweredModule lowered_module;
-  lowered_module.main_module = updated_module;
-  lowered_module.per_target_module = compiler->GetLoweredFunctions();
-  lowered_module.external_mods = compiler->LowerExternalFunctions();
-  lowered_module.main_func_info = func_info;
-  return lowered_module;
-}
+  // Copy the lowered functions into the return module
+  updated_module->Update(compiler->GetLoweredFunctions());
 
-IRModule LoweredModuleToIRModule(LoweredModule mod) {
-  IRModule unified_module;
-
-  // Copy the main module and its typedefs
-  for (const auto& kv : mod.main_module->functions) {
-    unified_module->Add(kv.first, kv.second);
-  }
-  for (const auto& kv : mod.main_module->type_definitions) {
-    unified_module->AddTypeDef(kv.first, kv.second);
-  }
+  // Annotate the module with the external modules and function info
+  updated_module = WithAttr(updated_module, "external_mods", compiler->LowerExternalFunctions());
 
-  // Annotate the per-target functions with their target and add them to the unified module
-  for (const auto& kv : mod.per_target_module) {
-    const Target target = kv.first;
-    const IRModule target_module = kv.second;
-
-    // Right now, per-target functions are TIR functions, which don't have type definitions, so
-    // there should be no type defs in the per_target_modules
-    size_t ty_def_size = target_module->type_definitions.size();
-    ICHECK(ty_def_size == 0)
-        << "Expected there to be no type definitions in the per_target_modules, but found "
-        << ty_def_size;
-
-    for (const auto& kv : target_module->functions) {
-      const GlobalVar& var = kv.first;
-      const BaseFunc& func = kv.second;
-      if (func->IsInstance<tir::PrimFuncNode>()) {
-        tir::PrimFunc primFunc =
-            WithAttr(Downcast<tir::PrimFunc>(std::move(func)), tvm::attr::kTarget, target);
-        unified_module->Add(var, primFunc);
-      } else if (func->IsInstance<relay::FunctionNode>()) {
-        relay::Function relayFunc =
-            WithAttr(Downcast<relay::Function>(std::move(func)), tvm::attr::kTarget, target);
-        unified_module->Add(var, relayFunc);
-      } else {
-        LOG(FATAL)
-            << "We expected to only have PrimFuncs or RelayFuncs in the target modules, but found "
-            << func->GetTypeKey();
-      }
-    }
-  }
-
-  IRModule ret_mod = WithAttr(unified_module, "external_mods", mod.external_mods);
-  ret_mod = WithAttr(ret_mod, "main_func_info", mod.main_func_info);
-  return ret_mod;
+  return updated_module;
 }
 
-LoweredModule IRModuleToLoweredModule(IRModule mod) {
-  IRModule main_mod;
-  // Copy just the TypeDefs from the IRModule to the LoweredModule's main module
-  // This is the only time we need to do this since there are no TypeDefs in TIR
-  for (const auto& kv : mod->type_definitions) {
-    main_mod->AddTypeDef(kv.first, kv.second);
-  }
-
-  Map<Target, IRModule> per_target_modules;
+Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
+  std::unordered_map<Target, IRModule, backend::TargetStrHash, backend::TargetStrEqual>
+      per_target_modules;
   for (const auto& kv : mod->functions) {
     const GlobalVar& var = kv.first;
     const BaseFunc& func = kv.second;
-    if (func->IsInstance<relay::FunctionNode>()) {
-      main_mod->Add(var, func);
-    } else if (func->IsInstance<tir::PrimFuncNode>()) {
+    if (func->IsInstance<tir::PrimFuncNode>()) {
       // Extract target
       Optional<Target> target = func->GetAttr<Target>(tvm::attr::kTarget);
       ICHECK(target) << "Target should be set at this point";
 
       // Put the function in per_target_modules
       if (!per_target_modules.count(target.value())) {
-        // Initialize the IRModule for this target and add the function
-        IRModule target_module;
+        // Initialize the IRModule for this target with the attributes from the input IRModule
+        IRModule target_module = IRModule({}, {}, {}, {}, mod->attrs);
+        // Add the function to the IRModule
         target_module->Add(var, func);
-        per_target_modules.Set(target.value(), target_module);
+        per_target_modules[target.value()] = target_module;
       } else {
         // The IRModule for this target is initialized, so just add the function.
         IRModule target_module = per_target_modules.at(target.value());
         target_module->Add(var, func);
       }
-    } else {
+    } else if (!func->IsInstance<relay::FunctionNode>()) {
       LOG(FATAL)
           << "The function types in the IRModule should be RelayFunction or PrimFunc, but got "
           << func->GetTypeKey();
     }
   }
-
-  // Put the LoweredModule together
-  LoweredModule lowered_module;
-  lowered_module.main_module = main_mod;
-  lowered_module.per_target_module = per_target_modules;
-
-  // Extract external modules and main func info, add to lowered module if they exist
-  auto external_mods = mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
-  if (external_mods) {
-    lowered_module.external_mods = external_mods.value();
-  }
-  auto main_func_info = mod->GetAttr<backend::FunctionInfo>("main_func_info");
-  if (main_func_info) {
-    lowered_module.main_func_info = main_func_info.value();
-  }
-  return lowered_module;
+  return per_target_modules;
 }
 
-Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map,
-                 backend::StaticMemoryPlan memory_plan, const String& module_name,
+Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map, const String& module_name,
                  std::function<void(Function)> process_fn) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule module,
                                                                             PassContext ctx) {
-    return LoweredModuleToIRModule(
-        LowerTE(module, targets, device_context_map, memory_plan, module_name, process_fn));
+    return LowerTE(module, targets, device_context_map, module_name, process_fn);
   };
-  return tvm::transform::CreateModulePass(pass_func, 1, "LowerTE", {});
+
+  return tvm::transform::Sequential({tvm::relay::transform::RelayToTIRTargetHook(),
+                                     tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {}),
+                                     InferType()});
 }
 }  // namespace tec
 }  // namespace relay
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 65ba67ac7e1b..d5135e6301c4 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -52,24 +52,15 @@
 #include "../transforms/infer_layout_utils.h"
 #include "../transforms/pass_utils.h"
 #include "./te_compiler_cache.h"
-#include "utils.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
 namespace tec {
 
-// This class is needed to avoid a GCC 5 bug that prevents maps containing enums
-// from being compiled. If i386 GCC version is increased, we can remove it.
-struct EnumClassHash {
-  template <typename T>
-  std::size_t operator()(T t) const {
-    return static_cast<std::size_t>(t);
-  }
-};
-
 // TODO(@jroesch, @chrisS) these should be a tvm::Map for uniformity sake
 // we should a version of context which works in Map
-using TargetMap = std::unordered_map<DLDeviceType, Target, EnumClassHash>;
+using TargetMap = std::unordered_map<DLDeviceType, Target, backend::EnumClassHash>;
 using DeviceMap =
     std::unordered_map<Expr, tvm::Device, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
 using ProcessFn = std::function<void(Function)>;
@@ -96,8 +87,9 @@ class TECompilerNode : public Object {
    */
   virtual CachedFunc Lower(const CCacheKey& key, const String mod_name) = 0;
 
-  /* Return all functions which have been lowered by the compiler, keyed by target. */
-  virtual Map<Target, IRModule> GetLoweredFunctions() = 0;
+  /* Return all functions which have been lowered by the compiler in an IRModule, annotated with
+   * their target. */
+  virtual IRModule GetLoweredFunctions() = 0;
 
   /*!
    * \brief Just in time compile to get a PackedFunc.
@@ -113,7 +105,7 @@ class TECompilerNode : public Object {
   virtual CachedFunc LowerShapeFunc(const CCacheKey& key) = 0;
   /*!
    * \brief Lower the external function using external codegen tools.
-   * \return The runtime moduels for each needed external codegen tool.
+   * \return The runtime modules for each needed external codegen tool.
    */
   virtual tvm::Array<tvm::runtime::Module> LowerExternalFunctions() = 0;
 
@@ -137,23 +129,6 @@ class TECompiler : public ObjectRef {
   using ContainerType = TECompilerNode;
 };
 
-/*! \brief The result of lowering a module, for now we need to pass an aggregate data structure
- * which contains more then a single module in order to interact with the today API.
- */
-struct LoweredModule {
-  /*! \brief The module which contains the Relay code. */
-  IRModule main_module;
-  /*! \brief The module which contains per target code. */
-  Map<Target, IRModule> per_target_module;
-  /*! \brief The external runtime modules which must be combined with the lowered code. */
-  Array<tvm::runtime::Module> external_mods;
-  // TODO(@electriclilies): THis might need to become a map
-  /*! \brief The info for this function (not sure what a better description is??)
-   *
-   */
-  backend::FunctionInfo main_func_info;
-};
-
 /*!
  * \brief A function to create the function metadata for an input function (ie calculate buffer
  * input/output sizes)
@@ -174,27 +149,22 @@ void UpdateFunctionMetadata(Function relay_func,
  */
 Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets);
 
-/*! \brief Utility to convert a LoweredModule to an IRModule.
- *
- * This function takes all the target specific modules in LoweredModule and
- * annotates their functions with the correct target, and puts all those functions
- * in one IRModule.
- * The purpose of this utility is to allow us to slowly remove LoweredModule from the codebase.
+/*!
+ * \brief Update the "main" control function's metadata
  *
- * \param mod The LoweredModule to convert.
- * \return The IRModule form of the input LoweredModule.
+ * \param mod The module
+ * \param targets Map of targets
+ * \return function_infos Function info for each function in the module
  */
-IRModule LoweredModuleToIRModule(LoweredModule mod);
+backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
+                                              Map<Expr, backend::StorageInfo> storage_info_map);
 
-/*! \brief Utility to convert an IRModule to a LoweredModule.
+/*! \brief Utility to separate the functions in an IRModule by Target.
  *
- * This function takes all the functions in the IRModule and moves them into target-specific
- * IRModules stored inside a LoweredModule.
- * The purpose of this utility is to allow us to slowly remove LoweredModule from the codebase.
- * \param mod The IRModule to convert.
- * \return The LoweredModule form of the input IRModule.
+ * \param mod The IRModule to extract the per target module from
+ * \return The map from Target to IRModule
  */
-LoweredModule IRModuleToLoweredModule(IRModule mod);
+Map<Target, IRModule> GetPerTargetModules(IRModule mod);
 
 /*! \brief Lower an IRModule's primitive functions to TIR.
  *
@@ -210,7 +180,7 @@ LoweredModule IRModuleToLoweredModule(IRModule mod);
  * each function that we lower
  * \return The lowered module, see above.
  */
-LoweredModule LowerTE(
+IRModule LowerTE(
     const IRModule& module, TargetMap targets, DeviceMap device_map,
     backend::StaticMemoryPlan memory_plan, const String& module_name,
     ProcessFn process_fn = [](Function f) {});
@@ -218,21 +188,18 @@ LoweredModule LowerTE(
 /*! \brief Pass to lower an IRModule's primitive functions to TIR.
  *
  * This is the "back half" of the Relay compiler which lowers "primitive functions"
- * to TE expressions, schedules them, and then to TIR. This Pass calls LowerTE, and
- * uses LoweredModuleToIRModule utility to convert the output LowerTE's output
- * LoweredModule into an IRModule before returning it.
+ * to TE expressions, schedules them, and then to TIR. It annotates all functions
+ * with their target.
  *
  * \param targets The mapping for devices to targets.
  * \param device_context_map An analysis result mapping each sub-expression to a device.
- * \param memory_plan The memory plan used during lowering
  * \param module_name The name of this module
  * \param process_fn Callback allowing one-level up code generators to process
  * each function that we lower
  * \returns The pass which lowers primative functions to TIR
  */
 transform::Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map,
-                            backend::StaticMemoryPlan memory_plan, const String& module_name,
-                            std::function<void(Function)> process_fn);
+                            const String& module_name, std::function<void(Function)> process_fn);
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index cf8a2dd4b8e0..ae8d7d2c2360 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -146,6 +146,17 @@ struct LoweredOutput {
   runtime::Metadata metadata;
 };
 
+/*!
+ * \brief This class is needed to avoid a GCC 5 bug that prevents maps containing enums from being
+ compiled. If i386 GCC version is increased, we can remove it.
+ */
+struct EnumClassHash {
+  template <typename T>
+  std::size_t operator()(T t) const {
+    return static_cast<std::size_t>(t);
+  }
+};
+
 /*!
  * \brief A helper to expand the params by adding the ones used in a given expression.
  */
diff --git a/src/relay/ir/transform.cc b/src/relay/ir/transform.cc
index 4a7974cae5ae..344d1cae7823 100644
--- a/src/relay/ir/transform.cc
+++ b/src/relay/ir/transform.cc
@@ -133,9 +133,7 @@ IRModule FunctionPassNode::operator()(IRModule mod, const PassContext& pass_ctx)
   DLOG(INFO) << "Executing function pass : " << pass_info->name
              << " with opt level: " << pass_info->opt_level;
 
-  // Execute the pass function and return a new module.
-  IRModule updated_mod =
-      IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map);
+  IRModule updated_mod = mod->ShallowCopy();
 
   std::vector<std::pair<GlobalVar, Function> > updates;
   for (const auto& it : updated_mod->functions) {
diff --git a/src/relay/op/contrib/ethosu/common.cc b/src/relay/op/contrib/ethosu/common.cc
new file mode 100644
index 000000000000..bdda81bc7708
--- /dev/null
+++ b/src/relay/op/contrib/ethosu/common.cc
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/ethosu/common.cc
+ * \brief A set of utilities and common functionality for Arm(R) Ethos(TM)-U NPU QNN ops.
+ */
+
+#include "common.h"
+
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace ethosu {
+
+Array<IndexExpr> EthosuInferKernelOutput(Array<IndexExpr> ifm_shape, String ifm_layout,
+                                         String ofm_layout, Array<IndexExpr> kernel_shape,
+                                         IndexExpr ofm_channels, Array<IndexExpr> dilation,
+                                         Array<IndexExpr> strides, Array<IndexExpr> padding) {
+  // In the case of NHCWB16, convert the ifm shape to NHW (C not required for this function)
+  if (ifm_layout == "NHCWB16") {
+    ifm_shape = {ifm_shape[0], ifm_shape[1], ifm_shape[3]};
+  }
+  Array<IndexExpr> output_shape({ifm_shape[0], 0, 0, ofm_channels});
+
+  IndexExpr dilated_ksize_y = 1 + (kernel_shape[0] - 1) * dilation[0];
+  IndexExpr dilated_ksize_x = 1 + (kernel_shape[1] - 1) * dilation[1];
+  IndexExpr pad_h, pad_w;
+  GetPaddingHeightWidth(padding, &pad_h, &pad_w);
+  output_shape.Set(1, indexdiv(ifm_shape[1] + pad_h - dilated_ksize_y, strides[0]) + 1);
+  output_shape.Set(2, indexdiv(ifm_shape[2] + pad_w - dilated_ksize_x, strides[1]) + 1);
+
+  // If the ofm is NHCWB16, convert the layout
+  if (ofm_layout == "NHCWB16") {
+    int channel_bricks = 1 + (output_shape[3].as<IntImmNode>()->value - 1) / 16;
+    output_shape = {output_shape[0], output_shape[1], channel_bricks, output_shape[2], 16};
+  }
+
+  return output_shape;
+}
+
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/ethosu/common.h b/src/relay/op/contrib/ethosu/common.h
new file mode 100644
index 000000000000..b5377e6e8bdf
--- /dev/null
+++ b/src/relay/op/contrib/ethosu/common.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/ethosu/common.h
+ * \brief Functions for all Arm(R) Ethos(TM)-U NPU operators to use.
+ */
+
+#ifndef TVM_RELAY_OP_CONTRIB_ETHOSU_COMMON_H_
+#define TVM_RELAY_OP_CONTRIB_ETHOSU_COMMON_H_
+
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace ethosu {
+
+/*! \brief Infer the output tensor shape for convolution and pooling operators.
+ * \param ifm_shape The shape of Input Feature Map.
+ * \param ifm_layout The layout of the IFM (NHWC or NHCWB16).
+ * \param ofm_layout The layout of the OFM (NHWC or NHCWB16).
+ * \param kernel_shape Kernel shape in format (height, width).
+ * \param ofm_channels The number of Output Feature Map channels.
+ * \param dilation The 2-dimensional dilation as (dilation_height, dilation_width).
+ * \param strides The 2 dimensional strides as (stride_height, stride_width).
+ * \param padding The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
+ * \return The shape of the output tensor.
+ */
+Array<IndexExpr> EthosuInferKernelOutput(Array<IndexExpr> ifm_shape, String ifm_layout,
+                                         String ofm_layout, Array<IndexExpr> kernel_shape,
+                                         IndexExpr ofm_channels, Array<IndexExpr> dilation,
+                                         Array<IndexExpr> strides, Array<IndexExpr> padding);
+
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_CONTRIB_ETHOSU_COMMON_H_
diff --git a/src/relay/op/contrib/ethosu/convolution.cc b/src/relay/op/contrib/ethosu/convolution.cc
new file mode 100644
index 000000000000..bad10bf66f3a
--- /dev/null
+++ b/src/relay/op/contrib/ethosu/convolution.cc
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/ethosu/convolution.cc
+ * \brief Operator definitions for the Arm(R) Ethos(TM)-U NPU convolution ops.
+ */
+#include "../../nn/convolution.h"
+
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/data_layout.h>
+
+#include "../../../qnn/utils.h"
+#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace ethosu {
+
+/*! \brief Attributes used by the Ethos(TM)-U NPU convolution operator */
+struct EthosuConv2DAttrs : public tvm::AttrsNode<EthosuConv2DAttrs> {
+  double ifm_scale;
+  int ifm_zero_point;
+  int weight_zero_point;
+  double ofm_scale;
+  int ofm_zero_point;
+  Array<IndexExpr> kernel_shape;
+  IndexExpr ofm_channels;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  String activation;
+  int clip_min;
+  int clip_max;
+  String upscale;
+  String ifm_layout;
+  String ofm_layout;
+
+  TVM_DECLARE_ATTRS(EthosuConv2DAttrs, "relay.attrs.EthosuConv2DAttrs") {
+    TVM_ATTR_FIELD(ifm_scale).describe("The quantization scale for the Input Feature Map tensor.");
+    TVM_ATTR_FIELD(ifm_zero_point)
+        .describe("The quantization zero point for the Input Feature Map tensor.");
+    TVM_ATTR_FIELD(weight_zero_point)
+        .describe("The quantization zero point for the weight tensor.");
+    TVM_ATTR_FIELD(ofm_scale).describe("The quantization scale for the Output Feature Map tensor.");
+    TVM_ATTR_FIELD(ofm_zero_point)
+        .describe("The quantization zero point for the Output Feature Map tensor.");
+    TVM_ATTR_FIELD(kernel_shape)
+        .describe("The 2 dimensional kernel shape as (kernel_height, kernel_width).")
+        .set_default(NullValue<Array<IndexExpr>>());
+    TVM_ATTR_FIELD(ofm_channels)
+        .describe("The number of OFM channels.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).");
+    TVM_ATTR_FIELD(dilation)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional dilation as (dilation_height, dilation_width).");
+    TVM_ATTR_FIELD(activation)
+        .describe(
+            "The activation function to use. "
+            "'NONE' - no activation function. "
+            "'CLIP' - clip the output between clip_min and clip_max. "
+            "'TANH' - tanh activation function. "
+            "'SIGMOID' - sigmoid activation function. "
+            "'LUT' - use a look-up table to perform the activation function.")
+        .set_default("NONE");
+    TVM_ATTR_FIELD(clip_min)
+        .describe("The minimum clipping value if activation = 'CLIP'.")
+        .set_default(0);
+    TVM_ATTR_FIELD(clip_max)
+        .describe("The maximum clipping value if activation = 'CLIP'.")
+        .set_default(0);
+    TVM_ATTR_FIELD(upscale)
+        .describe(
+            "The 2x2 upscaling mode to apply to the Input Feature Map tensor. "
+            "'NONE' - no upscaling. "
+            "'NEAREST' - upscale using nearest neighbour. "
+            "'ZEROS' - upscale using zeros.")
+        .set_default("NONE");
+    TVM_ATTR_FIELD(ifm_layout)
+        .set_default("NHWC")
+        .describe("The layout of the Input Feature Map tensor. Can be 'NHWC' or 'NHCWB16'.");
+    TVM_ATTR_FIELD(ofm_layout)
+        .set_default("NHWC")
+        .describe("The layout of the Output Feature Map tensor. Can be 'NHWC' or 'NHCWB16'.");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(EthosuConv2DAttrs);
+
+bool EthosuConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 5);
+  const auto* ifm = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  const auto* scale_bias = types[2].as<TensorTypeNode>();
+  if (ifm == nullptr || weight == nullptr) return false;
+  const auto* param = attrs.as<EthosuConv2DAttrs>();
+  CHECK(param != nullptr) << "EthosuConv2DAttrs cannot be nullptr.";
+  CHECK(ifm->dtype == DataType::UInt(8) || ifm->dtype == DataType::Int(8))
+      << "Expected ethosu_conv2d type(uint8) or type(int8) for ifm but was " << ifm->dtype;
+  CHECK(weight->dtype == DataType::UInt(8) || weight->dtype == DataType::Int(8))
+      << "Expected ethosu_conv2d type(uint8) or type(int8) for weight but was " << weight->dtype;
+  CHECK(scale_bias->dtype == DataType::UInt(8))
+      << "Expected ethosu_conv2d type(uint8) for scale_bias but was " << scale_bias->dtype;
+
+  // The scale_bias should be provided as a tensor of size {ofm_channels, 10}
+  reporter->Assign(types[2], TensorType({weight->shape[0], 10}, DataType::UInt(8)));
+
+  // Assign weight type {ofm_channels, kernel_height, kernel_width, ifm_channels}
+  reporter->Assign(types[1], TensorType({param->ofm_channels, param->kernel_shape[0],
+                                         param->kernel_shape[1], weight->shape[3]},
+                                        weight->dtype));
+
+  // Assign ofm type
+  auto ofm_shape =
+      EthosuInferKernelOutput(ifm->shape, param->ifm_layout, param->ofm_layout, param->kernel_shape,
+                              param->ofm_channels, param->dilation, param->strides, param->padding);
+  reporter->Assign(types[4], TensorType(ofm_shape, ifm->dtype));
+  return true;
+}
+
+Expr MakeEthosuConv2D(Expr ifm, Expr weight, Expr scale_bias, Expr lut, double ifm_scale,
+                      int ifm_zero_point, int weight_zero_point, double ofm_scale,
+                      int ofm_zero_point, Array<IndexExpr> kernel_shape, IndexExpr ofm_channels,
+                      Array<IndexExpr> strides, Array<IndexExpr> padding, Array<IndexExpr> dilation,
+                      String activation, int clip_min, int clip_max, String upscale,
+                      String ifm_layout, String ofm_layout) {
+  auto attrs = make_object<EthosuConv2DAttrs>();
+  attrs->ifm_scale = ifm_scale;
+  attrs->ifm_zero_point = ifm_zero_point;
+  attrs->weight_zero_point = weight_zero_point;
+  attrs->ofm_scale = ofm_scale;
+  attrs->ofm_zero_point = ofm_zero_point;
+  attrs->kernel_shape = std::move(kernel_shape);
+  attrs->ofm_channels = std::move(ofm_channels);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->activation = std::move(activation);
+  attrs->clip_min = clip_min;
+  attrs->clip_max = clip_max;
+  attrs->upscale = std::move(upscale);
+  attrs->ifm_layout = std::move(ifm_layout);
+  attrs->ofm_layout = std::move(ofm_layout);
+  static const Op& op = Op::Get("contrib.ethosu.conv2d");
+  return Call(op, {ifm, weight, scale_bias, lut}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.ethosu_conv2d").set_body_typed(MakeEthosuConv2D);
+
+RELAY_REGISTER_OP("contrib.ethosu.conv2d")
+    .describe(R"code(Arm(R) Ethos(TM)-U NPU 2D quantized convolution operator.
+
+This Relay operator corresponds to the hardware-implemented quantized
+convolution operation found on Ethos(TM)-U NPUs. It accepts either NHWC
+or NHCWB16 format for the input data (Input Feature Map, or IFM) and
+OHWI format for the kernel weights.
+
+Reference: https://developer.arm.com/documentation/102420/0200/
+
+Note that the per-channel weight scale and bias tensor must be packed together into
+a combined tensor of uint80s. This is represented in TVM by a (channels, 10) tensor
+of type uint8. For more detail, refer to the Technical Reference Manual linked above.
+
+- **ifm**: NHWC - (1, ifm_height, ifm_width, ifm_channels)
+           NHCWB16 - (1, ifm_height, ifm_channels // 16, ifm_width, 16)
+- **weight**: (ofm_channels, kernel_shape[0], kernel_shape[1], ifm_channels)
+- **scale_bias**: (ofm_channels, 10)
+- **ofm**: (1, ofm_height, ofm_width, ofm_channels)
+
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<EthosuConv2DAttrs>()
+    .set_num_inputs(4)
+    .add_argument("ifm", "Tensor", "The Input Feature Map tensor (IFM).")
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .add_argument("scale_bias", "Tensor", "The packed per-channel weight scale and bias tensor.")
+    .add_argument("lut", "Tensor", "The look-up table values to use if activation = 'LUT'.")
+    .set_support_level(11)
+    .add_type_rel("EthosuConv2D", EthosuConv2DRel);
+
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 066769ec388e..c27227b2eb73 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -871,8 +871,12 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     dilated_ksize_x = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
     channels = param->channels;
 
+    DataType weight_dtype = data->dtype;
+    if (weight != nullptr) {
+      weight_dtype = weight->dtype;
+    }
     // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, data->dtype));
+    reporter->Assign(types[1], TensorType(wshape, weight_dtype));
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
@@ -965,8 +969,12 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     dilated_ksize_x = 1 + (param->kernel_size[2] - 1) * param->dilation[2];
     channels = param->channels;
 
+    DataType weight_dtype = data->dtype;
+    if (weight != nullptr) {
+      weight_dtype = weight->dtype;
+    }
     // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, data->dtype));
+    reporter->Assign(types[1], TensorType(wshape, weight_dtype));
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
@@ -1076,8 +1084,12 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
     channels = param->channels;
 
+    DataType weight_dtype = data->dtype;
+    if (weight != nullptr) {
+      weight_dtype = weight->dtype;
+    }
     // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, data->dtype));
+    reporter->Assign(types[1], TensorType(wshape, weight_dtype));
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index a05e460dc680..f334361874a3 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -210,7 +210,7 @@ InferCorrectLayoutOutput DenseInferCorrectLayout(const Attrs& attrs,
                                                  const Array<Layout>& new_in_layouts,
                                                  const Array<Layout>& old_in_layouts,
                                                  const Array<tvm::relay::Type>& old_in_types) {
-  return InferCorrectLayoutOutput({"NC", "NK"}, {"NC"}, attrs);
+  return InferCorrectLayoutOutput({"NC", "NC"}, {"NC"}, attrs);
 }
 
 TVM_REGISTER_GLOBAL("relay.op.nn._make.dense").set_body_typed(MakeDense);
diff --git a/src/relay/op/tensor/math.cc b/src/relay/op/tensor/math.cc
new file mode 100644
index 000000000000..246fba62cc66
--- /dev/null
+++ b/src/relay/op/tensor/math.cc
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file math.cc
+ * \brief Math operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <tvm/topi/einsum.h>
+
+#include "../make_op.h"
+#include "../op_common.h"
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.einsum
+TVM_REGISTER_NODE_TYPE(EinsumAttrs);
+
+bool EinsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+               const TypeReporter& reporter) {
+  // Check attrs
+  const EinsumAttrs* param = attrs.as<EinsumAttrs>();
+  if (param == nullptr) {
+    reporter->GetDiagCtx().EmitFatal(Diagnostic::Error(reporter->GetSpan())
+                                     << "the call attributes are not defined");
+    return false;
+  }
+
+  // types: [data, result]
+  ICHECK_EQ(types.size(), 2) << "the arity of einsum is 2, not " << types.size();
+
+  // Check input type is a tuple.
+  const auto* tensor_tuple = types[0].as<TupleTypeNode>();
+  if (tensor_tuple == nullptr) {
+    reporter->GetDiagCtx().EmitFatal(
+        Diagnostic::Error(reporter->GetSpan())
+        << "einsum requires a tuple of tensors as the first argument, found "
+        << PrettyPrint(types[0]));
+    return false;
+  }
+
+  // Check the input tuple consists of tensors with consistent dtype.
+  const auto& first = Downcast<TensorType>(tensor_tuple->fields[0]);
+  const DataType dtype = first->dtype;
+  std::vector<Array<PrimExpr>> input_shapes;
+  for (const Type& ele : tensor_tuple->fields) {
+    if (ele.as<IncompleteTypeNode>()) {
+      return false;
+    }
+
+    const auto& e = Downcast<TensorType>(ele);
+
+    const DataType& e_dtype = e->dtype;
+    if (e_dtype != dtype) {
+      throw Error("relay.einsum requires all tensors have the same dtype");
+    }
+    input_shapes.push_back(e->shape);
+  }
+
+  // Calculate output shape
+  Array<IndexExpr> oshape = topi::NumpyEinsumShape(param->equation, input_shapes);
+
+  auto rtype = TensorType(oshape, dtype);
+  reporter->Assign(types[1], rtype);
+  return true;
+}
+
+Array<te::Tensor> EinsumCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
+  const EinsumAttrs* param = attrs.as<EinsumAttrs>();
+  ICHECK(param != nullptr);
+  return Array<te::Tensor>{topi::einsum(param->equation, inputs)};
+}
+
+Expr MakeEinsum(Expr data, String equation) {
+  auto attrs = make_object<EinsumAttrs>();
+  attrs->equation = std::move(equation);
+  static const Op& op = Op::Get("einsum");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.einsum").set_body_typed(MakeEinsum);
+
+RELAY_REGISTER_OP("einsum")
+    .describe(R"doc(Evaluates the Einstein summation convention
+on the operands)doc" TVM_ADD_FILELINE)
+    .set_attrs_type<EinsumAttrs>()
+    .set_num_inputs(1)
+    .add_argument("data", "Tuple of Tensors", "The input list of tensors.")
+    .set_support_level(11)
+    .add_type_rel("Einsum", EinsumRel)
+    .set_attr<FTVMCompute>("FTVMCompute", EinsumCompute)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index cf5266485f2e..5782f1f6b4d1 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -495,7 +495,7 @@ Expr Conv2DSecondTerm(const Expr& padded_data, const Expr& kernel_zero_point,
  * \param input_zero_point The input zero point expr.
  * \param param The qnn conv2d attributes.
  * \param out_channels The number of output channels.
- * \return The sequence of Relay operatos for term3.
+ * \return The sequence of Relay operators for term3.
  * \note The term3 looks like this
  *
  *       Sigma(c,r,s) zp_a * QW(k, c, r, s)
@@ -625,7 +625,7 @@ Expr Conv2DCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3,
  * \node Lowering of the qnn.conv2d operator
  *       A quantized tensor is represented in following manner
  *          A = scale_a x (QA - zp_A)
- *       where QA is quantized tensor, scale_a and zp_A are quantizations
+ *       where QA is quantized tensor, scale_a and zp_A are quantization
  *       params.
  *
  *       Quantized convolution will convolve two quantized tensors and returns a
@@ -662,8 +662,8 @@ Expr Conv2DCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3,
  *         a workaround, we fall back to simpler lowering using int32 conv if
  *         the conv is dilated. We fallback also in case of grouped conv.
  *
- *       For depthwise, we can similarly unroll the computation. The intial compute is as follows
- *       wehere cm = channel_multiplier
+ *       For depthwise, we can similarly unroll the computation. The initial compute is as follows
+ *       where cm = channel_multiplier
  *
  *       Qc(n, oc, oh, ow) = Sigma(r, s) (Qw(oc/m, oc%/m, r, s) - zp_w)
  *                                     * (Qa(n, oc/cm, oh + r, ow + s) - zp_a)
@@ -693,12 +693,13 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   Expr kernel_zero_point = new_args[3];
   const auto* param = attrs.as<Conv2DAttrs>();
   ICHECK(param != nullptr);
-  // Assertion checks for exisiing support.
+  // Assertion checks for existing support.
   ICHECK(param->data_layout == "NCHW" || param->data_layout == "NHWC")
       << "qnn.conv2d supports only NCHW/NHWC input data layout.";
   ICHECK(param->kernel_layout == "OIHW" || param->kernel_layout == "HWIO" ||
          param->kernel_layout == "HWOI")
       << "qnn.conv2d supports only OIHW/HWIO/HWOI kernel data layout.";
+  ICHECK(param->kernel_size.defined()) << "qnn.conv2d requires kernel size to be specified.";
 
   int batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier;
   std::tie(batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier) =
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 592fa77aed77..7b733d4777ec 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -61,8 +61,8 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     }
   }
   ICHECK(IsScalarType(types[2], DataType::Int(32)));                  // input_zero_point
-  ICHECK(IsScalarType(types[3], DataType::Int(32)));                  // weight_zero_point
   ICHECK(IsScalarType(types[4], DataType::Float(32)));                // input_scale
+  // weight_zero_point can be a scalar or a vector of the same shape as the weight_scale
   AssignType(types[5], DataType::Float(32), param->units, reporter);  // weight_scale
 
   ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
@@ -89,10 +89,17 @@ Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
   return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype);
 }
 
-Expr DenseSecondTerm(const Expr& quantized_data, const Expr& kernel_zero_point) {
+Expr DenseSecondTerm(const Expr& quantized_data, const Expr& kernel_zero_point,
+                     const int out_dim_size) {
   Array<Integer> axes = {1};
-  return Multiply(kernel_zero_point,
-                  Sum(Cast(quantized_data, DataType::Int(32)), axes, true, false));
+  Expr reduced_t2 = Sum(Cast(quantized_data, DataType::Int(32)), axes, true, false);
+  Expr multiplied_t2;
+  if (!IsConstScalar(kernel_zero_point)) {
+    multiplied_t2 = Multiply(kernel_zero_point, MakeRepeat(reduced_t2, out_dim_size, 1));
+  } else {
+    multiplied_t2 = Multiply(kernel_zero_point, reduced_t2);
+  }
+  return multiplied_t2;
 }
 
 Expr DenseThirdTerm(const Expr& quantized_kernel, const Expr& input_zero_point) {
@@ -159,25 +166,24 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   Expr kernel_zero_point = new_args[3];
 
   const auto in_shape = get_shape(arg_types[0]);
+  const auto w_shape = get_shape(arg_types[1]);
   const int reduction_dim_size = get_const_int(in_shape[1]);
+  const int out_dim_size = get_const_int(w_shape[0]);
 
   const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();
 
   auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
-  auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point);
+  auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point, out_dim_size);
   auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);
 
   // Extract the integer zero points.
-  auto kernel_zero_point_int = GetScalarFromConstant<int>(kernel_zero_point);
 
-  if (!IsConstScalar(input_zero_point)) {
-    if (kernel_zero_point_int == 0) {
-      return Subtract(term1, term3);
-    }
+  if (!IsConstScalar(input_zero_point) || !IsConstScalar(kernel_zero_point)) {
     auto term4 = DenseFourthTerm(input_zero_point, kernel_zero_point, reduction_dim_size);
     return DenseCombineTerms(term1, term2, term3, term4);
   }
 
+  auto kernel_zero_point_int = GetScalarFromConstant<int>(kernel_zero_point);
   auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);
 
   // Get all the terms as described in the comments.
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 7af5c2ac1c33..c843eb3f544e 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -55,8 +55,15 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   int axis = dequantize_attrs->axis;
   auto rank = static_cast<int>(data->shape.size());
   axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
-  ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << dequantize_attrs->axis << " is out of range";
-  ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
+
+  // If zero point and scale are scalar then axis doesnt matter.
+  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
+  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+
+  if (!(scale_is_scalar && zp_is_scalar)) {
+    ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << dequantize_attrs->axis << " is out of range";
+    ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
+  }
 
   PrimExpr axis_shape;
   if (rank > 0) {
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 2f1d7d8da16c..b116eb9da034 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -53,8 +53,15 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   int axis = quantize_attrs->axis;
   auto rank = static_cast<int>(data->shape.size());
   axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
-  ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << quantize_attrs->axis << " is out of range";
-  ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
+
+  // If zero point and scale are scalar then axis doesnt matter.
+  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
+  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+
+  if (!(scale_is_scalar && zp_is_scalar)) {
+    ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << quantize_attrs->axis << " is out of range";
+    ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
+  }
 
   PrimExpr axis_shape;
   if (rank > 0) {
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 46de3522061b..a7d214761b9b 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -136,10 +136,17 @@ Expr RequantizeLower(const Expr& input_tensor, const Expr& input_scale,
                      const Expr& output_zero_point, const RequantizeAttrs* param,
                      const Array<IndexExpr>& input_shape, const DataType& out_dtype) {
   auto tensor = Cast(input_tensor, DataType::Int(32));
-  // 1) Subtract the input_zero_point
   auto zero_scalar = MakeConstantScalar(DataType::Int(32), 0);
   if (!IsEqualScalar(input_zero_point, zero_scalar)) {
-    tensor = Subtract(tensor, Cast(input_zero_point, DataType::Int(32)));
+    // Broadcast input zero point if needed.
+    int rank = static_cast<int>(input_shape.size());
+    int axis = (param->axis < 0) ? ((rank > 0) ? rank + param->axis : 0) : param->axis;
+    Expr input_zero_broadcast = ExpandBiasToMatchAxis(Reshape(input_zero_point,
+                                                              {
+                                                                  -1,
+                                                              }),
+                                                      rank, {axis});
+    tensor = Subtract(tensor, Cast(input_zero_broadcast, DataType::Int(32)));
   }
 
   // 2) If the input and output scales are same, we can skip the fixed point multiplication. Check
diff --git a/src/relay/transforms/fake_quantization_to_integer.cc b/src/relay/transforms/fake_quantization_to_integer.cc
index b5f434e74c43..77d18d7556f2 100644
--- a/src/relay/transforms/fake_quantization_to_integer.cc
+++ b/src/relay/transforms/fake_quantization_to_integer.cc
@@ -26,6 +26,7 @@
 #include <tvm/ir/affine_type.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/qnn/attrs.h>
 #include <tvm/relay/transform.h>
 
 namespace tvm {
@@ -109,18 +110,23 @@ class SubgraphExtractor : public ExprVisitor {
  protected:
   void VisitExpr_(const CallNode* call_node) override {
     if (call_node->op == quantize_op_) {
+      const auto* attrs = call_node->attrs.as<qnn::QuantizeAttrs>();
+      ICHECK(attrs != nullptr);
       // Only look at arg0 for quantize
       VisitExpr(call_node->args[0]);
       // Collect type of quantize ops
-      affine_types_.Set(GetRef<Expr>(call_node),
-                        TensorAffineType(call_node->args[1], call_node->args[2],
-                                         call_node->checked_type().as<TensorTypeNode>()->dtype));
+      affine_types_.Set(
+          GetRef<Expr>(call_node),
+          TensorAffineType(call_node->args[1], call_node->args[2], attrs->out_dtype, attrs->axis));
     } else if (call_node->op == dequantize_op_) {
+      const auto* attrs = call_node->attrs.as<qnn::DequantizeAttrs>();
+      ICHECK(attrs != nullptr);
       // Collect type of dequantize ops
       affine_types_.Set(
           GetRef<Expr>(call_node),
           TensorAffineType(call_node->args[1], call_node->args[2],
-                           call_node->args[0]->checked_type().as<TensorTypeNode>()->dtype));
+                           call_node->args[0]->checked_type().as<TensorTypeNode>()->dtype,
+                           attrs->axis));
     } else {
       // run normally on everything else.
       ExprVisitor::VisitExpr_(call_node);
diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index 7056dfe79fee..7b3f2da716aa 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -243,7 +243,9 @@ class ForwardPrep : private MixedModeVisitor {
     }
   }
   // Visitor pattern override.
-  void VisitExpr_(const LetNode* op) {
+  void VisitExpr_(const TupleGetItemNode* op) final { MixedModeVisitor::VisitExpr_(op); }
+
+  void VisitExpr_(const LetNode* op) final {
     ExprVisitor::VisitExpr_(op);
     // do pass through condition
     // by assigning NullValue<Message>
@@ -256,13 +258,13 @@ class ForwardPrep : private MixedModeVisitor {
     flist_.push_back(flazy);
   }
 
-  void VisitExpr_(const FunctionNode* op) {
+  void VisitExpr_(const FunctionNode* op) final {
     ExprVisitor::VisitExpr_(op);
     auto flazy = [this, op] { this->Update(op->body, NullValue<Message>()); };
     flist_.push_back(flazy);
   }
 
-  void VisitExpr_(const CallNode* call) {
+  void VisitExpr_(const CallNode* call) final {
     ExprVisitor::VisitExpr_(call);
     // function to be lazily invoked
     auto flazy = [this, call]() {
@@ -292,7 +294,7 @@ class ForwardPrep : private MixedModeVisitor {
     flist_.push_back(flazy);
   }
 
-  void VisitExpr_(const TupleNode* op) {
+  void VisitExpr_(const TupleNode* op) final {
     ExprVisitor::VisitExpr_(op);
     // do not support pass scale through tuple for now.
     auto flazy = [this, op]() {
@@ -303,7 +305,7 @@ class ForwardPrep : private MixedModeVisitor {
     flist_.push_back(flazy);
   }
 
-  void VisitExpr_(const IfNode* op) {
+  void VisitExpr_(const IfNode* op) final {
     ExprVisitor::VisitExpr_(op);
     // do pass through condition
     // by assigning NullValue<Message>
diff --git a/src/relay/transforms/label_ops.cc b/src/relay/transforms/label_ops.cc
index 861342b03a76..b23b1d92e77a 100644
--- a/src/relay/transforms/label_ops.cc
+++ b/src/relay/transforms/label_ops.cc
@@ -77,6 +77,25 @@ class LabelOpsMutator : public MixedModeMutator {
     }
     return std::move(f);
   }
+  Expr VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      this->Mutate(op->var);
+      this->Mutate(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      Var var = Downcast<Var>(this->Mutate(op->var));
+      auto value = this->Mutate(op->value);
+      auto body = this->Mutate(op->body);
+      auto expr = GetRef<Expr>(op);
+      if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+        this->memo_[expr] = expr;
+      } else {
+        this->memo_[expr] = Let(var, value, body);
+      }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
+  }
 
   Expr Rewrite_(const CallNode* op, const Expr& post) final {
     auto updated = MixedModeMutator::Rewrite_(op, post);
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index b48fbe44bd11..f74cf983ccae 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -30,6 +30,7 @@
  */
 
 #include <tvm/ir/error.h>
+#include <tvm/ir/module.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/expr.h>
@@ -509,7 +510,9 @@ class NameMangleExtFuncs : public MixedModeMutator {
 
     // Walk the tree and mangle the functions. Then replace compiler functions
     // with mangled functions in the module
-    IRModule new_module = IRModule({}, module_->type_definitions, module_->Imports());
+    IRModule new_module = module_->ShallowCopy();
+    new_module->functions = {};
+
     for (const auto& pair : glob_funcs) {
       if (auto* fn = pair.second.as<FunctionNode>()) {
         auto func = GetRef<Function>(fn);
diff --git a/src/relay/transforms/target_hooks.cc b/src/relay/transforms/target_hooks.cc
new file mode 100644
index 000000000000..40287ded1dd8
--- /dev/null
+++ b/src/relay/transforms/target_hooks.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file target_hooks.cc
+ * \brief Relay passes for processing Target Hooks which have been registered on functions within
+ * the IRModule
+ */
+
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace transform {
+
+class TargetHookVisitor : public tvm::relay::MixedModeVisitor {
+  /*! \brief Collected pass list for all nodes */
+  std::vector<Pass> pass_list_;
+  /*! \brief Attribute map for all registered targets */
+  TargetKindAttrMap<Pass> target_attr_map_;
+
+ public:
+  TargetHookVisitor() : target_attr_map_(tvm::TargetKind::GetAttrMap<Pass>("RelayToTIR")) {}
+
+  std::vector<Pass> Visit(const IRModule& ir_mod) {
+    for (const auto& it : ir_mod->functions) {
+      const BaseFunc& base_func = it.second;
+      VisitExpr(base_func);
+    }
+    return pass_list_;
+  }
+
+  void VisitExpr_(const CallNode* call) override {
+    // Descend the call tree
+    for (auto arg : call->args) {
+      VisitExpr(arg);
+    }
+
+    if (const FunctionNode* func = call->op.as<FunctionNode>()) {
+      if (!func->GetAttr<String>(attr::kCompiler).defined()) {
+        return;
+      }
+      String code_gen_name = func->GetAttr<String>(attr::kCompiler).value();
+      Optional<TargetKind> target_kind = tvm::TargetKind::Get(code_gen_name);
+      if (!target_kind || !target_attr_map_.count(target_kind.value())) {
+        return;
+      }
+      Pass custom_target_pass = target_attr_map_[target_kind.value()];
+      if (std::find(pass_list_.begin(), pass_list_.end(), custom_target_pass) == pass_list_.end()) {
+        pass_list_.push_back(custom_target_pass);
+      }
+    }
+  }
+};
+
+Pass RelayToTIRTargetHook() {
+  auto pass_func = [=](IRModule mod, const PassContext& pass_ctx) {
+    auto target_hook_visitor = TargetHookVisitor();
+    std::vector<Pass> pass_list = target_hook_visitor.Visit(mod);
+    Sequential run_hooks(pass_list);
+
+    return run_hooks(mod);
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIRTargetHook", {});
+}
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index d03fc1488aea..8e952d60b8b7 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -52,7 +52,7 @@ IRModule ToBasicBlockNormalForm(const IRModule& mod) {
   DLOG(INFO) << "ToBBlock:" << std::endl << mod;
 
   // Create a new module by shallow copy.
-  auto mod_ = IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map);
+  IRModule mod_ = mod->ShallowCopy();
 
   tvm::Map<GlobalVar, Function> updates;
   auto funcs = mod_->functions;
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index f29087dcc049..6c2371716b16 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -205,13 +205,17 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       this->EmitFatal(Diagnostic::Error(op->span) << "Cannot do type inference on global variables "
                                                   << "without a module");
     }
-
     if (mod_->ContainGlobalVar(var->name_hint)) {
-      relay::Function e = Downcast<Function>(mod_->Lookup(var));
-      return e->checked_type();
-    } else {
-      return op->checked_type_;
+      BaseFunc func = mod_->Lookup(var->name_hint);
+
+      if (func->IsInstance<FunctionNode>()) {
+        relay::Function relay_func = Downcast<Function>(func);
+        return relay_func->checked_type();
+      }
     }
+    // Return op->checked_type if the module doesn't contain the GlobalVar or the function is a
+    // PrimFunc (we don't typecheck PrimFuncs)
+    return op->checked_type_;
   }
 
   Type VisitExpr_(const ConstantNode* op) final { return op->tensor_type(); }
@@ -822,8 +826,7 @@ Pass InferType() {
       [=](IRModule mod, const PassContext& pass_ctx) {
         DLOG(INFO) << "tvm::relay::transform::InferType";
         // Execute the pass function and return a new module.
-        IRModule updated_mod =
-            IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map);
+        IRModule updated_mod = mod->ShallowCopy();
 
         pass_ctx->diag_ctx = DiagnosticContext::Default(updated_mod);
 
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index 08ac2ae0ec45..23f7339605df 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -40,7 +40,7 @@ namespace contrib {
 TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
                                  const std::vector<const DLTensor*>& data_entry,
                                  size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
-                                 int batch_size)
+                                 int batch_size, nvinfer1::IInt8Calibrator* calibrator)
     : data_entry_(data_entry),
       max_workspace_size_(max_workspace_size),
       use_implicit_batch_(use_implicit_batch),
@@ -48,6 +48,8 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
       batch_size_(batch_size) {
   // Create TRT builder and network.
   builder_ = nvinfer1::createInferBuilder(*logger);
+  use_int8_ = false;
+
 #if TRT_VERSION_GE(6, 0, 1)
   // Use INetworkV2.
   auto flags =
@@ -56,9 +58,15 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
     flags = 0U;
     builder_->setMaxBatchSize(batch_size_);
   }
+  this->calibrator_ = calibrator;
+  if (calibrator != nullptr) {
+    use_int8_ = true;
+    builder_->setFp16Mode(true);
+    builder_->setInt8Mode(true);
+    builder_->setInt8Calibrator(calibrator);
+  }
   network_ = builder_->createNetworkV2(flags);
 #else
-  // Use INetwork with implicit batch.
   builder_->setMaxBatchSize(batch_size_);
   builder_->setMaxWorkspaceSize(max_workspace_size_);
   builder_->setFp16Mode(use_fp16_);
@@ -158,6 +166,13 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
   if (use_fp16_) {
     config_->setFlag(nvinfer1::BuilderFlag::kFP16);
   }
+
+  if (use_int8_) {
+    config_->setFlag(nvinfer1::BuilderFlag::kINT8);
+    config_->setInt8Calibrator(calibrator_);
+    LOG(INFO) << "config finishes setting up calibrator as INT8 mode ... ";
+  }
+
   // Add profiles.
   if (!use_implicit_batch_) {
     auto profile = builder_->createOptimizationProfile();
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
index 0b1c3997ec57..bf74630bce7f 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -72,8 +72,8 @@ class TensorRTBuilder {
    * \param batch_size If use_implicit_batch,
    */
   TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
-                  size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
-                  int batch_size);
+                  size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, int batch_size,
+                  nvinfer1::IInt8Calibrator* calibrator = nullptr);
 
   /*!
    * \brief Add TensorRT input(s) for input node in network definition.
@@ -153,6 +153,9 @@ class TensorRTBuilder {
   /*! \brief Whether to automatically convert model to 16-bit floating point precision. */
   bool use_fp16_;
 
+  /*! \brief whether to automatically convert model to int8 precision */
+  bool use_int8_;
+
   /*! \brief Batch size to optimize for. */
   int batch_size_;
 
@@ -161,6 +164,10 @@ class TensorRTBuilder {
 
   /*! \brief Output names. */
   std::vector<std::string> network_output_names_;
+
+  /*! \brief calibrator pointer to add batch data when using int8 mode */
+  /*! \brief pointer will be nullptr when it is fp16 or fp32 precision */
+  nvinfer1::IInt8Calibrator* calibrator_;
 };
 
 }  // namespace contrib
diff --git a/src/runtime/contrib/tensorrt/tensorrt_calibrator.h b/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
new file mode 100755
index 000000000000..1e340d287629
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
@@ -0,0 +1,130 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+ * file runtime/contrib/tensorrt/tensorrt_builder.h
+ * brief Contains TensorRTBuilder class which can be used to convert a relay
+ * program into a TRT engine which can be used for inference.
+*/
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
+
+#include <string>
+#include <vector>
+
+#include "../../cuda/cuda_common.h"
+#include "NvInfer.h"
+
+namespace tvm {
+namespace runtime {
+
+class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
+ public:
+  TensorRTCalibrator(int batch_size, const std::vector<std::string>& input_names)
+      : batch_size_(batch_size), num_batches_calibrated_(0), input_names_(input_names) {}
+
+  ~TensorRTCalibrator() {
+    // Free calibration data
+    for (auto& inputs : data_) {
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        delete[] inputs[i];
+      }
+    }
+    // Free buffers
+    for (size_t i = 0; i < buffers_.size(); ++i) {
+      CUDA_CALL(cudaFree(buffers_[i]));
+    }
+  }
+
+  void AddBatchData(const std::vector<void*>& bindings, const std::vector<size_t>& binding_sizes) {
+    // Copy data from GPU
+    std::vector<float*> data_host(bindings.size(), nullptr);
+    for (size_t i = 0; i < bindings.size(); ++i) {
+      data_host[i] = new float[batch_size_ * binding_sizes[i]];
+      CUDA_CALL(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i],
+                           batch_size_ * binding_sizes[i] * sizeof(float), cudaMemcpyDeviceToHost));
+    }
+    data_.push_back(data_host);
+    data_sizes_.push_back(binding_sizes);
+  }
+
+  int getBatchSize() const override { return batch_size_; }
+
+  /*!
+   * \brief TensorRT will call this method to get next batch of data to
+   * calibrate with.
+   */
+  bool getBatch(void* bindings[], const char* names[], int nbBindings) override {
+    AllocateBuffersIfNotAllocated();
+    CHECK_EQ(input_names_.size(), nbBindings);
+    for (size_t i = 0; i < input_names_.size(); ++i) {
+      CHECK_EQ(input_names_[i], names[i]);
+      CUDA_CALL(cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i],
+                           batch_size_ * data_sizes_[num_batches_calibrated_][i] * sizeof(float),
+                           cudaMemcpyHostToDevice));
+      bindings[i] = buffers_[i];
+    }
+    num_batches_calibrated_++;
+    // TODO(trevmorr): Free data from previous batch?
+    return (num_batches_calibrated_ < data_.size());
+  }
+
+  const void* readCalibrationCache(size_t& length) override {
+    if (calibration_cache_.empty()) return nullptr;
+    length = calibration_cache_.size();
+    return calibration_cache_.data();
+  }
+
+  void writeCalibrationCache(const void* cache, size_t length) override {
+    calibration_cache_.assign(static_cast<const char*>(cache), length);
+  }
+
+ private:
+  /*! \brief Batch size. */
+  int batch_size_;
+  /*! \brief Number of batches already fed to calibrator. */
+  int num_batches_calibrated_;
+  /*! \brief Storage for calibration cache. */
+  std::string calibration_cache_;
+
+  /*! \brief Data to be used for calibration. */
+  std::vector<std::vector<float*>> data_;
+  /*! \brief Number of elements for data to be used for calibration. */
+  std::vector<std::vector<size_t>> data_sizes_;
+
+  /*! \brief Device buffers to be used for calibration. */
+  std::vector<void*> buffers_;
+
+  /*! \brief Names of inputs */
+  const std::vector<std::string> input_names_;
+
+  /*! \brief Allocate device memory buffers. data_sizes_ must already have one
+   * entry. */
+  void AllocateBuffersIfNotAllocated() {
+    if (!buffers_.empty()) return;
+    CHECK_GE(data_sizes_.size(), 1);
+    const int num_inputs = data_sizes_[0].size();
+    buffers_.assign(num_inputs, nullptr);
+    for (int i = 0; i < num_inputs; ++i) {
+      CUDA_CALL(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float)));
+    }
+  }
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 5562f853383c..a5779f739dac 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -27,6 +27,10 @@
 #include <tvm/runtime/registry.h>
 
 #include <fstream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 #include "../../file_utils.h"
 #include "../json/json_node.h"
@@ -35,6 +39,8 @@
 #ifdef TVM_GRAPH_EXECUTOR_TENSORRT
 #include "NvInfer.h"
 #include "tensorrt_builder.h"
+#include "tensorrt_calibrator.h"
+#include "tensorrt_utils.h"
 #endif
 
 namespace tvm {
@@ -66,7 +72,22 @@ class TensorRTRuntime : public JSONRuntimeBase {
         use_implicit_batch_(true),
         max_workspace_size_(size_t(1) << 30),
         max_batch_size_(-1),
-        multi_engine_mode_(false) {}
+        multi_engine_mode_(false) {
+    const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
+    multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
+    num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
+    if (use_int8) {
+      ICHECK(num_calibration_batches_remaining_ != 0)
+          << "When using INT8 mode, "
+          << "environment variable TENSORRT_NUM_CALI_INT8"
+          << "must also be set to specify the number of "
+          << "calibration times";
+      LOG(INFO) << "settiing up " << num_calibration_batches_remaining_
+                << " sample data to calibrate data ... ";
+      ICHECK(multi_engine_mode_ == false) << "When using int8 mode, "
+                                          << "multi-engine is not allowed";
+    }
+  }
 
   /*!
    * \brief The type key of the module.
@@ -87,7 +108,6 @@ class TensorRTRuntime : public JSONRuntimeBase {
     LoadGlobalAttributes();
     if (GetCachedEnginesFromDisk()) return;
     SetupConstants(consts);
-    multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
   }
 
   void LoadGlobalAttributes() {
@@ -130,7 +150,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
     if (batch_size == 0) return;
     auto engine = engine_and_context.engine;
     auto context = engine_and_context.context;
-    std::vector<void*> bindings(engine->getNbBindings(), nullptr);
+    const int num_bindings = engine->getNbBindings();
+    std::vector<void*> bindings(num_bindings, nullptr);
+    std::vector<size_t> binding_sizes(num_bindings, 0);
     // Setup input bindings.
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto nid = input_nodes_[i];
@@ -153,9 +175,26 @@ class TensorRTRuntime : public JSONRuntimeBase {
             device_buffer.CopyFrom(data_entry_[eid]);
             bindings[binding_index] = device_buffer->data;
           }
+
+          auto dims = engine->getBindingDimensions(binding_index);
+          int num_elements = 1;
+          for (int i = 0; i < dims.nbDims; ++i) num_elements *= dims.d[i];
+          binding_sizes[binding_index] = num_elements;
         }
       }
     }
+
+    // add batch data to calibrator
+    if (num_calibration_batches_remaining_ > 0) {
+      if (calibrator_ != nullptr) {
+        LOG(INFO) << "Starting adding last " << num_calibration_batches_remaining_
+                  << "-th batch data to the calibrator";
+        calibrator_->AddBatchData(bindings, binding_sizes);
+        num_calibration_batches_remaining_--;
+      }
+      return;
+    }
+
     // Setup output bindings.
     for (size_t i = 0; i < outputs_.size(); ++i) {
       uint32_t eid = EntryID(outputs_[i]);
@@ -225,10 +264,16 @@ class TensorRTRuntime : public JSONRuntimeBase {
   TensorRTEngineAndContext& GetOrBuildEngine() {
     int batch_size = GetBatchSize();
     int compatible_engine_batch_size = -1;
-    if (FindCompatibleEngine(batch_size, &compatible_engine_batch_size)) {
+    bool find_engine_flag = FindCompatibleEngine(batch_size, &compatible_engine_batch_size);
+    const bool use_int8 = (dmlc::GetEnv("TVM_TENSORRT_USE_INT8", 0) != 0);
+    const bool int8_calibration_not_used_or_not_complete =
+        (calibrator_ != nullptr && num_calibration_batches_remaining_ != 0);
+    if (find_engine_flag &&
+        (!use_int8 || calibrator_ == nullptr || int8_calibration_not_used_or_not_complete)) {
       // A compatible engine already exists.
       return trt_engine_cache_.at(std::make_pair(symbol_name_, compatible_engine_batch_size));
     }
+
     // For single engine mode, remove previous engine and update max_batch_size.
     if (!multi_engine_mode_) {
       DestroyEngines();
@@ -236,11 +281,32 @@ class TensorRTRuntime : public JSONRuntimeBase {
     }
     DLOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_
                << " with batch size " << batch_size;
+
+    // Build engine.
+    if (calibrator_ != nullptr && num_calibration_batches_remaining_ == 0) {
+      // Calibration complete and build int8 engine
+      BuildEngineFromJson(batch_size);
+      calibrator_.reset(nullptr);
+    } else {
+      // Build new engine
+      BuildEngineFromJson(batch_size);
+      TensorRTEngineAndContext& engine_and_context =
+          trt_engine_cache_[std::make_pair(symbol_name_, batch_size)];
+      if (use_int8) {
+        this->CreateInt8Calibrator(engine_and_context);
+      }
+    }
+
+    LOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_
+              << " with batch size " << batch_size;
+    CacheEngineToDisk();
+    return trt_engine_cache_.at(std::make_pair(symbol_name_, batch_size));
+  }
+
+  void BuildEngineFromJson(int batch_size) {
     const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
     TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
-                            use_fp16, batch_size);
-
-    // Add inputs and constants.
+                            use_fp16, batch_size, calibrator_.get());
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto nid = input_nodes_[i];
       const auto& node = nodes_[nid];
@@ -266,12 +332,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
       builder.AddOutput(outputs_[i], EntryID(outputs_[i]));
     }
 
-    // Build engine.
-    trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = builder.BuildEngine();
-    DLOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_
-               << " with batch size " << batch_size;
-    CacheEngineToDisk();
-    return trt_engine_cache_.at(std::make_pair(symbol_name_, batch_size));
+    TensorRTEngineAndContext engine_and_context = builder.BuildEngine();
+    trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context;
   }
 
   /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
@@ -286,7 +348,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
     // Check if engine is in the cache.
     std::ifstream infile(path, std::ios::binary);
     if (!infile.good()) return false;
-    DLOG(INFO) << "Loading cached TensorRT engine from " << path;
+    LOG(INFO) << "Loading cached TensorRT engine from " << path;
     infile.close();
     std::string serialized_engine;
     LoadBinaryFromFile(path, &serialized_engine);
@@ -308,6 +370,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
     helper.ReadAllFields(&reader);
     const int batch_size = GetBatchSize();
     trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context;
+    LOG(INFO) << "finished saving engine and context ... ";
     return true;
   }
 
@@ -369,10 +432,24 @@ class TensorRTRuntime : public JSONRuntimeBase {
     return device_buffers_.at(binding_index);
   }
 
+  void CreateInt8Calibrator(const TensorRTEngineAndContext& engine_and_context) {
+    // Get input names in binding order.
+    std::vector<std::string> input_names;
+    for (size_t i = 0; i < engine_and_context.inputs.size(); i++) {
+      std::string ele = engine_and_context.inputs[i];
+      input_names.push_back(ele);
+    }
+    const int batch_size = GetBatchSize();
+    calibrator_.reset(new TensorRTCalibrator(batch_size, input_names));
+  }
+
   /*! \brief Map of function name and max batch size to TRT engine if built already. */
   std::unordered_map<std::pair<std::string, int>, TensorRTEngineAndContext, PairHash>
       trt_engine_cache_;
 
+  /*! \brief Calibrator for INT8 mode. */
+  std::unique_ptr<TensorRTCalibrator> calibrator_;
+
   /*! \brief Map of inding index to GPU buffers for inputs and outputs. Only used when target device
    * is not "cuda". Since TensorRT execution can only read data from GPU, we need to copy data from
    * the runtime device to these buffers first. These will be allocated for the highest batch size
@@ -402,6 +479,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
 
   size_t max_workspace_size_;
 
+  /*! \brief Number of calibration batches until we are done. */
+  int num_calibration_batches_remaining_;
+
   /*! \brief Highest batch size that an engine has been built for, used in single-engine mode only
    * (multi_engine_mode=false). */
   int max_batch_size_;
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 04721ee6d705..ea986a3bf096 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -395,6 +395,8 @@ int RPCGetCRTMaxPacketSize(TVMValue* args, int* type_codes, int num_args, TVMVal
   return 0;
 }
 
+int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                         int* ret_type_code);
 tvm_crt_error_t TVMInitializeRuntime() {
   int idx = 0;
   tvm_crt_error_t error = kTvmErrorNoError;
@@ -432,6 +434,10 @@ tvm_crt_error_t TVMInitializeRuntime() {
     error = TVMFuncRegisterGlobal("tvm.rpc.server.GetCRTMaxPacketSize", &RPCGetCRTMaxPacketSize, 0);
   }
 
+  if (error == kTvmErrorNoError) {
+    error = TVMFuncRegisterGlobal("tvm.contrib.random.random_fill", &TVMContribRandomFill, 0);
+  }
+
   if (error != kTvmErrorNoError) {
     TVMPlatformMemoryFree(registry_backing_memory, dev);
   }
@@ -563,3 +569,20 @@ release_and_return : {
 __attribute__((weak)) tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
   return kTvmErrorFunctionCallNotImplemented;
 }
+
+// Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
+// Named to correspond with the analogous function in the C++ runtime.
+int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                         int* ret_type_code) {
+  if (num_args != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (type_codes[0] != kTVMDLTensorHandle) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  DLTensor* tensor = (DLTensor*)args[0].v_handle;
+  TVMNDArray arr = {*tensor};
+  return TVMNDArray_RandomFill(&arr);
+}
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index c97f7658938f..16bde3227f7c 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -47,18 +47,22 @@ int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype,
   return 0;
 }
 
+int64_t TVMNDArray_DataSizeBytes(TVMNDArray* array) {
+  int64_t num_elems = 1;
+  int32_t idx;
+  for (idx = 0; idx < array->dl_tensor.ndim; ++idx) {
+    num_elems *= array->dl_tensor.shape[idx];
+  }
+  return (num_elems * array->dl_tensor.dtype.bits + 7) / 8;
+}
+
 int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                      TVMNDArray* array) {
   int status = TVMNDArray_Create(ndim, shape, dtype, dev, array);
   if (status != 0) {
     return status;
   }
-  int64_t num_elems = 1;
-  int32_t idx;
-  for (idx = 0; idx < array->dl_tensor.ndim; ++idx) {
-    num_elems *= shape[idx];
-  }
-  int total_elem_bytes = (num_elems * dtype.bits + 7) / 8;
+  int total_elem_bytes = TVMNDArray_DataSizeBytes(array);
   array->dl_tensor.data =
       TVMBackendAllocWorkspace(kDLCPU, 0, total_elem_bytes, dtype.code, dtype.bits);
   memset(array->dl_tensor.data, 0, total_elem_bytes);
@@ -136,6 +140,15 @@ int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndi
   return 0;
 }
 
+int TVMNDArray_RandomFill(TVMNDArray* arr) {
+  int64_t num_bytes = TVMNDArray_DataSizeBytes(arr);
+  if (num_bytes < 0 || num_bytes > SIZE_MAX) {
+    return kTvmErrorFunctionCallInvalidArg;
+  }
+
+  return TVMPlatformGenerateRandom(arr->dl_tensor.data, (size_t)num_bytes);
+}
+
 int TVMNDArray_Release(TVMNDArray* arr) {
   tvm_crt_error_t err;
   DLDevice dev = {kDLCPU, 0};
diff --git a/src/runtime/crt/crt_config-template.h b/src/runtime/crt/crt_config-template.h
index 7949aea6f171..aa718a303744 100644
--- a/src/runtime/crt/crt_config-template.h
+++ b/src/runtime/crt/crt_config-template.h
@@ -37,7 +37,7 @@
 #define TVM_CRT_MAX_ARGS 10
 
 /*! Size of the global function registry, in bytes. */
-#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 250
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
 
 /*! Maximum number of registered modules. */
 #define TVM_CRT_MAX_REGISTERED_MODULES 2
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
index f878477e7b42..e5869ed2a303 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
@@ -44,6 +44,10 @@ typedef struct TVMNDArray {
 int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                       TVMNDArray* array);
 
+int64_t TVMNDArray_DataSizeBytes(TVMNDArray* array);
+
+int TVMNDArray_RandomFill(TVMNDArray* array);
+
 int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                      TVMNDArray* array);
 
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 2fa73971d000..12a739722a5c 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -365,8 +365,23 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
   } else if (name == "profile") {
     return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
         [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
-          return this->Profile(collectors);
+          // We cannot send Arrays over rpc, so in order to support profiling
+          // on remotes, we accept a nullptr for collectors.
+          if (collectors.defined()) {
+            return this->Profile(collectors);
+          } else {
+            return this->Profile({});
+          }
         });
+  } else if (name == "profile_rpc") {
+    // We cannot return a Report over RPC because TMV RPC mechanism only
+    // supports a subset of Object classes. Instead we serialize it on the
+    // remote (here) and deserialize it on the other end.
+    return TypedPackedFunc<std::string()>([sptr_to_self, this]() {
+      PackedFunc profile = GetFunction("profile", sptr_to_self);
+      profiling::Report report = profile(Array<profiling::MetricCollector>());
+      return report->AsJSON();
+    });
   } else {
     return GraphExecutor::GetFunction(name, sptr_to_self);
   }
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 88815c388ccd..41aa5855ceeb 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -483,9 +483,10 @@ hexagon::ArgLayout HexagonModuleNode::BuildArgLayout(const TVMArgs& As) const {
         ICHECK_EQ(static_cast<int64_t>(A), static_cast<int32_t>(A));
         Args.Push(static_cast<int>(A));
         break;
-      // 64-bit values
+      // As above, treat floating point values as float32.
       case kDLFloat:
-        Args.Push(static_cast<double>(A));
+        ICHECK_EQ(static_cast<double>(A), static_cast<float>(static_cast<double>(A)));
+        Args.Push(static_cast<float>(static_cast<double>(A)));
         break;
 
       case kTVMOpaqueHandle:
diff --git a/src/runtime/hexagon/launcher/CMakeLists.txt b/src/runtime/hexagon/launcher/CMakeLists.txt
new file mode 100644
index 000000000000..d3a2f4f8161d
--- /dev/null
+++ b/src/runtime/hexagon/launcher/CMakeLists.txt
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cmake_minimum_required(VERSION 3.2)
+project(HexagonLauncher C CXX)
+
+if(NOT "${FASTRPC_LIBS}" STREQUAL "SKEL" AND
+   NOT "${FASTRPC_LIBS}" STREQUAL "STUB")
+  message(SEND_ERROR "Please set FASTRPC_LIBS to either SKEL or STUB")
+endif()
+
+if(NOT DEFINED USE_HEXAGON_SDK)
+  message(SEND_ERROR "Please set USE_HEXAGON_SDK to the location of Hexagon SDK")
+endif()
+if (NOT DEFINED USE_HEXAGON_ARCH)
+  message(SEND_ERROR "Please set USE_HEXAGON_ARCH to the Hexagon architecture version")
+endif()
+
+include(../../../../cmake/modules/HexagonSDK.cmake)
+
+find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
+
+include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_REMOTE_ROOT})
+
+set(QAIC_EXE "${HEXAGON_QAIC_EXE}")
+foreach(INCDIR IN LISTS HEXAGON_SDK_INCLUDES HEXAGON_REMOTE_ROOT)
+  list(APPEND QAIC_FLAGS "-I${INCDIR}")
+endforeach()
+
+set(LAUNCHER_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
+set(CMAKE_SKIP_RPATH TRUE)
+
+# Qaic for the domain header.
+#
+# Don't add paths to these filenames, or otherwise cmake may spontaneously
+# add -o option to the qaic invocation (with an undesirable path).
+set(LAUNCHER_RPC_IDL "launcher_rpc.idl")
+set(LAUNCHER_RPC_H "launcher_rpc.h")
+set(LAUNCHER_RPC_SKEL_C "launcher_rpc_skel.c")
+set(LAUNCHER_RPC_STUB_C "launcher_rpc_stub.c")
+
+add_custom_command(
+  OUTPUT ${LAUNCHER_RPC_SKEL_C} ${LAUNCHER_RPC_STUB_C}
+    "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
+  COMMAND ${QAIC_EXE} ${QAIC_FLAGS}
+    "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+  COMMAND ${CMAKE_COMMAND} -E rename "${LAUNCHER_RPC_H}"
+    "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
+  MAIN_DEPENDENCY "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+)
+
+
+if("${FASTRPC_LIBS}" STREQUAL "SKEL")
+  # Skel libraries.
+  #
+  if (NOT DEFINED TVM_RUNTIME_HEXAGON)
+    message(SEND_ERROR "Please set TVM_RUNTIME_HEXAGON=/path/to/libtvm_runtime.a")
+  endif()
+
+  include_directories(SYSTEM ${HEXAGON_QURT_INCLUDES})
+  include_directories(
+    "${LAUNCHER_SRC}"
+    "${LAUNCHER_SRC}/../../../../include"
+    "${LAUNCHER_SRC}/../../../../3rdparty/dlpack/include"
+    "${LAUNCHER_SRC}/../../../../3rdparty/dmlc-core/include"
+  )
+  link_directories(${HEXAGON_QURT_LIBS})
+
+  add_definitions(-D_MACH_I32=int)
+  add_definitions(-DDMLC_CXX11_THREAD_LOCAL=0)
+  add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+
+  # Extra compile flags (both C and C++).
+  set(EXTRA_COMP_FLAGS
+    "-O3"
+    "-m${USE_HEXAGON_ARCH}"
+  )
+  string(REGEX REPLACE ";" " " EXTRA_COMP_FLAGS_STR "${EXTRA_COMP_FLAGS}")
+  set(CMAKE_C_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_C_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
+
+  set(EXTRA_LINK_FLAGS
+    "-lposix"
+    "-lqurt"
+    "-Wl,--export-dynamic"
+    "-Wl,--whole-archive ${TVM_RUNTIME_HEXAGON} -Wl,--no-whole-archive"
+    "-Wl,--defsym=HEAP_SIZE=0x40000000"
+  )
+  string(REGEX REPLACE ";" " " EXTRA_LINK_FLAGS_STR "${EXTRA_LINK_FLAGS}")
+
+  set(SKEL_SRCS
+    "launcher_core.cc"
+    "launcher_hexagon.cc"
+  )
+  add_library(launcher_rpc_skel SHARED
+    "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
+    "${LAUNCHER_RPC_SKEL_C}"
+    "${SKEL_SRCS}"
+  )
+
+  # Extra linker flags for linking shared libraries.
+  set_target_properties(launcher_rpc_skel PROPERTIES
+    LINK_FLAGS ${EXTRA_LINK_FLAGS_STR}
+  )
+else()
+  # Stub libraries.
+  #
+  if (NOT DEFINED TVM_RUNTIME_ANDROID)
+    message(SEND_ERROR "Please set TVM_RUNTIME_ANDROID=/path/to/libtvm_runtime.so")
+  endif()
+
+  include_directories(SYSTEM
+    "${HEXAGON_SDK_INCLUDES}"
+    "${HEXAGON_RPCMEM_ROOT}/inc"
+  )
+  include_directories(
+    "${LAUNCHER_SRC}"
+    "${LAUNCHER_SRC}/../../../../include"
+    "${LAUNCHER_SRC}/../../../../3rdparty/dlpack/include"
+    "${LAUNCHER_SRC}/../../../../3rdparty/dmlc-core/include"
+  )
+  link_directories(${HEXAGON_REMOTE_ROOT})
+
+  add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+
+  set(STUB_SRCS
+    "launcher_android.cc"
+    "launcher_core.cc"
+    "launcher_main.cc"
+    "launcher_util.cc"
+  )
+
+  add_executable(launcher_android
+    "${STUB_SRCS}"
+    "${LAUNCHER_RPC_STUB_C}"
+  )
+  target_link_libraries(launcher_android cdsprpc log)
+
+  set_target_properties(launcher_android PROPERTIES
+    LINK_FLAGS "${TVM_RUNTIME_ANDROID}"
+  )
+endif()
diff --git a/src/runtime/hexagon/launcher/README.md b/src/runtime/hexagon/launcher/README.md
new file mode 100644
index 000000000000..a8a570918514
--- /dev/null
+++ b/src/runtime/hexagon/launcher/README.md
@@ -0,0 +1,175 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+# Hexagon Graph Launcher
+
+## Compilation
+
+The launcher consists of two parts: part running on Hexagon, and part running
+on Android. They need to be compiled separately. Since some source files are
+shared between these two parts, make sure to delete all object files between
+compilations. Compile the Hexagon code first.
+
+The supported Snapdragon architectures are 855, 865, and 888.
+
+### Prerequisites
+
+1. Android NDK version r19c or later.
+2. Hexagon SDK version 4.0.0 or later.
+
+Android NDK can be downloaded from https://developer.android.com/ndk.
+Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk.
+
+### Compilation of the Hexagon part
+
+1. Build the static version of TVM runtime for Hexagon. Use Hexagon clang
+   from the Hexagon SDK. This step is the same as building the shared version,
+   except at the cmake step, add `-DBUILD_STATIC_RUNTIME=ON`. The compilation
+   step should create `libtvm_runtime.a`.
+
+2. Create a subdirectory for the build files, and run `cmake` with the
+   following variables set:
+   - `FASTRPC_LIBS=SKEL`
+   - `USE_HEXAGON_SDK` to the path to the Hexagon SDK
+   - `CMAKE_C_COMPILER=hexagon-clang`
+   - `CMAKE_CXX_COMPILER=hexagon-clang++`
+   - `USE_HEXAGON_ARCH` to one of v65, v66, v68
+   - `TVM_RUNTIME_HEXAGON=/path/to/libtvm_runtime.a` _statically_ linked
+     TVM runtime
+
+   Make sure to provide the path to launcher's `CMakeLists.txt` directory
+   in `cmake` invocation.
+
+3. Run `make`. This will create `liblauncher_rpc_skel.so`.
+
+### Compilation of the Android part
+
+1. Build TVM runtime for Android, using clang for AArch64 from the Android
+   NDK. Unlike in the Hexagon case, this should be the dynamic library (which
+   is the default), i.e. `libtvm_runtime.so`.
+
+2. Create a subdirectory for the build files (different from the one used for
+   Hexagon files), and run `cmake` with the following variables set:
+   - `FASTRPC_LIBS=STUB`
+   - `USE_HEXAGON_SDK` to the path to the Hexagon SDK
+   - `CMAKE_C_COMPILER=aarch64-linux-android28-clang` (or later)
+   - `CMAKE_CXX_COMPILER=aarch64-linux-android28-clang++` (or later)
+   - `USE_HEXAGON_ARCH` to one of v65, v66, v68 (same as for the Hexagon part)
+   - `TVM_RUNTIME_ANDROID=/path/to/libtvm_runtime.so` dynamically or
+     statically linked TVM runtime
+
+3. Run `make`. This will create `launcher_android`.
+
+## Execution
+
+From the Android shell, do
+```
+./launcher_android --in_config input.json --out_config output.json
+```
+
+You may need to add the location of `libtvm_runtime.so` to `LD_LIBRARY_PATH`.
+See below for more information about the setup and launcher's inputs.
+
+### Preparation steps
+
+Copy the following binaries to the device:
+- `liblauncher_rpc_skel.so`: created by the compilation step for Hexagon,
+- `libgcc.so`: take this one from the Hexagon toolchain,
+- `launcher_android`: created by the compilation step for Android,
+- `libtvm_runtime.so`: built for Android.
+
+These are only the binaries related to the launcher itself. To run a model
+copy the shared object with the model and the model JSON file over to the
+device (both are obtained from relay).  Also, copy all input files for the
+model as well.
+
+The following snippet illustrates how to obtain the shared object and the
+JSON file from a TFLite model (using Inception V3 as an example):
+
+```
+# Skipped imports, etc.
+
+with open("inception_v3.tflite", "rb") as f:
+    tflite_model_buf = f.read()
+tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+shape_dict = { "input": [1,299,299,3] }
+dtype_dict = { "input": "float32" }
+
+mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
+)
+
+target = tvm.target.hexagon('v68', link_params=True)
+with tvm.transform.PassContext(opt_level=3):
+    lib = relay.build(mod, target, target_host=target, params=params, mod_name="default")
+
+# Save model.so and model.json:
+with open('model.json', 'w') as f:
+    f.write(lib.get_graph_json())
+lib.get_lib().save('model.so')
+```
+
+The final thing is to prepare a JSON configuration file for the launcher.
+The JSON has two attributes describing the model: `model-library` and
+`model-json`, and an attribute `inputs`, which is a list of records, one
+for each input file.
+An input file record has three attributes: `file`, `shape`, and `dtype`.
+
+Below is an example of the input config file for Inception V3:
+```
+{
+  "model-library": "inceptionv3-float32.so",
+  "model-json": "inceptionv3-float32.json",
+  "inputs" : [
+    {
+      "file": "panda_299x299_fp.dat",
+      "shape": [1,299,299,3],
+      "dtype": "float32"
+    }
+  ]
+}
+```
+
+The launcher will then create the output JSON file (with the name given via
+`--out_config`) containing information about the execution time and the model
+outputs. The output JSON file has three attributes: "pcycles", "usecs" that
+contain the execution duration in terms of processor cycles and microseconds
+respectivaly, and an attribute `outputs`, which is a list of output file records
+whose syntax is identical to the input file records in the input file.
+A sample output JSON from running the Inception V3 model may look like
+```
+{
+  "pcycles": 112965680178,
+  "usecs": 79532302,
+  "outputs": [
+    {
+      "file": "output0.dat",
+      "shape": [1, 1001],
+      "dtype": "float32"
+    }
+  ]
+}
+```
+
+# Disclaimer
+
+The launcher does not perform any correctness verification. In order to verify
+correctness, the user needs to copy the output files from the device and
+verify their contents.
+
+This launcher is intended for use with prototyping and does not utilize any
+performance acceleration, as such the measured performance may be very poor.
diff --git a/src/runtime/hexagon/launcher/launcher_android.cc b/src/runtime/hexagon/launcher/launcher_android.cc
new file mode 100644
index 000000000000..c0e428cb63ca
--- /dev/null
+++ b/src/runtime/hexagon/launcher/launcher_android.cc
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+#include <remote.h>
+#include <rpcmem.h>
+
+#include <algorithm>
+#include <ios>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "launcher_core.h"
+#include "launcher_rpc.h"
+
+AEEResult enable_unsigned_pd(bool enable) {
+  remote_rpc_control_unsigned_module data{static_cast<int>(enable), CDSP_DOMAIN_ID};
+  AEEResult rc = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, &data, sizeof(data));
+  if (rc != AEE_SUCCESS) {
+    std::cout << "error " << (enable ? "enabling" : "disabling") << " unsigned PD\n";
+  }
+  return rc;
+}
+
+AEEResult set_remote_stack_size(int size) {
+  remote_rpc_thread_params th_data{CDSP_DOMAIN_ID, -1, size};
+  AEEResult rc = remote_session_control(FASTRPC_THREAD_PARAMS, &th_data, sizeof(th_data));
+  if (rc != AEE_SUCCESS) {
+    std::cout << "error setting remote stack size: " << std::hex << rc << '\n';
+  }
+  return rc;
+}
+
+struct RPCChannel : public ExecutionSession {
+  explicit RPCChannel(const std::string& uri) {
+    enable_unsigned_pd(true);
+    set_remote_stack_size(128 * 1024);
+
+    int rc = launcher_rpc_open(uri.c_str(), &handle);
+    if (rc != AEE_SUCCESS) {
+      handle = -1;
+    }
+  }
+
+  ~RPCChannel() {
+    if (handle == -1) {
+      return;
+    }
+
+    for (void* ptr : allocations) {
+      rpcmem_free(ptr);
+    }
+    if (model_loaded) {
+      unload_model();
+    }
+    launcher_rpc_close(handle);
+    handle = -1;
+  }
+
+  void* alloc_mem(size_t nbytes, size_t align) override {
+    void* host_ptr = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, nbytes);
+    if (host_ptr != nullptr) {
+      allocations.push_back(host_ptr);
+    }
+    return host_ptr;
+  }
+
+  void free_mem(void* addr) override {
+    auto f = std::find(allocations.begin(), allocations.end(), addr);
+    if (f != allocations.end()) {
+      allocations.erase(f);
+      rpcmem_free(addr);
+    }
+  }
+
+  bool load_model(const std::string& model_path, const std::string& model_json) override {
+    AEEResult rc = launcher_rpc_load(handle, model_path.c_str(), model_json.c_str());
+    if (rc != AEE_SUCCESS) {
+      std::cout << "error loading graph module: " << std::hex << rc << '\n';
+    } else {
+      model_loaded = true;
+    }
+    return rc == AEE_SUCCESS;
+  }
+
+  bool unload_model() override {
+    AEEResult rc = launcher_rpc_unload(handle);
+    if (rc != AEE_SUCCESS) {
+      std::cout << "error unloading model: " << std::hex << rc << '\n';
+    }
+    model_loaded = false;
+    return rc == AEE_SUCCESS;
+  }
+
+  bool set_input(int input_idx, const tensor_meta* input_meta, const void* input_data) override {
+    AEEResult rc = launcher_rpc_set_input(
+        handle, input_idx, reinterpret_cast<const unsigned char*>(input_meta),
+        input_meta->meta_size(), reinterpret_cast<const unsigned char*>(input_data),
+        input_meta->data_size());
+    if (rc != AEE_SUCCESS) {
+      std::cout << "error setting model input no." << input_idx << ": " << std::hex << rc << '\n';
+    }
+    return rc == AEE_SUCCESS;
+  }
+
+  bool run(uint64_t* pcycles, uint64_t* usecs) override {
+    AEEResult rc = launcher_rpc_run(handle, pcycles, usecs);
+    if (rc != AEE_SUCCESS) {
+      std::cout << "error running model: " << std::hex << rc << '\n';
+    }
+    return rc == AEE_SUCCESS;
+  }
+
+  bool get_num_outputs(int* num_outputs) override {
+    AEEResult rc = launcher_rpc_get_num_outputs(handle, num_outputs);
+    if (rc != AEE_SUCCESS) {
+      std::cout << "error getting number of outputs: " << std::hex << rc << '\n';
+    }
+    return rc == AEE_SUCCESS;
+  }
+
+  bool get_output(int output_idx, tensor_meta* output_meta, int meta_size, void* output_data,
+                  int data_size) override {
+    AEEResult rc = launcher_rpc_get_output(
+        handle, output_idx, reinterpret_cast<unsigned char*>(output_meta), meta_size,
+        reinterpret_cast<unsigned char*>(output_data), data_size);
+    if (rc != AEE_SUCCESS) {
+      std::cout << "error getting output no." << output_idx << ": " << std::hex << rc << '\n';
+    }
+    return rc == AEE_SUCCESS;
+  }
+
+  bool model_loaded = false;
+  remote_handle64 handle = -1;
+  std::vector<void*> allocations;
+};
+
+ExecutionSession* create_execution_session() {
+  auto* session = new RPCChannel(launcher_rpc_URI CDSP_DOMAIN);
+  if (session->handle == -1) {
+    delete session;
+    session = nullptr;
+    std::cout << "Error opening FastRPC channel\n";
+  }
+  return session;
+}
diff --git a/src/runtime/hexagon/launcher/launcher_core.cc b/src/runtime/hexagon/launcher/launcher_core.cc
new file mode 100644
index 000000000000..364e7abfd171
--- /dev/null
+++ b/src/runtime/hexagon/launcher/launcher_core.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "launcher_core.h"
+
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <ios>
+#include <iterator>
+#include <string>
+#include <vector>
+
+const std::string TensorConfig::file_key = "file";    // NOLINT(runtime/string)
+const std::string TensorConfig::shape_key = "shape";  // NOLINT(runtime/string)
+const std::string TensorConfig::dtype_key = "dtype";  // NOLINT(runtime/string)
+
+std::string tensor_meta::to_string() const {
+  std::stringstream out;
+  out << "ndim=" << ndim << ", dtype=" << tvm::runtime::DLDataType2String(dtype) << ", shape=";
+  for (int i = 0; i != ndim; ++i) {
+    out << shape[i];
+    if (i + 1 < ndim) {
+      out << 'x';
+    }
+  }
+  return out.str();
+}
+
+void TensorConfig::Load(dmlc::JSONReader* reader) {
+  reader->BeginObject();
+  std::string key;
+  while (!bad && reader->NextObjectItem(&key)) {
+    if (key == file_key) {
+      reader->Read(&file_name);
+    } else if (key == shape_key) {
+      reader->Read(&shape);
+      if (shape.empty()) {
+        std::cout << "error: empty shape\n";
+        bad = true;
+      }
+    } else if (key == dtype_key) {
+      reader->Read(&dtype);
+    } else {
+      std::cout << "unknown tensor config key: " << key << '\n';
+      bad = true;
+    }
+  }
+}
+
+void TensorConfig::Save(dmlc::JSONWriter* writer) const {
+  writer->BeginObject(true);
+  writer->WriteObjectKeyValue(file_key, file_name);
+  writer->WriteObjectKeyValue(shape_key, shape);
+  writer->WriteObjectKeyValue(dtype_key, dtype);
+  writer->EndObject();
+}
+
+void ModelConfig::Load(dmlc::JSONReader* reader) {
+  reader->BeginObject();
+  std::string key;
+  while (!bad && reader->NextObjectItem(&key)) {
+    if (key == "model-library") {
+      reader->Read(&model_library);
+    } else if (key == "model-json") {
+      reader->Read(&model_json);
+    } else if (key == "inputs") {
+      reader->Read(&inputs);
+      bad = std::any_of(inputs.begin(), inputs.end(), [](auto t) { return t.bad; });
+    } else {
+      std::cout << "unknown model config key: " << key << '\n';
+      bad = true;
+    }
+  }
+}
+
+void OutputConfig::Save(dmlc::JSONWriter* writer) const {
+  writer->BeginObject(true);
+  writer->WriteObjectKeyValue("pcycles", pcycles);
+  writer->WriteObjectKeyValue("usecs", usecs);
+  writer->WriteObjectKeyValue("outputs", outputs);
+  writer->EndObject();
+}
+
+bool read_model_config(const std::string& file_name, ModelConfig* model_config) {
+  if (model_config == nullptr) {
+    return false;
+  }
+  std::ifstream mfc(file_name);
+  if (!mfc.is_open()) {
+    return false;
+  }
+  dmlc::JSONReader reader(&mfc);
+  model_config->Load(&reader);
+  if (model_config->bad || !mfc) {
+    return false;
+  }
+  return true;
+}
+
+bool write_output_config(const std::string& file_name, OutputConfig* output_config) {
+  std::ofstream ofc(file_name);
+  if (!ofc.is_open()) {
+    return false;
+  }
+  dmlc::JSONWriter writer(&ofc);
+  output_config->Save(&writer);
+  if (!ofc) {
+    return false;
+  }
+  return true;
+}
+
+Model::Model(tvm::runtime::Module executor, tvm::runtime::Module module, std::string json)
+    : graph_executor(executor), graph_module(module), graph_json(json) {
+  // Lookup "run" ahead of time to reduce overhead in the model execution.
+  run = get_module_func(graph_executor, "run");
+}
+
+const tvm::runtime::PackedFunc get_runtime_func(const std::string& name) {
+  if (const tvm::runtime::PackedFunc* pf = tvm::runtime::Registry::Get(name)) {
+    return *pf;
+  }
+  return tvm::runtime::PackedFunc();
+}
+
+const tvm::runtime::PackedFunc get_module_func(tvm::runtime::Module module,
+                                               const std::string& name) {
+  return module.GetFunction(name, false);
+}
+
+void reset_device_api() {
+  const tvm::runtime::PackedFunc api = get_runtime_func("device_api.cpu");
+  tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(api);
+}
+
+tvm::runtime::Module load_module(const std::string& file_name) {
+  static const tvm::runtime::PackedFunc loader = get_runtime_func("runtime.module.loadfile_so");
+  tvm::runtime::TVMRetValue rv = loader(file_name);
+  if (rv.type_code() == kTVMModuleHandle) {
+    return rv.operator tvm::runtime::Module();
+  }
+  return tvm::runtime::Module();
+}
+
+tvm::runtime::Module create_graph_executor(const std::string& graph_json,
+                                           tvm::runtime::Module graph_module,
+                                           tvm::runtime::Device device) {
+  std::string launcher_name = "tvm.graph_executor.create";
+
+  const tvm::runtime::PackedFunc create_executor = get_runtime_func(launcher_name);
+  uint64_t device_type = device.device_type;
+  uint64_t device_id = device.device_id;
+
+  // Use default param lookup function (linked into the module).
+  tvm::runtime::TVMRetValue rv = create_executor(graph_json, graph_module, device_type, device_id);
+  return rv.operator tvm::runtime::Module();
+}
diff --git a/src/runtime/hexagon/launcher/launcher_core.h b/src/runtime/hexagon/launcher/launcher_core.h
new file mode 100644
index 000000000000..e799e1c798cb
--- /dev/null
+++ b/src/runtime/hexagon/launcher/launcher_core.h
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_LAUNCHER_LAUNCHER_CORE_H_
+#define TVM_RUNTIME_HEXAGON_LAUNCHER_LAUNCHER_CORE_H_
+
+#include <dlpack/dlpack.h>
+#include <dmlc/json.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <string>
+#include <vector>
+
+struct tensor_meta {
+  int ndim;
+  DLDataType dtype;
+  int64_t shape[];
+
+  int meta_size() const { return meta_size(ndim); }
+  int data_size() const {
+    int size = tvm::runtime::DataType(dtype).bytes();
+    for (int d = 0; d != ndim; ++d) {
+      size *= shape[d];
+    }
+    return size;
+  }
+
+  static int meta_size(int ndim) { return sizeof(tensor_meta) + ndim * sizeof(int64_t); }
+
+  std::string to_string() const;
+};
+
+struct TensorConfig {
+  static const std::string file_key;
+  static const std::string shape_key;
+  static const std::string dtype_key;
+
+  std::string file_name;
+  std::vector<int> shape;
+  std::string dtype;
+  bool bad = false;
+
+  void Load(dmlc::JSONReader* reader);
+  void Save(dmlc::JSONWriter* writer) const;
+};
+
+struct ModelConfig {
+  std::string model_library;
+  std::string model_json;
+  std::vector<TensorConfig> inputs;
+  bool bad = false;
+
+  void Load(dmlc::JSONReader* reader);
+};
+
+struct OutputConfig {
+  uint64_t pcycles;
+  uint64_t usecs;
+  std::vector<TensorConfig> outputs;
+
+  void Save(dmlc::JSONWriter* writer) const;
+};
+
+struct Model {
+  Model(tvm::runtime::Module executor, tvm::runtime::Module module, std::string json);
+
+  tvm::runtime::Module graph_executor;
+  tvm::runtime::Module graph_module;
+  std::string graph_json;
+
+  static tvm::runtime::Device device() {
+    return tvm::runtime::Device{static_cast<DLDeviceType>(kDLHexagon), 0};
+  }
+
+  tvm::runtime::PackedFunc run;
+};
+
+struct ExecutionSession {
+  template <typename T>
+  T* alloc(size_t bytes, size_t align = 1) {
+    return reinterpret_cast<T*>(alloc_mem(bytes, align));
+  }
+  void free(void* ptr) { free_mem(ptr); }
+
+  virtual void* alloc_mem(size_t bytes, size_t align) = 0;
+  virtual void free_mem(void* ptr) = 0;
+
+  virtual bool load_model(const std::string& model_path, const std::string& model_json) = 0;
+  virtual bool unload_model() = 0;
+
+  virtual bool set_input(int input_idx, const tensor_meta* input_meta, const void* input_data) = 0;
+  virtual bool run(uint64_t* pcycles, uint64_t* usecs) = 0;
+  virtual bool get_num_outputs(int* num_outputs) = 0;
+  virtual bool get_output(int output_idx, tensor_meta* output_meta, int meta_size,
+                          void* output_data, int data_size) = 0;
+};
+
+bool read_model_config(const std::string& file_name, ModelConfig* model_config);
+bool write_output_config(const std::string& file_name, OutputConfig* output_config);
+
+void reset_device_api();
+
+tvm::runtime::Module load_module(const std::string& file_name);
+
+const tvm::runtime::PackedFunc get_runtime_func(const std::string& name);
+const tvm::runtime::PackedFunc get_module_func(tvm::runtime::Module module,
+                                               const std::string& name);
+
+tvm::runtime::Module create_graph_executor(const std::string& graph_json,
+                                           tvm::runtime::Module graph_module,
+                                           tvm::runtime::Device device);
+
+#endif  // TVM_RUNTIME_HEXAGON_LAUNCHER_LAUNCHER_CORE_H_
diff --git a/src/runtime/hexagon/launcher/launcher_hexagon.cc b/src/runtime/hexagon/launcher/launcher_hexagon.cc
new file mode 100644
index 000000000000..0a5d1f55e0c2
--- /dev/null
+++ b/src/runtime/hexagon/launcher/launcher_hexagon.cc
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+extern "C" {
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+#include <HAP_farf.h>
+#include <HAP_perf.h>
+#include <qurt_error.h>
+#include <qurt_hvx.h>
+}
+
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "launcher_core.h"
+#include "launcher_rpc.h"
+
+static std::unique_ptr<Model> TheModel;
+
+static AEEResult error_too_small(const std::string& func_name, const std::string& value_name,
+                                 int given, int needed) {
+  FARF(ERROR, "%s: %s value too small (%d), need at least %d", func_name.c_str(),
+       value_name.c_str(), given, needed);
+  return AEE_EBADPARM;
+}
+
+int __QAIC_HEADER(launcher_rpc_open)(const char* uri, remote_handle64* handle) {
+  *handle = 0;  // Just use any value.
+  reset_device_api();
+  return AEE_SUCCESS;
+}
+
+int __QAIC_HEADER(launcher_rpc_close)(remote_handle64 handle) {
+  // Comment to stop clang-format from single-lining this function.
+  return AEE_SUCCESS;
+}
+
+AEEResult __QAIC_HEADER(launcher_rpc_load)(remote_handle64 handle, const char* module_path,
+                                           const char* graph_json) {
+  if (TheModel) {
+    // Need to unload first.
+    FARF(ERROR, "%s: model already loaded, unload first", __func__);
+    return AEE_EUNABLETOLOAD;
+  }
+
+  tvm::runtime::Module module = load_module(module_path);
+  tvm::runtime::Module executor = create_graph_executor(graph_json, module, Model::device());
+
+  TheModel = std::make_unique<Model>(executor, module, graph_json);
+  return AEE_SUCCESS;
+}
+
+AEEResult __QAIC_HEADER(launcher_rpc_unload)(remote_handle64 handle) {
+  if (TheModel) {
+    TheModel.reset();
+  }
+  return AEE_SUCCESS;
+}
+
+AEEResult __QAIC_HEADER(launcher_rpc_get_num_inputs)(remote_handle64 handle, int* num_inputs) {
+  if (!TheModel) {
+    // No model created.
+    return AEE_EBADSTATE;
+  }
+
+  tvm::runtime::PackedFunc get_num_inputs =
+      get_module_func(TheModel->graph_executor, "get_num_inputs");
+  *num_inputs = get_num_inputs();
+  return AEE_SUCCESS;
+}
+
+AEEResult __QAIC_HEADER(launcher_rpc_set_input)(remote_handle64 handle, int input_idx,
+                                                const unsigned char* input_meta, int meta_size,
+                                                const unsigned char* input_value, int value_size) {
+  if (!TheModel) {
+    // No model created.
+    FARF(ERROR, "%s: no model created", __func__);
+    return AEE_EBADSTATE;
+  }
+
+  const auto* meta = reinterpret_cast<const tensor_meta*>(input_meta);
+  if (meta_size < meta->meta_size()) {
+    return error_too_small(__func__, "meta_size", meta_size, meta->meta_size());
+  }
+  if (value_size < meta->data_size()) {
+    return error_too_small(__func__, "value_size", value_size, meta->data_size());
+  }
+
+  DLTensor tensor{
+      const_cast<unsigned char*>(input_value),
+      Model::device(),
+      meta->ndim,
+      meta->dtype,
+      const_cast<int64_t*>(meta->shape),
+      /*strides*/ nullptr,
+      /*byte_offset*/ 0,
+  };
+  DLManagedTensor managed{tensor, /*manager_ctx*/ nullptr, /*deleter*/ nullptr};
+
+  auto input = tvm::runtime::NDArray::FromDLPack(&managed);
+
+  tvm::runtime::PackedFunc set_input = get_module_func(TheModel->graph_executor, "set_input");
+  set_input(input_idx, input);
+
+  return AEE_SUCCESS;
+}
+
+AEEResult __QAIC_HEADER(launcher_rpc_get_num_outputs)(remote_handle64 handle, int* num_outputs) {
+  if (!TheModel) {
+    // No model created.
+    return AEE_EBADSTATE;
+  }
+
+  tvm::runtime::PackedFunc get_num_outputs =
+      get_module_func(TheModel->graph_executor, "get_num_outputs");
+  *num_outputs = get_num_outputs();
+  return AEE_SUCCESS;
+}
+
+AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int output_idx,
+                                                 unsigned char* output_meta, int meta_size,
+                                                 unsigned char* output_value, int value_size) {
+  if (!TheModel) {
+    // No model created.
+    return AEE_EBADSTATE;
+  }
+  if (meta_size < 0 || value_size < 0) {
+    return AEE_EBADPARM;
+  }
+  if ((output_meta == nullptr && meta_size != 0) || (output_value == nullptr && value_size != 0)) {
+    // If the pointer is null, the size must be 0.
+    return AEE_EBADPARM;
+  }
+
+  tvm::runtime::PackedFunc get_output = get_module_func(TheModel->graph_executor, "get_output");
+  tvm::runtime::NDArray output = get_output(output_idx);
+
+  if (meta_size != 0) {
+    auto* meta = reinterpret_cast<tensor_meta*>(output_meta);
+    if (meta_size < meta->meta_size(output->ndim)) {
+      return error_too_small(__func__, "meta_size", meta_size, meta->meta_size(output->ndim));
+    }
+
+    meta->ndim = output->ndim;
+    meta->dtype = output->dtype;
+    std::copy(&output->shape[0], &output->shape[output->ndim], meta->shape);
+  }
+
+  if (value_size != 0) {
+    size_t data_size = tvm::runtime::GetDataSize(*output.operator->());
+    if (value_size < data_size) {
+      return error_too_small(__func__, "value_size", value_size, data_size);
+    }
+
+    auto data = reinterpret_cast<decltype(output_value)>(output->data);
+    std::copy(data, data + data_size, output_value);
+  }
+
+  return AEE_SUCCESS;
+}
+
+AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcycles,
+                                          uint64_t* usecs) {
+  if (!TheModel) {
+    // No model created.
+    FARF(ERROR, "%s: no model created", __func__);
+    return AEE_EBADSTATE;
+  }
+
+  // Reserve HVX.
+  int res = qurt_hvx_reserve(QURT_HVX_RESERVE_ALL_AVAILABLE);
+  switch (res) {
+    case QURT_HVX_RESERVE_NOT_SUPPORTED:
+    case QURT_HVX_RESERVE_NOT_SUCCESSFUL:
+      FARF(ERROR, "error reserving HVX: %u", res);
+      return AEE_EFAILED;
+    default:
+      break;
+  }
+  // Lock HVX.
+  int lck = qurt_hvx_lock(QURT_HVX_MODE_128B);
+  if (lck != 0) {
+    FARF(ERROR, "error locking HVX: %u", lck);
+    return AEE_EFAILED;
+  }
+
+  uint64_t us_begin = HAP_perf_get_time_us();
+  uint64_t pc_begin = HAP_perf_get_pcycles();
+
+  TheModel->run();
+
+  uint64_t pc_end = HAP_perf_get_pcycles();
+  uint64_t us_end = HAP_perf_get_time_us();
+  *pcycles = pc_end - pc_begin;
+  *usecs = us_end - us_begin;
+
+  // Unlock HVX.
+  int unl = qurt_hvx_unlock();
+  if (unl != 0) {
+    FARF(ERROR, "error unlocking HVX: %u", unl);
+    return AEE_EFAILED;
+  }
+  // Release HVX.
+  int rel = qurt_hvx_cancel_reserve();
+  if (rel != 0) {
+    FARF(ERROR, "error canceling HVX reservation: %u", rel);
+    return AEE_EFAILED;
+  }
+
+  return AEE_SUCCESS;
+}
diff --git a/src/runtime/hexagon/launcher/launcher_main.cc b/src/runtime/hexagon/launcher/launcher_main.cc
new file mode 100644
index 000000000000..ac21a7be1636
--- /dev/null
+++ b/src/runtime/hexagon/launcher/launcher_main.cc
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <getopt.h>
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <utility>
+
+#include "launcher_core.h"
+#include "launcher_util.h"
+
+ExecutionSession* create_execution_session();
+
+int parse_command_line(int argc, char* argv[], std::string* in_path, std::string* out_path) {
+  static option long_options[] = {
+      {"in_config", required_argument, nullptr, 0},
+      {"out_config", required_argument, nullptr, 0},
+  };
+
+  bool show_usage = false;
+  int opt, long_index = 0;
+  while ((opt = getopt_long(argc, argv, "i:o:u:", long_options, &long_index)) != -1) {
+    if (opt != 0) {
+      show_usage = true;
+      continue;
+    }
+    switch (long_index) {
+      case 0:
+        *in_path = std::string(optarg);
+        break;
+      case 1:
+        *out_path = std::string(optarg);
+        break;
+    }
+  }
+  if (in_path->empty() || out_path->empty() || show_usage) {
+    std::cout << "Usage: " << argv[0] << " --" << long_options[0].name << " input.json --"
+              << long_options[1].name << " output.json\n";
+    return 1;
+  }
+  return 0;
+}
+
+int main(int argc, char* argv[]) {
+  std::string in_path, out_path;
+  if (parse_command_line(argc, argv, &in_path, &out_path) != 0) {
+    return 1;
+  }
+
+  ModelConfig config;
+  if (!read_model_config(in_path, &config)) {
+    return 1;
+  }
+
+  ExecutionSession* session_ptr = create_execution_session();
+  if (session_ptr == nullptr) {
+    return 1;
+  }
+  ExecutionSession& session = *session_ptr;
+
+  std::cout << "loading model files: " << config.model_json << ", " << config.model_library << '\n';
+  std::string json = load_text_file(config.model_json);
+  if (!session.load_model(config.model_library, json.c_str())) {
+    return 1;
+  }
+
+  int max_ndim = 0;
+  for (const TensorConfig& tc : config.inputs) {
+    max_ndim = std::max<int>(max_ndim, tc.shape.size());
+  }
+  auto* input_meta = session.alloc<tensor_meta>(tensor_meta::meta_size(max_ndim));
+
+  for (int i = 0, e = config.inputs.size(); i != e; ++i) {
+    const TensorConfig& tc = config.inputs[i];
+    input_meta->ndim = tc.shape.size();
+    input_meta->dtype = tvm::runtime::String2DLDataType(tc.dtype);
+    std::copy(tc.shape.begin(), tc.shape.end(), input_meta->shape);
+
+    auto* input_data = session.alloc<unsigned char>(input_meta->data_size());
+    std::cout << "loading input file #" << i << ": " << tc.file_name << '\n';
+    load_binary_file(tc.file_name, input_data, input_meta->data_size());
+    if (!session.set_input(i, input_meta, input_data)) {
+      return 1;
+    }
+  }
+
+  OutputConfig output_config;
+
+  std::cout << "running..." << std::flush;
+  if (!session.run(&output_config.pcycles, &output_config.usecs)) {
+    std::cout << '\n';
+    return 1;
+  }
+  std::cout << '\n';
+  std::cout << "Finished in " << output_config.pcycles << " pcycles, (" << output_config.usecs
+            << "us)\n";
+
+  auto* output_meta = session.alloc<tensor_meta>(128);
+  int num_outputs = 0;
+  if (!session.get_num_outputs(&num_outputs)) {
+    return 1;
+  }
+
+  for (int i = 0; i != num_outputs; ++i) {
+    if (!session.get_output(i, output_meta, 128, nullptr, 0)) {
+      return 1;
+    }
+    int data_size = output_meta->data_size();
+    auto* output_data = session.alloc<unsigned char>(data_size);
+    if (!session.get_output(i, output_meta, 128, output_data, data_size)) {
+      return 1;
+    }
+
+    TensorConfig oc;
+    oc.file_name = "output" + std::to_string(i) + ".dat";
+    for (int i = 0, e = output_meta->ndim; i != e; ++i) {
+      oc.shape.push_back(output_meta->shape[i]);
+    }
+    oc.dtype = tvm::runtime::DLDataType2String(output_meta->dtype);
+    write_binary_file(oc.file_name, output_data, data_size);
+    output_config.outputs.push_back(std::move(oc));
+
+    session.free(output_data);
+  }
+
+  if (!write_output_config(out_path, &output_config)) {
+    return 1;
+  }
+  return 0;
+}
diff --git a/src/runtime/hexagon/launcher/launcher_rpc.idl b/src/runtime/hexagon/launcher/launcher_rpc.idl
new file mode 100644
index 000000000000..6677108a76f0
--- /dev/null
+++ b/src/runtime/hexagon/launcher/launcher_rpc.idl
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "remote.idl"
+#include "AEEStdDef.idl"
+
+typedef sequence<octet> buffer;
+
+interface launcher_rpc : remote_handle64 {
+  AEEResult load(in string module_path, in string model_json);
+  AEEResult unload();
+  AEEResult get_num_inputs(rout long num_inputs);
+  AEEResult set_input(in long input_idx, in buffer input_meta, in buffer input_value);
+  AEEResult get_num_outputs(rout long num_outputs);
+  AEEResult get_output(in long output_idx, rout buffer output_meta, rout buffer output_value);
+  AEEResult run(rout uint64_t pcycles, rout uint64_t usecs);
+};
diff --git a/src/runtime/hexagon/launcher/launcher_util.cc b/src/runtime/hexagon/launcher/launcher_util.cc
new file mode 100644
index 000000000000..9c565167142b
--- /dev/null
+++ b/src/runtime/hexagon/launcher/launcher_util.cc
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "launcher_util.h"
+
+#include <tvm/runtime/logging.h>
+
+#include <algorithm>
+#include <fstream>
+#include <ios>
+#include <iostream>
+#include <string>
+#include <utility>
+
+size_t get_file_size(std::ifstream& in_file) {
+  std::ifstream::pos_type pos = in_file.tellg();
+  size_t size = in_file.seekg(0, std::ios::end).tellg();
+  in_file.seekg(pos, std::ios::beg);
+  return size;
+}
+
+size_t get_file_size(std::ifstream&& in_file) {
+  return get_file_size(in_file);  // calls the & version
+}
+
+std::string load_text_file(const std::string& file_name) {
+  constexpr size_t block_size = 1024 * 1024;  // 1MB
+  std::ifstream in_file(file_name);
+  ICHECK(in_file.is_open()) << "cannot open file " << file_name;
+  size_t file_size = get_file_size(in_file);
+  std::string buffer(file_size + 1, 0);
+
+  in_file.read(&buffer[0], file_size);
+  return std::move(buffer);
+}
+
+void* load_binary_file(const std::string& file_name, void* buffer, size_t buffer_size) {
+  std::ifstream in_file(file_name);
+  ICHECK(in_file.is_open()) << "cannot open file " << file_name;
+  size_t file_size = get_file_size(in_file);
+
+  in_file.read(reinterpret_cast<std::ifstream::char_type*>(buffer),
+               std::min(buffer_size, file_size));
+  return buffer;
+}
+
+void write_binary_file(const std::string& file_name, void* buffer, size_t buffer_size) {
+  std::ofstream out_file(file_name);
+  ICHECK(out_file.is_open()) << "cannot open file " << file_name;
+
+  out_file.write(reinterpret_cast<std::ofstream::char_type*>(buffer), buffer_size);
+}
diff --git a/include/tvm/topi/rocm/normalization.h b/src/runtime/hexagon/launcher/launcher_util.h
similarity index 54%
rename from include/tvm/topi/rocm/normalization.h
rename to src/runtime/hexagon/launcher/launcher_util.h
index 2fbb88089286..13db89d052fb 100644
--- a/include/tvm/topi/rocm/normalization.h
+++ b/src/runtime/hexagon/launcher/launcher_util.h
@@ -17,30 +17,18 @@
  * under the License.
  */
 
-/*!
- * \file rocm/normalization.h
- * \brief rocm schedule for LRN and l2 normalization operations
- */
-#ifndef TVM_TOPI_ROCM_NORMALIZATION_H_
-#define TVM_TOPI_ROCM_NORMALIZATION_H_
+#ifndef TVM_RUNTIME_HEXAGON_LAUNCHER_LAUNCHER_UTIL_H_
+#define TVM_RUNTIME_HEXAGON_LAUNCHER_LAUNCHER_UTIL_H_
 
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/tags.h>
+#include <cstddef>
+#include <fstream>
+#include <string>
 
-namespace tvm {
-namespace topi {
+size_t get_file_size(std::ifstream& in_file);
+size_t get_file_size(std::ifstream&& in_file);
 
-using namespace tvm::te;
-namespace rocm {
-/*!
- * \brief Create a rocm schedule for LRN
- * \param outs The output tensors.
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_lrn(const Array<Tensor>& outs) { return topi::cuda::schedule_lrn(outs); }
+std::string load_text_file(const std::string& file_name);
+void* load_binary_file(const std::string& file_name, void* buffer, size_t buffer_size);
+void write_binary_file(const std::string& file_name, void* buffer, size_t buffer_size);
 
-}  // namespace rocm
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_ROCM_NORMALIZATION_H_
+#endif  // TVM_RUNTIME_HEXAGON_LAUNCHER_LAUNCHER_UTIL_H_
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 2dcd928b24f8..9e6664ff5984 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -404,7 +404,7 @@ TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue*
     throw std::runtime_error(ss.str());
   }
   std::unique_ptr<RPCChannel> channel(micro_channel);
-  auto ep = RPCEndpoint::Create(std::move(channel), args[0], "");
+  auto ep = RPCEndpoint::Create(std::move(channel), args[0], "", args[6]);
   auto sess = CreateClientSession(ep);
   *rv = CreateRPCSessionModule(sess);
 });
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
new file mode 100644
index 000000000000..41f867057282
--- /dev/null
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file pipeline_executor.cc
+ */
+#include "pipeline_executor.h"
+
+namespace tvm {
+namespace runtime {
+
+void PipelineRuntime::Init(const Array<tvm::runtime::Module>& modules,
+                           const std::string& pipeline_json) {
+  return;
+}
+
+/* GetFunction can not be pure abstract function, implement an empty function for now.
+ */
+PackedFunc PipelineRuntime::GetFunction(const std::string& name,
+                                        const ObjectPtr<Object>& sptr_to_self) {
+  return nullptr;
+}
+
+Module PipelineRuntimeCreate(const Array<tvm::runtime::Module>& m,
+                             const std::string& pipeline_json) {
+  auto exec = make_object<PipelineRuntime>();
+  exec->Init(m, pipeline_json);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.pipeline_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = PipelineRuntimeCreate(args[0], args[1]);
+});
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
new file mode 100644
index 000000000000..c7625c62b724
--- /dev/null
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief pipeline executor
+ * \file pipeline_executor.h
+ */
+#ifndef TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
+#define TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
+#include <tvm/runtime/registry.h>
+
+#include <string>
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief pipeline executor.
+ *  This executor class use the module list and dependency configuration of modules as
+ *  the parameters and executes these modules on heterogeneous targets in a pipeline
+ *  parallel manner to improve throughput.
+ *
+ *  This executor can be accessed by various language via TVM runtime PackedFunc API.
+ */
+class TVM_DLL PipelineRuntime : public ModuleNode {
+ public:
+  /*!
+   * \Return the type key of the executor.
+   */
+  const char* type_key() const final { return "PipelineRuntime"; }
+  /*!
+   * \brief Initialize the pipeline executor with module array and json text.
+   * \param modules The module list used for building pipeline.
+   * \param pipeline_json The configuration of modules dependencies.
+   */
+  void Init(const Array<tvm::runtime::Module>& modules, const std::string& pipeline_json);
+  /*!
+   * \brief Give frontends an access to packed functions.
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding packed function.
+   */
+  virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+};
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 596b6ace8831..bd59be87f7d9 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -22,6 +22,7 @@
  * \brief Runtime profiling including timers.
  */
 
+#include <dmlc/json.h>
 #include <tvm/ir/expr.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/packed_func.h>
@@ -231,7 +232,9 @@ String ReportNode::AsCSV() const {
 namespace {
 void print_metric(std::ostream& os, ObjectRef o) {
   if (o.as<StringObj>()) {
-    os << "\"" << Downcast<String>(o) << "\"";
+    os << "{\"string\":"
+       << "\"" << Downcast<String>(o) << "\""
+       << "}";
   } else if (const CountNode* n = o.as<CountNode>()) {
     os << "{\"count\":" << std::to_string(n->value) << "}";
   } else if (const DurationNode* n = o.as<DurationNode>()) {
@@ -540,6 +543,72 @@ Report::Report(Array<Map<String, ObjectRef>> calls,
   data_ = std::move(node);
 }
 
+Map<String, ObjectRef> parse_metrics(dmlc::JSONReader* reader) {
+  reader->BeginObject();
+  std::string metric_name, metric_value_name;
+  Map<String, ObjectRef> metrics;
+  while (reader->NextObjectItem(&metric_name)) {
+    ObjectRef o;
+    reader->BeginObject();
+    reader->NextObjectItem(&metric_value_name);
+    if (metric_value_name == "microseconds") {
+      double microseconds;
+      reader->Read(&microseconds);
+      o = ObjectRef(make_object<DurationNode>(microseconds));
+    } else if (metric_value_name == "percent") {
+      double percent;
+      reader->Read(&percent);
+      o = ObjectRef(make_object<PercentNode>(percent));
+    } else if (metric_value_name == "count") {
+      int64_t count;
+      reader->Read(&count);
+      o = ObjectRef(make_object<CountNode>(count));
+    } else if (metric_value_name == "string") {
+      std::string s;
+      reader->Read(&s);
+      o = String(s);
+    } else {
+      LOG(FATAL) << "Cannot parse metric of type " << metric_value_name
+                 << " valid types are microseconds, percent, count.";
+    }
+    metrics.Set(metric_name, o);
+    // Necessary to make sure that the parser hits the end of the object.
+    ICHECK(!reader->NextObjectItem(&metric_value_name));
+    // EndObject does not exist, leaving this here for clarity
+    // reader.EndObject();
+  }
+  // reader.EndObject();
+  return metrics;
+}
+
+Report Report::FromJSON(String json) {
+  std::stringstream input(json.operator std::string());
+  dmlc::JSONReader reader(&input);
+  std::string key;
+  Array<Map<String, ObjectRef>> calls;
+  Map<String, Map<String, ObjectRef>> device_metrics;
+
+  reader.BeginObject();
+  while (reader.NextObjectItem(&key)) {
+    if (key == "calls") {
+      reader.BeginArray();
+      while (reader.NextArrayItem()) {
+        calls.push_back(parse_metrics(&reader));
+      }
+      // reader.EndArray();
+    } else if (key == "device_metrics") {
+      reader.BeginObject();
+      std::string device_name;
+      while (reader.NextObjectItem(&device_name)) {
+        device_metrics.Set(device_name, parse_metrics(&reader));
+      }
+      // reader.EndObject();
+    }
+  }
+
+  return Report(calls, device_metrics);
+}
+
 TVM_REGISTER_OBJECT_TYPE(DurationNode);
 TVM_REGISTER_OBJECT_TYPE(PercentNode);
 TVM_REGISTER_OBJECT_TYPE(CountNode);
@@ -551,6 +620,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.AsCSV").set_body_typed([](Report n) { ret
 TVM_REGISTER_GLOBAL("runtime.profiling.AsJSON").set_body_typed([](Report n) {
   return n->AsJSON();
 });
+TVM_REGISTER_GLOBAL("runtime.profiling.FromJSON").set_body_typed(Report::FromJSON);
 TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) {
   return DeviceWrapper(dev);
 });
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index e83f062795e4..2f1fc54f39d0 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -691,11 +691,13 @@ void RPCEndpoint::Init() {
  *     the key to modify their behavior.
  */
 std::shared_ptr<RPCEndpoint> RPCEndpoint::Create(std::unique_ptr<RPCChannel> channel,
-                                                 std::string name, std::string remote_key) {
+                                                 std::string name, std::string remote_key,
+                                                 TypedPackedFunc<void()> fshutdown) {
   std::shared_ptr<RPCEndpoint> endpt = std::make_shared<RPCEndpoint>();
   endpt->channel_ = std::move(channel);
   endpt->name_ = std::move(name);
   endpt->remote_key_ = std::move(remote_key);
+  endpt->fshutdown_ = fshutdown;
   endpt->Init();
   return endpt;
 }
@@ -734,6 +736,7 @@ void RPCEndpoint::ServerLoop() {
     (*f)();
   }
   channel_.reset(nullptr);
+  if (fshutdown_ != nullptr) fshutdown_();
 }
 
 int RPCEndpoint::ServerAsyncIOEventHandler(const std::string& in_bytes, int event_flag) {
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index 7c11a1aeac01..f6784faba0f6 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -161,11 +161,13 @@ class RPCEndpoint {
    * \param channel The communication channel.
    * \param name The local name of the session, used for debug
    * \param remote_key The remote key of the session
+   * \param fshutdown The shutdown Packed function
    *   if remote_key equals "%toinit", we need to re-intialize
    *   it by event handler.
    */
   static std::shared_ptr<RPCEndpoint> Create(std::unique_ptr<RPCChannel> channel, std::string name,
-                                             std::string remote_key);
+                                             std::string remote_key,
+                                             TypedPackedFunc<void()> fshutdown = nullptr);
 
  private:
   class EventHandler;
@@ -190,6 +192,8 @@ class RPCEndpoint {
   std::string name_;
   // The remote key
   std::string remote_key_;
+  // The shutdown Packed Function
+  TypedPackedFunc<void()> fshutdown_;
 };
 
 /*!
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 6d893114d623..d6575c35d10d 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -52,8 +52,14 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
             }
           }
 
-          std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-          prof_ = profiling::Profiler(devices, cs);
+          // We cannot send Arrays over rpc, so in order to support profiling
+          // on remotes, we accept a nullptr for collectors.
+          if (collectors.defined()) {
+            std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
+            prof_ = profiling::Profiler(devices, cs);
+          } else {
+            prof_ = profiling::Profiler(devices, {});
+          }
 
           auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
           // warmup
@@ -68,6 +74,15 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
           prof_ = dmlc::optional<profiling::Profiler>();  // releases hardware counters
           return report;
         });
+  } else if (name == "profile_rpc") {
+    // We cannot return a Report over RPC because TMV RPC mechanism only
+    // supports a subset of Object classes. Instead we serialize it on the
+    // remote (here) and deserialize it on the other end.
+    return TypedPackedFunc<std::string(std::string)>([sptr_to_self, this](std::string arg_name) {
+      PackedFunc profile = GetFunction("profile", sptr_to_self);
+      profiling::Report report = profile(arg_name, Array<profiling::MetricCollector>());
+      return report->AsJSON();
+    });
   } else {
     return VirtualMachine::GetFunction(name, sptr_to_self);
   }
diff --git a/src/support/nd_int_set.h b/src/support/nd_int_set.h
new file mode 100644
index 000000000000..ae4a0386d404
--- /dev/null
+++ b/src/support/nd_int_set.h
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SUPPORT_ND_INT_SET_H_
+#define TVM_SUPPORT_ND_INT_SET_H_
+
+#include <tvm/arith/int_set.h>
+#include <tvm/ir/expr.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace support {
+
+/*! \brief An N-dimensional integer set representing a rectangle region */
+using NDIntSet = std::vector<arith::IntSet>;
+
+/*!
+ * \brief Construct an N-dimensional integer set representing a region.
+ * \param region The region.
+ * \return The constructed set.
+ */
+inline NDIntSet NDIntSetFromRegion(const tir::Region& region) {
+  NDIntSet result;
+  result.reserve(region.size());
+  for (const Range& range : region) {
+    result.push_back(arith::IntSet::FromRange(range));
+  }
+  return result;
+}
+
+/*!
+ * \brief Construct an N-dimensional integer set representing a shape.
+ * \param shape The shape which is an array of the length of each dimension.
+ * \return The constructed set.
+ */
+inline NDIntSet NDIntSetFromShape(const Array<PrimExpr>& shape) {
+  PrimExpr zero = Integer(0);
+  NDIntSet result;
+  result.reserve(shape.size());
+  for (const PrimExpr& extent : shape) {
+    result.push_back(arith::IntSet::FromMinExtent(zero, extent));
+  }
+  return result;
+}
+
+/*!
+ * \brief Construct an N-dimensional integer set representing a point.
+ * \param indices The N-dimensional indices representing the point.
+ * \return The constructed set.
+ */
+inline NDIntSet NDIntSetFromPoint(const Array<PrimExpr>& indices) {
+  NDIntSet result;
+  result.reserve(indices.size());
+  for (const PrimExpr& index : indices) {
+    result.push_back(arith::IntSet::SinglePoint(index));
+  }
+  return result;
+}
+
+/*!
+ * \brief Create a union set of two sets, possibly relaxed. The RHS set will be combined into the
+ *        LHS set.
+ * \param lhs The first N-dimensional integer set
+ * \param rhs The second N-dimensional integer set
+ */
+inline void NDIntSetUnionWith(NDIntSet* lhs, const NDIntSet& rhs) {
+  ICHECK_EQ(lhs->size(), rhs.size());
+  int ndim = rhs.size();
+  for (int i = 0; i < ndim; ++i) {
+    arith::IntSet& int_set = lhs->at(i);
+    int_set = arith::Union({int_set, rhs.at(i)});
+  }
+}
+
+/*!
+ * \brief Union a list of N-dimensional integer sets
+ * \param nd_int_sets The N-dimensional integer sets to be merged.
+ * \return The result of the union
+ */
+inline NDIntSet NDIntSetUnion(const std::vector<NDIntSet>& nd_int_sets) {
+  ICHECK(!nd_int_sets.empty());
+  int n = nd_int_sets.size();
+  if (n == 1) {
+    return nd_int_sets[0];
+  }
+  int ndim = nd_int_sets[0].size();
+  for (int i = 1; i < n; ++i) {
+    ICHECK_EQ(nd_int_sets[i].size(), ndim);
+  }
+  NDIntSet result;
+  result.reserve(ndim);
+  Array<arith::IntSet> int_sets(n, arith::IntSet{nullptr});
+  for (int dim = 0; dim < ndim; ++dim) {
+    for (int i = 0; i < n; ++i) {
+      int_sets.Set(i, nd_int_sets[i][dim]);
+    }
+    result.push_back(arith::Union(int_sets));
+  }
+  return result;
+}
+
+/*!
+ * \brief Create an empty N-dimensional integer set.
+ * \param ndim The number of dimensions.
+ * \return The constructed set.
+ */
+inline NDIntSet NDIntSetEmpty(int ndim) {
+  return std::vector<arith::IntSet>(ndim, arith::IntSet::Nothing());
+}
+
+/*!
+ * \brief The N-dimensional version of EvalSet.
+ * \param nd_int_set The N-dimensional integer set to be evaluated.
+ * \param dom_map The domain of each variable.
+ * \return An N-dimensional integer set that can cover all the possible values of the N-dimensional
+ *         integer set.
+ * \sa EvalSet
+ */
+inline NDIntSet NDIntSetEval(
+    const NDIntSet& nd_int_set,
+    const std::unordered_map<const tir::VarNode*, arith::IntSet>& dom_map) {
+  NDIntSet ret;
+  ret.reserve(nd_int_set.size());
+  for (const arith::IntSet& s : nd_int_set) {
+    ret.push_back(EvalSet(s, dom_map));
+  }
+  return ret;
+}
+
+}  // namespace support
+}  // namespace tvm
+
+#endif  // TVM_SUPPORT_ND_INT_SET_H_
diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 7770e42086de..33a09b1ded66 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -230,11 +230,11 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
 
   cg->Init("TVMAMDGPUModule", tm.get(), ctx.get(), false, false, false);
 
-  for (auto kv : mod->functions) {
-    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
-    auto f = Downcast<PrimFunc>(kv.second);
-    cg->AddFunction(f);
-  }
+  cg->AddFunctionsOrdered(mod->functions.begin(), mod->functions.end(), [](auto& kv) {
+    ICHECK(kv.second->template IsInstance<PrimFuncNode>())
+        << "Can only lower IR Module with PrimFuncs";
+    return Downcast<PrimFunc>(kv.second);
+  });
 
   const auto* find_rocm_bitcodes = tvm::runtime::Registry::Get("tvm_callback_rocm_bitcode_path");
   Array<runtime::String> bitcode_files = (*find_rocm_bitcodes)();
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 26356a547990..d9d0d1f3d6a4 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -671,11 +671,9 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
     }
     return vec;
   };
-  std::string llvm_options_str;
-  if (const Optional<String> llvm_options = target->GetAttr<String>("llvm-options")) {
-    llvm_options_str = "llvm," + llvm_options.value();
-  } else {
-    llvm_options_str = "llvm";
+  std::string llvm_options_str = "llvm";
+  if (const auto& llvm_options = target->GetAttr<Array<String>>("llvm-options")) {
+    for (const String& s : llvm_options.value()) llvm_options_str += "," + s;
   }
   // Postprocess the LLVM options string: replace '@' with '=', and ',' with ' '.
   for (int i = 0, e = llvm_options_str.size(); i != e; ++i) {
@@ -706,12 +704,37 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   std::unique_ptr<llvm::TargetMachine> tm = GetLLVMTargetMachine(target);
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
   std::unique_ptr<CodeGenHexagon> cg(new CodeGenHexagon());
-  cg->Init("TVMHexagonModule", tm.get(), ctx.get(), false, false, false);
+
+  std::vector<PrimFunc> funcs;
+  Map<String, LinkedParam> linked_params;
+  bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
+
   for (auto kv : mod->functions) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    if (could_have_linked_params &&
+        kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
+      // If `f` is the linked-params function, extract the parameters from the
+      // attribute dictionary, and skip the codegen.
+      auto attrs_dict = Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
+      CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
+          << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+
+      CHECK(linked_params.empty()) << "Multiple linked-param functions";
+      linked_params =
+          Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
+      continue;
+    }
     auto f = Downcast<PrimFunc>(kv.second);
-    cg->AddFunction(f);
+    funcs.emplace_back(f);
   }
+
+  cg->Init("TVMHexagonModule", tm.get(), ctx.get(), false, false, false);
+  cg->AddFunctionsOrdered(funcs.begin(), funcs.end());
+
+  if (!linked_params.empty()) {
+    cg->LinkParameters(linked_params);
+  }
+
   // Uncomment to get the LLVM module right out of codegen, before optimizations.
   // std::cerr << "HexagonModule.0 {\n" << *cg->GetModulePtr() << "}\n";
   std::unique_ptr<llvm::Module> module = cg->Finish();
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 52c5b98a0025..a4f007aeebed 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -36,6 +36,7 @@
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -92,6 +93,25 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    * \return the created module.
    */
   virtual std::unique_ptr<llvm::Module> Finish();
+  /*!
+   * \brief Add functions from the (unordered) range to the current module in a deterministic order.
+   *        The range consists of objects convertible to PrimFunc.
+   * \param begin The beginning of the range.
+   * \param end The end of the range.
+   * \param pfunc Converter function from the range element type to PrimFunc.
+   */
+  template <typename IterType, typename ConvType>
+  void AddFunctionsOrdered(IterType begin, IterType end, ConvType pfunc);
+  /*!
+   * \brief Add functions from the (unordered) range of elements of type PrimFunc to the current
+   *        module in a deterministic order.
+   * \param begin The beginning of the range.
+   * \param end The end of the range.
+   */
+  template <typename IterType>
+  void AddFunctionsOrdered(IterType begin, IterType end) {
+    this->AddFunctionsOrdered(begin, end, [](auto f) { return f; });
+  }
   /*!
    * \brief Add mod to be linked with the generated module
    * \param mod The module to be linked.
@@ -377,6 +397,22 @@ inline int CodeGenLLVM::GetVectorNumElements(llvm::Value* vec) {
 #endif
 }
 
+template <typename IterType, typename ConvType>
+void CodeGenLLVM::AddFunctionsOrdered(IterType begin, IterType end, ConvType pfunc) {
+  std::vector<PrimFunc> funcs;
+  for (auto it = begin; it != end; ++it) {
+    funcs.push_back(pfunc(*it));
+  }
+  std::sort(funcs.begin(), funcs.end(), [](PrimFunc func_a, PrimFunc func_b) {
+    std::string name_a = func_a->GetAttr<String>(tvm::attr::kGlobalSymbol).value();
+    std::string name_b = func_b->GetAttr<String>(tvm::attr::kGlobalSymbol).value();
+    return name_a < name_b;
+  });
+  for (auto& f : funcs) {
+    AddFunction(f);
+  }
+}
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // LLVM_VERSION
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 15543eda423f..ebe6d6d67442 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -274,11 +274,11 @@ runtime::Module BuildNVPTX(IRModule mod, Target target) {
 
   cg->Init("TVMPTXModule", tm.get(), ctx.get(), false, false, false);
 
-  for (auto kv : mod->functions) {
-    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
-    auto f = Downcast<PrimFunc>(kv.second);
-    cg->AddFunction(f);
-  }
+  cg->AddFunctionsOrdered(mod->functions.begin(), mod->functions.end(), [](auto& kv) {
+    ICHECK(kv.second->template IsInstance<PrimFuncNode>())
+        << "Can only lower IR Module with PrimFuncs";
+    return Downcast<PrimFunc>(kv.second);
+  });
 
   const auto* flibdevice_path = tvm::runtime::Registry::Get("tvm_callback_libdevice_path");
   if (flibdevice_path != nullptr) {
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 8bdf6d1b0422..0e4bca4396f5 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -258,9 +258,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
 
-    for (const auto& f : funcs) {
-      cg->AddFunction(f);
-    }
+    cg->AddFunctionsOrdered(funcs.begin(), funcs.end());
 
     if (entry_func.length() != 0) {
       cg->AddMainFunction(entry_func);
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index dc849b8fa6b3..80ace929b881 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -59,6 +59,17 @@ void CodeGenCHost::AddFunction(const PrimFunc& f) {
   function_names_.push_back(global_symbol.value());
 
   CodeGenC::AddFunction(f);
+  if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
+    function_names_.push_back(runtime::symbol::tvm_module_main);
+    stream << "// CodegenC: NOTE: Auto-generated entry function\n";
+    PrintFuncPrefix();
+    stream << " " << tvm::runtime::symbol::tvm_module_main
+           << "(void* args, int* arg_type_ids, int num_args, void* out_ret_value, "
+           << "int* out_ret_tcode, void* resource_handle) {\n";
+    stream << "  return " << global_symbol.value()
+           << "(args, arg_type_ids, num_args, out_ret_value, out_ret_tcode, resource_handle);\n";
+    stream << "}\n";
+  }
 }
 
 void CodeGenCHost::DeclareParameters(Map<String, LinkedParam> params) {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 7728773b13d7..9b93b0726f3a 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -234,6 +234,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "}\n";
   }
 
+  static int isNotAlnum(char c) { return !std::isalnum(c); }
+
   void GenerateCInterfaceEntrypoint(const std::string& entrypoint_name, const std::string& run_func,
                                     const std::string& mod_name) {
     code_ << "#include <" << mod_name << ".h>\n";
@@ -252,7 +254,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           << ") {";
     code_ << "return " << run_func << "(";
     for (const auto& input : metadata_->inputs) {
-      code_ << "inputs->" << input << ",";
+      std::string sanitised_input = input;
+      std::replace_if(sanitised_input.begin(), sanitised_input.end(), isNotAlnum, '_');
+      code_ << "inputs->" << sanitised_input << ",";
     }
     if (metadata_->num_outputs == 1) {
       code_ << "outputs->output";
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index ab8e6eaad157..d719386d204b 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -341,6 +341,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
     .add_attr_option<Bool>("system-lib")
+    .add_attr_option<Bool>("link-params", Bool(false))
     .add_attr_option<Array<String>>("llvm-options")
     .set_default_keys({"hexagon"});
 
diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
index 8964c1013a53..5d71c5345fd0 100644
--- a/src/te/schedule/schedule_lang.cc
+++ b/src/te/schedule/schedule_lang.cc
@@ -778,11 +778,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->Print(op->outer);
       p->stream << ", inner=";
       p->Print(op->inner);
+      if (op->factor.defined()) {
+        p->stream << ", factor=";
+        p->Print(op->factor);
+      } else {
+        p->stream << ", nparts=";
+        p->Print(op->nparts);
+      }
       p->stream << ')';
     })
     .set_dispatch<FuseNode>([](const ObjectRef& node, ReprPrinter* p) {
       auto* op = static_cast<const FuseNode*>(node.get());
-      p->stream << "split(";
+      p->stream << "fuse(";
       p->stream << "outer=";
       p->Print(op->outer);
       p->stream << ", inner=";
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 335ff19dd775..8253fa3c0a36 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -294,6 +294,7 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataTyp
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
+  ICHECK(n != nullptr);
   ICHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype << " from buffer of " << n->dtype;
   if (dtype == DataType::Bool()) {
@@ -308,6 +309,7 @@ PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
 Stmt Buffer::vstore(Array<PrimExpr> begin, PrimExpr value) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
+  ICHECK(n != nullptr);
   DataType dtype = value.dtype();
   ICHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot store " << dtype << " to buffer of " << n->dtype;
@@ -332,7 +334,9 @@ Buffer Buffer::MakeStrideView() const {
   if ((*this)->strides.size() != 0) return *this;
   if ((*this)->shape.size() == 0) return *this;
   std::vector<PrimExpr> temp;
-  auto n = make_object<BufferNode>(*operator->());
+  const BufferNode* self = operator->();
+  ICHECK(self != nullptr);
+  auto n = make_object<BufferNode>(*self);
   PrimExpr acc = make_const(n->DefaultIndexType(), 1);
   for (size_t i = n->shape.size(); i != 0; --i) {
     temp.push_back(acc);
@@ -346,6 +350,7 @@ Buffer Buffer::MakeStrideView() const {
 
 Buffer Buffer::MakeSlice(Array<PrimExpr> begins, Array<PrimExpr> extents) const {
   const BufferNode* n = operator->();
+  ICHECK(n != nullptr);
   arith::Analyzer ana;
   begins = SimplifyArray(&ana, begins);
   PrimExpr elem_offset = ana.Simplify(ElemOffset(n, begins));
@@ -374,6 +379,7 @@ Buffer Buffer::MakeSlice(Array<PrimExpr> begins, Array<PrimExpr> extents) const
 PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes,
                             PrimExpr offset) const {
   const BufferNode* self = operator->();
+  ICHECK(self != nullptr);
   PrimExpr e_dtype;
   PrimExpr extent;
   if (self->shape.size() == 0) {
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index d4e4728abfe0..5a2f46c910b4 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -21,6 +21,7 @@
 
 #include <tvm/tir/schedule/state.h>
 
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
@@ -69,11 +70,20 @@ StmtSRef GetSRefTreeRoot(const StmtSRef& sref);
  * \param self The schedule state
  * \param sref The sref whose scope is to be checked
  * \param require_stage_pipeline A boolean indicating whether to check stage pipeline
- * \throw ScheduleError if the sref has been the root of the AST (so it has no scope root), or its
- * scope root is not a stage pipeline
+ * \param require_subtree_compact_dataflow A boolean indicating whether to check
+ * subtree compact dataflow property. The scope root may have one or more subtrees rooted at
+ * its direct children, and this property requires all the blocks of the subtree
+ * that the specified sref is in to be complete block or reduction block.
+ * \throw ScheduleError if
+ * 1) the sref has been the root of the AST (so it has no scope root), or
+ * 2) require_stage_pipeline = true, but its scope root is not a stage pipeline
+ * 3) require_subtree_compact_dataflow = true, but the subtree that the sref is in doesn't satisfy
+ * the compact dataflow condition, i.e. a block in the subtree is neither complete block nor
+ * reduction block
  * \return The block sref to the scope root
  */
-StmtSRef GetScopeRoot(const ScheduleState& self, const StmtSRef& sref, bool require_stage_pipeline);
+StmtSRef GetScopeRoot(const ScheduleState& self, const StmtSRef& sref, bool require_stage_pipeline,
+                      bool require_subtree_compact_dataflow);
 
 /*!
  * \brief Checks whether the block is a complete block under the scope
@@ -128,18 +138,36 @@ void CheckReductionBlock(const ScheduleState& self, const StmtSRef& block_sref,
                          const StmtSRef& scope_root_sref);
 
 /*!
- * \brief Check whether a subtree on SRef tree has compact data flow, and throw an exception if the
- * subtree does not have compact data flow
- * \details For a given StmtSRef, We say the subtree rooted from the StmtSRef has "compact data
- * flow" property if:
- * - the scope root of the input subtree root has stage-pipeline property, and
- * - all its child blocks on SRef tree are complete blocks or reduction blocks.
+ * \brief Check if the block is a complete block or a reduction block under the scope
  * \param self The schedule state
- * \param subtree_root_sref The root of the subtree to be checked in the SRef tree
- * \throw ScheduleError If the subtree does not have compact data flow
- * \sa IsCompleteBlock, IsReductionBlock
+ * \param block_sref The sref of the block to be checked
+ * \param scope_root_sref The scope root of the block
+ * \throw ScheduleError If the block is neither a complete block nor a reduction block
+ */
+void CheckCompleteOrReductionBlock(const ScheduleState& self, const StmtSRef& block_sref,
+                                   const StmtSRef& scope_root_sref);
+
+/*!
+ * \brief Check if the block is an output block, i.e. the block writes to at least a buffer that is
+ * not allocated under the current scope
+ * \param self The schedule state
+ * \param block_sref The block to be checked
+ * \param scope_root_sref The scope root of the block
+ * \return A boolean flag indicating if the block is an output block
+ */
+bool IsOutputBlock(const ScheduleState& self, const StmtSRef& block_sref,
+                   const StmtSRef& scope_root_sref);
+
+/*!
+ * \brief Check if the block is not an output block, i.e. all the buffers the block writes to
+ * are allocated under the current scope
+ * \param self The schedule state
+ * \param block_sref The block to be checked
+ * \param scope_root_sref The scope root of the block
+ * \throw ScheduleError if the block is an output block
  */
-void CheckSRefSubtreeCompactDataFlow(const ScheduleState& self, const StmtSRef& subtree_root_sref);
+void CheckNotOutputBlock(const ScheduleState& self, const StmtSRef& block_sref,
+                         const StmtSRef& scope_root_sref);
 
 /******** Binding ********/
 /*!
@@ -224,6 +252,7 @@ Array<BlockRealize> GetChildBlockRealizeOnSRefTree(const StmtSRef& parent_sref);
  */
 BlockRealize CheckGetSingleChildBlockRealizeOnSRefTree(const ScheduleState& self,
                                                        const StmtSRef& parent_sref);
+
 /*!
  * \brief Get the BlockRealize of the input block
  * \param self The schedule state
@@ -232,6 +261,55 @@ BlockRealize CheckGetSingleChildBlockRealizeOnSRefTree(const ScheduleState& self
  */
 BlockRealize GetBlockRealize(const ScheduleState& self, const StmtSRef& block_sref);
 
+/******** Producer-consumer relation ********/
+
+/*!
+ * \brief Get the producer blocks to the given block under the given scope
+ * \param block_sref The block whose producers are to be retrieved
+ * \param scope The block scope where the given block is in
+ * \return The producer blocks of the specified block
+ */
+Array<StmtSRef> GetProducers(const StmtSRef& block_sref, const BlockScope& scope);
+
+/*!
+ * \brief Get the consumer blocks to the given block under the given scope
+ * \param block_sref The block whose consumers are to be retrieved
+ * \param scope The block scope where the given block is in
+ * \return The consumer blocks of the specified block
+ */
+Array<StmtSRef> GetConsumers(const StmtSRef& block_sref, const BlockScope& scope);
+
+/*!
+ * \brief A solution to split a ordered list of subtrees into two parts,
+ * where producers are on the LHS and consumers are on the RHS.
+ * For example, subtree[0, 3) are on the LHS, and subtree[3, 6) are on the RHS.
+ */
+struct ProducerConsumerSplit {
+  /*! \brief Indicates that all producers fall into `subtrees[0, last_producer_position]` */
+  int last_producer_position;
+  /*! \brief Indicates that all consumers fall into `subtrees[first_consumer_position, ...)` */
+  int first_consumer_position;
+  /*! \brief The number of given producers visited in `subtrees` */
+  int n_producers_visited;
+  /*! \brief The number of given consumers visited in `subtrees` */
+  int n_consumers_visited;
+  /*!
+   * \brief Find a split among the given `subtree`
+   * \param state The schedule state
+   * \param subtrees The ordered list of subtrees to be split
+   * \param producer_block_srefs The producers
+   * \param consumer_block_srefs The consumers
+   * \param block2realize If not null, the corresponding BlockRealize to each block in the scope
+   * will be saved in this map
+   * \return The valid split points are (last_producer_position, first_consumer_position]
+   * \throw ScheduleError is not valid split is found
+   */
+  static ProducerConsumerSplit Find(
+      const ScheduleState& state, const Array<Stmt>& subtrees,
+      const Array<StmtSRef>& producer_block_srefs, const Array<StmtSRef>& consumer_block_srefs,
+      std::unordered_map<const BlockNode*, const BlockRealizeNode*>* block2realize);
+};
+
 /******** Block-buffer relation ********/
 
 /*!
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 3865781c5870..d14d64a4c787 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -47,22 +47,9 @@ const PrimFuncNode* GetRootPrimFunc(const IRModule& mod, const StmtNode* root_bl
 
 /******** Scope ********/
 
-/*!
- * \brief Gets the sref to the scope root block, exclusive
- * \param sref The block or loop sref to be retrieved
- * \return The sref to the scope root block. NullOpt if `sref` is the root block of the IR
- */
-Optional<StmtSRef> GetScopeRoot(const StmtSRef& sref) {
-  for (const StmtSRefNode* p = sref->parent; p != nullptr; p = p->parent) {
-    if (p->stmt->IsInstance<BlockNode>()) {
-      return GetRef<StmtSRef>(p);
-    }
-  }
-  return NullOpt;
-}
-
-StmtSRef GetScopeRoot(const ScheduleState& self, const StmtSRef& sref,
-                      bool require_stage_pipeline) {
+StmtSRef GetScopeRoot(const ScheduleState& self, const StmtSRef& sref,  //
+                      bool require_stage_pipeline,                      //
+                      bool require_subtree_compact_dataflow) {
   class RootBlockError : public ScheduleError {
    public:
     explicit RootBlockError(IRModule mod) : mod_(mod) {}
@@ -98,16 +85,67 @@ Definition of a scope that is a stage pipeline:
     Block block_;
   };
 
+  class NotCompactDataFlowError : public ScheduleError {
+   public:
+    explicit NotCompactDataFlowError(IRModule mod, Stmt subtree_root, Block violate_block)
+        : mod_(std::move(mod)),
+          subtree_root_(std::move(subtree_root)),
+          violate_block_(std::move(violate_block)) {
+      ICHECK(subtree_root_->IsInstance<BlockNode>() || subtree_root_->IsInstance<ForNode>());
+    }
+    String FastErrorString() const final {
+      return "ScheduleError: The queried subtree root in SRef tree does not have compact dataflow, "
+             "because some of its child block on SRef tree is neither a complete block nor a "
+             "reduction block";
+    }
+    String DetailRenderTemplate() const final {
+      return "The queried subtree root {0} in SRef tree does not have compact dataflow, because "
+             "its child block {1} on SRef tree is neither a complete block nor a reduction block";
+    }
+    IRModule mod() const final { return mod_; }
+    Array<ObjectRef> LocationsOfInterest() const final { return {subtree_root_, violate_block_}; }
+
+    IRModule mod_;
+    Stmt subtree_root_;
+    Block violate_block_;
+  };
+
   StmtSRef scope_root_sref{nullptr};
-  if (Optional<StmtSRef> opt_scope_root_sref = GetScopeRoot(sref)) {
-    scope_root_sref = opt_scope_root_sref.value();
-  } else {
-    throw RootBlockError(self->mod);
+  StmtSRef scope_root_subtree{nullptr};
+  // Step 1. Find the scope root and the subtree that the given sref is in
+  {
+    const StmtSRefNode* p = sref->parent;
+    const StmtSRefNode* subtree = sref.get();
+    for (; p != nullptr; subtree = p, p = p->parent) {
+      if (p->stmt->IsInstance<BlockNode>()) {
+        scope_root_sref = GetRef<StmtSRef>(p);
+        scope_root_subtree = GetRef<StmtSRef>(subtree);
+        break;
+      }
+    }
+    if (p == nullptr) {
+      throw RootBlockError(self->mod);
+    }
+  }
+  // Step 2. Handle `require_stage_pipeline`
+  if (require_stage_pipeline) {
+    bool stage_pipeline = self->GetBlockInfo(scope_root_sref).scope->stage_pipeline;
+    if (stage_pipeline == false) {
+      const BlockNode* block = TVM_SREF_TO_BLOCK(block, scope_root_sref);
+      throw NotStagePipelineError(self->mod, GetRef<Block>(block));
+    }
   }
-  bool stage_pipeline = self->GetBlockInfo(scope_root_sref).scope->stage_pipeline;
-  if (require_stage_pipeline && stage_pipeline == false) {
-    const BlockNode* block = TVM_SREF_TO_BLOCK(block, scope_root_sref);
-    throw NotStagePipelineError(self->mod, GetRef<Block>(block));
+  // Step 3. Handle `require_subtree_compact_dataflow`
+  if (require_subtree_compact_dataflow) {
+    Array<StmtSRef> child_block_srefs = GetChildBlockSRefOnSRefTree(self, scope_root_sref);
+    for (const StmtSRef& block_sref : child_block_srefs) {
+      if (!IsCompleteBlock(self, block_sref, scope_root_sref) &&
+          !IsReductionBlock(self, block_sref, scope_root_sref)) {
+        const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+        throw NotCompactDataFlowError(self->mod, GetRef<Stmt>(scope_root_subtree->stmt),
+                                      GetRef<Block>(block));
+      }
+    }
   }
   return scope_root_sref;
 }
@@ -174,6 +212,18 @@ int CheckCompleteBlockErrorCode(const ScheduleState& self, const StmtSRef& block
   return 0;
 }
 
+static const char* kCompleteBlockDefinition = R"(Definition of a complete block:
+1) All block vars are data parallel
+2) Dominant: the block is the only writer of its output, dominating the reader of its output buffers
+3) No overlap between the buffers the block reads and writes)";
+
+static const char* kReductionBlockDefinition = R"(Definition of a reduction block:
+1) The block has the `init` statement
+2) All the block bindings are quasi-affine expressions
+3) All block vars are either data parallel block vars or reduction block vars
+4) Dominant: the block is the only writer of its output, dominating the reader of its output buffers
+5) The reduction block vars are not used to index the output buffers)";
+
 bool IsCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref,
                      const StmtSRef& scope_root_sref) {
   return CheckCompleteBlockErrorCode(self, block_sref, scope_root_sref) == 0;
@@ -188,12 +238,8 @@ void CheckCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref,
     String FastErrorString() const final { return "ScheduleError: Incomplete block"; }
     String DetailRenderTemplate() const final {
       std::ostringstream os;
-      os << "The block {0} is not a complete block - it violates condition #" << violated_cond_
-         << ".\n"
-         << R"(Definition of a complete block:
-1) All block vars are data parallel
-2) Dominant: the block is the only writer of its output, dominating the reader of its output buffers
-3) No overlap between the buffers the block reads and writes)";
+      os << "The block {0} is not a complete block - it violates condition #" << violated_cond_;
+      os << ".\n" << kCompleteBlockDefinition;
       return os.str();
     }
     IRModule mod() const final { return mod_; }
@@ -291,14 +337,8 @@ void CheckReductionBlock(const ScheduleState& self, const StmtSRef& block_sref,
     String FastErrorString() const final { return "ScheduleError: Not a reduction block"; }
     String DetailRenderTemplate() const final {
       std::ostringstream os;
-      os << "The block {0} is not a reduction block - it violates condition #" << violated_cond_
-         << ".\n"
-         << R"(Definition of a reduction block:
-1) The block has the `init` statement
-2) All the block bindings are quasi-affine expressions
-3) All block vars are either data parallel block vars or reduction block vars
-4) Dominant: the block is the only writer of its output, dominating the reader of its output buffers
-5) The reduction block vars are not used to index the output buffers)";
+      os << "The block {0} is not a reduction block - it violates condition #" << violated_cond_;
+      os << ".\n" << kReductionBlockDefinition;
       return os.str();
     }
     IRModule mod() const final { return mod_; }
@@ -315,41 +355,89 @@ void CheckReductionBlock(const ScheduleState& self, const StmtSRef& block_sref,
   }
 }
 
-void CheckSRefSubtreeCompactDataFlow(const ScheduleState& self, const StmtSRef& subtree_root_sref) {
-  class NotCompactDataFlowError : public ScheduleError {
+void CheckCompleteOrReductionBlock(const ScheduleState& self, const StmtSRef& block_sref,
+                                   const StmtSRef& scope_root_sref) {
+  class NotCompleteOrReductionBlockError : public ScheduleError {
    public:
-    explicit NotCompactDataFlowError(IRModule mod, Stmt subtree_root, Block violate_block)
-        : mod_(std::move(mod)),
-          subtree_root_(std::move(subtree_root)),
-          violate_block_(std::move(violate_block)) {
-      ICHECK(subtree_root_->IsInstance<BlockNode>() || subtree_root_->IsInstance<ForNode>());
-    }
+    explicit NotCompleteOrReductionBlockError(IRModule mod, Block block,
+                                              int complete_block_error_code,
+                                              int reduction_block_error_code)
+        : mod_(mod),
+          block_(block),
+          complete_block_error_code_(complete_block_error_code),
+          reduction_block_error_code_(reduction_block_error_code) {}
+
     String FastErrorString() const final {
-      return "ScheduleError: The queried subtree root in SRef tree does not have compact data "
-             "flow, because some of its child block on SRef tree is neither a complete block nor a "
-             "reduction block";
+      return "ScheduleError: Not a complete or reduction block";
     }
     String DetailRenderTemplate() const final {
-      return "The queried subtree root {0} in SRef tree does not have compact data flow, because "
-             "its child block {1} on SRef tree is neither a complete block nor a reduction block";
+      std::ostringstream os;
+      os << "The block {0} is not a complete block - it violates condition #"
+         << complete_block_error_code_;
+      os << ".\n" << kCompleteBlockDefinition;
+      os << "\nThe block is not a reduction block either - it violates condition #"
+         << reduction_block_error_code_;
+      os << ".\n" << kReductionBlockDefinition;
+      return os.str();
     }
     IRModule mod() const final { return mod_; }
-    Array<ObjectRef> LocationsOfInterest() const final { return {subtree_root_, violate_block_}; }
+    Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
 
     IRModule mod_;
-    Stmt subtree_root_;
-    Block violate_block_;
+    Block block_;
+    int complete_block_error_code_;
+    int reduction_block_error_code_;
   };
 
-  StmtSRef scope_root = GetScopeRoot(self, subtree_root_sref, /*require_stage_pipeline=*/true);
-  Array<StmtSRef> child_blocks = GetChildBlockSRefOnSRefTree(self, scope_root);
-  for (const StmtSRef& block : child_blocks) {
-    if (!IsCompleteBlock(self, block, scope_root) && !IsReductionBlock(self, block, scope_root)) {
-      const BlockNode* violate_block = TVM_SREF_TO_BLOCK(violate_block, block);
-      throw NotCompactDataFlowError(self->mod, GetRef<Stmt>(subtree_root_sref->stmt),
-                                    GetRef<Block>(violate_block));
+  int complete_block_error_code = CheckCompleteBlockErrorCode(self, block_sref, scope_root_sref);
+  if (complete_block_error_code == 0) {
+    return;
+  }
+  int reduction_block_error_code = CheckReductionBlockErrorCode(self, block_sref, scope_root_sref);
+  if (reduction_block_error_code == 0) {
+    return;
+  }
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  throw NotCompleteOrReductionBlockError(self->mod, GetRef<Block>(block), complete_block_error_code,
+                                         reduction_block_error_code);
+}
+
+bool IsOutputBlock(const ScheduleState& self, const StmtSRef& block_sref,
+                   const StmtSRef& scope_root_sref) {
+  const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root, scope_root_sref);
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  std::unordered_set<const BufferNode*> scope_allocated;
+  scope_allocated.reserve(scope_root->alloc_buffers.size());
+  for (const Buffer& buffer : scope_root->alloc_buffers) {
+    scope_allocated.insert(buffer.get());
+  }
+  for (const BufferRegion& buffer_region : block->writes) {
+    if (!scope_allocated.count(buffer_region->buffer.get())) {
+      return true;
     }
   }
+  return false;
+}
+
+void CheckNotOutputBlock(const ScheduleState& self, const StmtSRef& block_sref,
+                         const StmtSRef& scope_root_sref) {
+  class OutputBlockError : public ScheduleError {
+   public:
+    explicit OutputBlockError(IRModule mod, Block block) : mod_(mod), block_(block) {}
+    String FastErrorString() const final {
+      return "ScheduleError: Cannot operate on an output block";
+    }
+    String DetailRenderTemplate() const final { return "The block {0} is an output block"; }
+    IRModule mod() const final { return mod_; }
+    Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+    IRModule mod_;
+    Block block_;
+  };
+  if (IsOutputBlock(self, block_sref, scope_root_sref)) {
+    const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+    throw OutputBlockError(self->mod, GetRef<Block>(block));
+  }
 }
 
 /******** Binding ********/
@@ -586,6 +674,125 @@ BlockRealize GetBlockRealize(const ScheduleState& self, const StmtSRef& block_sr
   }
 }
 
+/******** Producer-consumer relation ********/
+
+Array<StmtSRef> GetProducers(const StmtSRef& block_sref, const BlockScope& scope) {
+  Array<Dependency> deps = scope->GetDepsByDst(block_sref);
+  Array<StmtSRef> result;
+  result.reserve(deps.size());
+  for (const Dependency& dep : deps) {
+    result.push_back(dep->src);
+  }
+  return result;
+}
+
+Array<StmtSRef> GetConsumers(const StmtSRef& block_sref, const BlockScope& scope) {
+  Array<Dependency> deps = scope->GetDepsBySrc(block_sref);
+  Array<StmtSRef> result;
+  result.reserve(deps.size());
+  for (const Dependency& dep : deps) {
+    result.push_back(dep->dst);
+  }
+  return result;
+}
+
+ProducerConsumerSplit ProducerConsumerSplit::Find(
+    const ScheduleState& self, const Array<Stmt>& subtrees,
+    const Array<StmtSRef>& producer_block_srefs, const Array<StmtSRef>& consumer_block_srefs,
+    std::unordered_map<const BlockNode*, const BlockRealizeNode*>* block2realize) {
+  class InsertionPointNotFoundError : public ScheduleError {
+   public:
+    explicit InsertionPointNotFoundError(IRModule mod, int last_producer_position,
+                                         int first_consumer_position)
+        : mod_(mod),
+          last_producer_position_(last_producer_position),
+          first_consumer_position_(first_consumer_position) {}
+
+    String FastErrorString() const final {
+      return "ScheduleError: Cannot find the insertion point that satisfies the producer-consumer "
+             "constraint";
+    }
+
+    String DetailRenderTemplate() const final {
+      return "Cannot find the insertion point that satisfies the producer-consumer constraint. In "
+             "0-based indexing, the last producer appears in subtree " +
+             std::to_string(last_producer_position_) +
+             ", and the first consumer appears in subtree " +
+             std::to_string(first_consumer_position_);
+    }
+
+    IRModule mod() const final { return mod_; }
+
+    Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+   private:
+    IRModule mod_;
+    int last_producer_position_;
+    int first_consumer_position_;
+  };
+
+  class Finder : public StmtVisitor {
+   public:
+    void VisitStmt_(const BlockRealizeNode* realize) final {
+      const BlockNode* block = realize->block.get();
+      if (block2realize_) {
+        block2realize_->emplace(block, realize);
+      }
+      if (producer_blocks_.count(block)) {
+        ++this->n_producers_visited_;
+      }
+      if (consumer_blocks_.count(block)) {
+        ++this->n_consumers_visited_;
+      }
+    }
+
+    std::unordered_map<const BlockNode*, const BlockRealizeNode*>* block2realize_;
+    std::unordered_set<const StmtNode*> producer_blocks_;
+    std::unordered_set<const StmtNode*> consumer_blocks_;
+    int n_producers_visited_ = 0;
+    int n_consumers_visited_ = 0;
+  };
+
+  Finder finder;
+  finder.block2realize_ = block2realize;
+  // Set up the lookup table for producers
+  finder.producer_blocks_.reserve(producer_block_srefs.size());
+  for (const StmtSRef& block_sref : producer_block_srefs) {
+    finder.producer_blocks_.insert(block_sref->stmt);
+  }
+  // Set up the lookup table for consumers
+  finder.consumer_blocks_.reserve(consumer_block_srefs.size());
+  for (const StmtSRef& block_sref : consumer_block_srefs) {
+    finder.consumer_blocks_.insert(block_sref->stmt);
+  }
+  // Visit the subtrees
+  int n = subtrees.size();
+  int last_producer_position = -1;
+  int first_consumer_position = n;
+  for (int i = 0; i < n; ++i) {
+    int n_producers_visited_before = finder.n_producers_visited_;
+    int n_consumers_visited_before = finder.n_consumers_visited_;
+    finder(subtrees[i]);
+    // Check if the subtree contains at least a producer
+    if (finder.n_producers_visited_ != n_producers_visited_before) {
+      last_producer_position = i;
+    }
+    // Check if the subtree contains at least a consumer
+    if (finder.n_consumers_visited_ != n_consumers_visited_before) {
+      if (first_consumer_position == n) {
+        first_consumer_position = i;
+      }
+    }
+  }
+  if (last_producer_position >= first_consumer_position) {
+    throw InsertionPointNotFoundError(self->mod, last_producer_position, first_consumer_position);
+  }
+  return ProducerConsumerSplit{last_producer_position,       //
+                               first_consumer_position,      //
+                               finder.n_producers_visited_,  //
+                               finder.n_consumers_visited_};
+}
+
 /******** Block-buffer relation ********/
 
 Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n, bool is_write) {
@@ -957,11 +1164,13 @@ bool FromIdentityCombiner(const PrimExpr& identity, const BufferStore& combiner,
 }
 
 /******** SRef Tree Related ********/
+
 StmtSRef GetSRefTreeRoot(const StmtSRef& sref) {
   const StmtSRefNode* p = sref.get();
   for (; p->parent != nullptr; p = p->parent) {
   }
   return GetRef<StmtSRef>(p);
 }
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 86223e11c196..07af73ebabb6 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -439,6 +439,44 @@ BlockRV ConcreteScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buff
 
 /******** Schedule: Compute location ********/
 
+void ConcreteScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                                     bool preserve_unit_loops) {
+  static StmtSRef inline_mark = StmtSRef::InlineMark();
+  static StmtSRef root_mark = StmtSRef::RootMark();
+  StmtSRef loop_sref = this->GetSRef(loop_rv);
+  if (loop_sref.same_as(root_mark)) {
+    // do nothing
+  } else if (loop_sref.same_as(inline_mark)) {
+    TVM_TIR_SCHEDULE_BEGIN();
+    tir::ComputeInline(state_, this->GetSRef(block_rv));
+    TVM_TIR_SCHEDULE_END("compute-at", this->error_render_level_);
+  } else {
+    TVM_TIR_SCHEDULE_BEGIN();
+    tir::ComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops);
+    TVM_TIR_SCHEDULE_END("compute-at", this->error_render_level_);
+  }
+  this->state_->DebugVerify();
+}
+
+void ConcreteScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                                            bool preserve_unit_loops) {
+  static StmtSRef inline_mark = StmtSRef::InlineMark();
+  static StmtSRef root_mark = StmtSRef::RootMark();
+  StmtSRef loop_sref = this->GetSRef(loop_rv);
+  if (loop_sref.same_as(root_mark)) {
+    // do nothing
+  } else if (loop_sref.same_as(inline_mark)) {
+    TVM_TIR_SCHEDULE_BEGIN();
+    tir::ReverseComputeInline(state_, this->GetSRef(block_rv));
+    TVM_TIR_SCHEDULE_END("reverse-compute-at", this->error_render_level_);
+  } else {
+    TVM_TIR_SCHEDULE_BEGIN();
+    tir::ReverseComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops);
+    TVM_TIR_SCHEDULE_END("reverse-compute-at", this->error_render_level_);
+  }
+  this->state_->DebugVerify();
+}
+
 void ConcreteScheduleNode::ComputeInline(const BlockRV& block_rv) {
   TVM_TIR_SCHEDULE_BEGIN();
   tir::ComputeInline(state_, this->GetSRef(block_rv));
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index e756f9da41b2..c9a9402832f2 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -108,6 +108,9 @@ class ConcreteScheduleNode : public ScheduleNode {
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) override;
   /******** Schedule: Compute location ********/
+  void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) override;
+  void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                        bool preserve_unit_loops) override;
   void ComputeInline(const BlockRV& block) override;
   void ReverseComputeInline(const BlockRV& block) override;
   /******** Schedule: Reduction ********/
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 412611adf76d..05eefaca8a11 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -160,6 +160,44 @@ TVM_DLL StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int r
 TVM_DLL StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index,
                             const String& storage_scope);
 /******** Schedule: Compute location ********/
+/*!
+ * \brief Move a producer block under the specific loop, and regenerate the
+ * loops induced by the block so that the buffer region produced by the producer block could
+ * cover those regions consumed by its consumer blocks under the given loop. It requires:
+ * 1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+ * 2) The scope block has stage-pipeline property
+ * 3) The subtree of the scope block, where the given block is in, satisfies the compact dataflow
+ * condition. i.e. all the blocks in the scope block's subtree must be either complete block or
+ * reduction block
+ * 4) The block is not an output block with regard to the scope block, i.e. the buffers written by
+ * the block are allocated under the scope block
+ * 5) All the consumers of the block are under the given loop
+ *
+ * \param self The schedule state
+ * \param block_sref The block to be moved
+ * \param loop_sref The loop where the block to be moved to
+ * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+ */
+TVM_DLL void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref,
+                       bool preserve_unit_loops);
+/*!
+ * \brief Move a consumer block under the specific loop, and regenerate the
+ * loops induced by the block so that the buffer region consumed by the consumer block could
+ * cover those regions produced by its producer blocks under the given loop. It requires:
+ * 1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+ * 2) The scope block has stage-pipeline property
+ * 3) The subtree of the scope block, where the given block is in, satisfies the compact dataflow
+ * condition. i.e. all the blocks in the scope block's subtree must be either complete block or
+ * reduction block
+ * 4) All the producers of the block are under the given loop
+ *
+ * \param self The schedule state
+ * \param block_sref The block to be moved
+ * \param loop_sref The loop where the block to be moved to
+ * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+ */
+TVM_DLL void ReverseComputeAt(ScheduleState self, const StmtSRef& block_sref,
+                              const StmtSRef& loop_sref, bool preserve_unit_loops);
 /*!
  * \brief Inline a block into its consumer(s). It requires:
  * 1) The block is a complete non-root block, which only produces one buffer
@@ -199,6 +237,10 @@ TVM_DLL void ReverseComputeInline(ScheduleState self, const StmtSRef& block_sref
  */
 TVM_DLL StmtSRef RFactor(ScheduleState self, const StmtSRef& loop_sref, int factor_axis);
 /******** Schedule: Block annotation ********/
+/*! \brief The quad used by StorageAlign for (buffer_idx, axis, factor, offset) */
+using StorageAlignTuple = Array<Integer>;
+/*! \brief A list of StorageAlignTuple, used by StorageAlign */
+using StorageAlignAnnotation = Array<StorageAlignTuple>;
 /*!
  * \brief Set alignment requirement for specific dimension such that
  *        stride[axis] == k * factor + offset for some k. This is useful to set memory layout for
@@ -214,10 +256,6 @@ TVM_DLL StmtSRef RFactor(ScheduleState self, const StmtSRef& loop_sref, int fact
 TVM_DLL void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                           int axis, int factor, int offset);
 
-/******** Annotation types for StorageAlign ********/
-using StorageAlignTuple = Array<Integer>;                 // (buffer_idx, axis, factor, offset)
-using StorageAlignAnnotation = Array<StorageAlignTuple>;  // unordered array of StorageAlignTuple
-
 /******** Schedule: Blockize & Tensorize ********/
 /******** Schedule: Annotation ********/
 /******** Schedule: Misc ********/
diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc
index 06f7ac3c1bc2..a96c8ca09f32 100644
--- a/src/tir/schedule/primitive/block_annotate.cc
+++ b/src/tir/schedule/primitive/block_annotate.cc
@@ -270,7 +270,7 @@ void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_ind
   self->Replace(block_sref, new_block, {{GetRef<Block>(block_ptr), new_block}});
 }
 
-/******** Instruction Registration ********/
+/******** InstructionKind Registration ********/
 
 struct StorageAlignTraits : public UnpackedInstTraits<StorageAlignTraits> {
   static constexpr const char* kName = "StorageAlign";
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index df54c9652ece..8628cc3c0791 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -146,7 +146,7 @@ Block MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
       /*annotations=*/{});
   // Create the block realize node
   Stmt body = BlockRealize(/*values=*/iter_values,
-                           /*predicate=*/Bool(true),
+                           /*predicate=*/const_true(),
                            /*block=*/block);
   // Create surrounding loops
   for (size_t i = loop_vars.size(); i >= 1; --i) {
@@ -160,6 +160,21 @@ Block MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   return block;
 }
 
+/*!
+ * \brief Recalculate the `affine_binding` flag of a specifc block
+ * \param block_sref The sref to the specific block
+ */
+bool CalculateAffineFlag(const ScheduleState& self, const StmtSRef& block_sref) {
+  if (block_sref->parent == nullptr) {
+    return true;
+  }
+  arith::Analyzer analyzer;
+  StmtSRef parent_sref = GetRef<StmtSRef>(block_sref->parent);
+  return IsAffineBinding(/*realize=*/GetBlockRealize(self, block_sref),
+                         /*loop_var_ranges=*/LoopDomainOfSRefTreePath(parent_sref),
+                         /*analyzer=*/&analyzer);
+}
+
 /*!
  * \brief Insert the cache_read/cache_write stage into the specific position
  * \param stmt A sequence of statements or a single statement that the new stage is inserted in
@@ -613,7 +628,8 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   Buffer read_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block), read_buffer_index, /*is_write=*/false);
-  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true,
+                                     /*require_subtree_compact_dataflow=*/false);
   const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
 
   // Step 2. Creat CacheStageInfo
@@ -657,8 +673,8 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
   // Step 5. Replacing and updating flags.
   self->Replace(scope_sref, new_scope, info.block_reuse);
   StmtSRef result_block_sref = self->stmt2ref.at(cache_read_stage.get());
-  self->UpdateAffineFlag(result_block_sref);
   BlockInfo& block_info = self->block_info[result_block_sref];
+  block_info.affine_binding = CalculateAffineFlag(self, result_block_sref);
   block_info.region_cover = true;
   block_info.scope->stage_pipeline = true;
   return result_block_sref;
@@ -680,7 +696,8 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   Buffer write_buffer =
       GetNthAccessBuffer(self, GetRef<Block>(block), write_buffer_index, /*is_write=*/true);
-  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true,
+                                     /*require_subtree_compact_dataflow=*/false);
 
   // Step 2. Creating CacheStageInfo
   CacheStageInfo info;
@@ -710,8 +727,8 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   // Step 6. Replacing and updating flags.
   self->Replace(scope_sref, new_scope, info.block_reuse);
   StmtSRef result_block_sref = self->stmt2ref.at(cache_write_stage.get());
-  self->UpdateAffineFlag(result_block_sref);
   BlockInfo& block_info = self->block_info[result_block_sref];
+  block_info.affine_binding = CalculateAffineFlag(self, result_block_sref);
   block_info.region_cover = true;
   block_info.scope->stage_pipeline = true;
   return result_block_sref;
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
new file mode 100644
index 000000000000..0dae50abc05e
--- /dev/null
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -0,0 +1,589 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+using support::NDIntSet;
+
+/******** Error Classes ********/
+
+/*!
+ * \brief An error raised when not all required blocks are under the given loop.
+ * \tparam is_consumer Indicates if all the required blocks are consumers or producers
+ */
+template <bool is_consumer>
+class NotAllRequiredBlocksAreVisitedError : public ScheduleError {
+ public:
+  explicit NotAllRequiredBlocksAreVisitedError(IRModule mod, int num_not_visited,
+                                               const Array<StmtSRef>& required)
+      : mod_(mod), num_not_visited_(num_not_visited) {
+    required_.reserve(required.size());
+    for (const StmtSRef& block_sref : required) {
+      const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+      required_.push_back(GetRef<Block>(block));
+    }
+  }
+
+  String FastErrorString() const final {
+    return "ScheduleError: Not all required blocks are under the loop scope";
+  }
+
+  String DetailRenderTemplate() const final {
+    String relation = is_consumer ? "consumer(s)" : "producer(s)";
+    std::ostringstream os;
+    os << "The primitive requires all the " << relation
+       << " of the given block to be present under the target loop. However, there are "
+       << num_not_visited_ << " " << relation << " not satisfying the constraint. List of the "
+       << relation << ":";
+    for (int i = 0, n = required_.size(); i < n; ++i) {
+      os << "{" << i << "}";
+    }
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+
+  Array<ObjectRef> LocationsOfInterest() const final {
+    return {required_.begin(), required_.end()};
+  }
+
+ private:
+  IRModule mod_;
+  int num_not_visited_;
+  Array<Block> required_;
+};
+
+/*!
+ * \brief An error raised when the given block is not in the same block scope as the given loop,
+ * or the given loop is the ancestor of the given block.
+ */
+class NotInSameScopeError : public ScheduleError {
+ public:
+  static void CheckAndBindLoopDomain(const ScheduleState& self, const StmtSRef& block_sref,
+                                     const StmtSRef& loop_sref, const StmtSRef& scope_root_sref,
+                                     arith::Analyzer* analyzer) {
+    for (const StmtSRefNode* p = loop_sref.get();; p = p->parent) {
+      if (const ForNode* loop = p->StmtAs<ForNode>()) {
+        analyzer->Bind(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent));
+      } else if (p != scope_root_sref.get()) {
+        throw NotInSameScopeError(self->mod, block_sref, loop_sref);
+      } else {
+        break;
+      }
+    }
+    for (const StmtSRefNode* p = block_sref->parent; p != scope_root_sref.get(); p = p->parent) {
+      if (p == loop_sref.get()) {
+        throw NotInSameScopeError(self->mod, block_sref, loop_sref);
+      }
+    }
+  }
+
+  String FastErrorString() const final {
+    return "ScheduleError: Expected the block and loop to be under the same block scope, and loop "
+           "not to be the ancestor of block";
+  }
+  String DetailRenderTemplate() const final {
+    return "ScheduleError: Expected the block {0} and loop {1} to be under the same block scope, "
+           "and loop not to be the ancestor of block";
+  }
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_, loop_}; }
+
+ private:
+  explicit NotInSameScopeError(IRModule mod, const StmtSRef& block_sref, const StmtSRef& loop_sref)
+      : mod_(mod),
+        block_(GetRef<Block>(block_sref->StmtAs<BlockNode>())),
+        loop_(GetRef<For>(loop_sref->StmtAs<ForNode>())) {}
+
+  IRModule mod_;
+  Block block_;
+  For loop_;
+};
+
+/******** Helper Functions/Classes ********/
+
+/*!
+ * \brief Find a point where the block can be inserted under the loop
+ * \tparam require_all_producers_visited Requires all producer blocks to be present under the loop
+ * \tparam require_all_consumers_visited Requires all consumer blocks to be present under the loop
+ * \param self The schedule state
+ * \param subtrees The subtrees under the loop, among which the insertion points are sought
+ * \param producer_srefs The producer blocks
+ * \param consumer_srefs The consumer blocks
+ * \param block2realize A cache that maps a block to its realize
+ * \return The last position the new block can be inserted onto, and the
+ * producer-consumer-relationship is still satisfied.
+ * \throws ScheduleError if there is no such insertion point found
+ */
+template <bool require_all_producers_visited, bool require_all_consumers_visited>
+int FindInsertionPoint(
+    const ScheduleState& self, const Array<Stmt>& subtrees, const Array<StmtSRef>& producer_srefs,
+    const Array<StmtSRef>& consumer_srefs,
+    std::unordered_map<const BlockNode*, const BlockRealizeNode*>* block2realize) {
+  ProducerConsumerSplit split =
+      ProducerConsumerSplit::Find(self, subtrees, producer_srefs, consumer_srefs, block2realize);
+  // Step 1. Check if all the producers are visited in the subtrees, if required to
+  if (require_all_producers_visited) {
+    int num_producers = producer_srefs.size();
+    if (split.n_producers_visited < num_producers) {
+      throw NotAllRequiredBlocksAreVisitedError<false>(
+          self->mod, num_producers - split.n_producers_visited, producer_srefs);
+    }
+  }
+  // Step 2. Check if all the consumers are visited in the subtrees, if required to
+  if (require_all_consumers_visited) {
+    int num_consumers = consumer_srefs.size();
+    if (split.n_consumers_visited < num_consumers) {
+      throw NotAllRequiredBlocksAreVisitedError<true>(
+          self->mod, num_consumers - split.n_consumers_visited, consumer_srefs);
+    }
+  }
+  // Step 3. Check if there is at least one index of the position can be inserted into
+  // The valid indices are: (last_producer_position, first_consumer_position]
+  ICHECK(split.last_producer_position < split.first_consumer_position);
+  // Step 4. Return the last valid insertion point
+  return split.first_consumer_position;
+}
+
+/*!
+ * \brief A helper to reconstruct the block scope where the given block is moved under the given
+ * loop, and the given block's induced loop nest is regenerated to satisfy the required region.
+ */
+class ScopeReconstructor : private StmtMutator {
+ public:
+  explicit ScopeReconstructor(Block scope_root, Block block, For loop)
+      : scope_root_(scope_root), block_(block), loop_(loop) {}
+
+  using StmtMutator::operator();
+
+  /*!
+   * \brief Create the loop nest on top of the block, induced by the given block var's domain
+   * \param insert_position The position among the subtrees where the block and its induced loop
+   * nest is inserted
+   * \param iter_doms The domain of each block var
+   * \param preserve_unit_loops Whether to generate unit loops where the loop extent is 1
+   */
+  void MakeNewLoop(int insert_position, std::vector<Range> iter_doms, bool preserve_unit_loops) {
+    int n_iters = iter_doms.size();
+    Array<Var> loop_vars;
+    Array<PrimExpr> loop_extents;
+    Array<PrimExpr> iter_values;
+    loop_vars.reserve(n_iters);
+    loop_extents.reserve(n_iters);
+    iter_values.reserve(n_iters);
+    for (int i = 0; i < n_iters; ++i) {
+      const Range& iter_dom = iter_doms[i];
+      if (preserve_unit_loops || !is_one(iter_dom->extent)) {
+        Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(32));
+        loop_vars.push_back(var);
+        loop_extents.push_back(iter_dom->extent);
+        iter_values.push_back(iter_dom->min + var);
+      } else {
+        iter_values.push_back(iter_dom->min);
+      }
+    }
+    this->new_block_realize_ =
+        BlockRealize(std::move(iter_values), const_true(), std::move(block_));
+    Stmt new_subtree = this->new_block_realize_;
+    for (int i = static_cast<int>(loop_vars.size()) - 1; i >= 0; --i) {
+      const Var& loop_var = loop_vars[i];
+      const PrimExpr& loop_extent = loop_extents[i];
+      new_subtree = For(/*loop_var=*/loop_var,
+                        /*min=*/Integer(0),
+                        /*extent=*/loop_extent,
+                        /*ForKind=*/ForKind::kSerial,
+                        /*body=*/std::move(new_subtree));
+    }
+    Array<Stmt> subtrees = AsArray(loop_->body);
+    subtrees.insert(subtrees.begin() + insert_position, std::move(new_subtree));
+    ObjectPtr<ForNode> new_loop = make_object<ForNode>(*loop_.get());
+    new_loop->body = SeqStmt(std::move(subtrees));
+    this->new_loop_ = For(std::move(new_loop));
+  }
+
+ private:
+  Stmt VisitStmt_(const BlockNode* block) final {
+    if (block != scope_root_.get()) {
+      return GetRef<Block>(block);
+    }
+    if (block == rm_src_stmt_.get()) {
+      block = TVM_TYPE_AS(block, rm_tgt_stmt_, BlockNode);
+    }
+    return StmtMutator::VisitStmt_(block);
+  }
+
+  Stmt VisitStmt_(const ForNode* loop) final {
+    if (loop == rm_src_stmt_.get()) {
+      loop = TVM_TYPE_AS(loop, rm_tgt_stmt_, ForNode);
+    }
+    if (loop == loop_.get()) {
+      return new_loop_;
+    }
+    return StmtMutator::VisitStmt_(loop);
+  }
+
+ public:
+  /*! \brief The root block of the block scope */
+  Block scope_root_;
+  /*! \brief The given block to be moved */
+  Block block_;
+  /*! \brief The given loop the block and its loop nest to be put under */
+  For loop_;
+  /*! \brief The new loop to replace the original loop */
+  For new_loop_{nullptr};
+  /*! \brief The new block realize to the moved block */
+  BlockRealize new_block_realize_{nullptr};
+  /*! \brief The plan to remove the given block by replacing this loop/block in the AST */
+  Stmt rm_src_stmt_{nullptr};
+  /*! \brief The plan to remove the given block by replacing to this loop/block in the AST */
+  Stmt rm_tgt_stmt_{nullptr};
+};
+
+/*!
+ * \brief Calculate a list of accessed buffer regions under a path of loops
+ * \tparam relax_storage_scope Whether to relax beyond the path according to the storage and
+ * execution scope
+ * \param binding The block binding, used to unbind the buffer regions
+ * \param buffer_regions The buffer regions to be calculated
+ * \param relax_path_low_inclusive The lowest point in the loop path, inclusive
+ * \param relax_path_high_exclusive The highest point in the loop path, exclusive
+ * \param relaxed Where the calculation result is stored
+ */
+template <bool relax_storage_scope>
+void RelaxBufferRegions(const Map<Var, PrimExpr>& binding,
+                        const Array<BufferRegion>& buffer_regions,
+                        const StmtSRef& relax_path_low_inclusive,
+                        const StmtSRef& relax_path_high_exclusive,
+                        std::unordered_map<const BufferNode*, std::vector<NDIntSet>>* relaxed) {
+  runtime::StorageScope global_scope{runtime::StorageRank::kGlobal, ""};
+  // We cache the variable domains
+  runtime::StorageRank previous_rank = runtime::StorageRank::kGlobal;
+  Optional<Map<Var, arith::IntSet>> var_dom = NullOpt;
+  // Enumerate every buffer region
+  for (const BufferRegion& buffer_region : buffer_regions) {
+    const Buffer& buffer = buffer_region->buffer;
+    const Array<Range>& region = buffer_region->region;
+    // Skip the buffer regions we are not interested in
+    auto it = relaxed->find(buffer.get());
+    if (it == relaxed->end()) {
+      continue;
+    }
+    std::vector<NDIntSet>& relaxed_regions = it->second;
+    // Check and update the cached `var_dom`
+    runtime::StorageScope scope =
+        relax_storage_scope ? runtime::StorageScope::Create(buffer.scope()) : global_scope;
+    runtime::StorageRank rank = scope.rank;
+    if (rank != previous_rank || !var_dom.defined()) {
+      previous_rank = rank;
+      var_dom = AsIntSet(LoopDomainOfSRefTreePath(
+          /*low_inclusive=*/relax_path_low_inclusive,
+          /*high_exclusive=*/relax_path_high_exclusive,
+          /*extra_relax_scope=*/scope));
+    }
+    // Relax the region
+    Array<arith::IntSet> relaxed_region =
+        arith::EvalSet(Substitute(region, binding), var_dom.value());
+    relaxed_regions.push_back({relaxed_region.begin(), relaxed_region.end()});
+  }
+}
+
+/*!
+ * \brief Calculate the iteration domain of a provided integer set to fully cover the required
+ * domain
+ * \param provided The provided integer set to cover the required domain
+ * \param required The required domain to be covered
+ * \param iter_doms The result iteration domains to be updated
+ * \param analyzer The arithmetic analyzer
+ */
+void UpdateBlockVarDomain(const arith::IntSet& provided, const arith::IntSet& required,
+                          std::unordered_map<const VarNode*, std::vector<arith::IntSet>>* iter_doms,
+                          arith::Analyzer* analyzer) {
+  PrimExpr provided_min = analyzer->Simplify(provided.min());
+  PrimExpr provided_extent = analyzer->Simplify(provided.max() - provided_min + 1);
+  PrimExpr required_min = analyzer->Simplify(required.min());
+  PrimExpr required_extent = analyzer->Simplify(required.max() - required_min + 1);
+  PrimExpr dom_min{nullptr}, dom_extent{nullptr};
+  Var dom_var{ObjectPtr<VarNode>{nullptr}};
+  arith::PVar<Var> p_v;
+  arith::PVar<PrimExpr> p_e;
+  if ((p_v * p_e).Match(provided_min) || (p_e * p_v).Match(provided_min)) {
+    PrimExpr e = p_e.Eval();
+    dom_var = p_v.Eval();
+    dom_min = floordiv(required_min, e);
+    dom_extent = analyzer->Simplify((required_extent + e - 1) / e);
+  } else if (analyzer->CanProveEqual(provided_extent, 1) && p_v.Match(provided_min)) {
+    dom_var = p_v.Eval();
+    dom_min = required_min;
+    dom_extent = required_extent;
+  } else {
+    ICHECK(false) << "ValueError: BufferRegion pattern match failed";
+  }
+  auto it = iter_doms->find(dom_var.get());
+  if (it != iter_doms->end()) {
+    std::vector<arith::IntSet>& doms = it->second;
+    doms.push_back(arith::IntSet::FromMinExtent(dom_min, dom_extent));
+  } else {
+    ICHECK(analyzer->CanProveEqual(provided_min, required_min));
+    ICHECK(analyzer->CanProveEqual(provided_extent, required_extent));
+  }
+}
+
+/*!
+ * \brief Calculate the domain of block vars to cover the required region
+ * \param iter_vars The list of block vars to cover the required region
+ * \param provided_regions The region provided by one iteration instance of the block vars
+ * \param required_regions The region required to be covered
+ * \param analyzer The arithmetic analyzer
+ * \return A list of iteration domain corresponding to the given list of block vars
+ */
+std::vector<Range> CalculateBlockVarDomain(
+    const Array<IterVar>& iter_vars,
+    std::unordered_map<const BufferNode*, std::vector<NDIntSet>> provided_regions,
+    std::unordered_map<const BufferNode*, std::vector<NDIntSet>> required_regions,
+    arith::Analyzer* analyzer) {
+  int n_iters = iter_vars.size();
+  // Step 1. Construct the mapping from block var to their iteration domain (initialized to empty)
+  std::unordered_map<const VarNode*, std::vector<arith::IntSet>> iter_doms;
+  iter_doms.reserve(n_iters);
+  for (const IterVar& iter_var : iter_vars) {
+    iter_doms[iter_var->var.get()] = {};
+  }
+  // Step 2. For each buffer, update the domain according to the provided and required regions
+  for (const auto& kv : provided_regions) {
+    const BufferNode* buffer = kv.first;
+    const std::vector<NDIntSet>& many_provided_regions = kv.second;
+    // Calculate `provided_region` and `required_region`
+    auto it = required_regions.find(buffer);
+    if (it == required_regions.end() || it->second.empty()) {
+      continue;
+    }
+    NDIntSet required_region = support::NDIntSetUnion(it->second);
+    NDIntSet provided_region = support::NDIntSetUnion(many_provided_regions);
+    ICHECK_EQ(provided_region.size(), buffer->shape.size());
+    ICHECK_EQ(required_region.size(), buffer->shape.size());
+    // For each dimension, update the iteration domain
+    int ndim = buffer->shape.size();
+    for (int i = 0; i < ndim; ++i) {
+      arith::IntSet provided = provided_region[i];
+      arith::IntSet required = required_region[i];
+      required = arith::Intersect(
+          {std::move(required), arith::IntSet::FromMinExtent(Integer(0), buffer->shape[i])});
+      UpdateBlockVarDomain(provided, required, &iter_doms, analyzer);
+    }
+  }
+  // Union the iter var domains, put them in the same order of block vars, and return
+  std::vector<Range> result;
+  result.reserve(n_iters);
+  for (const IterVar& iter_var : iter_vars) {
+    const std::vector<arith::IntSet>& doms = iter_doms.at(iter_var->var.get());
+    arith::IntSet dom = arith::IntSet::FromRange(iter_var->dom);
+    if (!doms.empty()) {
+      dom = arith::Intersect({std::move(dom), arith::Union(doms)});
+    }
+    PrimExpr min = analyzer->Simplify(dom.min());
+    PrimExpr extent = analyzer->Simplify(dom.max() - min + 1);
+    result.push_back(Range::FromMinExtent(min, extent));
+  }
+  return result;
+}
+
+/*!
+ * \brief Calculate the provided region of the given block by one single of its execution instance,
+ * as well as the required buffer regions relaxed to the given loop
+ * \tparam is_compute_at Indicates if the operation is compute-at or reverse-compute-at
+ * \param block The given block that provides buffer regions
+ * \param loop_sref The given loop under which the block is going to be moved to
+ * \param block2realize Maps a block to its corresponding BlockRealize
+ * \param producer_srefs The producers of the given block
+ * \param consumer_srefs The consumers of the given block
+ * \param provided_regions The calculated regions provided by the block
+ * \param required_regions The calculated regions required by its consumers (in compute-at) or
+ * producers (in reverse-compute-at)
+ */
+template <bool is_compute_at>
+void CalculateProvidedRequiredRegions(
+    const BlockNode* block, const StmtSRef& loop_sref,
+    std::unordered_map<const BlockNode*, const BlockRealizeNode*> block2realize,
+    Array<StmtSRef> producer_srefs, Array<StmtSRef> consumer_srefs,
+    std::unordered_map<const BufferNode*, std::vector<NDIntSet>>* provided_regions,
+    std::unordered_map<const BufferNode*, std::vector<NDIntSet>>* required_regions) {
+  // Step 1. Calculate the region provided by a single execution instance of `block`
+  const Array<BufferRegion>& provided_buffers = is_compute_at ? block->writes : block->reads;
+  provided_regions->reserve(provided_buffers.size());
+  required_regions->reserve(provided_buffers.size());
+  for (const BufferRegion& provided_buffer_region : provided_buffers) {
+    const BufferNode* buffer = provided_buffer_region->buffer.get();
+    const Array<Range>& region = provided_buffer_region->region;
+    (*provided_regions)[buffer].push_back(support::NDIntSetFromRegion(region));
+    (*required_regions)[buffer].clear();
+  }
+  // Step 2. Calculate the region required by dependent blocks under `loop`
+  for (const StmtSRef& required_block_sref : is_compute_at ? consumer_srefs : producer_srefs) {
+    const BlockNode* required_block = TVM_SREF_TO_BLOCK(required_block, required_block_sref);
+    ICHECK(block2realize.count(required_block));
+    RelaxBufferRegions</*relax_storage_scope=*/is_compute_at>(
+        /*binding=*/GetBindings(GetRef<BlockRealize>(block2realize.at(required_block))),
+        /*buffer_regions=*/is_compute_at ? required_block->reads : required_block->writes,
+        /*relax_path_low_inclusive=*/GetRef<StmtSRef>(required_block_sref->parent),
+        /*relax_path_high_exclusive=*/loop_sref, /*relaxed=*/required_regions);
+  }
+}
+
+/******** Main Implementation ********/
+
+template <bool is_compute_at>
+void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_sref,
+                                     const StmtSRef& loop_sref, bool preserve_unit_loops) {
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  // Step 1. Bunch of checks
+  // Check condition 1) and 2): stage pipeline and subtree compact dataflow
+  StmtSRef scope_root_sref = GetScopeRoot(self, block_sref,
+                                          /*require_stage_pipeline=*/true,
+                                          /*require_subtree_compact_dataflow=*/true);
+  Block scope_root = GetRef<Block>(scope_root_sref->StmtAs<BlockNode>());
+  BlockScope scope = self->GetBlockScope(scope_root_sref);
+  Array<StmtSRef> producer_srefs = GetProducers(block_sref, scope);
+  Array<StmtSRef> consumer_srefs = GetConsumers(block_sref, scope);
+  arith::Analyzer analyzer;
+  // Check condition 3): `block` and `loop` are under the same scope,
+  // and `loop` is not the ancestor of `block`
+  NotInSameScopeError::CheckAndBindLoopDomain(self, block_sref, loop_sref, scope_root_sref,
+                                              &analyzer);
+  // Check condition 4): `block` is not an output block
+  if (is_compute_at) {
+    CheckNotOutputBlock(self, block_sref, scope_root_sref);
+  }
+  // Step 2. Plan for the removal of `block`
+  ScopeReconstructor reconstructor(scope_root, GetRef<Block>(block), GetRef<For>(loop));
+  LeafBlockRemovalPlan(self, block_sref, &reconstructor.rm_src_stmt_, &reconstructor.rm_tgt_stmt_);
+  // Step 3. Find the insertion point under `loop`
+  // Check condition 5): all the required block are under the given loop
+  std::unordered_map<const BlockNode*, const BlockRealizeNode*> block2realize;
+  block2realize.reserve(self->block_info.size());
+  int insert_position = FindInsertionPoint<!is_compute_at, is_compute_at>(
+      /*self=*/self,
+      /*subtrees=*/AsArray(loop->body),
+      /*producer_srefs=*/producer_srefs,
+      /*consumer_srefs=*/consumer_srefs, /*block2realize=*/&block2realize);
+  // Step 4. Calculate the region provided by a single execution instance of `block`,
+  // as well as the region required by dependent blocks under `loop`.
+  // Here is the definition of `provide` and `require`:
+  // - In compute-at, `provide` means `produce`, and `require` means `consume`
+  // - In reverse-compute-at, `provide` means `consume`, and `require` means `produce`
+  std::unordered_map<const BufferNode*, std::vector<NDIntSet>> provided_regions;
+  std::unordered_map<const BufferNode*, std::vector<NDIntSet>> required_regions;
+  CalculateProvidedRequiredRegions<is_compute_at>(
+      /*block=*/block, /*loop_sref=*/loop_sref, /*block2realize=*/std::move(block2realize),
+      /*producer_srefs=*/std::move(producer_srefs),
+      /*consumer_srefs=*/std::move(consumer_srefs),
+      /*provided_regions=*/&provided_regions, /*required_regions=*/&required_regions);
+  // Step 5. Calculate the iteration domain for each block var
+  std::vector<Range> iter_doms =
+      CalculateBlockVarDomain(/*iter_vars=*/block->iter_vars,
+                              /*provided_regions=*/std::move(provided_regions),
+                              /*required_regions=*/std::move(required_regions),
+                              /*analyzer=*/&analyzer);
+  // Step 6. Create the new scope according to the iteration domain
+  reconstructor.MakeNewLoop(/*insert_position=*/insert_position, /*iter_doms=*/std::move(iter_doms),
+                            /*preserve_unit_loops=*/preserve_unit_loops);
+  Block new_scope_root = Downcast<Block>(reconstructor(scope_root));
+  // Step 7. Do the actual replacement
+  self->Replace(scope_root_sref, new_scope_root, {{scope_root, new_scope_root}});
+  // Step 8. Update the cached flags
+  BlockInfo& block_info = self->block_info[block_sref];
+  block_info.affine_binding = IsAffineBinding(
+      /*realize=*/reconstructor.new_block_realize_,
+      /*loop_var_ranges=*/LoopDomainOfSRefTreePath(GetRef<StmtSRef>(block_sref->parent)),
+      /*analyzer=*/&analyzer);
+}
+
+void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref,
+               bool preserve_unit_loops) {
+  ComputeAtOrReverseComputeAtImpl<true>(self, block_sref, loop_sref, preserve_unit_loops);
+}
+
+void ReverseComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref,
+                      bool preserve_unit_loops) {
+  ComputeAtOrReverseComputeAtImpl<false>(self, block_sref, loop_sref, preserve_unit_loops);
+}
+
+/******** InstructionKind Registration ********/
+
+struct ComputeAtTraits : public UnpackedInstTraits<ComputeAtTraits> {
+  static constexpr const char* kName = "ComputeAt";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 2;
+  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumDecisions = 0;
+
+  static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, LoopRV loop_rv,
+                                      Bool preserve_unit_loops) {
+    return sch->ComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool());
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block_rv, String loop_rv,
+                                 Bool preserve_unit_loops) {
+    PythonAPICall py("compute_at");
+    py.Input("block", block_rv);
+    py.Input("loop", loop_rv);
+    py.Input("preserve_unit_loops", preserve_unit_loops.operator bool());
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
+struct ReverseComputeAtTraits : public UnpackedInstTraits<ReverseComputeAtTraits> {
+  static constexpr const char* kName = "ReverseComputeAt";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 2;
+  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumDecisions = 0;
+
+  static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, LoopRV loop_rv,
+                                      Bool preserve_unit_loops) {
+    return sch->ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool());
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block_rv, String loop_rv,
+                                 Bool preserve_unit_loops) {
+    PythonAPICall py("reverse_compute_at");
+    py.Input("block", block_rv);
+    py.Input("loop", loop_rv);
+    py.Input("preserve_unit_loops", preserve_unit_loops.operator bool());
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
+TVM_REGISTER_INST_KIND_TRAITS(ComputeAtTraits);
+TVM_REGISTER_INST_KIND_TRAITS(ReverseComputeAtTraits);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index 9c88cc1e787a..c2de78863d79 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -97,31 +97,6 @@ class BodyAnalysisError : public ScheduleError {
   Block block_;
 };
 
-class OnlyLeafError : public ScheduleError {
- public:
-  explicit OnlyLeafError(IRModule mod, Block leaf_block, StmtSRef scope_root_sref)
-      : mod_(mod), leaf_block_(std::move(leaf_block)), scope_root_(nullptr) {
-    const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root, scope_root_sref);
-    this->scope_root_ = GetRef<Block>(scope_root);
-  }
-
-  String FastErrorString() const final {
-    return "ScheduleError: Cannot remove the only leaf in the scope";
-  }
-
-  String DetailRenderTemplate() const final {
-    return "Block {0} is the only leaf in the scope {1}, which cannot be removed; Otherwise the "
-           "scope will be empty.";
-  }
-
-  IRModule mod() const final { return mod_; }
-  Array<ObjectRef> LocationsOfInterest() const final { return {leaf_block_, scope_root_}; }
-
-  IRModule mod_;
-  Block leaf_block_;
-  Block scope_root_;
-};
-
 class NonSingleProducerError : public ScheduleError {
  public:
   explicit NonSingleProducerError(IRModule mod, Block block)
@@ -188,76 +163,6 @@ class OpaqueAccessError : public ScheduleError {
   Block scope_root_;
 };
 
-/*!
- * \brief Construct a new AST, with a specific sref tree leaf removed.
- * The leaf's ancestors who have only a single child will be removed too.
- * \param leaf_block_sref The block/loop sref to the sref tree leaf to be removed
- * \param src_stmt The root of the subtree where the replacement begins
- * \param tgt_stmt The root of the subtree after the replacement
- * \return A boolean indicating if the leaf can be removed successfully
- * \note Removal is not conducted beyond scope-level.
- *
- * An example of the removal plan, say we are removing the leaf block "B" from the AST.
- *
- *  \code
- *    with block([], "scope_root"):
- *        ...
- *        with block([128, 128], "B") as [vi, vj]:
- *            B[vi, vj] = A[vi, vj] + 1.0
- *        with block([128, 128], "C") as [vi, vj]:
- *            C[vi, vj] = B[vi, vj] * 2.0
- *  \endcode
- *
- * Ths method does not mutate the AST, instead it returns the a `(src_stmt, tgt_stmt)` pair as a
- * plan to substitute certain pieces of the IR.
- *
- * In our example, it returns block "scope_root" as `src_stmt`, and the result `tgt_stmt` is:
- *
- *  \code
- *    with block([], "scope_root"):
- *        ...
- *        with block([128, 128], "C") as [vi, vj]:
- *            C[vi, vj] = B[vi, vj] * 2.0
- *  \endcode
- */
-bool LeafBlockRemovalPlan(const StmtSRef& leaf_block_sref, Stmt* src_stmt, Stmt* tgt_stmt) {
-  // Go upwards until find an ancestor with more than one child
-  const StmtNode* last_stmt = leaf_block_sref->stmt;
-  StmtSRefNode* sref = leaf_block_sref->parent;
-  for (;; last_stmt = sref->stmt, sref = sref->parent) {
-    if (const auto* loop = sref->StmtAs<ForNode>()) {
-      if (const auto* seq = loop->body.as<SeqStmtNode>()) {
-        if (seq->size() > 1) {
-          break;
-        }
-      }
-    } else {
-      // Removal is not done beyond scope-level.
-      // When encountering a block, i.e. the scope root, we simply stop
-      break;
-    }
-  }
-  if (const auto* block = sref->StmtAs<BlockNode>()) {
-    if (const auto* seq = block->body.as<SeqStmtNode>()) {
-      ObjectPtr<BlockNode> n = make_object<BlockNode>(*block);
-      n->body = RemoveFromSeqStmt(GetRef<SeqStmt>(seq), GetRef<Stmt>(last_stmt));
-      *src_stmt = GetRef<Stmt>(block);
-      *tgt_stmt = Stmt(std::move(n));
-      return true;
-    }
-  }
-  if (const auto* loop = sref->StmtAs<ForNode>()) {
-    if (const auto* seq = loop->body.as<SeqStmtNode>()) {
-      ObjectPtr<ForNode> n = make_object<ForNode>(*loop);
-      n->body = RemoveFromSeqStmt(GetRef<SeqStmt>(seq), GetRef<Stmt>(last_stmt));
-      *src_stmt = GetRef<Stmt>(loop);
-      *tgt_stmt = Stmt(std::move(n));
-      return true;
-    }
-  }
-  return false;
-}
-
 /*!
  * \brief The base class of the inliner, which handles:
  * 1) Substitute a subtree with the specific block being inlined
@@ -622,8 +527,9 @@ void ComputeInline(ScheduleState self, const StmtSRef& producer_block_sref) {
   Block producer_block = GetRef<Block>(_producer_block);
   Buffer inlined_buffer = NotSingleReadWriteBuffer::GetSingleWrite(self, producer_block);
   // Step 1. Get the scope block
-  StmtSRef scope_root_sref =
-      GetScopeRoot(self, producer_block_sref, /*require_stage_pipeline=*/true);
+  StmtSRef scope_root_sref = GetScopeRoot(self, producer_block_sref,  //
+                                          /*require_stage_pipeline=*/true,
+                                          /*require_subtree_compact_dataflow=*/false);
   // Step 2. Check completeness
   CheckCompleteBlock(self, producer_block_sref, scope_root_sref);
   // Step 3. Analyze the block body
@@ -632,9 +538,7 @@ void ComputeInline(ScheduleState self, const StmtSRef& producer_block_sref) {
     throw BodyAnalysisError(false, self->mod, producer_block);
   }
   // Step 4. Create a plan that removes the leaf block to be inlined
-  if (!LeafBlockRemovalPlan(producer_block_sref, &inliner.src_stmt, &inliner.tgt_stmt)) {
-    throw OnlyLeafError(self->mod, producer_block, scope_root_sref);
-  }
+  LeafBlockRemovalPlan(self, producer_block_sref, &inliner.src_stmt, &inliner.tgt_stmt);
   // Step 5. Create an AST where the leaf `producer_block_sref` points to is removed,
   // and update other blocks who read from the removed block
   Stmt tgt_stmt = inliner(GetRef<Stmt>(scope_root_sref->stmt));
@@ -650,8 +554,9 @@ void ReverseComputeInline(ScheduleState self, const StmtSRef& consumer_block_sre
   Block consumer_block = GetRef<Block>(_consumer_block);
   Buffer inlined_buffer = NotSingleReadWriteBuffer::GetSingleRead(self, consumer_block);
   // Step 1. Get the scope block
-  StmtSRef scope_root_sref =
-      GetScopeRoot(self, consumer_block_sref, /*require_stage_pipeline=*/true);
+  StmtSRef scope_root_sref = GetScopeRoot(self, consumer_block_sref,  //
+                                          /*require_stage_pipeline=*/true,
+                                          /*require_subtree_compact_dataflow=*/false);
   // Step 2. Check completeness
   CheckCompleteBlock(self, consumer_block_sref, scope_root_sref);
   // Step 3. Check if the consumer has a single complete producer
@@ -662,9 +567,7 @@ void ReverseComputeInline(ScheduleState self, const StmtSRef& consumer_block_sre
     throw BodyAnalysisError(true, self->mod, consumer_block);
   }
   // Step 5. Create a plan that removes the leaf block to be inlined
-  if (!LeafBlockRemovalPlan(consumer_block_sref, &inliner.src_stmt, &inliner.tgt_stmt)) {
-    throw OnlyLeafError(self->mod, consumer_block, scope_root_sref);
-  }
+  LeafBlockRemovalPlan(self, consumer_block_sref, &inliner.src_stmt, &inliner.tgt_stmt);
   // Step 6. Create an AST where the leaf `consumer_block_sref` points to is removed,
   // and update other blocks who read from the removed block
   Stmt tgt_stmt = inliner(GetRef<Stmt>(scope_root_sref->stmt));
@@ -675,7 +578,7 @@ void ReverseComputeInline(ScheduleState self, const StmtSRef& consumer_block_sre
   self->Replace(scope_root_sref, tgt_stmt, inliner.block_reuse);
 }
 
-/******** Instruction Registration ********/
+/******** InstructionKind Registration ********/
 
 struct ComputeInlineTraits : public UnpackedInstTraits<ComputeInlineTraits> {
   static constexpr const char* kName = "ComputeInline";
diff --git a/src/tir/schedule/primitive/for_kind.cc b/src/tir/schedule/primitive/for_kind.cc
index a6056d607042..008d47792f69 100644
--- a/src/tir/schedule/primitive/for_kind.cc
+++ b/src/tir/schedule/primitive/for_kind.cc
@@ -27,7 +27,7 @@ class WrongBlockIterTypeError : public ScheduleError {
       : mod_(std::move(mod)), loop_var_(std::move(loop_var)), block_(std::move(block)) {
     op_str_ = for_kind == ForKind::kParallel
                   ? "parallel"
-                  : for_kind == ForKind::kVectorized ? "vectorize" : "bind";
+                  : (for_kind == ForKind::kVectorized ? "vectorize" : "bind");
   }
   String FastErrorString() const final {
     std::ostringstream os;
@@ -151,7 +151,9 @@ void ParallelizeComputation(const ScheduleState& self, const StmtSRef& loop_sref
    * parallelized/vectorized/bound.
    */
   // Step 1. Check whether the subtree rooted from the `loop` in sref tree has compact data flow.
-  CheckSRefSubtreeCompactDataFlow(self, loop_sref);
+  GetScopeRoot(self, loop_sref,  //
+               /*require_stage_pipeline=*/true,
+               /*require_subtree_compact_dataflow=*/true);
 
   // Step 2. Check whether the loop can be parallelized/vectorized/bound with regard to each
   // underlying block.
@@ -187,7 +189,7 @@ void Unroll(ScheduleState self, const StmtSRef& loop_sref) {
   self->Replace(loop_sref, For(new_loop), {});
 }
 
-/******** Instruction Registration ********/
+/******** InstructionKind Registration ********/
 
 struct ParallelTraits : public UnpackedInstTraits<ParallelTraits> {
   static constexpr const char* kName = "Parallel";
@@ -251,7 +253,7 @@ struct BindTraits : public UnpackedInstTraits<BindTraits> {
   static String UnpackedAsPython(Array<String> outputs, String loop_rv, String thread) {
     PythonAPICall py("bind");
     py.Input("loop", loop_rv);
-    py.Input("thread", thread);
+    py.Input("thread_axis", thread);
     return py.Str();
   }
 
diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc
index a8d9c5a69dc9..8b32a9c14f58 100644
--- a/src/tir/schedule/primitive/get_block_loop.cc
+++ b/src/tir/schedule/primitive/get_block_loop.cc
@@ -55,7 +55,7 @@ Array<StmtSRef> GetLoops(const StmtSRef& block_sref) {
   return {result.rbegin(), result.rend()};
 }
 
-/******** Instruction Registration ********/
+/******** InstructionKind Registration ********/
 
 struct GetBlockTraits : public UnpackedInstTraits<GetBlockTraits> {
   static constexpr const char* kName = "GetBlock";
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index 7c2b61344427..95c92aa0a322 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -687,7 +687,7 @@ void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs) {
   self->Replace(GetRef<StmtSRef>(top), new_loop, {});
 }
 
-/******** Instruction Registration ********/
+/******** InstructionKind Registration ********/
 
 struct SplitTraits : public UnpackedInstTraits<SplitTraits> {
   static constexpr const char* kName = "Split";
diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index af77e51e4d83..677b64311855 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -427,7 +427,7 @@ class BaseBlockCreator {
     CreateReadWriteRegions();
 
     String new_block_name = old_block_realize_->block->name_hint;
-    PrimExpr predicate = Bool(true);
+    PrimExpr predicate = const_true();
     if (is_rf_block_) {
       new_block_name = new_block_name + "_rf";
       predicate = old_block_realize_->predicate;
@@ -860,7 +860,9 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax
   BlockRealize block_realize = CheckGetSingleChildBlockRealizeOnSRefTree(self, rf_loop_sref);
   const StmtSRef& block_sref = self->stmt2ref.at(block_realize->block.get());
   const Block& block = block_realize->block;
-  StmtSRef scope_root = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  StmtSRef scope_root = GetScopeRoot(self, block_sref,  //
+                                     /*require_stage_pipeline=*/true,
+                                     /*require_subtree_compact_dataflow=*/false);
   CheckReductionBlock(self, block_sref, scope_root);
   const ForNode* rf_loop = TVM_SREF_TO_FOR(rf_loop, rf_loop_sref);
   if (rf_loop->kind != ForKind::kSerial) {
@@ -954,7 +956,7 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax
   return new_block_srefs[0];
 }
 
-/******** Instruction Registration ********/
+/******** InstructionKind Registration ********/
 
 struct RFactorTraits : public UnpackedInstTraits<RFactorTraits> {
   static constexpr const char* kName = "RFactor";
diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc
index ac40d27c4bf3..8843ac613179 100644
--- a/src/tir/schedule/primitive/sampling.cc
+++ b/src/tir/schedule/primitive/sampling.cc
@@ -19,7 +19,6 @@
 
 #include <random>
 
-#include "../primitive.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -51,6 +50,8 @@ int64_t SampleCategorical(support::LinearCongruentialEngine::TRandState* rand_st
   return candidates[i];
 }
 
+/******** InstructionKind Registration ********/
+
 struct SampleCategoricalTraits : public UnpackedInstTraits<SampleCategoricalTraits> {
   static constexpr const char* kName = "SampleCategorical";
   static constexpr bool kIsPure = true;
@@ -79,7 +80,8 @@ struct SampleCategoricalTraits : public UnpackedInstTraits<SampleCategoricalTrai
     return py.Str();
   }
 
-  friend struct UnpackedInstTraits<SampleCategoricalTraits>;
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
 TVM_REGISTER_INST_KIND_TRAITS(SampleCategoricalTraits);
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index fd30b02fc9dd..4262a099b59d 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -146,6 +146,10 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheRead")
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheWrite")
     .set_body_method<Schedule>(&ScheduleNode::CacheWrite);
 /******** (FFI) Compute location ********/
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleComputeAt")
+    .set_body_method<Schedule>(&ScheduleNode::ComputeAt);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReverseComputeAt")
+    .set_body_method<Schedule>(&ScheduleNode::ReverseComputeAt);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleComputeInline")
     .set_body_method<Schedule>(&ScheduleNode::ComputeInline);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReverseComputeInline")
diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index 799806bef7b5..4604add3bdb4 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -35,15 +35,22 @@ using SMap = std::unordered_map<K, V, ObjectPtrHash, ObjectPtrEqual>;
  * \param dom_high_exclusive The highest node in the sref tree path
  * \return An n-dimensional integer set
  */
-Array<arith::IntSet> AnalyzeRegionUpperBound(const BufferRegion& region,
-                                             const StmtSRef& dom_low_inclusive,
-                                             const StmtSRef& dom_high_exclusive) {
-  return arith::EvalSet(
-      region->region,
-      AsIntSet(LoopDomainOfSRefTreePath(
-          /*low_inclusive=*/dom_low_inclusive,
-          /*high_exclusive=*/dom_high_exclusive,
-          /*extra_relax_scope=*/runtime::StorageScope::Create(region->buffer.scope()))));
+Array<arith::IntSet> AnalyzeRegionUpperBound(const BufferRegion& region,          //
+                                             const PrimExpr& predicate,           //
+                                             const StmtSRef& dom_low_inclusive,   //
+                                             const StmtSRef& dom_high_exclusive,  //
+                                             arith::Analyzer* analyzer) {
+  Map<Var, Range> var_dom = LoopDomainOfSRefTreePath(
+      /*low_inclusive=*/dom_low_inclusive,
+      /*high_exclusive=*/dom_high_exclusive,
+      /*extra_relax_scope=*/runtime::StorageScope::Create(region->buffer.scope()));
+  if (Optional<Array<arith::IntSet>> result = EstimateRegionLowerBound(
+          /*region=*/region->region,
+          /*var_dom=*/var_dom,
+          /*predicate=*/predicate, /*analyzer=*/analyzer)) {
+    return result.value();
+  }
+  return arith::EvalSet(region->region, AsIntSet(var_dom));
 }
 
 /*!
@@ -56,19 +63,19 @@ Array<arith::IntSet> AnalyzeRegionUpperBound(const BufferRegion& region,
  * \param analyzer The analyzer
  * \return An n-dimensional integer set
  */
-Array<arith::IntSet> AnalyzeRegionLowerBound(const BlockRealize& realize,
-                                             const BufferRegion& region,
-                                             const StmtSRef& dom_low_inclusive,
-                                             const StmtSRef& dom_high_exclusive,
+Array<arith::IntSet> AnalyzeRegionLowerBound(const BufferRegion& region,          //
+                                             const PrimExpr& predicate,           //
+                                             const StmtSRef& dom_low_inclusive,   //
+                                             const StmtSRef& dom_high_exclusive,  //
                                              arith::Analyzer* analyzer) {
+  Map<Var, Range> var_dom = LoopDomainOfSRefTreePath(
+      /*low_inclusive=*/dom_low_inclusive,
+      /*high_exclusive=*/dom_high_exclusive,
+      /*extra_relax_scope=*/runtime::StorageScope::Create(region->buffer.scope()));
   if (Optional<Array<arith::IntSet>> result = EstimateRegionLowerBound(
           /*region=*/region->region,
-          /*var_dom=*/
-          LoopDomainOfSRefTreePath(
-              /*low_inclusive=*/dom_low_inclusive,
-              /*high_exclusive=*/dom_high_exclusive,
-              /*extra_relax_scope=*/runtime::StorageScope::Create(region->buffer.scope())),
-          /*predicate=*/realize->predicate, /*analyzer=*/analyzer)) {
+          /*var_dom=*/var_dom,
+          /*predicate=*/predicate, /*analyzer=*/analyzer)) {
     return result.value();
   }
   return Array<arith::IntSet>(region->buffer->shape.size(), arith::IntSet::Nothing());
@@ -90,16 +97,16 @@ bool ProducerCoversConsumer(const Array<PrimExpr>& buffer_shape,
   ICHECK_EQ(produced_region.size(), consumed_region.size());
   int ndim = produced_region.size();
   for (int i = 0; i < ndim; ++i) {
-    Range buffer_size = Range::FromMinExtent(0, buffer_shape[i]);
+    arith::IntSet buffer_size = arith::IntSet::FromMinExtent(0, buffer_shape[i]);
     if (produced_region[i].IsNothing()) {
       return false;
     }
-    Range produced = produced_region[i].CoverRange(buffer_size);
-    Range consumed = consumed_region[i].CoverRange(buffer_size);
-    PrimExpr produced_min = produced->min;
-    PrimExpr produced_max = produced->min + produced->extent;
-    PrimExpr consumed_min = consumed->min;
-    PrimExpr consumed_max = consumed->min + consumed->extent;
+    arith::IntSet produced = arith::Intersect({produced_region[i], buffer_size});
+    arith::IntSet consumed = arith::Intersect({consumed_region[i], buffer_size});
+    PrimExpr produced_min = analyzer->Simplify(produced.min());
+    PrimExpr produced_max = analyzer->Simplify(produced.max() - produced_min + 1);
+    PrimExpr consumed_min = analyzer->Simplify(consumed.min());
+    PrimExpr consumed_max = analyzer->Simplify(consumed.max() - consumed_min + 1);
     if (!analyzer->CanProve((produced_min <= consumed_min) && (consumed_max <= produced_max))) {
       return false;
     }
@@ -276,6 +283,8 @@ class StateCreator : private StmtVisitor {
     for (const auto& kv : info.scope->dst2deps) {
       const StmtSRef& consumer_block_sref = kv.first;
       const Array<Dependency>& deps = kv.second;
+      const BlockNode* consumer_block = TVM_SREF_TO_BLOCK(consumer_block, consumer_block_sref);
+      const BlockRealize& consumer_realize = block2realize_.at(consumer_block);
       bool& region_cover = self_->block_info.at(consumer_block_sref).region_cover = true;
       // Step 2.1. Extract the path to the scope root
       std::unordered_map<const StmtSRefNode*, std::vector<const StmtSRefNode*>> lca_loc;
@@ -334,11 +343,12 @@ class StateCreator : private StmtVisitor {
               // and to make sure region cover property must be satisfied once the flag is on
               // Therefore, we use lower-bound analysis for producers and upper-bound analysis for
               // consumer, and require that the produced region can cover the consumed region
-              touched_region.push_back(AnalyzeRegionLowerBound(/*realize=*/producer_realize,
-                                                               /*region=*/region,
-                                                               /*dom_low_inclusive=*/parent_sref,
-                                                               /*dom_high_exclusive=*/lca,
-                                                               /*analyzer=*/&analyzer_));
+              touched_region.push_back(AnalyzeRegionLowerBound(
+                  /*region=*/region,
+                  /*predicate=*/producer_realize->predicate,
+                  /*dom_low_inclusive=*/parent_sref,
+                  /*dom_high_exclusive=*/lca,
+                  /*analyzer=*/&analyzer_));
             }
           }
         }
@@ -353,8 +363,10 @@ class StateCreator : private StmtVisitor {
                   arith::UnionRegionLowerBound({touched_region.begin(), touched_region.end()});
               Array<arith::IntSet> consumed_region = AnalyzeRegionUpperBound(
                   /*region=*/region,
+                  /*predicate=*/consumer_realize->predicate,
                   /*dom_low_inclusive=*/parent_sref,
-                  /*dom_high_exclusive=*/lca);
+                  /*dom_high_exclusive=*/lca,
+                  /*analyzer=*/&analyzer_);
               if (!ProducerCoversConsumer(buffer->shape, produced_region, consumed_region,
                                           &analyzer_)) {
                 region_cover = false;
@@ -920,8 +932,8 @@ void ScheduleStateNode::Replace(const tir::StmtSRef& _src_sref, const Stmt& tgt_
   // Before step `i`:
   // 1) `child_sref` is `src_sref` going up by `i` steps
   // 2) `child_tgt_stmt` is the subtree that `child_sref` should correspond to after replacement
-  // 3) except for the subtree root, srefs that point to the subtree of `child_tgt_stmt` are
-  // correct 4) for the subtree root of `child_tgt_stmt`, `child_sref` has not pointed to it yet
+  // 3) except for the subtree root, srefs that point to the subtree of `child_tgt_stmt` are correct
+  // 4) for the subtree root of `child_tgt_stmt`, `child_sref` has not pointed to it yet
   // 5) `tgt_stmt` is of type Loop, Block or BlockRealize
   //
   // During step `i`:
@@ -1029,24 +1041,6 @@ TVM_DLL Array<Bool> GetCachedFlags(const ScheduleState& self, const StmtSRef& bl
           Bool(info.scope->stage_pipeline)};
 }
 
-TVM_DLL void ScheduleStateNode::UpdateAffineFlag(const StmtSRef& scope_sref) {
-  auto it = this->block_info.find(scope_sref);
-  ICHECK(it != this->block_info.end()) << "Cannot find the block info of the given block.";
-  BlockInfo& info = it->second;
-
-  bool is_root_block = scope_sref->parent == nullptr;
-  if (is_root_block) {
-    info.affine_binding = true;
-  } else {
-    BlockRealize realize = GetBlockRealize(GetRef<ScheduleState>(this), scope_sref);
-    arith::Analyzer analyzer;
-    StmtSRef parent_sref = GetRef<StmtSRef>(scope_sref->parent);
-    info.affine_binding = IsAffineBinding(/*realize=*/realize,
-                                          /*loop_var_ranges=*/LoopDomainOfSRefTreePath(parent_sref),
-                                          /*analyzer=*/&analyzer);
-  }
-}
-
 /**************** FFI ****************/
 
 TVM_REGISTER_NODE_TYPE(ScheduleStateNode);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index f429a917858b..6f679598c9d1 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -192,6 +192,28 @@ BlockRV TracedScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buffer
 
 /******** Schedule: Compute location ********/
 
+void TracedScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                                   bool preserve_unit_loops) {
+  ConcreteScheduleNode::ComputeAt(block_rv, loop_rv, preserve_unit_loops);
+
+  static const InstructionKind& kind = InstructionKind::Get("ComputeAt");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv, loop_rv},
+                                      /*attrs=*/{Integer(preserve_unit_loops)},
+                                      /*outputs=*/{}));
+}
+
+void TracedScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                                          bool preserve_unit_loops) {
+  ConcreteScheduleNode::ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops);
+
+  static const InstructionKind& kind = InstructionKind::Get("ReverseComputeAt");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv, loop_rv},
+                                      /*attrs=*/{Integer(preserve_unit_loops)},
+                                      /*outputs=*/{}));
+}
+
 void TracedScheduleNode::ComputeInline(const BlockRV& block_rv) {
   ConcreteScheduleNode::ComputeInline(block_rv);
 
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index a6b5251a96a3..fb89783b6036 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -76,6 +76,9 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) final;
   /******** Schedule: Compute location ********/
+  void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) final;
+  void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                        bool preserve_unit_loops) final;
   void ComputeInline(const BlockRV& block_rv) final;
   void ReverseComputeInline(const BlockRV& block_rv) final;
   /******** Schedule: Reduction ********/
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index da376fdde90f..ffb6b2d52628 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -17,14 +17,13 @@
  * under the License.
  */
 
-#include "./transform.h"
-
 #include "./utils.h"
 
 namespace tvm {
 namespace tir {
 
 /******** Annotation ********/
+
 Block WithAnnotation(const BlockNode* block, const String& attr_key, const ObjectRef& attr_value) {
   Map<String, ObjectRef> annotations = block->annotations;
   annotations.Set(attr_key, attr_value);
@@ -71,5 +70,71 @@ Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, c
   return match_buffers;
 }
 
+/******** Block Removal ********/
+
+void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_sref,
+                          Stmt* src_stmt, Stmt* tgt_stmt) {
+  class OnlyLeafError : public ScheduleError {
+   public:
+    explicit OnlyLeafError(IRModule mod, Block leaf_block, Block scope_root)
+        : mod_(mod), leaf_block_(leaf_block), scope_root_(scope_root) {}
+
+    String FastErrorString() const final {
+      return "ScheduleError: Cannot remove the only leaf in the scope";
+    }
+
+    String DetailRenderTemplate() const final {
+      return "Block {0} is the only leaf in the scope {1}, which cannot be removed; Otherwise the "
+             "scope will be empty.";
+    }
+
+    IRModule mod() const final { return mod_; }
+    Array<ObjectRef> LocationsOfInterest() const final { return {leaf_block_, scope_root_}; }
+
+    IRModule mod_;
+    Block leaf_block_;
+    Block scope_root_;
+  };
+
+  // Go upwards until find an ancestor with more than one child
+  const StmtNode* last_stmt = leaf_block_sref->stmt;
+  StmtSRefNode* sref = leaf_block_sref->parent;
+  for (;; last_stmt = sref->stmt, sref = sref->parent) {
+    if (const auto* loop = sref->StmtAs<ForNode>()) {
+      if (const auto* seq = loop->body.as<SeqStmtNode>()) {
+        if (seq->size() > 1) {
+          break;
+        }
+      }
+    } else {
+      // Removal is not done beyond scope-level.
+      // When encountering a block, i.e. the scope root, we simply stop
+      break;
+    }
+  }
+  if (const auto* block = sref->StmtAs<BlockNode>()) {
+    if (const auto* seq = block->body.as<SeqStmtNode>()) {
+      ObjectPtr<BlockNode> n = make_object<BlockNode>(*block);
+      n->body = RemoveFromSeqStmt(GetRef<SeqStmt>(seq), GetRef<Stmt>(last_stmt));
+      *src_stmt = GetRef<Stmt>(block);
+      *tgt_stmt = Stmt(std::move(n));
+      return;
+    }
+  }
+  if (const auto* loop = sref->StmtAs<ForNode>()) {
+    if (const auto* seq = loop->body.as<SeqStmtNode>()) {
+      ObjectPtr<ForNode> n = make_object<ForNode>(*loop);
+      n->body = RemoveFromSeqStmt(GetRef<SeqStmt>(seq), GetRef<Stmt>(last_stmt));
+      *src_stmt = GetRef<Stmt>(loop);
+      *tgt_stmt = Stmt(std::move(n));
+      return;
+    }
+  }
+  ICHECK(sref != nullptr && sref->stmt != nullptr);
+  const auto* leaf_block = TVM_SREF_TO_BLOCK(leaf_block, leaf_block_sref);
+  const auto* scope_block = TVM_SREF_TO_BLOCK(scope_block, sref);
+  throw OnlyLeafError(self->mod, GetRef<Block>(leaf_block), GetRef<Block>(scope_block));
+}
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 85cce9da216e..3932c4bdbd3d 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -64,6 +64,46 @@ Array<BufferRegion> ReplaceBuffer(Array<BufferRegion> regions, const Buffer& sou
  */
 Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, const Buffer& source,
                                        const Buffer& target);
+
+/******** Block Removal ********/
+
+/*!
+ * \brief Construct a new AST, with a specific sref tree leaf removed.
+ * The leaf's ancestors who have only a single child will be removed too.
+ * \param leaf_block_sref The block/loop sref to the sref tree leaf to be removed
+ * \param src_stmt The root of the subtree where the replacement begins
+ * \param tgt_stmt The root of the subtree after the replacement
+ * \return A boolean indicating if the leaf can be removed successfully
+ * \note Read before use:
+ * 1) Removal is not conducted beyond scope-level.
+ * 2) This method only works properly when the scope root is a stage pipeline.
+ *
+ * An example of the removal plan, say we are removing the leaf block "B" from the AST.
+ *
+ *  \code
+ *    with block([], "scope_root"):
+ *        ...
+ *        with block([128, 128], "B") as [vi, vj]:
+ *            B[vi, vj] = A[vi, vj] + 1.0
+ *        with block([128, 128], "C") as [vi, vj]:
+ *            C[vi, vj] = B[vi, vj] * 2.0
+ *  \endcode
+ *
+ * Ths method does not mutate the AST, instead it returns the a `(src_stmt, tgt_stmt)` pair as a
+ * plan to substitute certain pieces of the IR.
+ *
+ * In our example, it returns block "scope_root" as `src_stmt`, and the result `tgt_stmt` is:
+ *
+ *  \code
+ *    with block([], "scope_root"):
+ *        ...
+ *        with block([128, 128], "C") as [vi, vj]:
+ *            C[vi, vj] = B[vi, vj] * 2.0
+ *  \endcode
+ */
+void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_sref,
+                          Stmt* src_stmt, Stmt* tgt_stmt);
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h
index c2f430181664..a63a9f079617 100644
--- a/src/tir/schedule/utils.h
+++ b/src/tir/schedule/utils.h
@@ -34,10 +34,12 @@
 #include <unordered_map>
 #include <utility>
 
+#include "../../arith/pattern_match.h"
 #include "../../node/attr_registry.h"
 #include "../../printer/text_printer.h"
 #include "../../runtime/thread_storage_scope.h"
 #include "../../support/array.h"
+#include "../../support/nd_int_set.h"
 #include "./analysis.h"
 #include "./error.h"
 #include "./instruction_traits.h"
@@ -163,6 +165,21 @@ inline Stmt RemoveFromSeqStmt(const SeqStmt& seq, const Stmt& to_remove) {
   return SeqStmt::Flatten(new_stmts);
 }
 
+/*!
+ * \brief Convert a Stmt to an Array.
+ * \param stmt The Stmt to be converted to
+ * \return If the Stmt is SeqStmt, then returns the sequence;
+ * Otherwise, returns a single-element Array with the Stmt inside.
+ */
+inline Array<Stmt> AsArray(const Stmt& stmt) {
+  if (const auto* seq_stmt = stmt.as<SeqStmtNode>()) {
+    return seq_stmt->seq;
+  }
+  return {stmt};
+}
+
+/******** IterVar ********/
+
 /*!
  * \brief Create a new IterVar for the input For loop, with specified name and type
  * \param loop The loop to be created from
diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index 961ea1721fa1..a1f488f386b3 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -30,6 +30,7 @@
 #include <stack>
 
 #include "../../support/arena.h"
+#include "../../support/nd_int_set.h"
 #include "../../support/utils.h"
 #include "../schedule/utils.h"
 #include "ir_utils.h"
@@ -37,62 +38,7 @@
 namespace tvm {
 namespace tir {
 
-using NDIntSet = std::vector<arith::IntSet>;
-
-arith::IntSet IntSetFromMinExtent(const PrimExpr& min, const PrimExpr& extent) {
-  return arith::IntSet::FromRange(Range::FromMinExtent(min, extent));
-}
-
-NDIntSet NDIntSetFromRegion(const Region& region) {
-  NDIntSet result;
-  result.reserve(region.size());
-  for (const Range& range : region) {
-    result.push_back(arith::IntSet::FromRange(range));
-  }
-  return result;
-}
-
-NDIntSet NDIntSetFromShape(const Array<PrimExpr>& shape) {
-  PrimExpr zero = Integer(0);
-  NDIntSet result;
-  result.reserve(shape.size());
-  for (const PrimExpr& extent : shape) {
-    result.push_back(IntSetFromMinExtent(zero, extent));
-  }
-  return result;
-}
-
-NDIntSet NDIntSetFromPoint(const Array<PrimExpr>& indices) {
-  NDIntSet result;
-  result.reserve(indices.size());
-  for (const PrimExpr& index : indices) {
-    result.push_back(arith::IntSet::SinglePoint(index));
-  }
-  return result;
-}
-
-void NDIntSetUnionWith(NDIntSet* lhs, const NDIntSet& rhs) {
-  ICHECK_EQ(lhs->size(), rhs.size());
-  int ndim = rhs.size();
-  for (int i = 0; i < ndim; ++i) {
-    arith::IntSet& int_set = lhs->at(i);
-    int_set = arith::Union({int_set, rhs.at(i)});
-  }
-}
-
-NDIntSet NDIntSetEmpty(int ndim) {
-  return std::vector<arith::IntSet>(ndim, arith::IntSet::Nothing());
-}
-
-NDIntSet EvalNDIntSet(const NDIntSet& nd_int_set,
-                      const std::unordered_map<const VarNode*, arith::IntSet>& dom_map) {
-  NDIntSet ret;
-  ret.reserve(nd_int_set.size());
-  for (const arith::IntSet& s : nd_int_set) {
-    ret.push_back(arith::EvalSet(s, dom_map));
-  }
-  return ret;
-}
+using support::NDIntSet;
 
 /*!
  * \brief return the region collected by NDIntSet. return the oroginal buffer shape if the
@@ -164,7 +110,8 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     // The iter_dom_map is updated by post DFS order.
     // If the union point is under the for node, the loop var will not be relaxed.
     // If the union point is outer of the for loop, the loop var should be relaxed.
-    iter_dom_map_on_post_order_[op->loop_var.get()] = IntSetFromMinExtent(op->min, op->extent);
+    iter_dom_map_on_post_order_[op->loop_var.get()] =
+        arith::IntSet::FromMinExtent(op->min, op->extent);
   }
 
   void VisitStmt_(const BlockNode* op) final {
@@ -205,10 +152,10 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       for (const ForNode* loop : ancestor_loops_) {
         const VarNode* loop_var = loop->loop_var.get();
         if (NeedRelaxThread(GetRef<For>(loop), runtime::StorageScope::Create(buffer.scope()))) {
-          dom_map[loop_var] = IntSetFromMinExtent(loop->min, loop->extent);
+          dom_map[loop_var] = arith::IntSet::FromMinExtent(loop->min, loop->extent);
         }
       }
-      NDIntSet int_set = EvalNDIntSet(nd_int_set, dom_map);
+      NDIntSet int_set = support::NDIntSetEval(nd_int_set, dom_map);
       buffer_access_region_[buffer] = NarrowBufferRegionFromNDIntSet(int_set, buffer->shape);
     }
   }
@@ -221,7 +168,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     if (it != buffer_var_in_scope_.end()) {
       const Buffer& buffer = it->second;
       const BufferAccessInfo* info =
-          arena_.make<BufferAccessInfo>(buffer, NDIntSetFromRegion(buffer_region->region));
+          arena_.make<BufferAccessInfo>(buffer, support::NDIntSetFromRegion(buffer_region->region));
       buffer_access_stack_.push(info);
     }
   }
@@ -246,10 +193,11 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     while (buffer_access_stack_.size() > stack_top) {
       const BufferAccessInfo* info = buffer_access_stack_.top();
       buffer_access_stack_.pop();
-      NDIntSet nd_int_set = EvalNDIntSet(info->accessed_region, iter_dom_map_on_post_order_);
+      NDIntSet nd_int_set =
+          support::NDIntSetEval(info->accessed_region, iter_dom_map_on_post_order_);
       auto it = accesses.find(info->buffer);
       if (it != accesses.end()) {
-        NDIntSetUnionWith(&it->second, nd_int_set);
+        support::NDIntSetUnionWith(&it->second, nd_int_set);
       } else {
         accesses[info->buffer] = nd_int_set;
       }
diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index a41905c148bf..262906ade2e8 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -113,8 +113,9 @@ class IRConvertSSA final : public StmtExprMutator {
   PrimExpr VisitExpr_(const LoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<LoadNode>();
-    if (scope_.count(op->buffer_var.get())) {
-      return Load(op->dtype, scope_[op->buffer_var.get()].back(), op->index, op->predicate);
+    const VarNode* v = op->buffer_var.get();
+    if (scope_.count(v) && !scope_[v].empty()) {
+      return Load(op->dtype, scope_[v].back(), op->index, op->predicate);
     } else {
       return expr;
     }
@@ -122,8 +123,9 @@ class IRConvertSSA final : public StmtExprMutator {
   Stmt VisitStmt_(const StoreNode* op) final {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<StoreNode>();
-    if (scope_.count(op->buffer_var.get())) {
-      return Store(scope_[op->buffer_var.get()].back(), op->value, op->index, op->predicate);
+    const VarNode* v = op->buffer_var.get();
+    if (scope_.count(v) && !scope_[v].empty()) {
+      return Store(scope_[v].back(), op->value, op->index, op->predicate);
     } else {
       return stmt;
     }
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index 481b1bfd4b19..6f7c09cdcf2d 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -119,6 +119,17 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     }
   }
 
+  Stmt VisitStmt_(const StoreNode* op) final {
+    auto it = store_remap_.find(op->buffer_var.get());
+    if (it != store_remap_.end()) {
+      ICHECK(is_zero(op->index));
+      auto value = StmtExprMutator::VisitExpr(op->value);
+      return Store(it->second, value, 0, op->predicate);
+    } else {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+  }
+
   std::unordered_map<const VarNode*, String> new_storage_scopes_;
 
  private:
@@ -328,6 +339,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         PrimExpr pred = const_true(types[i].lanes());
         Var var = shared_bufs[i];
         load_remap_[buffers[i]] = Load(types[i], var, index, pred);
+        store_remap_[buffers[i]] = var;
         Array<PrimExpr> extents{PrimExpr(1)};
         auto node = Allocate(var, types[i], extents, pred, Evaluate(0));
         alloc_remap_[buffers[i]] = node;
@@ -370,6 +382,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         alloc_remap_[buffers[idx]] =
             Allocate(shared_bufs[idx], types[idx],
                      {PrimExpr(group_extent), PrimExpr(reduce_extent)}, pred, Evaluate(0));
+        store_remap_[buffers[idx]] = shared_bufs[idx];
       }
     }
 
@@ -587,6 +600,8 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   std::vector<const CommReducerNode*> reduce_combiner_;
   // The load remap
   std::unordered_map<const VarNode*, PrimExpr> load_remap_;
+  // The store remap
+  std::unordered_map<const VarNode*, Var> store_remap_;
   // Allocate remap
   std::unordered_map<const VarNode*, Stmt> alloc_remap_;
   // Allocate from warp reductions
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index f5a553aa0598..99d71ebe15bd 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -113,6 +113,16 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
+    if (device_type_.defined()) {
+      if (const auto* dev_type = device_type_.as<IntImmNode>()) {
+        if (dev_type->value == kDLCPU) {
+          int32_t constant_size = op->constant_allocation_size();
+          if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
+            return stmt;
+          }
+        }
+      }
+    }
     PrimExpr total_bytes = make_const(op->extents[0].dtype(), nbytes);
     for (size_t i = 0; i < op->extents.size(); ++i) {
       total_bytes = total_bytes * op->extents[i];
diff --git a/src/topi/schedule.cc b/src/topi/schedule.cc
index f9400bf59df6..21f863bb2e70 100644
--- a/src/topi/schedule.cc
+++ b/src/topi/schedule.cc
@@ -29,7 +29,6 @@
 #include <tvm/target/generic_func.h>
 #include <tvm/topi/cuda/dense.h>
 #include <tvm/topi/cuda/injective.h>
-#include <tvm/topi/cuda/normalization.h>
 #include <tvm/topi/cuda/pooling.h>
 #include <tvm/topi/cuda/reduction.h>
 #include <tvm/topi/cuda/softmax.h>
@@ -39,7 +38,6 @@
 #include <tvm/topi/generic/injective.h>
 #include <tvm/topi/rocm/dense.h>
 #include <tvm/topi/rocm/injective.h>
-#include <tvm/topi/rocm/normalization.h>
 #include <tvm/topi/rocm/pooling.h>
 #include <tvm/topi/rocm/reduction.h>
 #include <tvm/topi/rocm/softmax.h>
@@ -139,10 +137,6 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_softmax").set_body([](TVMArgs args, TVMR
   *rv = topi::rocm::schedule_softmax(args[0], args[1]);
 });
 
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_lrn").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::rocm::schedule_lrn(args[0]);
-});
-
 /* CUDA schedules */
 TVM_REGISTER_GLOBAL("topi.cuda.dense_cuda").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = cuda::dense_cuda(args[0], args[1], args[2], args[3], args[4]);
@@ -177,10 +171,6 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax").set_body([](TVMArgs args, TVMR
   *rv = topi::cuda::schedule_softmax(args[0], args[1]);
 });
 
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::cuda::schedule_lrn(args[0]);
-});
-
 /* Utility functions */
 TVM_REGISTER_GLOBAL("topi.utils.is_empty_shape").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = topi::detail::is_empty_shape(args[0]);
diff --git a/tests/micro/arduino/README.md b/tests/micro/arduino/README.md
index 78e63cabb7e2..0b039ba6de7c 100644
--- a/tests/micro/arduino/README.md
+++ b/tests/micro/arduino/README.md
@@ -22,14 +22,14 @@ all of the appropriate TVM dependencies installed. You can run the test with:
 
 ```
 $ cd tvm/tests/micro/arduino
-$ pytest --microtvm-platforms spresense
+$ pytest --arduino-board=spresense
 ```
 
 Most of these tests require a supported Arduino board to be connected.
 If you don't want to run these tests, you can pass the flag
 `--test-build-only` to only test project generation and compilation.
 
-To see the list of supported values for `----microtvm-platforms`, run:
+To see the list of supported values for `--arduino-board`, run:
 ```
 $ pytest --help
 ```
diff --git a/tests/micro/arduino/conftest.py b/tests/micro/arduino/conftest.py
index aea1381a43f8..bb9c69bf4a0e 100644
--- a/tests/micro/arduino/conftest.py
+++ b/tests/micro/arduino/conftest.py
@@ -20,20 +20,8 @@
 
 import pytest
 import tvm.target.target
-
-# The models that should pass this configuration. Maps a short, identifying platform string to
-# (model, zephyr_board).
-PLATFORMS = {
-    "due": ("sam3x8e", "due"),
-    "feathers2": ("esp32", "feathers2"),
-    "metrom4": ("atsamd51", "metrom4"),
-    "nano33ble": ("nrf52840", "nano33ble"),
-    "pybadge": ("atsamd51", "pybadge"),
-    "spresense": ("cxd5602gg", "spresense"),
-    "teensy40": ("imxrt1060", "teensy40"),
-    "teensy41": ("imxrt1060", "teensy41"),
-    "wioterminal": ("atsamd51", "wioterminal"),
-}
+from tvm.micro import project
+from tvm import micro, relay
 
 TEMPLATE_PROJECT_DIR = (
     pathlib.Path(__file__).parent
@@ -47,13 +35,30 @@
 ).resolve()
 
 
+def arduino_boards() -> dict:
+    """Returns a dict mapping board to target model"""
+    template = project.TemplateProject.from_directory(TEMPLATE_PROJECT_DIR)
+    project_options = template.info()["project_options"]
+    for option in project_options:
+        if option["name"] == "arduino_board":
+            boards = option["choices"]
+        if option["name"] == "arduino_model":
+            models = option["choices"]
+
+    arduino_boards = {boards[i]: models[i] for i in range(len(boards))}
+    return arduino_boards
+
+
+ARDUINO_BOARDS = arduino_boards()
+
+
 def pytest_addoption(parser):
     parser.addoption(
-        "--microtvm-platforms",
+        "--arduino-board",
         nargs="+",
         required=True,
-        choices=PLATFORMS.keys(),
-        help="Target platforms for microTVM tests.",
+        choices=ARDUINO_BOARDS.keys(),
+        help="Arduino board for tests.",
     )
     parser.addoption(
         "--arduino-cli-cmd",
@@ -91,8 +96,8 @@ def pytest_collection_modifyitems(config, items):
 # (to take advantage of multiple cores / external memory / etc.), so all tests
 # are parameterized by board
 def pytest_generate_tests(metafunc):
-    platforms = metafunc.config.getoption("microtvm_platforms")
-    metafunc.parametrize("platform", platforms, scope="session")
+    board = metafunc.config.getoption("arduino_board")
+    metafunc.parametrize("board", board, scope="session")
 
 
 @pytest.fixture(scope="session")
@@ -105,12 +110,11 @@ def tvm_debug(request):
     return request.config.getoption("--tvm-debug")
 
 
-def make_workspace_dir(test_name, platform):
-    _, arduino_board = PLATFORMS[platform]
+def make_workspace_dir(test_name, board):
     filepath = pathlib.Path(__file__)
     board_workspace = (
         filepath.parent
-        / f"workspace_{test_name}_{arduino_board}"
+        / f"workspace_{test_name}_{board}"
         / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
     )
 
@@ -122,3 +126,42 @@ def make_workspace_dir(test_name, platform):
     t = tvm.contrib.utils.tempdir(board_workspace)
     # time.sleep(200)
     return t
+
+
+def make_kws_project(board, arduino_cli_cmd, tvm_debug, workspace_dir):
+    this_dir = pathlib.Path(__file__).parent
+    model = ARDUINO_BOARDS[board]
+    build_config = {"debug": tvm_debug}
+
+    with open(this_dir.parent / "testdata" / "kws" / "yes_no.tflite", "rb") as f:
+        tflite_model_buf = f.read()
+
+    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
+    try:
+        import tflite.Model
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+
+    mod, params = relay.frontend.from_tflite(tflite_model)
+    target = tvm.target.target.micro(
+        model, options=["--link-params=1", "--unpacked-api=1", "--executor=aot"]
+    )
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = relay.build(mod, target, params=params)
+
+    return tvm.micro.generate_project(
+        str(TEMPLATE_PROJECT_DIR),
+        mod,
+        workspace_dir / "project",
+        {
+            "arduino_board": board,
+            "arduino_cli_cmd": arduino_cli_cmd,
+            "project_type": "example_project",
+            "verbose": bool(build_config.get("debug")),
+        },
+    )
diff --git a/tests/micro/arduino/test_arduino_error_detection.py b/tests/micro/arduino/test_arduino_error_detection.py
new file mode 100644
index 000000000000..64e2c14d1c18
--- /dev/null
+++ b/tests/micro/arduino/test_arduino_error_detection.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pathlib
+import re
+import sys
+
+import pytest
+
+import conftest
+from tvm.micro.project_api.server import ServerError
+
+
+# A new project and workspace dir is created for EVERY test
+@pytest.fixture
+def workspace_dir(request, board):
+    return conftest.make_workspace_dir("arduino_error_detection", board)
+
+
+@pytest.fixture
+def project(board, arduino_cli_cmd, tvm_debug, workspace_dir):
+    return conftest.make_kws_project(board, arduino_cli_cmd, tvm_debug, workspace_dir)
+
+
+def test_blank_project_compiles(workspace_dir, project):
+    project.build()
+
+
+# Add a bug (an extra curly brace) and make sure the project doesn't compile
+def test_bugged_project_compile_fails(workspace_dir, project):
+    with open(workspace_dir / "project" / "project.ino", "a") as main_file:
+        main_file.write("}\n")
+    with pytest.raises(ServerError):
+        project.build()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/micro/arduino/test_arduino_rpc_server.py b/tests/micro/arduino/test_arduino_rpc_server.py
index 1b165a02e9d1..f157214241c9 100644
--- a/tests/micro/arduino/test_arduino_rpc_server.py
+++ b/tests/micro/arduino/test_arduino_rpc_server.py
@@ -36,11 +36,10 @@
 
 import conftest
 
-
-# We'll make a new workspace for each test
-@pytest.fixture(scope="function")
-def workspace_dir(platform):
-    return conftest.make_workspace_dir("arduino_rpc_server", platform)
+# # A new project and workspace dir is created for EVERY test
+@pytest.fixture
+def workspace_dir(board):
+    return conftest.make_workspace_dir("arduino_rpc_server", board)
 
 
 def _make_session(model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config):
@@ -83,10 +82,10 @@ def _make_add_sess(model, arduino_board, arduino_cli_cmd, workspace_dir, build_c
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_compile_runtime(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
+def test_compile_runtime(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     """Test compiling the on-device runtime."""
 
-    model, arduino_board = conftest.PLATFORMS[platform]
+    model = conftest.ARDUINO_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -102,16 +101,16 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.numpy() == np.array([6, 7])).all()
 
-    with _make_add_sess(model, arduino_board, arduino_cli_cmd, workspace_dir, build_config) as sess:
+    with _make_add_sess(model, board, arduino_cli_cmd, workspace_dir, build_config) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_platform_timer(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
+def test_platform_timer(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     """Test compiling the on-device runtime."""
 
-    model, arduino_board = conftest.PLATFORMS[platform]
+    model = conftest.ARDUINO_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -132,15 +131,15 @@ def test_basic_add(sess):
         assert result.mean > 0
         assert len(result.results) == 3
 
-    with _make_add_sess(model, arduino_board, arduino_cli_cmd, workspace_dir, build_config) as sess:
+    with _make_add_sess(model, board, arduino_cli_cmd, workspace_dir, build_config) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_relay(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
+def test_relay(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     """Testing a simple relay graph"""
-    model, arduino_board = conftest.PLATFORMS[platform]
+    model = conftest.ARDUINO_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     shape = (10,)
@@ -156,9 +155,7 @@ def test_relay(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(func, target=target)
 
-    with _make_session(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config
-    ) as session:
+    with _make_session(model, board, arduino_cli_cmd, workspace_dir, mod, build_config) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -172,9 +169,9 @@ def test_relay(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_onnx(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
+def test_onnx(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     """Testing a simple ONNX model."""
-    model, arduino_board = conftest.PLATFORMS[platform]
+    model = conftest.ARDUINO_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # Load test images.
@@ -200,7 +197,7 @@ def test_onnx(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
         graph = lowered.get_graph_json()
 
     with _make_session(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, lowered, build_config
+        model, board, arduino_cli_cmd, workspace_dir, lowered, build_config
     ) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             graph, session.get_system_lib(), session.device
@@ -260,9 +257,9 @@ def check_result(
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_byoc_microtvm(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
+def test_byoc_microtvm(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     """This is a simple test case to check BYOC capabilities of microTVM"""
-    model, arduino_board = conftest.PLATFORMS[platform]
+    model = conftest.ARDUINO_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     x = relay.var("x", shape=(10, 10))
@@ -317,7 +314,7 @@ def test_byoc_microtvm(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
         ),
         model=model,
         build_config=build_config,
-        arduino_board=arduino_board,
+        arduino_board=board,
         arduino_cli_cmd=arduino_cli_cmd,
         workspace_dir=workspace_dir,
     )
@@ -344,9 +341,9 @@ def _make_add_sess_with_shape(
 )
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_rpc_large_array(platform, arduino_cli_cmd, tvm_debug, workspace_dir, shape):
+def test_rpc_large_array(board, arduino_cli_cmd, tvm_debug, workspace_dir, shape):
     """Test large RPC array transfer."""
-    model, arduino_board = conftest.PLATFORMS[platform]
+    model = conftest.ARDUINO_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -359,7 +356,7 @@ def test_tensors(sess):
         assert (C_data.numpy() == np.zeros(shape)).all()
 
     with _make_add_sess_with_shape(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, shape, build_config
+        model, board, arduino_cli_cmd, workspace_dir, shape, build_config
     ) as sess:
         test_tensors(sess)
 
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index 101d36f9bd2d..fe6ea8fe3b2e 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -17,12 +17,11 @@
 
 import datetime
 import pathlib
+import re
 import shutil
 import sys
 
 import pytest
-import tvm
-from tvm import micro, relay
 
 import conftest
 
@@ -30,7 +29,7 @@
 This unit test simulates a simple user workflow, where we:
 1. Generate a base sketch using a simple audio model
 2. Modify the .ino file, much like a user would
-3. Compile the sketch for the target platform
+3. Compile the sketch for the target board
 -- If physical hardware is present --
 4. Upload the sketch to a connected board
 5. Open a serial connection to the board
@@ -38,10 +37,11 @@
 """
 
 
-# Since these tests are sequential, we'll use the same project for all tests
+# Since these tests are sequential, we'll use the same project/workspace
+# directory for all tests in this file
 @pytest.fixture(scope="module")
-def workspace_dir(request, platform):
-    return conftest.make_workspace_dir("arduino_workflow", platform)
+def workspace_dir(request, board):
+    return conftest.make_workspace_dir("arduino_workflow", board)
 
 
 @pytest.fixture(scope="module")
@@ -49,49 +49,10 @@ def project_dir(workspace_dir):
     return workspace_dir / "project"
 
 
-def _generate_project(arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config):
-    return tvm.micro.generate_project(
-        str(conftest.TEMPLATE_PROJECT_DIR),
-        mod,
-        workspace_dir / "project",
-        {
-            "arduino_board": arduino_board,
-            "arduino_cli_cmd": arduino_cli_cmd,
-            "project_type": "example_project",
-            "verbose": bool(build_config.get("debug")),
-        },
-    )
-
-
 # We MUST pass workspace_dir, not project_dir, or the workspace will be dereferenced too soon
 @pytest.fixture(scope="module")
-def project(platform, arduino_cli_cmd, tvm_debug, workspace_dir):
-    this_dir = pathlib.Path(__file__).parent
-    model, arduino_board = conftest.PLATFORMS[platform]
-    build_config = {"debug": tvm_debug}
-
-    with open(this_dir.parent / "testdata" / "kws" / "yes_no.tflite", "rb") as f:
-        tflite_model_buf = f.read()
-
-    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
-    try:
-        import tflite.Model
-
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-    except AttributeError:
-        import tflite
-
-        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
-
-    mod, params = relay.frontend.from_tflite(tflite_model)
-    target = tvm.target.target.micro(
-        model, options=["--link-params=1", "--unpacked-api=1", "--executor=aot"]
-    )
-
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = relay.build(mod, target, params=params)
-
-    return _generate_project(arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config)
+def project(board, arduino_cli_cmd, tvm_debug, workspace_dir):
+    return conftest.make_kws_project(board, arduino_cli_cmd, tvm_debug, workspace_dir)
 
 
 def _get_directory_elements(directory):
@@ -120,7 +81,16 @@ def test_model_header_templating(project_dir, project):
     # Ensure model.h was templated with correct WORKSPACE_SIZE
     with (project_dir / "src" / "model.h").open() as f:
         model_h = f.read()
-        assert "#define WORKSPACE_SIZE 21312" in model_h
+        workspace_size_defs = re.findall(r"\#define WORKSPACE_SIZE ([0-9]*)", model_h)
+        assert workspace_size_defs
+        assert len(workspace_size_defs) == 1
+
+        # Make sure the WORKSPACE_SIZE we define is a reasonable size. We don't want
+        # to set an exact value, as this test shouldn't break if an improvement to
+        # TVM causes the amount of memory needed to decrease.
+        workspace_size = int(workspace_size_defs[0])
+        assert workspace_size < 30000
+        assert workspace_size > 10000
 
 
 def test_import_rerouting(project_dir, project):
diff --git a/tests/micro/zephyr/README.md b/tests/micro/zephyr/README.md
index 9769cae2b53b..09376e42f8bb 100644
--- a/tests/micro/zephyr/README.md
+++ b/tests/micro/zephyr/README.md
@@ -32,11 +32,11 @@ device) using:
 
 ```
 $ cd tvm/tests/micro/zephyr
-$ pytest test_zephyr.py --microtvm-platforms=host       # For QEMU emulation
-$ pytest test_zephyr.py --microtvm-platforms=nrf5340dk  # For nRF5340DK
+$ pytest test_zephyr.py --zephyr-board=qemu_x86       # For QEMU emulation
+$ pytest test_zephyr.py --zephyr-board=nrf5340dk_nrf5340_cpuapp  # For nRF5340DK
 ```
 
-To see the list of supported values for `--microtvm-platforms`, run:
+To see the list of supported values for `--zephyr-board`, run:
 ```
 $ pytest test_zephyr.py --help
 ```
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index cfdb208c92b8..7c19b62ac63d 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -20,33 +20,45 @@
 
 import pytest
 
+from tvm.micro import project
 import tvm.contrib.utils
 import tvm.target.target
 
-# The models that should pass this configuration. Maps a short, identifying platform string to
-# (model, zephyr_board).
-PLATFORMS = {
-    "qemu_x86": ("host", "qemu_x86"),
-    "qemu_riscv32": ("host", "qemu_riscv32"),
-    "qemu_riscv64": ("host", "qemu_riscv64"),
-    "mps2_an521": ("mps2_an521", "mps2_an521"),
-    "nrf5340dk": ("nrf5340dk", "nrf5340dk_nrf5340_cpuapp"),
-    "stm32f746xx_disco": ("stm32f746xx", "stm32f746g_disco"),
-    "stm32f746xx_nucleo": ("stm32f746xx", "nucleo_f746zg"),
-    "stm32l4r5zi_nucleo": ("stm32l4r5zi", "nucleo_l4r5zi"),
-    "zynq_mp_r5": ("zynq_mp_r5", "qemu_cortex_r5"),
-}
+TEMPLATE_PROJECT_DIR = (
+    pathlib.Path(__file__).parent
+    / ".."
+    / ".."
+    / ".."
+    / "apps"
+    / "microtvm"
+    / "zephyr"
+    / "template_project"
+).resolve()
+
+
+def zephyr_boards() -> dict:
+    """Returns a dict mapping board to target model"""
+    template = project.TemplateProject.from_directory(TEMPLATE_PROJECT_DIR)
+    project_options = template.info()["project_options"]
+    for option in project_options:
+        if option["name"] == "zephyr_board":
+            boards = option["choices"]
+        if option["name"] == "zephyr_model":
+            models = option["choices"]
+
+    arduino_boards = {boards[i]: models[i] for i in range(len(boards))}
+    return arduino_boards
+
+
+ZEPHYR_BOARDS = zephyr_boards()
 
 
 def pytest_addoption(parser):
     parser.addoption(
-        "--microtvm-platforms",
-        default="qemu_x86",
-        choices=PLATFORMS.keys(),
-        help=(
-            "Specify a comma-separated list of test models (i.e. as passed to tvm.target.micro()) "
-            "for microTVM tests."
-        ),
+        "--zephyr-board",
+        required=True,
+        choices=ZEPHYR_BOARDS.keys(),
+        help=("Zephyr board for test."),
     )
     parser.addoption(
         "--west-cmd", default="west", help="Path to `west` command for flashing device."
@@ -60,8 +72,8 @@ def pytest_addoption(parser):
 
 
 def pytest_generate_tests(metafunc):
-    if "platform" in metafunc.fixturenames:
-        metafunc.parametrize("platform", metafunc.config.getoption("microtvm_platforms").split(","))
+    if "board" in metafunc.fixturenames:
+        metafunc.parametrize("board", [metafunc.config.getoption("zephyr_board")])
 
 
 @pytest.fixture
@@ -75,13 +87,12 @@ def tvm_debug(request):
 
 
 @pytest.fixture
-def temp_dir(platform):
-    _, zephyr_board = PLATFORMS[platform]
+def temp_dir(board):
     parent_dir = pathlib.Path(os.path.dirname(__file__))
     filename = os.path.splitext(os.path.basename(__file__))[0]
     board_workspace = (
         parent_dir
-        / f"workspace_{filename}_{zephyr_board}"
+        / f"workspace_{filename}_{board}"
         / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
     )
     board_workspace_base = str(board_workspace)
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 5a7e69e3c7f9..d2d5522b1a0a 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -15,10 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import contextlib
-import copy
-import datetime
-import glob
 import logging
 import os
 import pathlib
@@ -46,8 +42,6 @@
 
 _LOG = logging.getLogger(__name__)
 
-PLATFORMS = conftest.PLATFORMS
-
 
 def _make_sess_from_op(
     temp_dir, model, zephyr_board, west_cmd, op_name, sched, arg_bufs, build_config
@@ -60,21 +54,9 @@ def _make_sess_from_op(
     return _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config)
 
 
-TEMPLATE_PROJECT_DIR = (
-    pathlib.Path(__file__).parent
-    / ".."
-    / ".."
-    / ".."
-    / "apps"
-    / "microtvm"
-    / "zephyr"
-    / "template_project"
-).resolve()
-
-
 def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config):
     project = tvm.micro.generate_project(
-        str(TEMPLATE_PROJECT_DIR),
+        str(conftest.TEMPLATE_PROJECT_DIR),
         mod,
         temp_dir / "project",
         {
@@ -101,10 +83,10 @@ def _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config, dtype=
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
-def test_add_uint(temp_dir, platform, west_cmd, tvm_debug):
+def test_add_uint(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -120,12 +102,12 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.numpy() == np.array([6, 7])).all()
 
-    with _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config) as sess:
+    with _make_add_sess(temp_dir, model, board, west_cmd, build_config) as sess:
         test_basic_add(sess)
 
 
 def has_fpu(zephyr_board):
-    sys.path.insert(0, str(TEMPLATE_PROJECT_DIR))
+    sys.path.insert(0, str(conftest.TEMPLATE_PROJECT_DIR))
     try:
         import microtvm_api_server
     finally:
@@ -136,11 +118,11 @@ def has_fpu(zephyr_board):
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
-def test_add_float(temp_dir, platform, west_cmd, tvm_debug):
+def test_add_float(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
-    model, zephyr_board = PLATFORMS[platform]
-    if not has_fpu(zephyr_board):
-        pytest.skip(f"FPU not enabled for {platform}")
+    model = conftest.ZEPHYR_BOARDS[board]
+    if not has_fpu(board):
+        pytest.skip(f"FPU not enabled for {board}")
 
     build_config = {"debug": tvm_debug}
 
@@ -157,17 +139,15 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.numpy() == np.array([7, 8])).all()
 
-    with _make_add_sess(
-        temp_dir, model, zephyr_board, west_cmd, build_config, dtype="float32"
-    ) as sess:
+    with _make_add_sess(temp_dir, model, board, west_cmd, build_config, dtype="float32") as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
-def test_platform_timer(temp_dir, platform, west_cmd, tvm_debug):
+def test_platform_timer(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -188,14 +168,14 @@ def test_basic_add(sess):
         assert result.mean > 0
         assert len(result.results) == 3
 
-    with _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config) as sess:
+    with _make_add_sess(temp_dir, model, board, west_cmd, build_config) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
-def test_relay(temp_dir, platform, west_cmd, tvm_debug):
+def test_relay(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple relay graph"""
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -211,7 +191,7 @@ def test_relay(temp_dir, platform, west_cmd, tvm_debug):
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(ir_mod, target=target)
 
-    with _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config) as session:
+    with _make_session(temp_dir, board, west_cmd, mod, build_config) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -224,9 +204,9 @@ def test_relay(temp_dir, platform, west_cmd, tvm_debug):
 
 
 @tvm.testing.requires_micro
-def test_onnx(temp_dir, platform, west_cmd, tvm_debug):
+def test_onnx(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple ONNX model."""
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     this_dir = pathlib.Path(os.path.dirname(__file__))
@@ -254,7 +234,7 @@ def test_onnx(temp_dir, platform, west_cmd, tvm_debug):
         lowered = relay.build(relay_mod, target, params=params)
         graph = lowered.get_graph_json()
 
-    with _make_session(temp_dir, zephyr_board, west_cmd, lowered, build_config) as session:
+    with _make_session(temp_dir, board, west_cmd, lowered, build_config) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             graph, session.get_system_lib(), session.device
         )
@@ -301,9 +281,9 @@ def check_result(
 
 
 @tvm.testing.requires_micro
-def test_byoc_microtvm(temp_dir, platform, west_cmd, tvm_debug):
+def test_byoc_microtvm(temp_dir, board, west_cmd, tvm_debug):
     """This is a simple test case to check BYOC capabilities of microTVM"""
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -357,7 +337,7 @@ def test_byoc_microtvm(temp_dir, platform, west_cmd, tvm_debug):
             axis=0,
         ),
         model=model,
-        zephyr_board=zephyr_board,
+        zephyr_board=board,
         west_cmd=west_cmd,
         build_config=build_config,
     )
@@ -381,9 +361,9 @@ def _make_add_sess_with_shape(temp_dir, model, zephyr_board, west_cmd, shape, bu
     ],
 )
 @tvm.testing.requires_micro
-def test_rpc_large_array(temp_dir, platform, west_cmd, tvm_debug, shape):
+def test_rpc_large_array(temp_dir, board, west_cmd, tvm_debug, shape):
     """Test large RPC array transfer."""
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -395,11 +375,149 @@ def test_tensors(sess):
         C_data = tvm.nd.array(np.zeros(shape, dtype="int8"), device=sess.device)
         assert (C_data.numpy() == np.zeros(shape)).all()
 
-    with _make_add_sess_with_shape(
-        temp_dir, model, zephyr_board, west_cmd, shape, build_config
-    ) as sess:
+    with _make_add_sess_with_shape(temp_dir, model, board, west_cmd, shape, build_config) as sess:
         test_tensors(sess)
 
 
+@tvm.testing.requires_micro
+def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
+    """Test AutoTune for microTVM Zephyr"""
+    import tvm.relay as relay
+
+    model = conftest.ZEPHYR_BOARDS[board]
+
+    # Create a Relay model
+    data_shape = (1, 3, 16, 16)
+    weight_shape = (8, 3, 5, 5)
+    data = relay.var("data", relay.TensorType(data_shape, "float32"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        kernel_layout="OIHW",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+
+    data_sample = np.random.rand(data_shape[0], data_shape[1], data_shape[2], data_shape[3]).astype(
+        "float32"
+    )
+    weight_sample = np.random.rand(
+        weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]
+    ).astype("float32")
+    params = {mod["main"].params[1].name_hint: weight_sample}
+
+    target = tvm.target.target.micro(model)
+    pass_context = tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True})
+    with pass_context:
+        tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target)
+    assert len(tasks) > 0
+
+    repo_root = pathlib.Path(
+        subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding="utf-8").strip()
+    )
+    template_project_dir = repo_root / "apps" / "microtvm" / "zephyr" / "template_project"
+    module_loader = tvm.micro.AutoTvmModuleLoader(
+        template_project_dir=template_project_dir,
+        project_options={
+            "zephyr_board": board,
+            "west_cmd": west_cmd,
+            "verbose": 1,
+            "project_type": "host_driven",
+        },
+    )
+    builder = tvm.autotvm.LocalBuilder(
+        n_parallel=1,
+        build_kwargs={"build_option": {"tir.disable_vectorize": True}},
+        do_fork=True,
+        build_func=tvm.micro.autotvm_build_func,
+    )
+    runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader)
+
+    measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)
+
+    log_path = pathlib.Path("zephyr_autotune.log")
+    if log_path.exists():
+        log_path.unlink()
+
+    n_trial = 10
+    for task in tasks:
+        tuner = tvm.autotvm.tuner.GATuner(task)
+        tuner.tune(
+            n_trial=n_trial,
+            measure_option=measure_option,
+            callbacks=[
+                tvm.autotvm.callback.log_to_file(str(log_path)),
+                tvm.autotvm.callback.progress_bar(n_trial, si_prefix="M"),
+            ],
+            si_prefix="M",
+        )
+
+    assert tuner.best_flops > 0
+
+    # Build without tuning
+    with pass_context:
+        lowered = tvm.relay.build(mod, target=target, params=params)
+
+    temp_dir = utils.tempdir()
+    project = tvm.micro.generate_project(
+        str(template_project_dir),
+        lowered,
+        temp_dir / "project",
+        {
+            "zephyr_board": board,
+            "west_cmd": west_cmd,
+            "verbose": 1,
+            "project_type": "host_driven",
+        },
+    )
+    project.build()
+    project.flash()
+
+    with tvm.micro.Session(project.transport()) as session:
+        graph_mod = tvm.micro.create_local_graph_executor(
+            lowered.get_graph_json(), session.get_system_lib(), session.device
+        )
+        graph_mod.set_input(**lowered.get_params())
+        graph_mod.run(data=data_sample)
+        expected_output = graph_mod.get_output(0).numpy()
+        del graph_mod
+
+    # Build using autotune logs
+    with tvm.autotvm.apply_history_best(str(log_path)):
+        with pass_context:
+            lowered_tuned = tvm.relay.build(mod, target=target, params=params)
+
+    temp_dir = utils.tempdir()
+    project = tvm.micro.generate_project(
+        str(template_project_dir),
+        lowered_tuned,
+        temp_dir / "project",
+        {
+            "zephyr_board": board,
+            "west_cmd": west_cmd,
+            "verbose": 1,
+            "project_type": "host_driven",
+        },
+    )
+    project.build()
+    project.flash()
+
+    with tvm.micro.Session(project.transport()) as session:
+        graph_mod = tvm.micro.create_local_graph_executor(
+            lowered_tuned.get_graph_json(), session.get_system_lib(), session.device
+        )
+        graph_mod.set_input(**lowered_tuned.get_params())
+        graph_mod.run(data=data_sample)
+        output = graph_mod.get_output(0).numpy()
+        del graph_mod
+
+    tvm.testing.assert_allclose(output, expected_output, rtol=1e-4, atol=1e-5)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 37aa0f76a852..6c72d3d7becf 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -42,8 +42,6 @@
 
 _LOG = logging.getLogger(__name__)
 
-PLATFORMS = conftest.PLATFORMS
-
 
 def _build_project(temp_dir, zephyr_board, west_cmd, mod, build_config, extra_files_tar=None):
     template_project_dir = (
@@ -135,13 +133,19 @@ def _get_message(fd, expr: str, timeout_sec: int):
 
 
 @tvm.testing.requires_micro
-def test_tflite(temp_dir, platform, west_cmd, tvm_debug):
+def test_tflite(temp_dir, board, west_cmd, tvm_debug):
     """Testing a TFLite model."""
 
-    if platform not in ["qemu_x86", "mps2_an521", "nrf5340dk", "stm32l4r5zi_nucleo", "zynq_mp_r5"]:
+    if board not in [
+        "qemu_x86",
+        "mps2_an521",
+        "nrf5340dk_nrf5340_cpuapp",
+        "nucleo_l4r5zi",
+        "qemu_cortex_r5",
+    ]:
         pytest.skip(msg="Model does not fit.")
 
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     input_shape = (1, 32, 32, 3)
     output_shape = (1, 10)
     build_config = {"debug": tvm_debug}
@@ -195,7 +199,7 @@ def test_tflite(temp_dir, platform, west_cmd, tvm_debug):
 
         project, _ = _build_project(
             temp_dir,
-            zephyr_board,
+            board,
             west_cmd,
             lowered,
             build_config,
@@ -218,12 +222,12 @@ def test_tflite(temp_dir, platform, west_cmd, tvm_debug):
 
 
 @tvm.testing.requires_micro
-def test_qemu_make_fail(temp_dir, platform, west_cmd, tvm_debug):
+def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug):
     """Testing QEMU make fail."""
-    if platform not in ["qemu_x86", "mps2_an521"]:
+    if board not in ["qemu_x86", "mps2_an521"]:
         pytest.skip(msg="Only for QEMU targets.")
 
-    model, zephyr_board = PLATFORMS[platform]
+    model = conftest.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
     shape = (10,)
     dtype = "float32"
@@ -254,7 +258,7 @@ def test_qemu_make_fail(temp_dir, platform, west_cmd, tvm_debug):
 
         project, project_dir = _build_project(
             temp_dir,
-            zephyr_board,
+            board,
             west_cmd,
             lowered,
             build_config,
diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py
new file mode 100644
index 000000000000..1f6e0e711f0c
--- /dev/null
+++ b/tests/python/contrib/test_cmsisnn/test_networks.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN: testing with networks"""
+
+import platform
+import sys
+import os
+import pathlib
+import tvm
+from tvm import relay
+from tvm.contrib.download import download_testdata
+from tvm.relay.op.contrib import cmsisnn
+import numpy as np
+import pytest
+import itertools
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+
+
+def get_range_for_dtype_str(dtype):
+    """
+    Produce the min,max for a give data type.
+
+    Parameters
+    ----------
+    dtype : str
+        a type string (e.g., int8)
+
+    Returns
+    -------
+    type_info.min : int
+        the minimum of the range
+    type_info.max : int
+        the maximum of the range
+    """
+
+    try:
+        type_info = np.iinfo(dtype)
+    except ValueError:
+        type_info = np.finfo(dtype)
+    return type_info.min, type_info.max
+
+
+def convert_to_relay(
+    tflite_model_buf,
+    input_data,
+    input_node,
+):
+    def convert_to_list(x):
+        if not isinstance(x, list):
+            x = [x]
+        return x
+
+    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
+    try:
+        import tflite.Model
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    except ImportError:
+        raise ImportError("The tflite package must be installed")
+
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
+
+    shape_dict = {}
+    dtype_dict = {}
+    for i, e in enumerate(input_node):
+        shape_dict[e] = input_data[i].shape
+        dtype_dict[e] = input_data[i].dtype.name
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
+    )
+
+    return mod, params
+
+
+@pytest.mark.skipif(
+    platform.machine() == "i686", reason="Reference system unavailable in i386 container"
+)
+def test_cnn_small():
+    # download the model
+    base_url = "https://github.com/ARM-software/ML-zoo/raw/master/models/keyword_spotting/cnn_small/tflite_int8"
+    file_to_download = "cnn_s_quantized.tflite"
+    model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_to_download)
+
+    with open(model_file, "rb") as f:
+        tflite_model_buf = f.read()
+
+    input_shape = (1, 490)
+    in_min, in_max = get_range_for_dtype_str("int8")
+    input_data = np.random.randint(in_min, high=in_max, size=input_shape).astype(np.float32)
+
+    orig_mod, params = convert_to_relay(tflite_model_buf, input_data, "input")
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate CMSIS-NN output against CPU output
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_CORSTONE300_RUNNER
+    inputs = {"input": input_data}
+    params = {}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(module=cmsisnn_mod, inputs=inputs, outputs=output_list, params=params),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index afbc302af66f..c1951d1f2ce5 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -17,16 +17,52 @@
 
 """CMSIS-NN integration tests: softmax"""
 
-import pytest
+import platform
 import sys
-
+import os
+import pathlib
 import tvm
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 import numpy as np
+import pytest
+import itertools
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+
+
+def get_range_for_dtype_str(dtype):
+    """
+    Produce the min,max for a give data type.
+
+    Parameters
+    ----------
+    dtype : str
+        a type string (e.g., int8)
+
+    Returns
+    -------
+    type_info.min : int
+        the minimum of the range
+    type_info.max : int
+        the maximum of the range
+    """
+
+    try:
+        type_info = np.iinfo(dtype)
+    except ValueError:
+        type_info = np.finfo(dtype)
+    return type_info.min, type_info.max
 
 
 def count_num_calls(mod):
+    """Count number of CallNode in the IRModule"""
+
     class CallCounter(relay.ExprVisitor):
         def __init__(self):
             super().__init__()
@@ -45,33 +81,50 @@ def visit_call(self, call):
 
 
 def make_module(func):
+    """Create IRModule from Function"""
     func = relay.Function(relay.analysis.free_vars(func), func)
     mod = tvm.IRModule.from_expr(func)
     return relay.transform.InferType()(mod)
 
 
-def make_model(shape, zero_point, scale, in_dtype, out_dtype):
-    a = relay.var("a", shape=shape, dtype=in_dtype)
+def make_model(
+    shape, in_dtype, out_dtype, in_zero_point, in_scale, out_zero_point=-128, out_scale=1.0 / 256
+):
+
+    """Create a Relay Function / network model"""
+    a = relay.var("in0", shape=shape, dtype=in_dtype)
     dequantize = relay.qnn.op.dequantize(
         a,
-        input_scale=relay.const(scale, "float32"),
-        input_zero_point=relay.const(zero_point, "int32"),
+        input_scale=relay.const(in_scale, "float32"),
+        input_zero_point=relay.const(in_zero_point, "int32"),
     )
     softmax = relay.nn.softmax(dequantize)
     model = relay.qnn.op.quantize(
         softmax,
-        output_scale=relay.const(scale, "float32"),
-        output_zero_point=relay.const(zero_point, "int32"),
+        output_scale=relay.const(out_scale, "float32"),
+        output_zero_point=relay.const(out_zero_point, "int32"),
         out_dtype=out_dtype,
     )
     return model
 
 
-def test_softmax_int8():
-    model = make_model([1, 16, 16, 3], 64, 0.02, "int8", "int8")
+@pytest.mark.skipif(
+    platform.machine() == "i686", reason="Reference system unavailable in i386 container"
+)
+@pytest.mark.parametrize(["zero_point", "scale"], [[33, 0.256], [-64, 0.0128]])
+def test_softmax_int8(zero_point, scale):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_CORSTONE300_RUNNER
+
+    dtype = "int8"
+    shape = [1, 16, 16, 3]
+    model = make_model(shape, dtype, dtype, zero_point, scale)
     orig_mod = make_module(model)
+
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
 
+    # validate pattern matching
     attrs = [
         cmsisnn_mod[var.name_hint].attrs
         for var in cmsisnn_mod.get_global_vars()
@@ -88,10 +141,52 @@ def test_softmax_int8():
         cmsisnn_mod
     ), "Number of calls changed during partitioning"
 
+    # validate the output
+    in_min, in_max = get_range_for_dtype_str(dtype)
+    np.random.seed(0)
+    input_data = np.random.randint(in_min, high=in_max, size=shape, dtype=dtype)
+    inputs = {"in0": input_data}
+    params = {}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(module=cmsisnn_mod, inputs=inputs, outputs=output_list, params=params),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
+def parameterize_for_invalid_model(test):
+    in_dtype = ["uint8", "int8"]
+    out_dtype = ["uint8", "int8"]
+    zero_point = [-128, 64]
+    scale = [1.0 / 256, 0.2]
+    out_zero_point = [-128, 33]
+    out_scale = [1.0 / 256, 0.2]
+    all_combinations = itertools.product(
+        in_dtype, out_dtype, zero_point, scale, out_zero_point, out_scale
+    )
+    all_combinations = filter(
+        lambda parameters: not (
+            parameters[0] == "int8"
+            and parameters[1] == "int8"
+            and parameters[4] == -128
+            and parameters[5] == 1.0 / 256
+        ),
+        all_combinations,
+    )
+    return pytest.mark.parametrize(
+        ["in_dtype", "out_dtype", "zero_point", "scale", "out_zero_point", "out_scale"],
+        all_combinations,
+    )(test)
+
+
+@parameterize_for_invalid_model
+def test_invalid_softmax(in_dtype, out_dtype, zero_point, scale, out_zero_point, out_scale):
+    model = make_model(
+        [1, 16, 16, 3], in_dtype, out_dtype, zero_point, scale, out_zero_point, out_scale
+    )
 
-@pytest.mark.parametrize("in_dtype,out_dtype", [["uint8", "int8"], ["int8", "uint8"]])
-def test_softmax_not_int8(in_dtype, out_dtype):
-    model = make_model([1, 16, 16, 3], 64, 0.02, in_dtype, out_dtype)
     orig_mod = make_module(model)
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
 
diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index a332ab9b0824..7e2a8263a73d 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 import numpy as np
 
@@ -54,10 +54,8 @@ def _get_addition_qnn_params(input1_zp, input1_sc, input2_zp, input2_sc):
     return output_zp, output_sc
 
 
+@requires_ethosn
 def test_addition():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 22, 9, 9), 24, 1.057, 253, 0.452),
         ((1, 27, 21, 16), 79, 0.850, 24, 0.380),
@@ -81,10 +79,8 @@ def test_addition():
         tei.verify(outputs, 2)
 
 
+@requires_ethosn
 def test_addition_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         (
             (2, 4, 4, 4),
diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
index a529e046311a..332b4e264a96 100644
--- a/tests/python/contrib/test_ethosn/test_concatenate.py
+++ b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -20,7 +20,7 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 
 
@@ -53,10 +53,8 @@ def _get_model(shapes, dtype, axis):
     return con
 
 
+@requires_ethosn
 def test_concatenate():
-    if not ethosn_available():
-        return
-
     trials = [
         ([(1, 4), (1, 6)], 1),
         ([(1, 16, 4), (1, 16, 4)], 1),
@@ -75,10 +73,8 @@ def test_concatenate():
         tei.verify(outputs, 0)
 
 
+@requires_ethosn
 def test_concatenate_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         ([(1, 4, 4, 4, 4), (1, 4, 4, 4, 4)], "uint8", 1, "dimensions=5, dimensions must be <= 4;"),
         (
diff --git a/tests/python/contrib/test_ethosn/test_constant_duplication.py b/tests/python/contrib/test_ethosn/test_constant_duplication.py
index e443a18c6d09..e0fb938340d7 100644
--- a/tests/python/contrib/test_ethosn/test_constant_duplication.py
+++ b/tests/python/contrib/test_ethosn/test_constant_duplication.py
@@ -20,7 +20,7 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 
 
@@ -70,10 +70,8 @@ def _get_model():
     return req, params
 
 
+@requires_ethosn
 def test_constant_duplication():
-    if not ethosn_available():
-        return
-
     model, params = _get_model()
     mod = tei.make_module(model, params)
     res = tei.build(mod, params, npu=True, expected_host_ops=1)
diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py
index 845cec593105..e4595af8260a 100644
--- a/tests/python/contrib/test_ethosn/test_conv2d.py
+++ b/tests/python/contrib/test_ethosn/test_conv2d.py
@@ -21,7 +21,7 @@
 import math
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 
 
@@ -113,10 +113,8 @@ def _get_model(
     return req, params
 
 
+@requires_ethosn
 def test_conv2d():
-    if not ethosn_available():
-        return
-
     trials = [
         [(1, 17, 20, 26), 4, 3, 1, "attr", (2, 2), (1, 1)],
         [(1, 30, 27, 30), 5, 5, 3, "none", (1, 1), (1, 1)],
@@ -181,10 +179,8 @@ def test_conv2d():
             tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_conv2d_failure():
-    if not ethosn_available():
-        return
-
     _scale_error_msg = (
         "Overall scale (of the input * weights / output) should be in the range [0, 1)"
     )
diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py
index 7daf888f163f..02f90a360a24 100644
--- a/tests/python/contrib/test_ethosn/test_depth_to_space.py
+++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 import numpy as np
 
@@ -30,10 +30,8 @@ def _get_model(shape, block, dtype, layout):
     return depth
 
 
+@requires_ethosn
 def test_depth_to_space():
-    if not ethosn_available():
-        return
-
     trials = [
         (1, 16, 16, 16),
         (1, 64, 32, 16),
@@ -52,10 +50,8 @@ def test_depth_to_space():
         tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_depth_to_space_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         ((2, 16, 16, 16), 2, "uint8", "NHWC", "batch size=2, batch size must = 1"),
         ((1, 16, 16, 16), 2, "int8", "NHWC", "dtype='int8', dtype must be either uint8 or int32"),
diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py
index 09d07f6a788e..5a693a53f705 100644
--- a/tests/python/contrib/test_ethosn/test_fullyconnected.py
+++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py
@@ -20,7 +20,7 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 
 
@@ -56,10 +56,8 @@ def _get_model(
     return req, params
 
 
+@requires_ethosn
 def test_fullyconnected():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 1024), 71, 0.580, 79, 1.498),
         ((1, 4096), 166, 1.724, 117, 0.180),
@@ -91,10 +89,8 @@ def test_fullyconnected():
         tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_fullyconnected_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         (
             (1, 64),
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 65d2738447cc..f720c55c567a 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -23,8 +23,10 @@
 pytest.importorskip("tensorflow")
 
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from tvm.contrib import download
+from tvm.testing import requires_ethosn
+
 import tvm.relay.testing.tf as tf_testing
 import tflite.Model
 from . import infrastructure as tei
@@ -88,8 +90,6 @@ def _test_image_network(
         to check the correctness/accuracy.
 
     """
-    if not ethosn_available():
-        return
 
     def get_model():
         if model_url[-3:] in ("tgz", "zip"):
@@ -116,6 +116,7 @@ def get_model():
         tei.run(m, inputs, output_count, npu=True)
 
 
+@requires_ethosn
 def test_mobilenet_v1():
     # If this test is failing due to a hash mismatch, please notify @mbaret and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
@@ -142,6 +143,7 @@ def test_mobilenet_v1():
     )
 
 
+@requires_ethosn
 def test_inception_v3():
     # If this test is failing due to a hash mismatch, please notify @mbaret and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
@@ -167,6 +169,7 @@ def test_inception_v3():
     )
 
 
+@requires_ethosn
 def test_inception_v4():
     # If this test is failing due to a hash mismatch, please notify @mbaret and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
@@ -192,6 +195,7 @@ def test_inception_v4():
     )
 
 
+@requires_ethosn
 def test_ssd_mobilenet_v1():
     # If this test is failing due to a hash mismatch, please notify @mbaret and
     # @Leo-arm. The hash is there to catch any changes in the behaviour of the
diff --git a/tests/python/contrib/test_ethosn/test_pooling.py b/tests/python/contrib/test_ethosn/test_pooling.py
index 6b2330f690c8..72b01dbc4ef5 100644
--- a/tests/python/contrib/test_ethosn/test_pooling.py
+++ b/tests/python/contrib/test_ethosn/test_pooling.py
@@ -20,7 +20,7 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 
 
@@ -35,10 +35,8 @@ def _get_model(shape, typef, sizes, strides, pads, layout, dtype):
     return req
 
 
+@requires_ethosn
 def test_pooling():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 8, 8, 8), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 0, 0), "NHWC"),
         ((1, 9, 9, 9), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 1, 1), "NHWC"),
@@ -60,10 +58,8 @@ def test_pooling():
         tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_pooling_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         (
             (2, 8, 8, 8),
diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py
index 6b366e6122ff..1f3d00cba05e 100644
--- a/tests/python/contrib/test_ethosn/test_relu.py
+++ b/tests/python/contrib/test_ethosn/test_relu.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 import numpy as np
 
@@ -30,10 +30,8 @@ def _get_model(shape, dtype, a_min, a_max):
     return relu
 
 
+@requires_ethosn
 def test_relu():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 4, 4, 4), 65, 178),
         ((1, 8, 4, 2), 1, 254),
@@ -53,10 +51,8 @@ def test_relu():
         tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_relu_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 4, 4, 4, 4), "uint8", 65, 78, "dimensions=5, dimensions must be <= 4"),
         ((1, 8, 4, 2), "int8", 1, 254, "dtype='int8', dtype must be either uint8 or int32"),
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index 20df5f9bd288..417b52f0eebd 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -19,7 +19,8 @@
 
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib import ethosn_available, get_pattern_table
+from tvm.testing import requires_ethosn
+from tvm.relay.op.contrib import get_pattern_table
 from . import infrastructure as tei
 import numpy as np
 
@@ -32,10 +33,8 @@ def _get_model(input_shape, output_shape, dtype):
     return req, params
 
 
+@requires_ethosn
 def test_reshape():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 15, 4, 1), (1, 60)),
         ((1, 15, 4, 1), (1, 30, 2)),
@@ -58,10 +57,8 @@ def test_reshape():
         tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_reshape_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         (
             (1, 15, 4, 1),
diff --git a/tests/python/contrib/test_ethosn/test_sigmoid.py b/tests/python/contrib/test_ethosn/test_sigmoid.py
index f3018dde1297..40a2fbd2d83f 100644
--- a/tests/python/contrib/test_ethosn/test_sigmoid.py
+++ b/tests/python/contrib/test_ethosn/test_sigmoid.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 import numpy as np
 
@@ -41,10 +41,8 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype):
     return model
 
 
+@requires_ethosn
 def test_sigmoid():
-    if not ethosn_available():
-        return
-
     trials = [
         (1, 16, 16, 16),
         (1, 8, 8),
@@ -64,10 +62,8 @@ def test_sigmoid():
         tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_sigmoid_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         ((2, 4, 4, 4), 64, 0.2, 0, 1 / 256, "uint8", "batch size=2, batch size must = 1"),
         (
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index 3f1c57728c61..e81930a05ec5 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -20,7 +20,7 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available
+from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 
 
@@ -30,10 +30,8 @@ def _get_model(shape, dtype, splits, axis):
     return split.astuple()
 
 
+@requires_ethosn
 def test_split():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 16, 16, 32), (2, 7, 10), 2),
         ((1, 12, 8, 16), 3, 1),
@@ -53,10 +51,8 @@ def test_split():
         tei.verify(outputs, 0)
 
 
+@requires_ethosn
 def test_split_failure():
-    if not ethosn_available():
-        return
-
     trials = [
         ((1, 4, 4, 4, 4), "uint8", 4, 2, "dimensions=5, dimensions must be <= 4;"),
         ((1, 4, 4, 4), "int8", 4, 2, "dtype='int8', dtype must be either uint8 or int32;"),
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index eb2173ab202f..6d38c78a9f36 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -19,14 +19,13 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available, Available
+from tvm.testing import requires_ethosn
+from tvm.relay.op.contrib.ethosn import Available
 from . import infrastructure as tei
 
 
+@requires_ethosn
 def test_split_add_concat():
-    if not ethosn_available():
-        return
-
     def get_model(input_shape, var_names):
         """Return a model"""
 
@@ -71,6 +70,7 @@ def get_model(input_shape, var_names):
     tei.verify(outputs, 2)
 
 
+@requires_ethosn
 def test_multiple_command_streams():
     """Check that multiple Ethos-N partitions are correctly handled.
 
@@ -80,8 +80,6 @@ def test_multiple_command_streams():
     simple graph which creates two Ethos-N partitions and checks the result
     against an 'all-CPU' run through TVM.
     """
-    if not ethosn_available():
-        return
 
     def get_model():
         """
@@ -107,10 +105,8 @@ def get_model():
     )
 
 
+@requires_ethosn
 def test_output_order():
-    if not ethosn_available():
-        return
-
     def get_model(input_shape, var_names):
         """Return a model"""
 
@@ -140,10 +136,8 @@ def get_model(input_shape, var_names):
     tei.verify(outputs, 1)
 
 
+@requires_ethosn
 def test_split_with_asym_concats():
-    if not ethosn_available():
-        return
-
     def get_model(shape, splits, axis):
         a = relay.var("a", shape=shape, dtype="uint8")
         split = relay.op.split(a, indices_or_sections=splits, axis=axis)
@@ -183,11 +177,10 @@ def get_model(shape, splits, axis):
         tei.verify(outputs, 0)
 
 
+@requires_ethosn
 def test_output_tuple_propagation():
     """This tests the case where the output tuple must be inferred
     as having dummy tensor information."""
-    if not ethosn_available():
-        return
 
     def get_model():
         a = relay.var("a", shape=(1, 4, 4, 16), dtype="uint8")
@@ -205,10 +198,8 @@ def get_model():
     tei.verify(outputs, 0)
 
 
+@requires_ethosn
 def test_input_tuples():
-    if not ethosn_available():
-        return
-
     def get_model(shapes, axis):
         tup = []
         for i, shape in enumerate(shapes):
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
new file mode 100644
index 000000000000..fc795c066cb6
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This module provides infrastructure to verify the correctness of
+the command stream produced.
+
+Currently it will invoke vela to generate a vela-optimized tflite
+in which the command stream is contained as a custom operator.
+This class include methods to parse the custom operator to extract
+the command stream and perform an equivalency check for single operator
+test cases.
+"""
+
+import numpy
+from enum import IntEnum
+
+import tvm
+from tvm import relay
+import tvm.relay.backend.contrib.ethosu.op as ethosu_ops
+from tvm.topi.nn.utils import get_pad_tuple
+
+
+class AttachType(IntEnum):
+    kGroupRoot = 1
+    kInline = 2
+    kInlinedAlready = 3
+    kScope = 4
+    kScanUpdate = 5
+
+
+def generate_weights_data(shape, dtype):
+    size = 1
+    for dim in shape:
+        size *= dim
+    return (numpy.arange(size) % 255).reshape(shape).astype(dtype)
+
+
+def get_convolutional_args(call, include_buffers=False, remove_constants=False):
+    """A method to extract the arguments from conv2d or depthwise2d extern call."""
+    args = call.args
+    conv_args = []
+    remove_indices = [0]
+
+    if remove_constants:
+        remove_indices += [41, 42, 44, 45]
+
+    for i, arg in enumerate(args):
+        if i in remove_indices:
+            continue
+        elif isinstance(arg, tvm.tir.expr.IntImm) or isinstance(arg, tvm.tir.expr.FloatImm):
+            conv_args.append(arg.value)
+        elif isinstance(arg, tvm.tir.expr.Load) and not include_buffers:
+            conv_args.append(arg.index)
+        else:
+            conv_args.append(arg)
+
+    return conv_args
+
+
+def make_ethosu_conv2d(
+    ifm,
+    ifm_channels,
+    ofm_channels,
+    kernel_shape,
+    padding,
+    strides,
+    dilation,
+    activation="NONE",
+    ifm_layout="NHWC",
+    ofm_layout="NHWC",
+    weight_dtype="int8",
+):
+    # conv params
+    weight_shape = (ofm_channels, kernel_shape[0], kernel_shape[1], ifm_channels)
+    padding = get_pad_tuple(padding, kernel_shape)
+
+    scale_bias_data = generate_weights_data((weight_shape[0], 10), "uint8")
+    scale_bias = relay.const(scale_bias_data, dtype="uint8")
+    weight_data = generate_weights_data(weight_shape, "int8")
+    weight = relay.const(weight_data, dtype=weight_dtype)
+    conv = ethosu_ops.ethosu_conv2d(
+        ifm,
+        weight,
+        scale_bias,
+        lut=relay.const([], dtype="int8"),
+        ifm_scale=0.5,
+        ifm_zero_point=10,
+        weight_zero_point=12,
+        ofm_scale=0.25,
+        ofm_zero_point=14,
+        kernel_shape=kernel_shape,
+        ofm_channels=ofm_channels,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        activation=activation,
+        clip_min=10 if activation == "CLIP" else 0,
+        clip_max=100 if activation == "CLIP" else 0,
+        upscale="NONE",
+        ifm_layout=ifm_layout,
+        ofm_layout=ofm_layout,
+    )
+    return conv
diff --git a/tests/python/contrib/test_ethosu/relay_ir_builder.py b/tests/python/contrib/test_ethosu/relay_ir_builder.py
new file mode 100644
index 000000000000..6169a3e46520
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/relay_ir_builder.py
@@ -0,0 +1,295 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Helper module to build relay operations for testing"""
+
+from pathlib import Path
+import numpy as np
+import math
+
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import get_pattern_table
+from tvm.relay import qnn
+from tvm.relay.backend.contrib.ethosu.util import get_range_for_dtype_str
+
+
+class TensorType:
+    """A data structure to capture tensor parameters"""
+
+    def __init__(self):
+        self.shape = None
+        self.dtype = None
+        self.zp = None
+        self.sc = None
+        self.layout = None
+
+    def get_dim_size(self, dim):
+        for idx, char in enumerate(self.layout):
+            if dim == char:
+                return self.shape[idx]
+        return None
+
+    def get_dim_index(self, dim):
+        for idx, char in enumerate(self.layout):
+            if dim == char:
+                return idx
+        return None
+
+
+class QnnConv2DParams:
+    """A data structure to capture relay.qnn.op.conv2D parameters"""
+
+    def __init__(self, dtype):
+        self.ifm = TensorType()
+        self.ofm = TensorType()
+        self.kernel = TensorType()
+
+        # default values
+        self.ifm.dtype = dtype
+        self.ifm.layout = "NHWC"
+        ifm_min, ifm_max = get_range_for_dtype_str(self.ifm.dtype)
+        self.ifm.zp = relay.const(np.random.randint(ifm_min, ifm_max), "int32")
+        self.ifm.sc = relay.const(np.random.random() * 2, "float32")
+        self.kernel.dtype = dtype
+        self.kernel.layout = "HWIO"
+        kernel_min, kernel_max = get_range_for_dtype_str(self.kernel.dtype)
+        self.kernel.zp = relay.const(np.random.randint(kernel_min, kernel_max), "int32")
+        self.kernel.sc = relay.const(np.random.random() * 2, "float32")
+        self.ofm.layout = "NHWC"
+        self.ofm.dtype = dtype
+        ofm_min, ofm_max = get_range_for_dtype_str(self.ofm.dtype)
+        self.ofm.zp = relay.const(np.random.randint(ofm_min, ofm_max), "int32")
+        self.ofm.sc = relay.const(np.random.random() * 2, "float32")
+        self.dilation = (1, 1)
+
+        self.strides = None
+        self.pad = None
+        self.activation = "NONE"
+        self.clip_min = 0
+        self.clip_max = 0
+
+    def update_output_qnn_params(
+        self, input_dtype="uint8", kernel_dtype="uint8", output_dtype="uint8"
+    ):
+        _, dtype_max = get_range_for_dtype_str(input_dtype)
+        input_max = self.ifm.sc.data.asnumpy() * (dtype_max - self.ifm.zp.data.asnumpy())
+        input_min = -self.ifm.sc.data.asnumpy() * self.ifm.zp.data.asnumpy()
+        _, dtype_max = get_range_for_dtype_str(kernel_dtype)
+        kernel_max = np.max(
+            self.kernel.sc.data.asnumpy() * (dtype_max - self.kernel.zp.data.asnumpy())
+        )
+        kernel_min = np.min(-self.kernel.sc.data.asnumpy() * self.kernel.zp.data.asnumpy())
+        kernel_h = self.kernel.get_dim_size("H")
+        kernel_w = self.kernel.get_dim_size("W")
+        channels = self.kernel.get_dim_size("I")
+        output_limits = [
+            kernel_max * kernel_h * kernel_w * channels * input_max,
+            kernel_min * kernel_h * kernel_w * channels * input_max,
+            kernel_min * kernel_h * kernel_w * channels * input_min,
+            kernel_max * kernel_h * kernel_w * channels * input_min,
+        ]
+        output_max = max(output_limits)
+        output_min = min(output_limits)
+        dtype_min, dtype_max = get_range_for_dtype_str(input_dtype)
+        self.ofm.sc = relay.const((output_max - output_min) / (dtype_max - dtype_min), "float32")
+        self.ofm.zp = relay.const(-int(output_min / self.ofm.sc.data.asnumpy()), "int32")
+
+
+class PoolingParams:
+    """A data structure to capture relay.op.max_pool2d /
+    relay.op.avg_pool2d parameters
+    """
+
+    def __init__(self, dtype):
+        self.type = None
+        self.size = None
+        self.strides = None
+        self.pad = None
+        self.layout = None
+        self.ifm = TensorType()
+        self.ofm = TensorType()
+
+        # default values
+        self.ifm.dtype = dtype
+        self.ifm.layout = "NHWC"
+        self.ifm.zp = relay.const(np.random.randint(0, 255), "int32")
+        self.ifm.sc = relay.const(np.random.random() * 2, "float32")
+        self.ofm.zp = relay.const(np.random.randint(0, 255), "int32")
+        self.ofm.sc = relay.const(np.random.random() * 2, "float32")
+        self.ofm.dtype = dtype
+        self.dilation = (1, 1)
+
+
+class AddParams:
+    """A data structure to capture relay.qnn.op.add parameters"""
+
+    def __init__(self, dtype):
+        self.ifm0 = TensorType()
+        self.ifm1 = TensorType()
+        self.ofm = TensorType()
+
+        # default values
+        self.ifm0.dtype = dtype
+        self.ifm0.zp = relay.const(np.random.randint(0, 255), "int32")
+        self.ifm0.sc = relay.const(np.random.random() * 2, "float32")
+        self.ifm1.dtype = dtype
+        self.ifm1.zp = relay.const(np.random.randint(0, 255), "int32")
+        self.ifm1.sc = relay.const(np.random.random() * 2, "float32")
+        self.update_output_qnn_params()
+        self.ofm.dtype = dtype
+
+    def update_output_qnn_params(self):
+        ti = np.iinfo(self.ifm0.dtype)
+        dtype_min, dtype_max = int(ti.min), int(ti.max)
+        input1_max = self.ifm0.sc.data.asnumpy() * (dtype_max - self.ifm0.zp.data.asnumpy())
+        input1_min = (dtype_min - self.ifm0.sc.data.asnumpy()) * self.ifm0.zp.data.asnumpy()
+        input2_max = self.ifm1.sc.data.asnumpy() * (dtype_max - self.ifm1.zp.data.asnumpy())
+        input2_min = (dtype_min - self.ifm1.sc.data.asnumpy()) * self.ifm1.zp.data.asnumpy()
+        output_max = input1_max + input2_max
+        output_min = input1_min + input2_min
+        self.ofm.sc = relay.const((output_max - output_min) / dtype_max, "float32")
+        self.ofm.zp = relay.const(
+            (dtype_min - int(output_min / self.ofm.sc.data.asnumpy())), "int32"
+        )
+
+
+def get_pad_value(data, kernel, stride):
+    """Get the pad tuple of value for SAME padding"""
+
+    out = int(math.ceil(float(data) / float(stride)))
+    pad = max(0, (out - 1) * stride + kernel - data)
+    pad_before = pad // 2
+    pad_after = pad - pad_before
+    return pad_before, pad_after
+
+
+def create_qnn_conv2d(qnn_conv2d_params, ifm_expr):
+    """Create a relay.Expr of relay.qnn.conv2D given the parameters"""
+    v_params = list()
+    params = {
+        "kernel_size": [
+            qnn_conv2d_params.kernel.get_dim_size("H"),
+            qnn_conv2d_params.kernel.get_dim_size("W"),
+        ],
+        "strides": [qnn_conv2d_params.strides[0], qnn_conv2d_params.strides[1]],
+        "dilation": [qnn_conv2d_params.dilation[0], qnn_conv2d_params.dilation[1]],
+        "padding": [0, 0, 0, 0],
+        "data_layout": qnn_conv2d_params.ifm.layout,
+    }
+    dilated_kernel_h = (
+        qnn_conv2d_params.dilation[0] * (qnn_conv2d_params.kernel.get_dim_size("H") - 1) + 1
+    )
+    dilated_kernel_w = (
+        qnn_conv2d_params.dilation[1] * (qnn_conv2d_params.kernel.get_dim_size("W") - 1) + 1
+    )
+    if qnn_conv2d_params.pad == "SAME":
+        pad_top, pad_bottom = get_pad_value(
+            qnn_conv2d_params.ifm.get_dim_size("H"), dilated_kernel_h, qnn_conv2d_params.strides[0]
+        )
+        pad_left, pad_right = get_pad_value(
+            qnn_conv2d_params.ifm.get_dim_size("W"), dilated_kernel_w, qnn_conv2d_params.strides[1]
+        )
+        do_pad = not (pad_top == 0 and pad_bottom == 0 and pad_left == 0 and pad_right == 0)
+        if do_pad:
+            params["padding"] = [pad_top, pad_left, pad_bottom, pad_right]
+    qnn_conv2d_params.pad = params["padding"]
+    params["input_zero_point"] = qnn_conv2d_params.ifm.zp
+    params["kernel_zero_point"] = qnn_conv2d_params.kernel.zp
+    params["out_dtype"] = "int32"
+    params["input_scale"] = qnn_conv2d_params.ifm.sc
+    params["kernel_scale"] = qnn_conv2d_params.kernel.sc
+    params["channels"] = int(qnn_conv2d_params.kernel.get_dim_size("O"))
+    params["kernel_layout"] = qnn_conv2d_params.kernel.layout
+    k_shape = qnn_conv2d_params.kernel.shape
+    k_dtype = qnn_conv2d_params.kernel.dtype
+    w = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(k_dtype).min, high=np.iinfo(k_dtype).max, size=k_shape, dtype=k_dtype
+        )
+    )
+    weight_expr = relay.const(w, k_dtype)
+    v_params.append(w)
+    qnn_conv2d_expr = qnn.op.conv2d(ifm_expr, weight_expr, **params)
+    b = tvm.nd.array(
+        np.random.randint(
+            0, high=10, size=(qnn_conv2d_params.kernel.get_dim_size("O")), dtype="int32"
+        )
+    )
+    v_params.append(b)
+    bias_expr = relay.const(b, "int32")
+    bias = relay.nn.bias_add(
+        qnn_conv2d_expr, bias_expr, axis=qnn_conv2d_params.ifm.get_dim_index("C")
+    )
+    bias_scale = relay.const(
+        qnn_conv2d_params.ifm.sc.data.asnumpy() * qnn_conv2d_params.kernel.sc.data.asnumpy(),
+        "float32",
+    )
+    req_expr = relay.qnn.op.requantize(
+        bias,
+        bias_scale,  # input zero scale
+        relay.const(0, "int32"),  # input zero point
+        qnn_conv2d_params.ofm.sc,  # output zero scale
+        qnn_conv2d_params.ofm.zp,  # output zero point
+        out_dtype=qnn_conv2d_params.ofm.dtype,
+    )
+    if qnn_conv2d_params.activation != "NONE":
+        assert qnn_conv2d_params.activation == "CLIP"
+        clip_expr = relay.clip(req_expr, qnn_conv2d_params.clip_min, qnn_conv2d_params.clip_max)
+        return clip_expr, v_params
+
+    return req_expr, v_params
+
+
+def create_pool2d(pooling_params, ifm_expr):
+    """Create a relay pooling operation"""
+    assert pooling_params.ifm.layout == "NHWC"
+    params = {
+        "pool_size": (pooling_params.size[0], pooling_params.size[1]),
+        "strides": (pooling_params.strides[0], pooling_params.strides[1]),
+        "padding": [0, 0],
+        "layout": "NHWC",
+    }
+    if pooling_params.pad == "SAME":
+        pad_top, pad_bottom = get_pad_value(
+            pooling_params.ifm.shape[1], pooling_params.size[0], pooling_params.strides[0]
+        )
+        pad_left, pad_right = get_pad_value(
+            pooling_params.ifm.shape[2], pooling_params.size[1], pooling_params.strides[1]
+        )
+        params["padding"] = [pad_top, pad_left, pad_bottom, pad_right]
+    if pooling_params.type == "MAX":
+        out = relay.op.nn.max_pool2d(ifm_expr, **params)
+    else:
+        assert pooling_params.type == "AVG"
+        out = relay.op.cast(ifm_expr, dtype="int32")
+        out = relay.op.nn.avg_pool2d(out, **params)
+        out = relay.op.cast(out, dtype=pooling_params.ofm.dtype)
+    return out
+
+
+def create_qnn_add(ifm0_expr, ifm1_expr, add_params):
+    add = relay.qnn.op.add(
+        lhs=ifm0_expr,
+        rhs=ifm1_expr,
+        lhs_scale=add_params.ifm0.sc,
+        lhs_zero_point=add_params.ifm0.zp,
+        rhs_scale=add_params.ifm1.sc,
+        rhs_zero_point=add_params.ifm1.zp,
+        output_scale=add_params.ofm.sc,
+        output_zero_point=add_params.ofm.zp,
+    )
+    return add
diff --git a/tests/python/contrib/test_ethosu/test_attr_passing.py b/tests/python/contrib/test_ethosu/test_attr_passing.py
new file mode 100644
index 000000000000..a2fbe1888d2a
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_attr_passing.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu import util
+
+
+def test_compiler_attr():
+    config = {
+        "accelerator_config": "ethos-u55-32",
+    }
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethosu.options": config}):
+        with tvm.target.Target("c -device=micro_dev"):
+            assert util.get_accelerator_config() == config["accelerator_config"]
+
+
+def test_compiler_attr_default():
+    default_config = {
+        "accelerator_config": "ethos-u55-256",
+    }
+    with tvm.transform.PassContext(opt_level=3):
+        with tvm.target.Target("c -device=micro_dev"):
+            assert util.get_accelerator_config() == default_config["accelerator_config"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_compiler.py b/tests/python/contrib/test_ethosu/test_compiler.py
new file mode 100644
index 000000000000..4df6311a230c
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_compiler.py
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+
+
+def test_lower_to_tir():
+    data = relay.var("data", shape=(1, 1, 1, 1024), dtype="uint8")
+    weight = relay.var("weight", shape=(1, 1, 1024, 1001), dtype="int8")
+    p2 = relay.var("p2", shape=(1, 1, 1, 1), dtype="int32")
+    conv = relay.nn.conv2d(
+        data,
+        weight,
+        kernel_size=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    multiply = relay.multiply(relay.const(-22, dtype="int32"), p2)
+    tile = relay.tile(multiply, reps=(1, 1, 1, 1001))
+    subtract = relay.subtract(conv, tile)
+    func = subtract
+    expr = relay.Function(relay.analysis.free_vars(func), func)
+    mod = tvm.IRModule.from_expr(expr)
+    mod = relay.transform.InferType()(mod)
+    lower_to_tir(mod["main"])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
new file mode 100644
index 000000000000..0e546ae2fd24
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -0,0 +1,273 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+from tvm import tir
+from tvm import script
+from tvm import relay
+from tvm.script import ty
+from tvm.relay.testing import run_opt_pass
+from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import Convolution2DCompute
+
+from infra import make_ethosu_conv2d
+
+
+# fmt: off
+@tvm.script.tir
+class WeightStreamOnly:
+    def main(placeholder: ty.handle, ethosu_write: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, placeholder_5: ty.handle, placeholder_6: ty.handle, placeholder_7: ty.handle, placeholder_8: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_7, [112], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_4, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_2, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_8, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_4 = tir.match_buffer(placeholder_5, [112], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_9 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_5 = tir.match_buffer(placeholder_3, [112], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_6 = tir.match_buffer(placeholder_1, [128], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_7 = tir.match_buffer(placeholder_6, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        placeholder_global = tir.allocate([128], "uint8", "global")
+        placeholder_d_global = tir.allocate([32], "uint8", "global")
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_6.data, 0), 128, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_2.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 128, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_5.data, 0), 112, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_1.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 2), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 112, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_4.data, 0), 112, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_7.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 4), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 112, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer.data, 0), 112, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_3.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 6), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 112, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+def test_weight_stream_only():
+    def _planner(te_graph, const_dict, sch):
+        weights = te_graph.inputs[1]
+        bias = te_graph.inputs[2]
+        out = te_graph.outputs[0]
+        conv_compute = Convolution2DCompute.from_output(out)
+        co = conv_compute.split(sch, 3, 2)
+        cache_weights = sch.cache_read(weights, "global", [conv_compute.conv2d])
+        cache_bias = sch.cache_read(bias, "global", [conv_compute.conv2d])
+        sch[cache_weights].compute_at(sch[out], co)
+        sch[cache_bias].compute_at(sch[out], co)
+
+    def _get_func():
+        ifm = relay.var("ifm", shape=(1, 16, 16, 32), dtype="int8")
+        conv = make_ethosu_conv2d(
+            ifm,
+            32,
+            8,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            (1, 1),
+        )
+        func = relay.Function(relay.analysis.free_vars(conv), conv)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func = _get_func()
+    mod, consts = lower_to_tir(func, cascader=_planner)
+    script = tvm.script.asscript(mod, True)
+    test_mod = tvm.script.from_source(script)
+    reference_mod = WeightStreamOnly()
+    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+
+    reference_const_sizes = {2: 128, 3: 32, 4: 112, 5: 32, 6: 112, 7: 32, 8: 112, 9: 32}
+    test_const_sizes = {}
+    for key, value in consts.items():
+        test_const_sizes[key] = len(value)
+
+    assert reference_const_sizes == test_const_sizes
+
+
+# fmt: off
+@tvm.script.tir
+class DirectReadOnly:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_3, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_1, [592], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_write_2 = tir.allocate([4096], "int8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 592, 12, tir.load("uint8", buffer_2.data, 0), 160, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 160, 12, tir.load("uint8", buffer_3.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+def test_direct_read_only():
+    def _get_func():
+        ifm = relay.var("ifm", shape=(1, 16, 16, 32), dtype="int8")
+        conv1 = make_ethosu_conv2d(
+            ifm,
+            32,
+            16,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            (1, 1),
+        )
+        conv2 = make_ethosu_conv2d(
+            conv1,
+            16,
+            8,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            (1, 1),
+        )
+        func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func = _get_func()
+    mod, consts = lower_to_tir(func)
+
+    script = tvm.script.asscript(mod, True)
+    test_mod = tvm.script.from_source(script)
+    reference_mod = DirectReadOnly()
+    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+
+    reference_const_sizes = {1: 592, 2: 160, 3: 160, 4: 80}
+    test_const_sizes = {}
+    for key, value in consts.items():
+        test_const_sizes[key] = len(value)
+
+    assert reference_const_sizes == test_const_sizes
+
+
+# fmt: off
+@tvm.script.tir
+class MixedRead:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, placeholder_5: ty.handle, placeholder_6: ty.handle, placeholder_7: ty.handle, placeholder_8: ty.handle, placeholder_9: ty.handle, placeholder_10: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_7, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_5, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_3, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_4, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_4 = tir.match_buffer(placeholder_9, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_5 = tir.match_buffer(placeholder_6, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_11 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_6 = tir.match_buffer(placeholder_1, [592], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_7 = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_8 = tir.match_buffer(placeholder_8, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_9 = tir.match_buffer(placeholder_10, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_write_2 = tir.allocate([4096], "int8", "global")
+        placeholder_global = tir.allocate([80], "uint8", "global")
+        placeholder_d_global = tir.allocate([32], "uint8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_11.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer_6.data, 0), 592, 12, tir.load("uint8", buffer_7.data, 0), 160, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_2.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_3.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_1.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_5.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 2), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_8.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 4), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_4.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_9.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 6), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+def test_mixed_read():
+    def _planner(te_graph, const_dict, sch):
+        weight = te_graph.inputs[4]
+        scale_bias = te_graph.inputs[5]
+        out = te_graph.outputs[0]
+        conv_compute = Convolution2DCompute.from_output(out)
+        co = conv_compute.split(sch, 3, 2)
+        cache_weight = sch.cache_read(weight, "global", [conv_compute.conv2d])
+        cache_scale_bias = sch.cache_read(scale_bias, "global", [conv_compute.conv2d])
+        sch[cache_weight].compute_at(sch[out], co)
+        sch[cache_scale_bias].compute_at(sch[out], co)
+
+    def _get_func():
+        ifm = relay.var("ifm", shape=(1, 16, 16, 32), dtype="int8")
+        conv1 = make_ethosu_conv2d(
+            ifm,
+            32,
+            16,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            (1, 1),
+        )
+        conv2 = make_ethosu_conv2d(
+            conv1,
+            16,
+            8,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            (1, 1),
+        )
+        func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func = _get_func()
+    mod, consts = lower_to_tir(func, cascader=_planner)
+
+    script = tvm.script.asscript(mod, True)
+    test_mod = tvm.script.from_source(script)
+    reference_mod = MixedRead()
+    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+
+    reference_const_sizes = {
+        1: 592,
+        2: 160,
+        4: 80,
+        5: 32,
+        6: 80,
+        7: 32,
+        8: 80,
+        9: 32,
+        10: 80,
+        11: 32,
+    }
+    test_const_sizes = {}
+    for key, value in consts.items():
+        test_const_sizes[key] = len(value)
+
+    assert reference_const_sizes == test_const_sizes
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_extract_constants.py b/tests/python/contrib/test_ethosu/test_extract_constants.py
new file mode 100644
index 000000000000..98094d8a4ed4
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_extract_constants.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+from tvm import relay
+from tvm.relay.testing import run_opt_pass
+from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants
+
+import numpy as np
+
+
+def test_extract_constants_single():
+    def _get_func():
+        var_input = relay.var("data", shape=(10, 10), dtype="uint8")
+        const_data = np.random.uniform(0, 255, (10, 10)).astype("uint8")
+        const_input = relay.const(const_data, dtype="uint8")
+        out = relay.add(var_input, const_input)
+        func = relay.Function(relay.analysis.free_vars(out), out)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func, const_input
+
+    def _expected():
+        var_input1 = relay.var("data", shape=(10, 10), dtype="uint8")
+        var_input2 = relay.var("p1", shape=(10, 10), dtype="uint8")
+        out = relay.add(var_input1, var_input2)
+        func = relay.Function(relay.analysis.free_vars(out), out)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func, const = _get_func()
+    new_func, const_dict = extract_constants(func)
+    assert tvm.ir.structural_equal(new_func, _expected())
+    assert 1 in const_dict
+    assert (const_dict[1] == const.data.asnumpy()).all()
+
+
+def test_extract_constants_multi():
+    def _get_func():
+        var_input1 = relay.var("data1", shape=(10, 10), dtype="uint8")
+        var_input2 = relay.var("data2", shape=(10, 10), dtype="uint8")
+        const_data_1 = np.random.uniform(0, 255, (10, 10)).astype("uint8")
+        const_data_2 = np.random.uniform(0, 255, (10, 10)).astype("uint8")
+        const_data_3 = np.random.uniform(0, 255, (10, 10)).astype("uint8")
+        const_data_4 = np.random.uniform(0, 255, (10, 10)).astype("uint8")
+        const_input_1 = relay.const(const_data_1, dtype="uint8")
+        const_input_2 = relay.const(const_data_2, dtype="uint8")
+        const_input_3 = relay.const(const_data_3, dtype="uint8")
+        const_input_4 = relay.const(const_data_4, dtype="uint8")
+        out = relay.add(var_input1, var_input2)
+        out = relay.add(out, const_input_1)
+        out = relay.add(out, const_input_2)
+        out = relay.add(out, const_input_3)
+        out = relay.add(out, const_input_4)
+        func = relay.Function(relay.analysis.free_vars(out), out)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func, [const_input_1, const_input_2, const_input_3, const_input_4]
+
+    def _expected():
+        var_input1 = relay.var("data1", shape=(10, 10), dtype="uint8")
+        var_input2 = relay.var("data2", shape=(10, 10), dtype="uint8")
+        var_input3 = relay.var("p1", shape=(10, 10), dtype="uint8")
+        var_input4 = relay.var("p2", shape=(10, 10), dtype="uint8")
+        var_input5 = relay.var("p3", shape=(10, 10), dtype="uint8")
+        var_input6 = relay.var("p4", shape=(10, 10), dtype="uint8")
+        out = relay.add(var_input1, var_input2)
+        out = relay.add(out, var_input3)
+        out = relay.add(out, var_input4)
+        out = relay.add(out, var_input5)
+        out = relay.add(out, var_input6)
+        func = relay.Function(relay.analysis.free_vars(out), out)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func, consts = _get_func()
+    new_func, const_dict = extract_constants(func)
+    assert tvm.ir.structural_equal(new_func, _expected())
+    for i, const in enumerate(consts):
+        assert i + 2 in const_dict
+        assert (const_dict[i + 2] == consts[i].data.asnumpy()).all()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
new file mode 100644
index 000000000000..52f6995c3aaa
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -0,0 +1,337 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib import ethosu
+from tvm.relay.backend.contrib.ethosu import legalize, preprocess
+from tvm.relay.dataflow_pattern import *
+from tvm.relay.op.contrib.ethosu import *
+import relay_ir_builder
+
+
+def test_split_indices_legalize():
+    def create_graph(axis):
+        x = relay.var("x", shape=(1, 50, 50, 3))
+        x_relu = relay.nn.relu(x)
+        split_output = relay.split(x_relu, [5, 20, 45], axis).tuple_value
+        return relay.Function([x], split_output)
+
+    def expected_mod_axis1():
+        expected_ir_string = """
+        #[version = "0.0.5"]
+        def @tvmgen_default_ethosu_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tensor[(1, 5, 50, 3), float32],\
+                                                               Tensor[(1, 15, 50, 3), float32],\
+                                                               Tensor[(1, 25, 50, 3), float32],\
+                                                               Tensor[(1, 5, 50, 3), float32]) {
+          %0 = nn.relu(%x) /* ty=Tensor[(1, 50, 50, 3), float32] */;
+          %1 = strided_slice(%0, begin=[0, 0, 0, 0], end=[1, 5, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 5, 50, 3), float32] */;
+          %2 = strided_slice(%0, begin=[0, 5, 0, 0], end=[1, 20, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 15, 50, 3), float32] */;
+          %3 = strided_slice(%0, begin=[0, 20, 0, 0], end=[1, 45, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 25, 50, 3), float32] */;
+          %4 = strided_slice(%0, begin=[0, 45, 0, 0], end=[1, 50, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 5, 50, 3), float32] */;
+          (%1, %2, %3, %4)
+        }
+        """
+        return tvm.parser.fromtext(expected_ir_string)
+
+    def expected_mod_axis2():
+        expected_ir_string = """
+        #[version = "0.0.5"]
+        def @tvmgen_default_ethosu_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tensor[(1, 50, 5, 3), float32],\
+                                                               Tensor[(1, 50, 15, 3), float32],\
+                                                               Tensor[(1, 50, 25, 3), float32],\
+                                                               Tensor[(1, 50, 5, 3), float32]) {
+          %0 = nn.relu(%x) /* ty=Tensor[(1, 50, 50, 3), float32] */;
+          %1 = strided_slice(%0, begin=[0, 0, 0, 0], end=[1, 50, 5, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 5, 3), float32] */;
+          %2 = strided_slice(%0, begin=[0, 0, 5, 0], end=[1, 50, 20, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 15, 3), float32] */;
+          %3 = strided_slice(%0, begin=[0, 0, 20, 0], end=[1, 50, 45, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 25, 3), float32] */;
+          %4 = strided_slice(%0, begin=[0, 0, 45, 0], end=[1, 50, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 5, 3), float32] */;
+          (%1, %2, %3, %4)
+        }
+        """
+        return tvm.parser.fromtext(expected_ir_string)
+
+    mod_axis1 = tvm.IRModule()
+    mod_axis1["tvmgen_default_ethosu_main_0"] = create_graph(1)
+    mod_axis1 = legalize.LegalizeSplit()(mod_axis1)
+    expected_axis1 = expected_mod_axis1()
+    tvm.ir.assert_structural_equal(mod_axis1, expected_axis1)
+
+    mod_axis2 = tvm.IRModule()
+    mod_axis2["tvmgen_default_ethosu_main_0"] = create_graph(2)
+    mod_axis2 = legalize.LegalizeSplit()(mod_axis2)
+    expected_axis2 = expected_mod_axis2()
+    tvm.ir.assert_structural_equal(mod_axis2, expected_axis2)
+
+
+def test_split_sections_legalize():
+    def create_graph(axis, sections):
+        x = relay.var("x", shape=(1, 50, 50, 3))
+        x_abs = relay.abs(x)
+        split_output = relay.split(x_abs, sections, axis).tuple_value
+        outputs = list()
+        for section_idx in range(sections):
+            split_single_out = relay.TupleGetItem(split_output, section_idx)
+            tanh = relay.tanh(split_single_out)
+            outputs.append(tanh)
+        tuple_out = relay.Tuple(outputs)
+        return relay.Function([x], tuple_out)
+
+    def expected_mod_axis1():
+        expected_ir_string = """
+        #[version = "0.0.5"]
+        def @tvmgen_default_ethosu_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tensor[(1, 10, 50, 3), float32],\
+                                                               Tensor[(1, 10, 50, 3), float32],\
+                                                               Tensor[(1, 10, 50, 3), float32],\
+                                                               Tensor[(1, 10, 50, 3), float32],\
+                                                               Tensor[(1, 10, 50, 3), float32]) {
+          %0 = abs(%x) /* ty=Tensor[(1, 50, 50, 3), float32] */;
+          %1 = strided_slice(%0, begin=[0, 0, 0, 0], end=[1, 10, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %2 = strided_slice(%0, begin=[0, 10, 0, 0], end=[1, 20, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %3 = strided_slice(%0, begin=[0, 20, 0, 0], end=[1, 30, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %4 = strided_slice(%0, begin=[0, 30, 0, 0], end=[1, 40, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %5 = strided_slice(%0, begin=[0, 40, 0, 0], end=[1, 50, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %6 = (%1, %2, %3, %4, %5);
+          %7 = %6.0;
+          %8 = tanh(%7) /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %9 = %6.1;
+          %10 = tanh(%9) /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %11 = %6.2;
+          %12 = tanh(%11) /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %13 = %6.3;
+          %14 = tanh(%13) /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          %15 = %6.4;
+          %16 = tanh(%15) /* ty=Tensor[(1, 10, 50, 3), float32] */;
+          (%8, %10, %12, %14, %16)
+        }
+        """
+        return tvm.parser.fromtext(expected_ir_string)
+
+    def expected_mod_axis2():
+        expected_ir_string = """
+        #[version = "0.0.5"]
+        def @tvmgen_default_ethosu_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tensor[(1, 50, 10, 3), float32],\
+                                                               Tensor[(1, 50, 10, 3), float32],\
+                                                               Tensor[(1, 50, 10, 3), float32],\
+                                                               Tensor[(1, 50, 10, 3), float32],\
+                                                               Tensor[(1, 50, 10, 3), float32]) {
+          %0 = abs(%x) /* ty=Tensor[(1, 50, 50, 3), float32] */;
+          %1 = strided_slice(%0, begin=[0, 0, 0, 0], end=[1, 50, 10, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %2 = strided_slice(%0, begin=[0, 0, 10, 0], end=[1, 50, 20, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %3 = strided_slice(%0, begin=[0, 0, 20, 0], end=[1, 50, 30, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %4 = strided_slice(%0, begin=[0, 0, 30, 0], end=[1, 50, 40, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %5 = strided_slice(%0, begin=[0, 0, 40, 0], end=[1, 50, 50, 3], strides=[1], axes=None)\
+           /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %6 = (%1, %2, %3, %4, %5);
+          %7 = %6.0;
+          %8 = tanh(%7) /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %9 = %6.1;
+          %10 = tanh(%9) /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %11 = %6.2;
+          %12 = tanh(%11) /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %13 = %6.3;
+          %14 = tanh(%13) /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          %15 = %6.4;
+          %16 = tanh(%15) /* ty=Tensor[(1, 50, 10, 3), float32] */;
+          (%8, %10, %12, %14, %16)
+        }
+        """
+        return tvm.parser.fromtext(expected_ir_string)
+
+    mod_axis1 = tvm.IRModule()
+    mod_axis1["tvmgen_default_ethosu_main_0"] = create_graph(1, 5)
+    mod_axis1 = legalize.LegalizeSplit()(mod_axis1)
+    expected_axis1 = expected_mod_axis1()
+    tvm.ir.assert_structural_equal(mod_axis1, expected_axis1)
+
+    mod_axis2 = tvm.IRModule()
+    mod_axis2["tvmgen_default_ethosu_main_0"] = create_graph(2, 5)
+    mod_axis2 = legalize.LegalizeSplit()(mod_axis2)
+    expected_axis2 = expected_mod_axis2()
+    tvm.ir.assert_structural_equal(mod_axis2, expected_axis2)
+
+
+def infer_type_function_pass(func):
+    mod = tvm.IRModule()
+    mod["test"] = func
+    mod = relay.transform.InferType()(mod)
+    return mod["test"]
+
+
+def get_shape_expr(in_expr, out_expr):
+    main_f = relay.Function([in_expr], out_expr)
+    main_f = infer_type_function_pass(main_f)
+    shape = [int(i) for i in main_f.body.checked_type.shape]
+    return shape
+
+
+INVERSE_LAYOUT_TRANSFORM_OHWI_MAP = {
+    "HWIO": [1, 2, 3, 0],
+    "HWOI": [1, 2, 0, 3],
+    "OWHI": [0, 1, 2, 3],
+}
+
+
+def test_ethosu_conv2d_legalize():
+    def create_graph_single(input_tensor_name, input_tensor_shape, input_tensor_dtype):
+        c1_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype)
+        c1_params.ifm.shape = input_tensor_shape
+        c1_params.kernel.shape = (3, 3, c1_params.ifm.shape[3], 32)
+        c1_params.strides = (1, 1)
+        c1_params.pad = "VALID"
+        c1_params.activation = "CLIP"
+        c1_params.clip_min = 23
+        c1_params.clip_max = 180
+        input0 = relay.var(input_tensor_name, shape=c1_params.ifm.shape, dtype=c1_params.ifm.dtype)
+        c1, new_params = relay_ir_builder.create_qnn_conv2d(c1_params, input0)
+        c1_params.ofm.shape = get_shape_expr(input0, c1)
+
+        f = relay.Function([input0], c1)
+        mod = tvm.IRModule()
+        mod["main"] = f
+        return mod, [c1_params]
+
+    def create_graph_double(input_tensor_name, input_tensor_shape, input_tensor_dtype):
+        c1_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype)
+        c1_params.ifm.shape = input_tensor_shape
+        c1_params.kernel.shape = (7, 7, c1_params.ifm.shape[3], 8)
+        c1_params.strides = (2, 2)
+        c1_params.pad = "VALID"
+        c1_params.activation = "CLIP"
+        c1_params.clip_min = 10
+        c1_params.clip_max = 240
+        input0 = relay.var(input_tensor_name, shape=c1_params.ifm.shape, dtype=c1_params.ifm.dtype)
+        c1, new_params = relay_ir_builder.create_qnn_conv2d(c1_params, input0)
+        c1_params.ofm.shape = get_shape_expr(input0, c1)
+
+        c2_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype)
+        c2_params.ifm.shape = c1_params.ofm.shape
+        c2_params.kernel.shape = (5, 5, c2_params.ifm.shape[3], 16)
+        c2_params.strides = (1, 1)
+        c2_params.pad = "SAME"
+        c2, new_params = relay_ir_builder.create_qnn_conv2d(c2_params, c1)
+        c2_params.ofm.shape = get_shape_expr(input0, c2)
+
+        f = relay.Function([input0], c2)
+        mod = tvm.IRModule()
+        mod["main"] = f
+        return mod, [c2_params, c1_params]
+
+    def verify_tensor(tensor_type, expr):
+        assert list(tensor_type.shape) == list(expr.checked_type.shape)
+        assert str(tensor_type.dtype) == str(expr.checked_type.dtype)
+
+    def verify_linear(ext_func, conv2d_params):
+        op = ext_func.body
+        for param in conv2d_params:
+            verify_tensor(param.ifm, op.args[0])
+            verify_tensor(param.ofm, op)
+
+            # This will be in OHWI layout
+            weights_ohwi = op.args[1].data.asnumpy()
+            weights_layout = str(param.kernel.layout)
+            weights = np.transpose(weights_ohwi, INVERSE_LAYOUT_TRANSFORM_OHWI_MAP[weights_layout])
+            assert weights.shape == param.kernel.shape
+            assert weights.dtype == param.kernel.dtype
+
+            assert list(op.args[2].checked_type.shape)[0] == weights_ohwi.shape[0]
+
+            assert float(op.attrs.ifm_scale) == float(param.ifm.sc.data.asnumpy())
+            assert int(op.attrs.ifm_zero_point) == int(param.ifm.zp.data.asnumpy())
+            assert int(op.attrs.weight_zero_point) == int(param.kernel.zp.data.asnumpy())
+            assert float(op.attrs.ofm_scale) == float(param.ofm.sc.data.asnumpy())
+            assert int(op.attrs.ofm_zero_point) == int(param.ofm.zp.data.asnumpy())
+            assert int(op.attrs.ofm_channels) == int(weights_ohwi.shape[0])
+            assert list(op.attrs.padding) == list(param.pad)
+            assert list(op.attrs.strides) == list(param.strides)
+            assert list(op.attrs.dilation) == list(param.dilation)
+            assert str(op.attrs.activation) == str(param.activation)
+            assert int(op.attrs.clip_min) == int(param.clip_min)
+            assert int(op.attrs.clip_max) == int(param.clip_max)
+            op = op.args[0]
+
+    test_cases = [
+        (create_graph_single, ["input", (1, 299, 299, 3), "uint8"]),
+        (create_graph_double, ["input", (1, 128, 256, 4), "uint8"]),
+    ]
+    for test_case in test_cases:
+        mod, conv_params = test_case[0](*test_case[1])
+        mod = ethosu.partition_for_ethosu(mod)
+        mod = legalize.LegalizeEthosUConv2D()(mod)
+        verify_linear(mod["tvmgen_default_ethosu_main_0"], conv_params)
+
+
+def test_ethosu_conv2d_legalize_errors():
+    def create_graph_single_unsupported_ifm_layout(
+        input_tensor_name, input_tensor_shape, input_tensor_dtype
+    ):
+        c1_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype)
+        c1_params.ifm.shape = input_tensor_shape
+        c1_params.ifm.layout = "NCHW"
+        c1_params.kernel.shape = (3, 3, c1_params.ifm.shape[1], 32)
+        c1_params.strides = (1, 1)
+        c1_params.pad = "VALID"
+        c1_params.activation = "CLIP"
+        c1_params.clip_min = 23
+        c1_params.clip_max = 180
+        input0 = relay.var(input_tensor_name, shape=c1_params.ifm.shape, dtype=c1_params.ifm.dtype)
+        c1, new_params = relay_ir_builder.create_qnn_conv2d(c1_params, input0)
+        c1_params.ofm.shape = get_shape_expr(input0, c1)
+
+        f = relay.Function([input0], c1)
+        mod = tvm.IRModule()
+        mod["main"] = f
+        return mod, [c1_params]
+
+    test_cases = [
+        (create_graph_single_unsupported_ifm_layout, ["input", (1, 3, 299, 299), "uint8"]),
+    ]
+
+    for test_case in test_cases:
+        mod, conv_params = test_case[0](*test_case[1])
+        mod = ethosu.partition_for_ethosu(mod)
+        with pytest.raises(
+            tvm._ffi.base.TVMError, match="EthosUCodegenError: Unsupported Layout NCHW"
+        ):
+            mod = legalize.LegalizeEthosUConv2D()(mod)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_lower_to_te.py b/tests/python/contrib/test_ethosu/test_lower_to_te.py
new file mode 100644
index 000000000000..cabd68b4e8d2
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_lower_to_te.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_te
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import Convolution2DCompute
+import tvm.relay.backend.contrib.ethosu.op as ethosu_ops
+
+
+def test_ethosu_conv2d():
+    ifm = relay.var("ifm", shape=(1, 10, 20, 30), dtype="uint8")
+    weight = relay.var("weight", shape=(40, 3, 3, 30), dtype="uint8")
+    scale_bias = relay.var("scale_bias", shape=(40, 10), dtype="uint8")
+    lut = relay.var("lut", shape=(), dtype="uint8")
+    conv = ethosu_ops.ethosu_conv2d(
+        ifm,
+        weight,
+        scale_bias,
+        lut,
+        ifm_scale=0.5,
+        ifm_zero_point=10,
+        weight_zero_point=12,
+        ofm_scale=0.25,
+        ofm_zero_point=14,
+        ofm_channels=40,
+        padding=(1, 1, 1, 1),
+        kernel_shape=(3, 3),
+        strides=(1, 1),
+        dilation=(1, 1),
+    )
+    expr = relay.Function(relay.analysis.free_vars(conv), conv)
+    mod = tvm.IRModule.from_expr(expr)
+    mod = relay.transform.InferType()(mod)
+    lowered = lower_to_te(mod["main"])
+    assert len(lowered.outputs) == 1
+    assert len(lowered.inputs) == 4
+    conv2d_compute = Convolution2DCompute.from_output(lowered.outputs[0])
+    assert conv2d_compute.conv2d.name == "ethosu_conv2d"
+    input_shapes = set()
+    for inp in lowered.inputs:
+        input_shapes.add(tuple([x.value for x in inp.shape]))
+    assert input_shapes == {(40, 10), (1, 10, 20, 30), (40, 3, 3, 30), ()}
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_preprocess.py b/tests/python/contrib/test_ethosu/test_preprocess.py
new file mode 100644
index 000000000000..f2c7b0afafd8
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_preprocess.py
@@ -0,0 +1,343 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu import preprocess
+
+
+def set_func_attr(func, compile_name, symbol_name):
+    """
+    Helper function to attach attributes to the external function.
+    """
+    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Compiler", compile_name)
+    func = func.with_attr("global_symbol", symbol_name)
+    return func
+
+
+def test_single_io():
+    """
+    This test will test the pass wont touch external functions that
+    have a single input and a single output.
+    """
+
+    def create_graph():
+        def create_external_func1(mod_, compiler_name, symbol_name):
+            x_int = relay.var("x_int", shape=(10, 10))
+            z0 = relay.nn.relu(x_int)
+            f1 = relay.Function([x_int], z0)
+            f1 = set_func_attr(f1, compiler_name, symbol_name)
+            glb_f1 = relay.GlobalVar(symbol_name)
+            mod_[glb_f1] = f1
+            mod_ = relay.transform.InferType()(mod_)
+            return glb_f1, mod_
+
+        mod = tvm.IRModule()
+        x = relay.var("x", shape=(10, 10))
+
+        glb_symbol_f1, mod = create_external_func1(mod, "ethosu", "ethosu_0")
+        r = relay.Call(glb_symbol_f1, [x])
+        main = relay.Function([x], r)
+        mod["main"] = main
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    mod = create_graph()
+    exp = create_graph()
+    mod = preprocess.preprocess_ext_io()(mod)
+    assert tvm.ir.structural_equal(mod, exp, map_free_vars=True)
+
+
+def test_2ins_single_out():
+    """
+    The test is check two inputs and a single output of external function
+    """
+
+    def create_graph():
+        def create_external_func1(mod_, compiler_name, symbol_name):
+            x_int = relay.var("x_int", shape=(10, 10))
+            w0_int = relay.var("w0_int", shape=(10, 10))
+            z0 = relay.add(x_int, w0_int)
+
+            f1 = relay.Function([x_int, w0_int], z0)
+            f1 = set_func_attr(f1, compiler_name, symbol_name)
+            glb_f1 = relay.GlobalVar(symbol_name)
+            mod_[glb_f1] = f1
+            mod_ = relay.transform.InferType()(mod_)
+            return glb_f1, mod_
+
+        mod = tvm.IRModule()
+
+        x = relay.var("x", shape=(10, 10))
+        w0 = relay.var("w0", shape=(10, 10))
+
+        glb_symbol_f1, mod = create_external_func1(mod, "ethosu", "ethosu_0")
+        r = relay.Call(glb_symbol_f1, [x, w0])
+        main = relay.Function([x, w0], r)
+        mod["main"] = main
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    def expected():
+        def create_external_func1(mod_, compiler_name, symbol_name):
+            ifms_int = relay.var("ifms_int", shape=[200])
+
+            # splits
+            (x_int_flat, w0_int_flat) = relay.split(ifms_int, [100])
+            # reshapes
+            x_int = relay.reshape(x_int_flat, newshape=(10, 10))
+            w0_int = relay.reshape(w0_int_flat, newshape=(10, 10))
+
+            z0 = relay.add(x_int, w0_int)
+            f1 = relay.Function([ifms_int], z0)
+            f1 = set_func_attr(f1, compiler_name, symbol_name)
+            glb_f1 = relay.GlobalVar(symbol_name)
+            mod_[glb_f1] = f1
+            mod_ = relay.transform.InferType()(mod_)
+            return glb_f1, mod_
+
+        mod = tvm.IRModule()
+
+        x = relay.var("x", shape=(10, 10))
+        w0 = relay.var("w0", shape=(10, 10))
+
+        # reshapes
+        x_reshaped = relay.reshape(x, newshape=100)
+        w0_reshaped = relay.reshape(w0, newshape=100)
+
+        # concat
+        ifms = relay.concatenate((x_reshaped, w0_reshaped), 0)
+
+        glb_symbol_f1, mod = create_external_func1(mod, "ethosu", "ethosu_0")
+        r = relay.Call(glb_symbol_f1, [ifms])
+        main = relay.Function([x, w0], r)
+        mod["main"] = main
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    mod = create_graph()
+    exp = expected()
+    mod = preprocess.preprocess_ext_io()(mod)
+    assert tvm.ir.structural_equal(mod, exp, map_free_vars=True)
+
+
+def test_single_in_2outs():
+    """
+    The test is to check a single input and two outputs of external function
+    """
+
+    def create_graph():
+        def create_external_func1(mod_, compiler_name, symbol_name):
+            x_int = relay.var("x_int", shape=(10, 10))
+
+            p0 = relay.nn.relu(x_int)
+            q0 = relay.tanh(x_int)
+            f1_o_tuple = relay.Tuple([p0, q0])
+
+            f1 = relay.Function([x_int], f1_o_tuple)
+            f1 = set_func_attr(f1, compiler_name, symbol_name)
+            glb_f1 = relay.GlobalVar(symbol_name)
+            mod_[glb_f1] = f1
+            mod_ = relay.transform.InferType()(mod_)
+            return glb_f1, mod_
+
+        mod = tvm.IRModule()
+        x = relay.var("x", shape=(10, 10))
+        glb_symbol_f1, mod = create_external_func1(mod, "ethosu", "ethosu_0")
+        pq_tuple = relay.Call(glb_symbol_f1, [x])
+        p0 = relay.TupleGetItem(pq_tuple, 0)
+        q0 = relay.TupleGetItem(pq_tuple, 1)
+        r = relay.concatenate((p0, q0), axis=0)
+        main = relay.Function([x], r)
+        mod["main"] = main
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    def expected():
+        def create_external_func1(mod_, compiler_name, symbol_name):
+            x_int = relay.var("x_int", shape=(10, 10))
+
+            p0 = relay.nn.relu(x_int)
+            q0 = relay.tanh(x_int)
+
+            # reshapes
+            p0_reshaped = relay.reshape(p0, newshape=100)
+            q0_reshaped = relay.reshape(q0, newshape=100)
+            ofms = relay.concatenate((p0_reshaped, q0_reshaped), 0)
+
+            f1 = relay.Function([x_int], ofms)
+            f1 = set_func_attr(f1, compiler_name, symbol_name)
+            glb_f1 = relay.GlobalVar(symbol_name)
+            mod_[glb_f1] = f1
+            mod_ = relay.transform.InferType()(mod_)
+            return glb_f1, mod_
+
+        mod = tvm.IRModule()
+        x = relay.var("x", shape=(10, 10))
+        glb_symbol_f1, mod = create_external_func1(mod, "ethosu", "ethosu_0")
+        ofms = relay.Call(glb_symbol_f1, [x])
+
+        # splits
+        (p0_flat, q0_flat) = relay.split(ofms, [100])
+        # reshapes
+        p0_flat_reshaped = relay.reshape(p0_flat, newshape=(10, 10))
+        q0_flat_reshaped = relay.reshape(q0_flat, newshape=(10, 10))
+        # original output
+        tuple_out = relay.Tuple([p0_flat_reshaped, q0_flat_reshaped])
+
+        p0 = relay.TupleGetItem(tuple_out, 0)
+        q0 = relay.TupleGetItem(tuple_out, 1)
+        r = relay.concatenate((p0, q0), axis=0)
+        main = relay.Function([x], r)
+        mod["main"] = main
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    mod = create_graph()
+    exp = expected()
+    mod = relay.transform.InferType()(mod)
+    mod = preprocess.preprocess_ext_io()(mod)
+    assert tvm.ir.structural_equal(mod, exp, map_free_vars=True)
+
+
+def test_4ins_2outs():
+    """
+    The test is to check a 4 inputs and two outputs of external function.
+    This just stand as a general test for multiple ins/outs.
+    """
+
+    def create_graph():
+        def create_external_func1(mod_, compiler_name, symbol_name):
+            x_int = relay.var("x_int", shape=(10, 10))
+            w0_int = relay.var("w0_int", shape=(10, 10))
+            w1_int = relay.var("w1_int", shape=(10, 10))
+            w2_int = relay.var("w2_int", shape=(10, 10))
+
+            z0 = relay.add(x_int, w0_int)
+            p0 = relay.subtract(z0, w1_int)
+            q0 = relay.multiply(z0, w2_int)
+            f1_o_tuple = relay.Tuple([p0, q0])
+
+            f1 = relay.Function([x_int, w0_int, w1_int, w2_int], f1_o_tuple)
+            f1 = set_func_attr(f1, compiler_name, symbol_name)
+            glb_f1 = relay.GlobalVar(symbol_name)
+            mod_[glb_f1] = f1
+            mod_ = relay.transform.InferType()(mod_)
+            return glb_f1, mod_
+
+        mod = tvm.IRModule()
+
+        x = relay.var("x", shape=(10, 10))
+        w0 = relay.var("w0", shape=(10, 10))
+        w1 = relay.var("w1", shape=(10, 10))
+        w2 = relay.var("w2", shape=(10, 10))
+
+        glb_symbol_f1, mod = create_external_func1(mod, "ethosu", "ethosu_0")
+        pq_tuple = relay.Call(glb_symbol_f1, [x, w0, w1, w2])
+
+        p0 = relay.TupleGetItem(pq_tuple, 0)
+        q0 = relay.TupleGetItem(pq_tuple, 1)
+        r = relay.concatenate((p0, q0), axis=0)
+        main = relay.Function([x, w0, w1, w2], r)
+        mod["main"] = main
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    def expected():
+        def create_external_func1(mod_, compiler_name, symbol_name):
+            ifms_int = relay.var("ifms_int", shape=[400])
+
+            # splits
+            (x_int_flat, w0_int_flat, w1_int_flat, w2_int_flat) = relay.split(
+                ifms_int, [100, 200, 300]
+            )
+            # reshapes
+            x_int = relay.reshape(x_int_flat, newshape=(10, 10))
+            w0_int = relay.reshape(w0_int_flat, newshape=(10, 10))
+            w1_int = relay.reshape(w1_int_flat, newshape=(10, 10))
+            w2_int = relay.reshape(w2_int_flat, newshape=(10, 10))
+
+            z0 = relay.add(x_int, w0_int)
+            p0 = relay.subtract(z0, w1_int)
+            q0 = relay.multiply(z0, w2_int)
+            # f1_o_tuple = relay.Tuple([p0, q0])
+
+            # reshapes
+            p0_reshaped = relay.reshape(p0, newshape=100)
+            q0_reshaped = relay.reshape(q0, newshape=100)
+            ofms = relay.concatenate((p0_reshaped, q0_reshaped), 0)
+
+            f1 = relay.Function([ifms_int], ofms)
+            f1 = set_func_attr(f1, compiler_name, symbol_name)
+            glb_f1 = relay.GlobalVar(symbol_name)
+            mod_[glb_f1] = f1
+            mod_ = relay.transform.InferType()(mod_)
+            return glb_f1, mod_
+
+        mod = tvm.IRModule()
+
+        x = relay.var("x", shape=(10, 10))
+        w0 = relay.var("w0", shape=(10, 10))
+        w1 = relay.var("w1", shape=(10, 10))
+        w2 = relay.var("w2", shape=(10, 10))
+
+        # reshapes
+        x_reshaped = relay.reshape(x, newshape=100)
+        w0_reshaped = relay.reshape(w0, newshape=100)
+        w1_reshaped = relay.reshape(w1, newshape=100)
+        w2_reshaped = relay.reshape(w2, newshape=100)
+
+        # concat
+        ifms = relay.concatenate((x_reshaped, w0_reshaped, w1_reshaped, w2_reshaped), 0)
+
+        # call
+        glb_func, mod = create_external_func1(mod, "ethosu", "ethosu_0")
+        ofms = relay.Call(glb_func, [ifms])
+
+        # splits
+        (p0_flat, q0_flat) = relay.split(ofms, [100])
+        # reshapes
+        p0_flat_reshaped = relay.reshape(p0_flat, newshape=(10, 10))
+        q0_flat_reshaped = relay.reshape(q0_flat, newshape=(10, 10))
+        # original output
+        tuple_out = relay.Tuple([p0_flat_reshaped, q0_flat_reshaped])
+
+        p0 = relay.TupleGetItem(tuple_out, 0)
+        q0 = relay.TupleGetItem(tuple_out, 1)
+
+        r = relay.concatenate((p0, q0), axis=0)
+        main = relay.Function([x, w0, w1, w2], r)
+        mod["main"] = main
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    mod = create_graph()
+    exp = expected()
+    mod = preprocess.preprocess_ext_io()(mod)
+    assert tvm.ir.structural_equal(mod, exp, map_free_vars=True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
new file mode 100644
index 000000000000..96fe56d1778e
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -0,0 +1,547 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+import tvm.script
+from tvm.script import tir, ty
+from tvm import relay
+from tvm.relay.testing import run_opt_pass
+from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import total_cascader
+from infra import make_ethosu_conv2d, get_convolutional_args
+
+
+@pytest.mark.parametrize(
+    "trial",
+    [
+        [(1, 8, 8, 3), 3, 16, (1, 1), (2, 1), (1, 1), (1, 1), "TANH", "NHWC", "NHWC"],
+        [(1, 8, 8, 3), 3, 16, (1, 1), (0, 0), (1, 1), (1, 1), "NONE", "NHWC", "NHWC"],
+        [(1, 1, 1, 1), 1, 16, (1, 1), (0, 0), (1, 1), (1, 1), "CLIP", "NHWC", "NHWC"],
+        [(1, 7, 9, 4), 4, 13, (3, 2), (1, 2), (2, 1), (1, 2), "SIGMOID", "NHWC", "NHWC"],
+        [(1, 8, 2, 8, 16), 18, 12, (1, 1), (2, 1), (1, 1), (1, 1), "CLIP", "NHCWB16", "NHWC"],
+        [(1, 7, 9, 4), 4, 71, (3, 2), (1, 2), (2, 1), (1, 2), "CLIP", "NHWC", "NHCWB16"],
+        [(1, 4, 12, 9, 16), 182, 67, (2, 3), (6, 3), (2, 2), (1, 1), "CLIP", "NHCWB16", "NHCWB16"],
+        [(1, 7, 9, 4), 4, 13, (3, 2), (1, 2), (2, 1), (2, 2), "CLIP", "NHWC", "NHWC"],
+        [(1, 7, 9, 4), 4, 71, (3, 2), (1, 2), (2, 1), (2, 2), "CLIP", "NHWC", "NHCWB16"],
+        [
+            (1, 13, 12, 19, 16),
+            182,
+            67,
+            (1, 3),
+            (5, 3),
+            (2, 1),
+            (2, 1),
+            "CLIP",
+            "NHCWB16",
+            "NHCWB16",
+        ],
+    ],
+)
+def test_conv2d_single(trial):
+    def _get_func(
+        ifm_shape,
+        ifm_channels,
+        ofm_channels,
+        kernel_shape,
+        padding,
+        strides,
+        dilation,
+        activation,
+        ifm_layout,
+        ofm_layout,
+    ):
+        ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+        conv = make_ethosu_conv2d(
+            ifm,
+            ifm_channels,
+            ofm_channels,
+            kernel_shape,
+            padding,
+            strides,
+            dilation,
+            activation,
+            ifm_layout,
+            ofm_layout,
+        )
+        func = relay.Function(relay.analysis.free_vars(conv), conv)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    # TODO(@mbaret) Fix the tests for these known failures
+    # These are anticipated to actually be correct, just a testing issue to do with
+    # equivalent convolutions.
+    known_failures = [
+        [(1, 3, 12, 9, 16), 182, 67, (2, 3), (1, 3), (2, 2), (1, 1), "CLIP", "NHCWB16", "NHCWB16"],
+        [(1, 2, 12, 9, 16), 182, 67, (1, 3), (6, 3), (2, 2), (1, 1), "CLIP", "NHCWB16", "NHCWB16"],
+    ]
+    func = _get_func(*trial)
+    mod, _ = lower_to_tir(func)
+    data = []
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.Call):
+            data.append(get_convolutional_args(stmt, remove_constants=True))
+
+    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _visit)
+    (
+        ifm_shape,
+        ifm_channels,
+        ofm_channels,
+        kernel_shape,
+        padding,
+        strides,
+        dilation,
+        activation,
+        ifm_layout,
+        ofm_layout,
+    ) = trial
+    dilated_kernel_h = (kernel_shape[0] - 1) * dilation[0] + 1
+    dilated_kernel_w = (kernel_shape[1] - 1) * dilation[1] + 1
+    if ifm_layout == "NHWC":
+        ifm_stride_c = 1
+        ifm_stride_w = ifm_shape[3]
+        ifm_stride_h = ifm_shape[2] * ifm_shape[3]
+        ofm_height = (ifm_shape[1] - dilated_kernel_h + padding[0] + padding[0]) // strides[0] + 1
+        ofm_width = (ifm_shape[2] - dilated_kernel_w + padding[1] + padding[1]) // strides[1] + 1
+    else:
+        ifm_stride_w = 16
+        ifm_stride_c = 16 * ifm_shape[3]
+        ifm_stride_h = 16 * ifm_shape[2] * ifm_shape[3]
+        ofm_height = (ifm_shape[1] - dilated_kernel_h + padding[0] + padding[0]) // strides[0] + 1
+        ofm_width = (ifm_shape[3] - dilated_kernel_w + padding[1] + padding[1]) // strides[1] + 1
+
+    if ofm_layout == "NHWC":
+        ofm_stride_c = 1
+        ofm_stride_w = ofm_channels if ofm_width > 1 else 1
+        ofm_stride_h = ofm_channels * ofm_width if ofm_height > 1 else 1
+    else:
+        ofm_stride_w = 16
+        ofm_stride_c = 16 * ofm_width
+        ofm_stride_h = 16 * ofm_width * ((ofm_channels - 1) // 16 + 1)
+
+    answer = [
+        "int8",
+        ifm_shape[1],
+        ifm_shape[2] if ifm_layout == "NHWC" else ifm_shape[3],
+        ifm_channels,
+        ifm_shape[1],
+        0,
+        ifm_shape[2] if ifm_layout == "NHWC" else ifm_shape[3],
+        0,
+        0,
+        0,
+        0,
+        0.5,
+        10,
+        ifm_layout,
+        ifm_stride_h,
+        ifm_stride_w,
+        ifm_stride_c,
+        "int8",
+        ofm_height,
+        ofm_width,
+        ofm_channels,
+        ofm_height,
+        0,
+        ofm_width,
+        0,
+        0,
+        0,
+        0,
+        0.25,
+        14,
+        ofm_layout,
+        ofm_stride_h,
+        ofm_stride_w,
+        ofm_stride_c,
+        kernel_shape[1],
+        kernel_shape[0],
+        strides[1],
+        strides[0],
+        dilation[1],
+        dilation[0],
+        12,
+        padding[0],
+        padding[1],
+        padding[0],
+        padding[1],
+        activation,
+        10 if activation == "CLIP" else 0,
+        100 if activation == "CLIP" else 0,
+        "NONE",
+    ]
+    assert data[0] == answer, data[0]
+
+
+# fmt: off
+@tvm.script.tir
+class Conv2dDoubleCascade1:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_3, [304], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder, [1, 8, 8, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_2, [320], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 8, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_1, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_write_2 = tir.allocate([1024], "int8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 160, 12, tir.load("uint8", buffer_2.data, 0), 320, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 304, 12, tir.load("uint8", buffer_1.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, tir.load("int8", placeholder_5.data, 12), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 160, 12, tir.load("uint8", buffer_2.data, 0), 320, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, tir.load("int8", ethosu_write_1.data, 32), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 304, 12, tir.load("uint8", buffer_1.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.tir
+class Conv2dDoubleCascade2:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_2, [320], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_1, [1312], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_3, [2608], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder, [1, 8, 8, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 8, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_write_2 = tir.allocate([1536], "int8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_2.data, 0), 1312, 12, tir.load("uint8", buffer_1.data, 0), 320, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 256), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 2608, 12, tir.load("uint8", buffer.data, 0), 80, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 48), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_2.data, 0), 1312, 12, tir.load("uint8", buffer_1.data, 0), 320, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 2608, 12, tir.load("uint8", buffer.data, 0), 80, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.tir
+class Conv2dDoubleCascade3:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 20, 4, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer = tir.match_buffer(placeholder_3, [1744], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_2, [320], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_1, [880], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder, [1, 16, 16, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_write_2 = tir.allocate([2560], "int8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, tir.load("int8", ethosu_write_2, 512), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, tir.load("uint8", buffer_3.data, 0), 880, 12, tir.load("uint8", buffer_2.data, 0), 320, 2, 1, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, tir.load("int8", ethosu_write_2, 512), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, tir.load("uint8", buffer.data, 0), 1744, 12, tir.load("uint8", buffer_1.data, 0), 80, 2, 1, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, tir.load("int8", placeholder_5.data, 192), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, tir.load("uint8", buffer_3.data, 0), 880, 12, tir.load("uint8", buffer_2.data, 0), 320, 0, 1, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 10, 8, 32, 10, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, tir.load("int8", ethosu_write_1.data, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, tir.load("uint8", buffer.data, 0), 1744, 12, tir.load("uint8", buffer_1.data, 0), 80, 0, 1, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 4, 16, 3, 4, 0, 16, tir.load("int8", placeholder_5.data, 576), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 4, 8, 32, 4, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, tir.load("uint8", buffer_3.data, 0), 880, 12, tir.load("uint8", buffer_2.data, 0), 320, 0, 1, 2, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 4, 8, 32, 4, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 4, 8, 4, 0, 4, tir.load("int8", ethosu_write_1.data, 512), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, tir.load("uint8", buffer.data, 0), 1744, 12, tir.load("uint8", buffer_1.data, 0), 80, 0, 1, 2, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.tir
+class Conv2dDoubleCascade4:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_1, [1456], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_2, [352], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder, [1, 8, 1, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 2, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_4, [272], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_3, [11040], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_write_2 = tir.allocate([2304], "int8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 384), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 1456, 12, tir.load("uint8", buffer_1.data, 0), 352, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 384), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 11040, 12, tir.load("uint8", buffer_2.data, 0), 272, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 256), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 1456, 12, tir.load("uint8", buffer_1.data, 0), 352, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 1024), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 11040, 12, tir.load("uint8", buffer_2.data, 0), 272, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+@pytest.mark.parametrize(
+    "trial",
+    [
+        [
+            Conv2dDoubleCascade1(),
+            (1, 8, 8, 3),
+            3,
+            32,
+            8,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            (1, 1),
+            "NHWC",
+            (1, 8, 4, 8),
+        ],
+        [
+            Conv2dDoubleCascade2(),
+            (1, 8, 8, 3),
+            3,
+            32,
+            8,
+            (3, 3),
+            (1, 1),
+            (1, 1),
+            (1, 1),
+            "NHWC",
+            (1, 4, 8, 8),
+        ],
+        [
+            Conv2dDoubleCascade3(),
+            (1, 16, 16, 3),
+            3,
+            32,
+            8,
+            (3, 2),
+            (2, 1),
+            (1, 2),
+            (1, 2),
+            "NHWC",
+            (1, 8, 4, 8),
+        ],
+        [
+            Conv2dDoubleCascade4(),
+            (1, 8, 1, 8, 16),
+            3,
+            35,
+            26,
+            (3, 3),
+            (1, 1),
+            (1, 1),
+            (1, 1),
+            "NHCWB16",
+            (1, 4, 2, 8, 16),
+        ],
+    ],
+)
+def test_conv2d_double_cascade(trial):
+    def _get_func(
+        ifm_shape,
+        ifm_channels,
+        mid_channels,
+        ofm_channels,
+        kernel_shape,
+        padding,
+        strides,
+        dilation,
+        layout,
+    ):
+        ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+        conv1 = make_ethosu_conv2d(
+            ifm,
+            ifm_channels,
+            mid_channels,
+            kernel_shape,
+            padding,
+            strides,
+            dilation,
+            "NONE",
+            layout,
+            layout,
+        )
+        conv2 = make_ethosu_conv2d(
+            conv1,
+            mid_channels,
+            ofm_channels,
+            kernel_shape,
+            padding,
+            strides,
+            dilation,
+            "NONE",
+            layout,
+            layout,
+        )
+        func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    reference_mod = trial[0]
+    params = trial[1:]
+    func = _get_func(*params[:-1])
+    mod, _ = lower_to_tir(func, cascader=total_cascader(params[-1]))
+    script = tvm.script.asscript(mod, True)
+    mod = tvm.script.from_source(script)
+    tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
+
+
+# fmt: off
+@tvm.script.tir
+class Conv2dInlineCopy1:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_1, [848], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [1, 10, 12, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, tir.load("int8", placeholder_3.data, 120), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 848, 12, tir.load("uint8", buffer_1.data, 0), 160, 1, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.tir
+class Conv2dInlineCopy2:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 3, 5, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [1, 7, 9, 5], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_1, [656], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, tir.load("int8", placeholder_3.data, 146), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 656, 12, tir.load("uint8", buffer.data, 0), 160, 1, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+@pytest.mark.parametrize(
+    "trial",
+    [
+        [Conv2dInlineCopy1(), (1, 10, 12, 8), (0, 1, 3, 0), (1, 9, 11, 4)],
+        [Conv2dInlineCopy2(), (1, 7, 9, 5), (0, 3, 2, 1), (1, 6, 7, 4)],
+    ],
+)
+def test_conv2d_inline_copy(trial):
+    def _get_func(ifm_shape, lower, upper, ofm_channels=16):
+        ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+        sliced = relay.strided_slice(ifm, lower, upper)
+        conv = make_ethosu_conv2d(
+            sliced, upper[3] - lower[3], ofm_channels, (3, 3), (1, 1), (1, 1), (1, 1)
+        )
+        func = relay.Function(relay.analysis.free_vars(conv), conv)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    reference_mod = trial[0]
+    params = trial[1:]
+    func = _get_func(*params)
+    mod, _ = lower_to_tir(func)
+    script = tvm.script.asscript(mod, True)
+    mod = tvm.script.from_source(script)
+    tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
+
+
+# fmt: off
+@tvm.script.tir
+class Conv2dInlineReshape1:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 6, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_1, [848], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [4, 6, 8, 1], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 72), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 384), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.tir
+class Conv2dInlineReshape2:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 6, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_1, [848], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [1, 24, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 72), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 384), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.tir
+class Conv2dInlineReshape3:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [192, 1], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_1, [848], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 6, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 72), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 384), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.tir
+class Conv2dInlineReshape4:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 6, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_1, [848], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 72), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 384), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+@pytest.mark.parametrize(
+    "trial",
+    [
+        [Conv2dInlineReshape1(), (4, 6, 8, 1), (1, 8, 6, 4), "NHWC"],
+        [Conv2dInlineReshape2(), (1, 4 * 6, 8), (1, 8, 6, 4), "NHWC"],
+        [Conv2dInlineReshape3(), (4 * 6 * 8, 1), (1, 8, 6, 4), "NHWC"],
+        [Conv2dInlineReshape4(), (4 * 6 * 8,), (1, 8, 6, 4), "NHWC"],
+    ],
+)
+def test_conv2d_inline_reshape(trial):
+    def _get_func(ifm_shape, reshaped, ifm_layout):
+        ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+        ifm_reshaped = relay.reshape(ifm, reshaped)
+        conv = make_ethosu_conv2d(
+            ifm_reshaped, reshaped[3], 16, (3, 3), (1, 1), (1, 1), (1, 1), "NONE", ifm_layout
+        )
+        func = relay.Function(relay.analysis.free_vars(conv), conv)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    reference_mod = trial[0]
+    params = trial[1:]
+    func = _get_func(*params)
+    mod, _ = lower_to_tir(func, cascader=total_cascader((1, 4, 6, 16)))
+    script = tvm.script.asscript(mod, True)
+    mod = tvm.script.from_source(script)
+    tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
+
+
+# TODO(@mbaret) Fix this case
+@pytest.mark.xfail(raises=TypeError, strict=True)
+def test_conv2d_big_pad():
+    def _get_func():
+        ifm_shape = (1, 2, 2, 8)
+        ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+        conv = make_ethosu_conv2d(ifm, ifm_shape[3], 16, (1, 1), (7, 7), (1, 1), (1, 1), "NHWC")
+        func = relay.Function(relay.analysis.free_vars(conv), conv)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func = _get_func()
+    mod, _ = lower_to_tir(func, cascader=total_cascader((1, 4, 4, 16)))
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
new file mode 100644
index 000000000000..222dccacc906
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+import tvm.script
+from tvm.script import tir, ty
+from tvm import relay
+from tvm.relay.testing import run_opt_pass
+from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
+
+from infra import make_ethosu_conv2d
+
+
+# fmt: off
+@tvm.script.tir
+class ReferenceModule:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer = tir.match_buffer(placeholder_2, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_1, [304], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        placeholder_global = tir.allocate([304], "uint8", "global")
+        placeholder_d_global = tir.allocate([80], "uint8", "global")
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_1.data, 0), 304, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer.data, 0), 80, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 304, 12, tir.load("uint8", placeholder_d_global, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+def test_copy():
+    def _get_func():
+        data = relay.var("data", shape=(1, 16, 16, 32), dtype="int8")
+        conv = make_ethosu_conv2d(
+            data,
+            32,
+            8,
+            (1, 1),
+            (0, 0),
+            (1, 1),
+            (1, 1),
+        )
+        func = relay.Function(relay.analysis.free_vars(conv), conv)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func = _get_func()
+    mod, _ = lower_to_tir(func, cascader=copy_constants())
+
+    script = tvm.script.asscript(mod, True)
+    test_mod = tvm.script.from_source(script)
+    reference_mod = ReferenceModule()
+    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
new file mode 100644
index 000000000000..b07f8ea7f48b
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+from tvm import relay
+from tvm.relay.testing import run_opt_pass
+from tvm import te, topi
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import (
+    tile_nd,
+    schedule_pragmas,
+    inline_no_ops,
+    total_cascader,
+    copy_constants,
+    schedule_cache_reads,
+)
+from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_te, extract_constants
+from infra import AttachType, make_ethosu_conv2d
+
+
+class TestTEGraph:
+    def __init__(self, inputs, outputs):
+        self.inputs = inputs
+        self.outputs = outputs
+
+
+def test_tile_nd():
+    input = te.placeholder((12, 12), dtype="uint8", name="input")
+    out = topi.nn.relu(input)
+    sch = te.create_schedule([out.op])
+    outer_iters, inner_iters = tile_nd(sch, out, (3, 4))
+    assert tuple(sch[out].leaf_iter_vars) == (*outer_iters, *inner_iters)
+
+
+def test_schedule_pragmas():
+    input = te.placeholder((12, 12), dtype="uint8", name="input")
+    out = te.compute(
+        (12, 12),
+        lambda i, j: input[i, j],
+        attrs={
+            "op": "unity",
+            "info": 1,
+        },
+    )
+    sch = te.create_schedule([out.op])
+    sch[out].split(out.op.axis[0], 3)
+    schedule_pragmas(sch)
+    iter_var = sch[out].leaf_iter_vars[1]
+    assert list(sch[out].iter_var_attrs[iter_var].pragma_keys) == ["op", "info"]
+    assert list(sch[out].iter_var_attrs[iter_var].pragma_values) == ["unity", 1]
+
+
+def test_schedule_pragmas_for_const():
+    input = te.placeholder((12, 12), dtype="uint8", name="input")
+    const = te.compute((), lambda: 2)
+    add = topi.add(input, const)
+    sch = te.create_schedule([add.op])
+    schedule_pragmas(sch)
+
+
+def test_inline_no_ops():
+    input = relay.var("input", shape=(12, 12), dtype="uint8")
+    slice = relay.strided_slice(input, [0, 0], [6, 6])
+    relu1 = relay.nn.relu(slice)
+    reshape = relay.reshape(relu1, (36,))
+    relu2 = relay.nn.relu(reshape)
+    func = relay.Function(relay.analysis.free_vars(relu2), relu2)
+    func = run_opt_pass(func, relay.transform.InferType())
+
+    te_graph = lower_to_te(func)
+    sch = te.create_schedule([te_graph.outputs[0].op])
+    inline_no_ops(te_graph, sch)
+    reshape_tensor = te_graph.outputs[0].op.input_tensors[0]
+    slice_tensor = reshape_tensor.op.input_tensors[0].op.input_tensors[0]
+    assert sch[reshape_tensor].attach_type == AttachType.kInline
+    assert sch[slice_tensor].attach_type == AttachType.kInline
+
+
+def test_total_cascader():
+    input = te.placeholder((12, 12), dtype="uint8", name="input")
+    relu1 = topi.nn.relu(input)
+    relu2 = topi.nn.relu(relu1)
+    relu3 = topi.nn.relu(relu2)
+    sch = te.create_schedule([relu3.op])
+    cascader = total_cascader((4, 4))
+    cascader(TestTEGraph([input], [relu3]), {}, sch)
+    assert sch[relu1].attach_type == AttachType.kScope
+    assert sch[relu2].attach_type == AttachType.kScope
+    assert sch[relu3].attach_type == AttachType.kGroupRoot
+    # Check that the attaches are at the correct iter var
+    assert sch[relu1].attach_ivar == sch[relu3].leaf_iter_vars[1]
+    assert sch[relu2].attach_ivar == sch[relu3].leaf_iter_vars[1]
+
+
+def test_copy_constants():
+    ifm_a = relay.var("IFM_A", shape=(1, 26, 26, 32), dtype="int8")
+    conv_a = make_ethosu_conv2d(ifm_a, 32, 8, (3, 3), (0, 0), (1, 1), (1, 1))
+    conv_b = make_ethosu_conv2d(conv_a, 8, 4, (1, 1), (0, 0), (1, 1), (1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv_b), conv_b)
+    func = run_opt_pass(func, relay.transform.InferType())
+
+    func, const_dict = extract_constants(func)
+    te_graph = lower_to_te(func)
+
+    sch = te.create_schedule([te_graph.outputs[0].op])
+    planner = copy_constants()
+    planner(te_graph, const_dict, sch)
+    assert len(sch.stages) == 21
+    assert ".global" in sch.stages[5].op.name
+    assert ".global" in sch.stages[7].op.name
+    assert ".global" in sch.stages[15].op.name
+    assert ".global" in sch.stages[17].op.name
+
+
+def test_schedule_cache_reads():
+    a = te.placeholder((12, 12), dtype="uint8", name="a")
+    b = te.placeholder((12, 12), dtype="uint8", name="b")
+    add = topi.add(a, b)
+    sch = te.create_schedule([add.op])
+    cr = sch.cache_read(b, "global", [add])
+    schedule_cache_reads(sch)
+    assert len(sch.stages) == 4
+    assert len(sch[cr].leaf_iter_vars) == 1
+    iv = sch[cr].leaf_iter_vars[0]
+    assert list(sch[cr].iter_var_attrs[iv].pragma_keys) == ["op"]
+    assert list(sch[cr].iter_var_attrs[iv].pragma_values) == ["ethosu_copy"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
new file mode 100644
index 000000000000..a86dd919d5ca
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -0,0 +1,556 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import numpy as np
+from ethosu.vela import api as vapi
+from unittest.mock import patch
+
+import tvm
+from tvm import tir
+from tvm.script import ty
+from tvm.tir import stmt_functor
+from tvm.relay.backend.contrib.ethosu import vela_api
+import tvm.relay.backend.contrib.ethosu.tir_to_cs_translator as tirtocs
+
+ACCEL_TYPES = [
+    vapi.NpuAccelerator.Ethos_U55_256,
+    vapi.NpuAccelerator.Ethos_U55_128,
+    vapi.NpuAccelerator.Ethos_U55_64,
+    vapi.NpuAccelerator.Ethos_U55_32,
+]
+
+
+"""Test case 1"""
+
+
+@tvm.script.tir
+class Module1:
+    def main(
+        placeholder: ty.handle,
+        placeholder_1: ty.handle,
+        placeholder_2: ty.handle,
+        ethosu_conv2d: ty.handle,
+    ) -> None:
+        # function attr dict
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        placeholder_3 = tir.match_buffer(
+            placeholder, [1, 8, 8, 3], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+        )
+        placeholder_4 = tir.match_buffer(
+            placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+        )
+        placeholder_5 = tir.match_buffer(
+            placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1
+        )
+        ethosu_conv2d_1 = tir.match_buffer(
+            ethosu_conv2d, [1, 8, 8, 16], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+        )
+        # body
+        tir.evaluate(
+            tir.call_extern(
+                "ethosu_conv2d",
+                "uint8",
+                8,
+                8,
+                3,
+                8,
+                0,
+                8,
+                tir.load("uint8", placeholder_3.data, 0),
+                0,
+                0,
+                0,
+                tir.float32(0.5),
+                10,
+                "NHWC",
+                24,
+                3,
+                1,
+                "uint8",
+                8,
+                8,
+                16,
+                8,
+                0,
+                8,
+                tir.load("uint8", ethosu_conv2d_1.data, 0),
+                0,
+                0,
+                0,
+                tir.float32(0.25),
+                14,
+                "NHWC",
+                128,
+                16,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                tir.load("uint8", placeholder_4.data, 0),
+                0,
+                12,
+                tir.load("uint8", placeholder_5.data, 0),
+                0,
+                0,
+                0,
+                0,
+                0,
+                "CLIP",
+                0,
+                0,
+                "NONE",
+                dtype="uint8",
+            )
+        )
+
+    __tvm_meta__ = None
+
+
+"""Test case 2 with per-channel quantization"""
+
+
+@tvm.script.tir
+class Module2:
+    def main(
+        placeholder: ty.handle,
+        placeholder_1: ty.handle,
+        placeholder_2: ty.handle,
+        placeholder_6: ty.handle,
+        ethosu_conv2d: ty.handle,
+    ) -> None:
+        # function attr dict
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        placeholder_3 = tir.match_buffer(
+            placeholder, [1, 8, 8, 3], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+        )
+        placeholder_4 = tir.match_buffer(
+            placeholder_1, [16, 1, 1, 3], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+        )
+        placeholder_5 = tir.match_buffer(
+            placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1
+        )
+        # Per-channel weight scales
+        placeholder_7 = tir.match_buffer(
+            placeholder_6, [16], dtype="float32", elem_offset=0, align=128, offset_factor=1
+        )
+        ethosu_conv2d_1 = tir.match_buffer(
+            ethosu_conv2d, [1, 8, 8, 16], dtype="uint8", elem_offset=0, align=128, offset_factor=1
+        )
+        # body
+        tir.evaluate(
+            tir.call_extern(
+                "ethosu_conv2d",
+                "uint8",
+                8,
+                8,
+                3,
+                8,
+                0,
+                8,
+                tir.load("uint8", placeholder_3.data, 0),
+                0,
+                0,
+                0,
+                tir.float32(0.5),
+                10,
+                "NHWC",
+                24,
+                3,
+                1,
+                "uint8",
+                8,
+                8,
+                16,
+                8,
+                0,
+                8,
+                tir.load("uint8", ethosu_conv2d_1.data, 0),
+                0,
+                0,
+                0,
+                tir.float32(0.25),
+                14,
+                "NHWC",
+                128,
+                16,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                tir.load("uint8", placeholder_4.data, 0),
+                0,
+                12,
+                tir.load("uint8", placeholder_5.data, 0),
+                0,
+                0,
+                0,
+                0,
+                0,
+                "CLIP",
+                0,
+                0,
+                "NONE",
+                dtype="uint8",
+            )
+        )
+
+    __tvm_meta__ = None
+
+
+def test_get_optimal_block_config():
+    block_configs_cases = [
+        {
+            "test": [
+                vapi.NpuShape3D(10, 20, 8),
+                vapi.NpuShape3D(10, 30, 16),
+                vapi.NpuShape3D(10, 40, 32),
+            ],
+            "ref": vapi.NpuShape3D(10, 40, 32),
+        },
+        {
+            "test": [
+                vapi.NpuShape3D(10, 20, 8),
+                vapi.NpuShape3D(10, 50, 32),
+                vapi.NpuShape3D(10, 40, 32),
+            ],
+            "ref": vapi.NpuShape3D(10, 50, 32),
+        },
+        {
+            "test": [
+                vapi.NpuShape3D(50, 50, 8),
+                vapi.NpuShape3D(10, 30, 32),
+                vapi.NpuShape3D(8, 8, 64),
+            ],
+            "ref": vapi.NpuShape3D(8, 8, 64),
+        },
+    ]
+
+    for test_case in block_configs_cases:
+        assert vela_api._get_optimal_block_config(test_case["test"]) == test_case["ref"]
+
+
+def test_compress_weights():
+    test_vecs = [
+        {
+            # Stimulus
+            "accel": vapi.NpuAccelerator.Ethos_U55_256,
+            "block_depth": 8,
+            "ifm_dtype": np.uint8,
+            "shape": (3, 3, 16, 64),
+            "layout": "HWIO",
+            "zero_point": np.int64(134),
+            "dilation": (1, 1),
+            "is_depthwise": False,
+            # Reference outputs
+            "block_traversal": vapi.NpuBlockTraversal.PART_KERNEL_FIRST,
+        },
+        {
+            # Stimulus
+            "accel": vapi.NpuAccelerator.Ethos_U55_256,
+            "block_depth": 8,
+            "ifm_dtype": np.uint8,
+            "shape": (3, 3, 32, 64),
+            "layout": "HWIO",
+            "zero_point": np.int64(134),
+            "dilation": (1, 1),
+            "is_depthwise": False,
+            # Reference outputs
+            "block_traversal": vapi.NpuBlockTraversal.DEPTH_FIRST,
+        },
+        {
+            # Stimulus
+            "accel": vapi.NpuAccelerator.Ethos_U55_256,
+            "block_depth": 8,
+            "ifm_dtype": np.int16,
+            "shape": (3, 3, 16, 64),
+            "layout": "HWIO",
+            "zero_point": np.int64(134),
+            "dilation": (1, 1),
+            "is_depthwise": False,
+            # Reference outputs
+            "block_traversal": vapi.NpuBlockTraversal.DEPTH_FIRST,
+        },
+        # Pass-through value check
+        {
+            # Stimulus
+            "accel": vapi.NpuAccelerator.Ethos_U55_128,
+            "block_depth": 16,
+            "ifm_dtype": np.uint8,
+            "shape": (243, 152, 7, 1),
+            "layout": "HWOI",
+            "zero_point": np.int64(110),
+            "dilation": (2, 2),
+            "is_depthwise": True,
+            # Reference outputs
+            "block_traversal": vapi.NpuBlockTraversal.DEPTH_FIRST,
+        },
+        {
+            # Stimulus
+            "accel": vapi.NpuAccelerator.Ethos_U55_128,
+            "block_depth": 32,
+            "ifm_dtype": np.uint8,
+            "shape": (64, 67, 35, 8),
+            "layout": "OHWI",
+            "zero_point": np.int64(100),
+            "dilation": (1, 2),
+            "is_depthwise": False,
+            # Reference outputs
+            "block_traversal": vapi.NpuBlockTraversal.PART_KERNEL_FIRST,
+        },
+    ]
+
+    def verify(test_vec, mock_obj):
+        layout_transform_indices = {
+            "HWIO": (3, 0, 1, 2),
+            "HWOI": (2, 0, 1, 3),
+            "OHWI": (0, 1, 2, 3),
+        }
+
+        assert mock_obj
+        mock_obj.assert_called_once()
+        assert mock_obj.call_args[1]["accelerator"] == test_vec["accel"]
+        assert mock_obj.call_args[1]["accelerator"] == test_vec["accel"]
+        ishape = test_vec["shape"]
+        shape_owhi = (
+            ishape[layout_transform_indices[test_vec["layout"]][0]],
+            ishape[layout_transform_indices[test_vec["layout"]][1]],
+            ishape[layout_transform_indices[test_vec["layout"]][2]],
+            ishape[layout_transform_indices[test_vec["layout"]][3]],
+        )
+        assert mock_obj.call_args[1]["weights_volume"].shape == shape_owhi
+        assert mock_obj.call_args[1]["dilation_xy"] == test_vec["dilation"]
+        assert mock_obj.call_args[1]["ifm_bitdepth"] == np.iinfo(test_vec["ifm_dtype"]).bits
+        assert mock_obj.call_args[1]["ofm_block_depth"] == test_vec["block_depth"]
+        assert mock_obj.call_args[1]["is_depthwise"] == test_vec["is_depthwise"]
+        assert mock_obj.call_args[1]["block_traversal"] == test_vec["block_traversal"]
+
+    def create_mock(test_vec):
+        with patch(
+            "tvm.relay.backend.contrib.ethosu.vela_api.vapi.npu_encode_weights"
+        ) as mock_npu_encode_weights:
+            ifm_bitdepth = np.iinfo(test_vec["ifm_dtype"]).bits
+            ifm_dtype = test_vec["ifm_dtype"]
+            max = np.iinfo(ifm_dtype).max
+            min = np.iinfo(ifm_dtype).min
+            values = np.random.randint(min, max, test_vec["shape"], ifm_dtype)
+            compressed_weights = vela_api.compress_weights(
+                weights=values,
+                weights_zp=test_vec["zero_point"],
+                weights_layout=test_vec["layout"],
+                ifm_bitdepth=ifm_bitdepth,
+                block_depth=test_vec["block_depth"],
+                dilation=test_vec["dilation"],
+                accel_type=test_vec["accel"],
+                is_depthwise=test_vec["is_depthwise"],
+            )
+            return mock_npu_encode_weights
+        return None
+
+    for tv in test_vecs:
+        mock_obj = create_mock(tv)
+        verify(tv, mock_obj)
+
+
+def test_pack_biases():
+    test_vecs = [
+        {
+            # Stimulus
+            "bias_length": 3,
+            "ifm_scale": np.single(1.11111111),
+            "ifm_dtype": np.uint8,
+            "weight_scales": np.array(
+                [np.single(0.91111111), np.single(1.01111111), np.single(1.11111111)]
+            ),
+            "ofm_scale": np.single(1.2),
+            "is_activation_tanh_or_sigmoid": False,
+            # Reference outputs
+            "hw_scales": (1811663288, 2010504240, 1104672703),
+            "hw_shifts": (31, 31, 30),
+        },
+        {
+            # Stimulus
+            "bias_length": 3,
+            "ifm_scale": np.single(1.11111111),
+            "ifm_dtype": np.int8,
+            "weight_scales": np.array(
+                [np.single(0.91111111), np.single(1.01111111), np.single(1.11111111)]
+            ),
+            "ofm_scale": np.single(1.2),
+            "is_activation_tanh_or_sigmoid": False,
+            # Reference outputs
+            "hw_scales": (1811663185, 2010504312, 1104672720),
+            "hw_shifts": (31, 31, 30),
+        },
+        {
+            # Stimulus
+            "bias_length": 3,
+            "ifm_scale": np.single(1.11111111),
+            "ifm_dtype": np.int16,
+            "weight_scales": np.array(
+                [np.single(0.91111111), np.single(1.01111111), np.single(1.11111111)]
+            ),
+            "ofm_scale": np.single(1.2),
+            "is_activation_tanh_or_sigmoid": False,
+            # Reference outputs
+            "hw_scales": (27644, 30678, 16856),
+            "hw_shifts": (15, 15, 14),
+        },
+    ]
+
+    def verify(test_vec, mock_obj, packed_biases):
+        assert mock_obj
+        for idx, val in enumerate(test_vec["bias_values"]):
+            assert val == mock_obj.call_args_list[idx][0][0]
+            assert test_vec["hw_scales"][idx] == mock_obj.call_args_list[idx][0][1]
+            assert test_vec["hw_shifts"][idx] == mock_obj.call_args_list[idx][0][2]
+
+    def create_mock(test_vec):
+        with patch(
+            "tvm.relay.backend.contrib.ethosu.vela_api.vapi.npu_encode_bias"
+        ) as mock_npu_encode_bias:
+            mock_npu_encode_bias.return_value = bytearray(10)
+            ifm_dtype = test_vec["ifm_dtype"]
+            max = np.iinfo(ifm_dtype).max
+            min = np.iinfo(ifm_dtype).min
+            # tvm will always create biases in int32
+            biases = np.random.randint(min, max, test_vec["bias_length"], np.int32)
+            packed_biases = vela_api.pack_biases(
+                biases=biases,
+                ifm_scale=test_vec["ifm_scale"],
+                ifm_dtype=test_vec["ifm_dtype"],
+                weight_scales=test_vec["weight_scales"],
+                ofm_scale=test_vec["ofm_scale"],
+                is_activation_tanh_or_sigmoid=test_vec["is_activation_tanh_or_sigmoid"],
+            )
+            test_vec["bias_values"] = biases
+            return mock_npu_encode_bias, packed_biases
+        return None
+
+    for _test_vec in test_vecs:
+        mock_obj, packed_biases = create_mock(_test_vec)
+        verify(_test_vec, mock_obj, packed_biases)
+
+
+def extract_ethosu_conv2d_extern_calls(mod):
+    """This function will obtain all ethosu_conv2d
+    calls from a NPU TIR module
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        This is a NPU TIR Module
+
+    Returns
+    -------
+    list
+        List of tvm.tir.Call objects
+        that are tir extern calls
+        for ethosu_conv2d
+    """
+    # There should only be a single function
+    assert len(mod.functions.items()) == 1
+    primfunc = mod.functions.items()[0][1]
+
+    ethosu_conv2d_calls = list()
+
+    def populate_ethosu_conv2d_calls(stmt):
+        if (
+            isinstance(stmt, tvm.tir.Call)
+            and stmt.op.name == "tir.call_extern"
+            and stmt.args[0] == "ethosu_conv2d"
+        ):
+            ethosu_conv2d_calls.append(stmt)
+
+    stmt_functor.post_order_visit(primfunc.body, populate_ethosu_conv2d_calls)
+    return ethosu_conv2d_calls
+
+
+@pytest.mark.parametrize(
+    "accel",
+    ACCEL_TYPES,
+)
+def test_encode_weights(accel):
+    test_vecs = [
+        {
+            # Stimulus
+            "tir_module": Module1(),
+            "param_dict": {
+                1: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [48], "uint8"),
+                2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [16], "int32"),
+            },
+            "accel_type": accel,
+            # Reference outputs
+            "block_traversal": vapi.NpuBlockTraversal.PART_KERNEL_FIRST,
+        },
+    ]
+
+    def create_mock(test_vec):
+        with patch(
+            "tvm.relay.backend.contrib.ethosu.vela_api.vapi.npu_encode_weights"
+        ) as mock_enc_w:
+            with patch(
+                "tvm.relay.backend.contrib.ethosu.vela_api.vapi.npu_find_block_configs"
+            ) as mock_blk_cfg:
+                mock_blk_cfg.return_value = [vapi.NpuShape3D(8, 8, 8)]
+                ethosu_conv2d_calls = extract_ethosu_conv2d_extern_calls(test_vec["tir_module"])
+                buffer_info = tirtocs.extract_buffer_info(
+                    test_vec["tir_module"], test_vec["param_dict"]
+                )
+                for ethosu_conv2d_call in ethosu_conv2d_calls:
+                    npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)
+                    weights = buffer_info[npu_op.weights[0].address.buffer_var][0]
+                    vela_api.encode_weights(ethosu_conv2d_call, weights, accel)
+                return mock_enc_w
+
+    def verify(test_vec, mock_enc_w):
+        ethosu_conv2d_calls = extract_ethosu_conv2d_extern_calls(test_vec["tir_module"])
+        buffer_info = tirtocs.extract_buffer_info(test_vec["tir_module"], test_vec["param_dict"])
+        for ethosu_conv2d_call in ethosu_conv2d_calls:
+            npu_op, w_zero_point = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)
+            weights = buffer_info[npu_op.weights[0].address.buffer_var][0]
+
+            assert mock_enc_w.call_args[1]["accelerator"] == accel
+            assert (
+                mock_enc_w.call_args[1]["weights_volume"].flatten()
+                == weights.astype(np.int64) - w_zero_point
+            ).all()
+            assert mock_enc_w.call_args[1]["dilation_xy"] == (
+                npu_op.kernel.dilation_x,
+                npu_op.kernel.dilation_y,
+            )
+            assert mock_enc_w.call_args[1]["dilation_xy"] == (
+                npu_op.kernel.dilation_x,
+                npu_op.kernel.dilation_y,
+            )
+            assert mock_enc_w.call_args[1]["ifm_bitdepth"] == npu_op.ifm.data_type.size_in_bits()
+            assert mock_enc_w.call_args[1]["block_traversal"] == test_vec["block_traversal"]
+
+    for _test_vec in test_vecs:
+        _mock_enc_w = create_mock(_test_vec)
+        verify(_test_vec, _mock_enc_w)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_hexagon/__init__.py b/tests/python/contrib/test_hexagon/__init__.py
new file mode 100644
index 000000000000..58dc4cc1e03d
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Testing infrastructure for Hexagon """
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
new file mode 100644
index 000000000000..0329328de3df
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Hexagon testing fixtures used to deduce testing argument
+    values from testing parameters """
+
+import tvm
+from .infrastructure import get_packed_filter_layout
+
+
+@tvm.testing.fixture
+def shape_nhwc(batch, in_channel, in_size):
+    return (batch, in_size, in_size, in_channel)
+
+
+@tvm.testing.fixture
+def shape_oihw(out_channel, in_channel, kernel):
+    return (out_channel, in_channel, kernel, kernel)
+
+
+@tvm.testing.fixture
+def shape_oihw8i32o4i(out_channel, in_channel, kernel):
+    return get_packed_filter_layout(out_channel, in_channel, kernel, kernel)
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
new file mode 100644
index 000000000000..193a8630c3d2
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Hexagon testing infrastructure """
+
+import tvm
+import numpy
+
+
+def ceildiv(o, d):
+    return tvm.tir.floordiv(o + d - 1, d)
+
+
+def get_packed_activation_layout(shape_nhwc, block_shape, packed_C=True):
+    assert len(shape_nhwc) == 4
+    shape = [shape_nhwc[0]]
+    off_h, off_w, off_c = block_shape
+    shape.append(ceildiv(shape_nhwc[1], off_h))
+    shape.append(ceildiv(shape_nhwc[2], off_w))
+    if packed_C:
+        shape.append(ceildiv(shape_nhwc[3], off_c))
+        shape.extend(block_shape)
+    else:
+        shape.extend([off_h, off_w, shape_nhwc[3]])
+    return shape
+
+
+def get_packed_filter_layout(out_channel, in_channel, kernel_h, kernel_w):
+    out_factor, in_first_factor, in_second_factor = 32, 32, 4
+    return (
+        int(ceildiv(out_channel, out_factor)),
+        int(ceildiv(in_channel, in_first_factor)),
+        kernel_h,
+        kernel_w,
+        in_first_factor // in_second_factor,
+        out_factor,
+        in_second_factor,
+    )
+
+
+def build_and_run(inputs, func, target, target_host, *args, **kwargs):
+    schedule, placeholders, binds = func(*args, **kwargs)
+
+    func = tvm.build(schedule, placeholders, target=target, target_host=target_host, binds=binds)
+    dev = tvm.device(target)
+    tensors = []
+    for tensor in inputs:
+        tensors.append(tvm.nd.array(tensor, dev))
+    tensors.append(
+        tvm.nd.array(
+            numpy.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype),
+            dev,
+        )
+    )
+    func(*tensors)
+
+    return tensors[-1].asnumpy()
+
+
+def get_block_shape():
+    return 8, 8, 32
+
+
+def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, out_channels):
+    assert len(shape_nhwc) == 4
+    kernel = []
+    kernel.append((kernel_size[0] - 1) * dilation[0] + 1)
+    kernel.append((kernel_size[1] - 1) * dilation[1] + 1)
+    return (
+        shape_nhwc[0],
+        (shape_nhwc[1] - kernel[0] + padding[0] + padding[1]) // strides[0] + 1,
+        (shape_nhwc[2] - kernel[1] + padding[2] + padding[3]) // strides[1] + 1,
+        out_channels,
+    )
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
new file mode 100644
index 000000000000..e0b7fb20ab8e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
@@ -0,0 +1,473 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+import tvm
+from tvm import te
+from tvm import topi
+from tvm.topi import testing
+from .infrastructure import (
+    ceildiv,
+    build_and_run,
+    get_block_shape,
+    get_conv2d_nhwc_shape,
+    get_packed_filter_layout,
+    get_packed_activation_layout,
+)
+
+import numpy as np
+import pytest
+
+
+def conv2d_logical(
+    shape_nhwc,
+    shape_oihw,
+    kernel_size,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Conv2d TE wherein both input activation and filter tensors
+    are defined with their logical NHWC/OIHW shapes, respectively.
+    The packed physical layout for the activation and filter are:
+      Activation: nhwc8h8w32c
+      Filter: oihw8i32o4i
+    """
+    assert kernel_size == tuple(shape_oihw[2:])
+
+    block_shape = get_block_shape()
+    block_H, block_W, block_C = block_shape
+    shape = get_packed_activation_layout(shape_nhwc, block_shape)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        shape_nhwc, kernel_size, stride, padding, [1, 1], shape_oihw[0]
+    )
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+    # Combination of padding required by conv2d operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+    # Calculate packed layout
+    X_packed = te.compute(
+        shape,
+        lambda n, ho, wo, co, hi, wi, ci: X_pad[
+            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        ],
+    )
+
+    # Filter shape using KCRS (OIHW) notation
+    K, C, R, S = shape_oihw
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    shape_filter = get_packed_filter_layout(K, C, R, S)
+    filt = te.placeholder(shape_oihw, dtype=dtype)
+    # Channel padding to multiples of 32
+    pad_c = (filter_Ci - (C % filter_Ci)) % filter_Ci
+    pad_k = (filter_Ki - (K % filter_Ki)) % filter_Ki
+    filt_pad = topi.nn.pad(
+        filt, [0, 0, 0, 0], [pad_k, pad_c, R, S], pad_value=0, name="padded_filter"
+    )
+    filt_packed = te.compute(
+        shape_filter,
+        lambda ko, co, r, s, cio, ki, cii: filt_pad[
+            ko * filter_Ki + ki, co * filter_Ci + cio * filter_Cii + cii, r, s
+        ],
+        name="packed_filter",
+    )
+
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, C), name="rc")
+
+    def compute(n, ho, wo, ko, hi, wi, ki):
+        # Construct blockized strided conv2d height index
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided conv2d width index
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        # Construct blockized conv2d channel index
+        c_block_id = rc // block_C
+        c_block_offset = rc % block_C
+
+        # Construct flat filter input channel indices
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        return te.sum(
+            X_packed[
+                n,
+                h_block_id,
+                w_block_id,
+                c_block_id,
+                h_block_offset,
+                w_block_offset,
+                c_block_offset,
+            ]
+            * filt_packed[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    s[filt_pad].compute_inline()
+    s[filt_packed].compute_inline()
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, filt, Y], binds)
+
+
+def conv2d_packed_filter(
+    shape_nhwc,
+    shape_oihw8i32o4i,
+    kernel_size,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Conv2d TE wherein the input activation is defined by its
+    logical NHWC shape, but the filter is provided in the
+    packed layout oihw8i32o4i. The physical packed layout used
+    for the activation is: nhwc8h8w32c
+    """
+    assert kernel_size == tuple(shape_oihw8i32o4i[2:4])
+
+    block_shape = get_block_shape()
+    block_H, block_W, block_C = block_shape
+    shape = get_packed_activation_layout(shape_nhwc, block_shape)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        shape_nhwc,
+        kernel_size,
+        stride,
+        padding,
+        [1, 1],
+        shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5],
+    )
+
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+    # Combination of padding required by conv2d operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+    # Calculate packed layout
+    packed_shape = get_packed_activation_layout(X_pad.shape, block_shape)
+
+    X_packed = te.compute(
+        packed_shape,
+        lambda n, ho, wo, co, hi, wi, ci: X_pad[
+            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        ],
+    )
+
+    # Filter shape using KCRS (OIHW) notation
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    assert shape_oihw8i32o4i[-1] == filter_Cii
+    assert shape_oihw8i32o4i[-2] == filter_Ki
+    assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii
+
+    filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype)
+
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, C), name="rc")
+
+    def compute(n, ho, wo, ko, hi, wi, ki):
+        # Construct blockized strided conv2d height index
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided conv2d width index
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        # Construct blockized conv2d channel index
+        c_block_id = rc // block_C
+        c_block_offset = rc % block_C
+
+        # Construct flat filter input channel indices
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        return te.sum(
+            X_packed[
+                n,
+                h_block_id,
+                w_block_id,
+                c_block_id,
+                h_block_offset,
+                w_block_offset,
+                c_block_offset,
+            ]
+            * filt_packed[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    # Perform scheduling
+    n, hid, wid, cid, hoff, woff, coff = s[Y].op.axis
+    slice = s[Y].fuse(wid, cid)
+    Xl = s.cache_read(X_packed, storage_scope, [Y])
+    Yl = s.cache_write(Y, storage_scope)
+
+    s[Yl].compute_at(s[Y], hid)
+    n, hid, slice, hoff, woff, coff = s[Yl].op.axis
+    s[Xl].compute_at(s[Yl], slice)
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, filt_packed, Y], binds)
+
+
+def conv2d_packed_filter_nhwhwc(
+    shape_nhwc,
+    shape_oihw8i32o4i,
+    kernel_size,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Conv2d TE wherein the input activation is defined by its
+    logical NHWC shape, but the filter is provided in the
+    packed layout oihw8i32o4i. The physical packed layout used
+    for the activation is: nhw8h8wc
+
+    """
+    assert kernel_size == tuple(shape_oihw8i32o4i[2:4])
+
+    block_shape = get_block_shape()
+    block_H, block_W, _ = block_shape
+    shape = get_packed_activation_layout(shape_nhwc, block_shape, packed_C=False)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        shape_nhwc,
+        kernel_size,
+        stride,
+        padding,
+        [1, 1],
+        shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5],
+    )
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape, packed_C=False)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+    # Combination of padding required by conv2d operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+    # Calculate packed layout
+    packed_shape = get_packed_activation_layout(X_pad.shape, block_shape, packed_C=False)
+    X_packed = te.compute(
+        packed_shape, lambda n, ho, wo, hi, wi, c: X_pad[n, ho * block_H + hi, wo * block_W + wi, c]
+    )
+
+    # Filter shape using KCRS (OIHW) notation
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    assert shape_oihw8i32o4i[-1] == filter_Cii
+    assert shape_oihw8i32o4i[-2] == filter_Ki
+    assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii
+
+    filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype)
+
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, C), name="rc")
+
+    def compute(n, ho, wo, hi, wi, k):
+        # Construct blockized strided conv2d height index
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided conv2d width index
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        # Construct flat filter input channel indices
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        # Construct split filter output channel index
+        ko = k // filter_Ki
+        ki = k % filter_Ki
+
+        return te.sum(
+            X_packed[n, h_block_id, w_block_id, h_block_offset, w_block_offset, rc]
+            * filt_packed[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    n, ho, wo, hi, wi, k = s[Y].op.axis
+    rh, rw, rc = s[Y].op.reduce_axis
+
+    rco, rci = s[Y].split(rc, factor=32)
+    s[Y].reorder(n, rco, wo, ho, k, hi, wi)
+    Xl = s.cache_read(X_packed, storage_scope, [Y])
+    s[Xl].compute_at(s[Y], rco)
+
+    ko, ki = s[Y].split(k, factor=32)
+    s[Y].reorder(n, rco, wo, ho, ko, hi, wi, ki)
+    Fl = s.cache_read(filt_packed, storage_scope, [Y])
+    s[Fl].compute_at(s[Y], ko)
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, filt_packed, Y], binds)
+
+
+class BaseConv2d:
+    batch = tvm.testing.parameter(1)
+    in_size = tvm.testing.parameter(8, 56)
+    in_channel = tvm.testing.parameter(64)
+    out_channel = tvm.testing.parameter(64)
+    kernel = tvm.testing.parameter(3)
+    stride = tvm.testing.parameter(1)
+    pad = tvm.testing.parameter(1)
+    dtype = tvm.testing.parameter("float32")
+
+
+class TestConv2dLogical(BaseConv2d):
+    @tvm.testing.parametrize_targets("llvm")
+    def test_conv2d(self, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target):
+        inputs = [
+            np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
+            np.random.uniform(0, 255, size=shape_oihw).astype(dtype),
+        ]
+        np_filter = inputs[1].transpose(2, 3, 1, 0)
+        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
+        output = build_and_run(
+            inputs,
+            conv2d_logical,
+            target,
+            target,
+            shape_nhwc=shape_nhwc,
+            shape_oihw=shape_oihw,
+            kernel_size=(kernel, kernel),
+            stride=(stride, stride),
+            padding=(pad, pad, pad, pad),
+            dtype=dtype,
+        )
+        return output, ref_output
+
+
+class TestConv2dPackedFilter(BaseConv2d):
+    conv2d_impl = tvm.testing.parameter(conv2d_packed_filter, conv2d_packed_filter_nhwhwc)
+
+    @tvm.testing.parametrize_targets("llvm")
+    def test_conv2d(
+        self,
+        conv2d_impl,
+        shape_nhwc,
+        shape_oihw,
+        shape_oihw8i32o4i,
+        kernel,
+        stride,
+        pad,
+        dtype,
+        target,
+    ):
+        inputs = [
+            np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
+            np.random.uniform(0, 255, size=shape_oihw8i32o4i).astype(dtype),
+        ]
+        np_filter = (
+            inputs[1].transpose(0, 5, 1, 4, 6, 2, 3).reshape(shape_oihw).transpose(2, 3, 1, 0)
+        )
+        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
+        output = build_and_run(
+            inputs,
+            conv2d_impl,
+            target,
+            target,
+            shape_nhwc=shape_nhwc,
+            shape_oihw8i32o4i=shape_oihw8i32o4i,
+            kernel_size=(kernel, kernel),
+            stride=(stride, stride),
+            padding=(pad, pad, pad, pad),
+            dtype=dtype,
+        )
+        return output, ref_output
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
new file mode 100644
index 000000000000..67af8d87f708
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+import tvm
+from tvm import te
+from tvm import topi
+from tvm.topi import testing
+from .infrastructure import (
+    ceildiv,
+    build_and_run,
+    get_block_shape,
+    get_packed_filter_layout,
+    get_packed_activation_layout,
+)
+
+import numpy as np
+import pytest
+
+# Blocked layout: NHWC8h8w32c :: [N, H//8, W//8, C//32, 8h, 8w, 32c]
+def maxpool2d_logical(
+    shape_nhwc,
+    window_shape,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Maxpool2d TE wherein the input activation is defined by its
+    logical NHWC shape. The packed physical layout for the
+    activation is nhwc8h8w32c.
+    """
+
+    block_shape = get_block_shape()
+    block_H, block_W, block_C = block_shape
+    shape = get_packed_activation_layout(shape_nhwc, block_shape)
+    logical_output_shape = (
+        shape_nhwc[0],
+        (shape_nhwc[1] - window_shape[0] + padding[0] + padding[1]) // stride[0] + 1,
+        (shape_nhwc[2] - window_shape[1] + padding[2] + padding[3]) // stride[0] + 1,
+        shape_nhwc[3],
+    )
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+
+    # Combination of padding required by maxpool operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+
+    # Calculate packed layout
+    X_packed = te.compute(
+        shape,
+        lambda n, ho, wo, co, hi, wi, ci: X_pad[
+            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        ],
+    )
+
+    rh = te.reduce_axis((0, window_shape[0]), name="rh")
+    rw = te.reduce_axis((0, window_shape[1]), name="rw")
+
+    def compute(n, ho, wo, co, hi, wi, ci):
+        # Construct blockized strided maxpool height indices
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided maxpool width indices
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        return te.max(
+            X_packed[n, h_block_id, w_block_id, co, h_block_offset, w_block_offset, ci],
+            axis=[rh, rw],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, Y], binds)
+
+
+class BaseMaxPooling:
+    batch = tvm.testing.parameter(1)
+    in_size = tvm.testing.parameter(8, 112)
+    in_channel = tvm.testing.parameter(64)
+    window_size = tvm.testing.parameter(3)
+    stride = tvm.testing.parameter(2)
+    pad = tvm.testing.parameter(1)
+    dtype = tvm.testing.parameter("float32")
+
+
+class TestMaxPooling(BaseMaxPooling):
+    @tvm.testing.parametrize_targets("llvm")
+    def test_maxpool(self, shape_nhwc, window_size, stride, pad, dtype, target):
+        inputs = [np.random.uniform(0, 255, size=shape_nhwc).astype(dtype)]
+        ref_output = testing.poolnd_python(
+            inputs[0],
+            (window_size, window_size),
+            strides=(stride, stride),
+            dilation=(1, 1),
+            padding_before=(pad, pad),
+            padding_after=(pad, pad),
+            pool_type="max",
+        )
+        output = build_and_run(
+            inputs,
+            maxpool2d_logical,
+            target,
+            target,
+            shape_nhwc,
+            window_shape=(window_size, window_size),
+            stride=(stride, stride),
+            padding=(pad, pad, pad, pad),
+            dtype=dtype,
+        )
+        return output, ref_output
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_popen_pool.py b/tests/python/contrib/test_popen_pool.py
index 9ebe4c11c118..b3a91e176a32 100644
--- a/tests/python/contrib/test_popen_pool.py
+++ b/tests/python/contrib/test_popen_pool.py
@@ -27,6 +27,9 @@
     call_py_ffi,
     call_cpp_ffi,
     call_cpp_py_ffi,
+    fast_summation,
+    slow_summation,
+    timeout_job,
 )
 
 
@@ -104,8 +107,41 @@ def test_popen_ffi():
     assert proc.recv() == initargs[0]
 
 
+def test_popen_pool_executor_async():
+    pool = PopenPoolExecutor()
+    f1 = pool.submit(slow_summation, 9999999)
+    f2 = pool.submit(fast_summation, 9999999)
+    t1 = 0
+    t2 = 0
+    while True:
+        if t1 == 0 and f1.done():
+            t1 = time.time()
+        if t2 == 0 and f2.done():
+            t2 = time.time()
+        if t1 != 0 and t2 != 0:
+            break
+    assert t2 < t1, "Expected fast async job to finish first!"
+    assert f1.result() == f2.result()
+
+
+def test_popen_pool_executor_timeout():
+    timeout = 0.5
+
+    pool = PopenPoolExecutor(timeout=timeout)
+
+    f1 = pool.submit(timeout_job, timeout)
+    while not f1.done():
+        pass
+    try:
+        res = f1.result()
+    except Exception as ex:
+        assert isinstance(ex, TimeoutError)
+
+
 if __name__ == "__main__":
     test_popen_worker()
     test_popen_pool_executor()
     test_popen_initializer()
     test_popen_ffi()
+    test_popen_pool_executor_async()
+    test_popen_pool_executor_timeout()
diff --git a/tests/python/contrib/test_tensorrt_int8_exp.py b/tests/python/contrib/test_tensorrt_int8_exp.py
new file mode 100644
index 000000000000..84360e92d33b
--- /dev/null
+++ b/tests/python/contrib/test_tensorrt_int8_exp.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import os
+import numpy as np
+
+import tvm
+import tvm.relay.testing
+from tvm import relay
+from tvm.contrib.download import download_testdata
+from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt
+from tvm.relay.op.contrib import tensorrt
+
+
+def skip_codegen_test():
+    """Skip test if TensorRT and CUDA codegen are not present"""
+    if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tvm.get_global_func("relay.ext.tensorrt", True):
+        print("Skip because TensorRT codegen is not available.")
+        return True
+    return False
+
+
+def skip_runtime_test():
+    if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tensorrt.is_tensorrt_runtime_enabled():
+        print("Skip because TensorRT runtime is not available.")
+        return True
+    return False
+
+
+def test_trt_int8():
+    """
+    This Function is used to use tensorrt int8 to compile a resnet34 model,
+    and compare cosine distance between the output of the original model and trt int8 tvm ouput
+
+    """
+    if skip_codegen_test() or skip_runtime_test():
+        return
+
+    try:
+        from PIL import Image
+        from scipy.spatial import distance
+    except:
+        print("please install scipy and Image python packages")
+        return
+
+    try:
+        import torch
+        import torchvision
+        from torchvision import transforms
+    except:
+        print("please install pytorch python package")
+        return
+
+    os.environ["TVM_TENSORRT_USE_INT8"] = "1"
+    os.environ["TENSORRT_NUM_CALI_INT8"] = "10"
+    model_name = "resnet34"
+    model = getattr(torchvision.models, model_name)(pretrained=True)
+    model = model.eval()
+
+    # We grab the TorchScripted model via tracing
+    input_shape = [1, 3, 224, 224]
+    input_data = torch.randn(input_shape)
+    scripted_model = torch.jit.trace(model, input_data).eval()
+
+    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
+    img_path = download_testdata(img_url, "cat.png", module="data")
+    img = Image.open(img_path).resize((224, 224))
+    my_preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    img = my_preprocess(img)
+    img = np.expand_dims(img, 0)
+
+    input_name = "input0"
+    shape_list = [(input_name, img.shape)]
+    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
+
+    # compile the model
+    target = "cuda"
+    dev = tvm.cuda(1)
+    mod, config = partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        lib = relay.build(mod, target=target, params=params)
+
+    dtype = "float32"
+    gen_module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    num_cali_int8 = int(os.environ["TENSORRT_NUM_CALI_INT8"])
+    if num_cali_int8 != 0:
+        print("start calibrating data ... ")
+        for i in range(num_cali_int8):
+            tvm_data = tvm.nd.array(img)
+            gen_module.set_input(input_name, tvm_data)
+            gen_module.run(data=tvm_data)
+        print("finished calibrating data ... ")
+
+    # get output of tvm model
+    print("rebuild engine and test to run ... ")
+    tvm_data = tvm.nd.array(img)
+    gen_module.set_input(input_name, tvm_data)
+    gen_module.run(data=tvm_data)
+    out = gen_module.get_output(0)
+
+    # check output of tvm and output of pytorch model are equal
+    torch_data = torch.from_numpy(img)
+    model = scripted_model.eval()
+    torch_output = model(torch_data)
+
+    cosine_distance_res = distance.cosine(out.numpy(), torch_output.detach().cpu().numpy())
+    assert cosine_distance_res <= 0.01
+
+    # Evaluate
+    print("Evaluate inference time cost...")
+    ftimer = gen_module.module.time_evaluator("run", dev, repeat=10, min_repeat_ms=500)
+    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
+    message = "Mean inference time (std dev): %.2f ms (%.2f ms)" % (
+        np.mean(prof_res),
+        np.std(prof_res),
+    )
+    print(message)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
index 2e16792542ca..fe063c2aafd1 100644
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
@@ -97,19 +97,6 @@ def test_bias_add():
     verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
 
 
-def test_relu():
-    """Test relu operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
-
-    shape = (10, 10)
-    x = relay.var("x", shape=shape)
-    y = relay.nn.relu(x)
-    func = relay.Function([x], y)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    verify_codegen(mod, dpu_target="DPUCADX8G")
-    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
-
-
 def test_batchnorm():
     """Test batchnorm operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
 
@@ -323,7 +310,6 @@ def expected():
     test_conv2d()
     test_depthwise_conv()
     test_bias_add()
-    test_relu()
     test_add()
     test_max_pool2d()
     test_global_max_pool2d()
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index 209c371a296a..d1e090f40bc5 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -189,3 +189,13 @@ def tflite_mobilenet_v1_0_25_128(tmpdir_factory):
     )
 
     return model_file
+
+
+@pytest.fixture(scope="session")
+def tflite_cnn_s_quantized(tmpdir_factory):
+    base_url = "https://github.com/ARM-software/ML-zoo/raw/master/models/keyword_spotting/cnn_small/tflite_int8/"
+    file_to_download = "cnn_s_quantized.tflite"
+    model_file = download_testdata(
+        "{}/{}".format(base_url, file_to_download), file_to_download, module=["tvmc"]
+    )
+    return model_file
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 16c02335c8a0..defd628c60c9 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -15,15 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 import os
+import re
 import shutil
+import tarfile
 from os import path
 
 from unittest import mock
 import pytest
 
 import tvm
+import tvm.testing
 
-from tvm.relay.op.contrib.ethosn import ethosn_available
 from tvm.contrib.target.vitis_ai import vitis_ai_available
 
 from tvm.driver import tvmc
@@ -290,10 +292,7 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
     assert os.path.exists(dumps_path)
 
 
-@pytest.mark.skipif(
-    not ethosn_available(),
-    reason="--target=ethos-n77 is not available. TVM built with 'USE_ETHOSN OFF'",
-)
+@tvm.testing.requires_ethosn
 def test_compile_tflite_module_with_external_codegen(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
     tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
@@ -308,6 +307,37 @@ def test_compile_tflite_module_with_external_codegen(tflite_mobilenet_v1_1_quant
     assert os.path.exists(dumps_path)
 
 
+def test_compile_tflite_module_with_external_codegen_cmsisnn(
+    tmpdir_factory, tflite_cnn_s_quantized
+):
+    pytest.importorskip("tflite")
+
+    output_dir = tmpdir_factory.mktemp("mlf")
+    tvmc_model = tvmc.load(tflite_cnn_s_quantized)
+
+    output_file_name = f"{output_dir}/file.tar"
+
+    tvmc_package = tvmc.compiler.compile_model(
+        tvmc_model,
+        target=f"cmsis-nn, c -runtime=c --system-lib --link-params -mcpu=cortex-m55 --executor=aot",
+        output_format="mlf",
+        package_path=output_file_name,
+        pass_context_configs=["tir.disable_vectorize=true"],
+    )
+
+    # check whether an MLF package was created
+    assert os.path.exists(output_file_name)
+
+    # check whether the expected number of C sources are in the tarfile
+    with tarfile.open(output_file_name) as mlf_package:
+        c_source_files = [
+            name
+            for name in mlf_package.getnames()
+            if re.match(r"\./codegen/host/src/\D+\d+\.c", name)
+        ]
+        assert len(c_source_files) == 3
+
+
 @pytest.mark.skipif(
     not vitis_ai_available(),
     reason="--target=vitis-ai is not available. TVM built with 'USE_VITIS_AI OFF'",
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 7c66954c182d..a235394a9746 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -885,10 +885,12 @@ def add_noop_to_input_attr(attr_name, attr):
 
     x = np.random.randn(20, 10, 5).astype(np.float32)
     _test_slice_iteration_v1(x, x[0:3, 0:10], starts=(0, 0), ends=(3, 10), axes=(0, 1))
+    _test_slice_iteration_v1(x, x[0:3, 0:10], starts=(0, 0), ends=(10, 3), axes=(1, 0))
     _test_slice_iteration_v1(x, x[:, :, 3:4], starts=(0, 0, 3), ends=(20, 10, 4))
     _test_slice_iteration_v1(x, x[:, 1:1000], starts=(1,), ends=(1000,), axes=(1,))
     _test_slice_iteration_v1(x, x[:, 0:-1], starts=(0,), ends=(-1,), axes=(1,))
     _test_slice_iteration_v10(x, x[0:3, 0:10], starts=(0, 0), ends=(3, 10), axes=(0, 1))
+    _test_slice_iteration_v10(x, x[0:3, 0:10], starts=(0, 0), ends=(10, 3), axes=(1, 0))
     _test_slice_iteration_v10(x, x[:, :, 3:4], starts=(0, 0, 3), ends=(20, 10, 4))
     _test_slice_iteration_v10(x, x[:, 1:1000], starts=(1,), ends=(1000,), axes=(1,))
     _test_slice_iteration_v10(x, x[:, 0:-1], starts=(0,), ends=(-1,), axes=(1,))
@@ -3323,6 +3325,8 @@ def verify_rnn(
     use_peep=False,
     linear_before_reset=False,
     directions=1,
+    rtol=1e-5,
+    atol=1e-5,
     target=None,
     dev=None,
 ):
@@ -3445,7 +3449,7 @@ def register(name, shape, proto_type):
     model = helper.make_model(graph, producer_name="rnn_test")
 
     verify_with_ort_with_inputs(
-        model, input_values, output_shapes, atol=1e-2, rtol=1e-2, target=target, dev=dev
+        model, input_values, output_shapes, atol=atol, rtol=rtol, target=target, dev=dev
     )
 
 
@@ -3601,6 +3605,8 @@ def test_lstm(target, dev):
 
 @tvm.testing.parametrize_targets
 def test_gru(target, dev):
+    # Set seed for test reproduction
+    np.random.seed(137)
     for directions in [1, 2]:
         # No bias.
         verify_rnn(
@@ -3611,10 +3617,12 @@ def test_gru(target, dev):
             use_bias=False,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-6,
+            atol=1e-6,
             target=target,
             dev=dev,
         )
-        # large batch.
+        # large batch. linear before reset
         verify_rnn(
             seq_length=4,
             batch_size=8,
@@ -3636,6 +3644,8 @@ def test_gru(target, dev):
             use_bias=True,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-6,
+            atol=1e-6,
             target=target,
             dev=dev,
         )
@@ -3648,6 +3658,8 @@ def test_gru(target, dev):
             use_bias=True,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-6,
+            atol=1e-6,
             target=target,
             dev=dev,
         )
@@ -3660,6 +3672,8 @@ def test_gru(target, dev):
             use_bias=True,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-6,
+            atol=1e-6,
             target=target,
             dev=dev,
         )
@@ -3672,6 +3686,8 @@ def test_gru(target, dev):
             use_bias=True,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-6,
+            atol=1e-6,
             target=target,
             dev=dev,
         )
@@ -3687,6 +3703,8 @@ def test_gru(target, dev):
             activations=["HardSigmoid", "Softsign"] * directions,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-6,
+            atol=1e-6,
             target=target,
             dev=dev,
         )
@@ -3702,6 +3720,8 @@ def test_gru(target, dev):
             betas=[0.3, 0.0] * directions,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-8,
+            atol=1e-8,
             target=target,
             dev=dev,
         )
@@ -3717,6 +3737,8 @@ def test_gru(target, dev):
             betas=[0.3, 0.1] * directions,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-8,
+            atol=1e-8,
             target=target,
             dev=dev,
         )
@@ -3731,6 +3753,8 @@ def test_gru(target, dev):
             use_initial_state=True,
             rnn_type="GRU",
             directions=directions,
+            rtol=1e-6,
+            atol=1e-6,
             target=target,
             dev=dev,
         )
@@ -4121,11 +4145,7 @@ def verify_nms(
     )
 
 
-# @tvm.testing.parametrize_targets
-@pytest.mark.skip(
-    "Test regressed due to not being run in CI"
-    + " tracked here: https://github.com/apache/tvm/pull/8274"
-)
+@tvm.testing.parametrize_targets
 def test_loop(target, dev):
     def verify_cond_loop():
         y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [1])
@@ -4687,10 +4707,6 @@ def verify_eyelike(indata):
 )
 
 unsupported_onnx_tests = [
-    "test_adagrad",
-    "test_adagrad_multiple",
-    "test_adam",
-    "test_adam_multiple",
     "test_cast_BFLOAT16_to_FLOAT",
     "test_cast_DOUBLE_to_FLOAT16",
     "test_cast_FLOAT_to_BFLOAT16",
@@ -4715,16 +4731,8 @@ def verify_eyelike(indata):
     "test_dropout_default_mask",
     "test_dropout_default_mask_ratio",
     "test_dropout_default_ratio",
-    "test_einsum_batch_diagonal",
-    "test_einsum_batch_matmul",
-    "test_einsum_inner_prod",
-    "test_einsum_sum",
-    "test_einsum_transpose",
     "test_greater_equal",
     "test_greater_equal_bcast",
-    "test_hardmax_axis_0",
-    "test_hardmax_axis_1",
-    "test_hardmax_default_axis",
     "test_if_seq",
     "test_less_equal",
     "test_less_equal_bcast",
@@ -4743,10 +4751,7 @@ def verify_eyelike(indata):
     "test_maxpool_with_argmax_2d_precomputed_pads",
     "test_maxpool_with_argmax_2d_precomputed_strides",
     "test_maxunpool_export_with_output_shape",
-    "test_momentum",
-    "test_momentum_multiple",
     "test_mvn",
-    "test_nesterov_momentum",
     # When unsqueeze is fully supported, remaining nllloss tests should work:
     "test_nllloss_NC_expanded",
     "test_nllloss_NCd1_expanded",
@@ -4766,16 +4771,6 @@ def verify_eyelike(indata):
     "test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
     "test_nllloss_NCd1d2d3d4d5_mean_weight_expanded",
     "test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
-    "test_pow_types_float",
-    "test_pow_types_float32_int32",
-    "test_pow_types_float32_int64",
-    "test_pow_types_float32_uint32",
-    "test_pow_types_float32_uint64",
-    "test_pow_types_int",
-    "test_pow_types_int32_float32",
-    "test_pow_types_int32_int32",
-    "test_pow_types_int64_float32",
-    "test_pow_types_int64_int64",
     "test_qlinearmatmul_2D",
     "test_qlinearmatmul_3D",
     "test_range_float_type_positive_delta_expanded",
@@ -5019,6 +5014,81 @@ def verify_embedding_bag(num_embedding, embedding_dim, data_shape, num_bags=None
     verify_embedding_bag(32, 2, [3, 3])
 
 
+@tvm.testing.parametrize_targets
+def test_index_put(target, dev):
+    class _index_put_model(torch.nn.Module):
+        def __init__(self, indices, values, accumulate):
+            super(_index_put_model, self).__init__()
+            self.indices = indices
+            self.values = values
+            self.accumulate = accumulate
+
+        def forward(self, x):
+            return x.index_put(self.indices, self.values, self.accumulate)
+
+    def _convert_to_onnx(model, dummy_data):
+        file_name = "{}.onnx".format("aten_model")
+        torch.onnx.export(
+            model,
+            dummy_data,
+            file_name,
+            export_params=True,
+            verbose=False,
+            opset_version=11,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        onnx_model = onnx.load(file_name)
+        return onnx_model
+
+    def verify_index_put(data_shape, indices, accumulate):
+        dummy_data = torch.ones(data_shape)
+        tvm_inputs = [dummy_data.numpy()]
+        values = torch.rand(indices[0].size())
+        model = _index_put_model(indices, values, accumulate)
+        onnx_model = _convert_to_onnx(model, dummy_data)
+        torch_out = model(dummy_data)
+
+        tvm_out = get_tvm_output_with_vm(
+            onnx_model, tvm_inputs, target, dev, freeze_params=True, convert_to_static=True
+        )
+        tvm.testing.assert_allclose(torch_out.numpy(), tvm_out)
+
+    shape = (3, 5)
+    xidx = torch.tensor([0, 1, 2, 2])
+    yidx = torch.tensor([0, 1, 3, 4])
+    verify_index_put(shape, [xidx, yidx], True)
+
+    shape = (3, 5, 3)
+    xidx = torch.tensor([0, 1, 2, 2, 0])
+    yidx = torch.tensor([0, 1, 3, 4, 0])
+    zidx = torch.tensor([0, 1, 1, 2, 0])
+    verify_index_put(shape, [xidx, yidx, zidx], False)
+
+    def verify_index_put_slice(data_shape, value_shape, accumulate):
+        dummy_data = torch.ones(data_shape)
+        tvm_inputs = [dummy_data.numpy()]
+        indices = []
+        index_shape = [1] * len(value_shape)
+        index_shape[0] = -1
+        for i in range(len(value_shape)):
+            indices.append(torch.arange(0, value_shape[i]).reshape(tuple(index_shape)))
+            index_shape.pop()
+        values = torch.rand(value_shape)
+
+        model = _index_put_model(indices, values, accumulate)
+        onnx_model = _convert_to_onnx(model, dummy_data)
+        torch_out = model(dummy_data)
+
+        tvm_out = get_tvm_output_with_vm(
+            onnx_model, tvm_inputs, target, dev, freeze_params=True, convert_to_static=True
+        )
+        tvm.testing.assert_allclose(torch_out.numpy(), tvm_out)
+
+    verify_index_put_slice((3, 3), (2, 2), False)
+    verify_index_put_slice((2, 3, 4), (1, 2, 3), True)
+    verify_index_put_slice((2, 3, 4, 5), (1, 2, 3, 1), False)
+
+
 @tvm.testing.parametrize_targets
 def test_reverse_sequence(target, dev):
     def verify_reverse_sequence(x, sequence_lens, batch_axis, time_axis):
@@ -5240,6 +5310,39 @@ def repeat(N, D):
     )
 
 
+@tvm.testing.parametrize_targets
+def test_qlinearconcat(target, dev):
+    def verify_qlinearconcat(shapes, out_shape, axis=None):
+        input_names = []
+        input_values = []
+        input_nodes = []
+        for i in range(len(shapes)):
+            tensor_name = chr(ord("a") + i)
+            shape = shapes[i]
+            node = helper.make_tensor_value_info(tensor_name, TensorProto.FLOAT, list(shape))
+
+            input_names.append(tensor_name)
+            input_values.append(np.random.random(shape).astype("float32"))
+            input_nodes.append(node)
+
+        node = helper.make_node("Concat", input_names, ["C"])
+        if axis is not None:
+            axis_attr = helper.make_attribute("axis", axis)
+            node.attribute.append(axis_attr)
+        graph = helper.make_graph(
+            [node],
+            "qlinearconcat_test",
+            inputs=input_nodes,
+            outputs=[helper.make_tensor_value_info("C", TensorProto.FLOAT, list(out_shape))],
+        )
+        model = helper.make_model(graph, producer_name="qlinearconcat_test")
+        quantize_and_verify_with_ort(model, input_names, shapes, target, dev)
+
+    verify_qlinearconcat([[2, 1], [2, 1]], [4, 1], 0)
+    verify_qlinearconcat([[2, 1], [2, 1]], [2, 2], 1)
+    verify_qlinearconcat([[1, 2], [2, 2], [3, 2]], [6, 2], 0)
+
+
 @tvm.testing.parametrize_targets
 def test_qlinearadd(target, dev):
     def verify_qlinearadd(a_shape, b_shape, c_shape):
@@ -5595,8 +5698,10 @@ def repeat(N, D):
     test_cumsum()
     test_wrong_input()
     test_aten()
+    test_index_put()
     test_reverse_sequence()
     test_eyelike()
+    test_qlinearconcat()
     test_qlinearconv()
     test_random_uniform()
     test_convinteger()
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index bae7c1b5498c..c27469edf1d7 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1576,6 +1576,13 @@ class LinearNoBias(Module):
         def forward(self, input, weight):
             return F.linear(input, weight)
 
+    class LinearNested(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y, z):
+            return F.linear(x, F.linear(y, z))
+
     input2d = torch.rand([2, 2]).float()
     input3d = torch.rand([4, 3, 2]).float()
     weight1d = torch.rand([2]).float()
@@ -1595,6 +1602,9 @@ def forward(self, input, weight):
     verify_model(LinearNoBias(), input_data=[input2d, weight1d])
     # 3D input, 2D weight, no bias
     verify_model(LinearNoBias(), input_data=[input3d, weight3x2])
+
+    verify_model(LinearNested(), input_data=[torch.randn(10, 10) for _ in range(3)])
+
     # TODO: Add the following cases when matmul(1D, _) is supported by TVM
     # 1D input, 2D weight, 1D bias
     # 1D input, 2D weight, no bias
@@ -3927,185 +3937,15 @@ def forward(self, x):
     verify_model(Flip(axis=-1), input_data=input)
 
 
+def test_annotate_span():
+    model = torchvision.models.resnet18().eval()
+    inp = torch.randn([1, 3, 224, 224])
+    trace = torch.jit.trace(model, inp).eval()
+    mod, params = relay.frontend.from_pytorch(
+        trace, [("input", inp.shape)], use_parser_friendly_name=True
+    )
+    relay.transform.AnnotateSpans()(mod)
+
+
 if __name__ == "__main__":
-    # some structural tests
-    test_forward_traced_function()
-    test_forward_dtypes()
-    test_weight_names()
-    test_duplicate_weight_use()
-
-    # Single operator tests
-    test_forward_pixel_shuffle()
-    test_forward_add()
-    test_forward_subtract()
-    test_forward_multiply()
-    test_forward_matmul()
-    test_forward_rsub()
-    test_forward_onehot()
-    test_forward_embedding()
-    test_forward_reshape()
-    test_forward_reciprocal()
-    test_forward_repeat()
-    test_forward_repeat_interleave()
-    test_forward_squeeze()
-    test_forward_unsqueeze()
-    test_forward_concatenate()
-    test_forward_reduce_sum()
-    test_forward_reduce_prod()
-    test_forward_argmin()
-    test_forward_argmax()
-    test_forward_norm()
-    test_forward_frobenius_norm()
-    test_forward_std()
-    test_forward_variance()
-    test_forward_relu()
-    test_forward_prelu()
-    test_forward_leakyrelu()
-    test_forward_elu()
-    test_forward_celu()
-    test_forward_gelu()
-    test_forward_selu()
-    test_forward_log_sigmoid()
-    test_forward_adaptiveavgpool()
-    test_forward_maxpool2d()
-    test_forward_maxpool1d()
-    test_forward_maxpool3d()
-    test_forward_hardtanh()
-    test_forward_conv()
-    test_forward_conv_transpose()
-    test_forward_threshold()
-    test_forward_contiguous()
-    test_forward_batchnorm()
-    test_forward_instancenorm()
-    test_forward_layernorm()
-    test_forward_groupnorm()
-    test_forward_transpose()
-    test_forward_size()
-    test_forward_view()
-    test_forward_select()
-    test_forward_take()
-    test_forward_topk()
-    test_forward_where()
-    test_forward_addcdiv()
-    test_forward_addcmul()
-    test_forward_true_divide()
-    test_forward_is_floating_point()
-    test_forward_clone()
-    test_forward_softplus()
-    test_forward_softsign()
-    test_forward_logsoftmax()
-    test_forward_sigmoid()
-    test_forward_dense()
-    test_forward_linear()
-    test_forward_avgpool1d()
-    test_forward_avgpool2d()
-    test_forward_avgpool3d()
-    test_forward_dropout()
-    test_forward_slice()
-    test_forward_narrow()
-    test_forward_mean()
-    test_forward_expand()
-    test_forward_pow()
-    test_forward_unary()
-    test_forward_clamp()
-    test_forward_clamp_()
-    test_forward_logical_not()
-    test_forward_bitwise_not()
-    test_forward_bitwise_xor()
-    test_forward_logical_xor()
-    test_forward_isfinite()
-    test_forward_isnan()
-    test_forward_isinf()
-    test_forward_ones()
-    test_forward_ones_like()
-    test_forward_zeros()
-    test_forward_zeros_like()
-    test_forward_full()
-    test_forward_full_like()
-    test_forward_linspace()
-    test_forward_arange()
-    test_forward_mesh_grid()
-    test_forward_chunk()
-    test_forward_split()
-    test_forward_gather()
-    test_upsample()
-    test_forward_upsample3d()
-    test_forward_nms()
-    test_forward_roi_align()
-    test_to()
-    test_flatten()
-    test_type_as()
-    test_forward_functional_pad()
-    test_forward_zero_pad2d()
-    test_forward_constant_pad1d()
-    test_forward_constant_pad2d()
-    test_forward_constant_pad3d()
-    test_forward_reflection_pad1d()
-    test_forward_reflection_pad2d()
-    test_forward_replication_pad1d()
-    test_forward_replication_pad2d()
-    test_forward_replication_pad3d()
-    test_adaptive_pool3d()
-    test_conv3d()
-    test_conv3d_transpose()
-    test_forward_index()
-    test_min_max()
-    test_logsumexp()
-    test_stack()
-    test_stack_dynamic()
-    test_forward_unbind()
-    test_forward_nonzero()
-    test_forward_scatter()
-    test_forward_index_put()
-    test_numel()
-    test_bincount()
-    test_cumsum()
-    test_masked_fill()
-    test_transformer()
-    test_sort()
-    test_argsort()
-    test_logical_and()
-    test_masked_select()
-    test_unique()
-    test_hard_swish()
-    test_hard_sigmoid()
-    test_forward_nll_loss()
-    test_forward_flip()
-
-    # Model tests
-    test_resnet18()
-    test_squeezenet1_0()
-    test_squeezenet1_1()
-    test_densenet121()
-    # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug
-    # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756
-    # test_inception_v3()
-    test_googlenet()
-    test_mnasnet0_5()
-    test_mobilenet_v2()
-
-    test_custom_conversion_map()
-
-    test_segmentation_models()
-    test_3d_models()
-
-    # Quantization test
-    from qnn_test import test_quantized_imagenet, test_quantized_modules
-
-    test_quantized_modules()
-    test_quantized_imagenet()
-
-    # Test simple conditionals and loop
-    test_control_flow()
-    test_simple_rnn()
-
-    # More complex recurrent models
-    from test_lstm import test_custom_lstm
-
-    test_custom_lstm()
-
-    # Test bert model
-    test_forward_pretrained_bert_base_uncased()
-
-    # Test convert torch script(jit) with specific inputs' types
-    test_convert_torch_script_with_input_types()
+    pytest.main([__file__])
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index 939d0819546b..ca097734a9eb 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -14,10 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
+
 import tvm
-from tvm import te
+from tvm import te, topi
 import numpy as np
 import tvm.testing
+import tvm.topi.testing
 
 
 @tvm.testing.requires_gpu
@@ -524,16 +527,73 @@ def check_target(device):
     check_target("rocm")
 
 
+@tvm.testing.requires_gpu
+def test_reduce_storage_reuse():
+    target = tvm.target.Target("cuda")
+
+    def run_passes(sch, args):
+        bounds = tvm.te.schedule.InferBound(sch)
+        stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
+        func = tvm.te.schedule.SchedulePostProcToPrimFunc(args, stmt, None)
+        mod = tvm.IRModule.from_expr(func)
+        mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", target))(mod)
+        return tvm.transform.Sequential(
+            [
+                tvm.tir.transform.StorageFlatten(64),
+                tvm.tir.transform.Simplify(),
+                tvm.tir.transform.StorageRewrite(),
+                tvm.tir.transform.LowerThreadAllreduce(),
+            ]
+        )(mod)
+
+    dev = tvm.device(target.kind.name, 0)
+    shape = (16, 16)
+
+    A = te.placeholder(shape, dtype="float32", name="A")
+    B = topi.nn.softmax(A, axis=1) + 1.0
+
+    with tvm.target.Target(target):
+        s = topi.cuda.schedule_softmax(B)
+
+    mod = run_passes(s, [A, B])
+
+    # Due to the storage rewrite pass, the reduction output storage reduce_temp0 can be reused as
+    # the storage of the next compute.
+
+    # Example:
+    # ...
+    # tir.tvm_thread_allreduce((uint32)1, normal_reduce_temp0[0], 1, reduce_temp0, threadIdx.x)
+    # if ((threadIdx.x < 16)) {
+    #   reduce_temp0[0] = (T_softmax_exp[threadIdx.x]/reduce_temp0[0])
+    # }
+    # ...
+
+    # The LowerThreadAllreduce pass should remap reduce_temp0 on the left hand side of the store
+    # above, as well as the load on the right hand side.
+
+    # Expected output:
+    # ...
+    # red_buf0[0] = tir.tvm_warp_shuffle(mask[0], red_buf0[0], 0, 32, 32)
+    # if ((threadIdx.x < 16)) {
+    #   red_buf0[0] = (T_softmax_exp[threadIdx.x]/red_buf0[0])
+    # }
+    # ...
+
+    def check_store_dst_remapped(op):
+        if isinstance(op, tvm.tir.Store):
+            assert op.buffer_var.name != "reduce_temp0"
+
+    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, check_store_dst_remapped)
+
+    inp = np.random.uniform(size=shape).astype("float32")
+    ref = tvm.topi.testing.softmax_python(inp) + 1.0
+
+    f = tvm.build(s, [A, B], target)
+    a = tvm.nd.array(inp, dev)
+    b = tvm.nd.array(np.zeros(shape, dtype=B.dtype), dev)
+    f(a, b)
+    tvm.testing.assert_allclose(b.numpy(), ref, rtol=1e-5)
+
+
 if __name__ == "__main__":
-    test_rfactor_elemwise_threads()
-    test_rfactor_threads()
-    test_rfactor_factor_axis()
-    test_rfactor()
-    test_reduce_prims()
-    test_argmax()
-    test_rfactor_argmax()
-    test_warp_reduction1()
-    test_warp_reduction2()
-    test_init()
-    test_init_imm()
-    test_rfactor_init()
+    pytest.main([__pfile__])
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index e5ac85b115aa..490257ac66da 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -22,6 +22,7 @@
 import os
 import pathlib
 import platform
+import re
 import shutil
 import subprocess
 import tarfile
@@ -154,6 +155,10 @@ def parametrize_aot_options(test):
     skip_i386 = pytest.mark.skipif(
         platform.machine() == "i686", reason="Reference system unavailable in i386 container"
     )
+    requires_arm_eabi = pytest.mark.skipif(
+        shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
+    )
+
     interface_api = ["packed", "c"]
     use_unpacked_api = [True, False]
     test_runner = [AOT_DEFAULT_RUNNER, AOT_CORSTONE300_RUNNER]
@@ -177,7 +182,7 @@ def parametrize_aot_options(test):
 
     # Skip reference system tests if running in i386 container
     marked_combinations = map(
-        lambda parameters: pytest.param(*parameters, marks=skip_i386)
+        lambda parameters: pytest.param(*parameters, marks=[skip_i386, requires_arm_eabi])
         if parameters[2] == AOT_CORSTONE300_RUNNER
         else parameters,
         valid_combinations,
@@ -250,7 +255,10 @@ def emit_main_prologue(main_file, custom_prologue, workspace_bytes):
 
 def emit_main_data(main_file, input_map, output_list, mod_name):
     for key in input_map:
-        main_file.write(f'#include "{mangle_name(mod_name,"input_data")}_{key}.h"\n')
+        sanitized_tensor_name = re.sub(r"\W", "_", key)
+        main_file.write(
+            f'#include "{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}.h"\n'
+        )
 
     for i in range(0, len(output_list)):
         main_file.write(f'#include "{mangle_name(mod_name,"expected_output_data")}{i}.h"\n')
@@ -262,7 +270,10 @@ def emit_main_data_structs(main_file, input_map, output_list, mod_name):
         f"struct {mangle_name(mod_name, 'inputs')} {mangle_name(mod_name, 'inputs')} = {{"
     )
     for key in input_map:
-        main_file.write(f"\t.{key} = {mangle_name(mod_name, 'input_data')}_{key},\n")
+        sanitized_tensor_name = re.sub(r"\W", "_", key)
+        main_file.write(
+            f"\t.{sanitized_tensor_name} = {mangle_name(mod_name, 'input_data')}_{sanitized_tensor_name},\n"
+        )
     main_file.write("};\n")
 
     main_file.write(
@@ -283,7 +294,8 @@ def emit_main_data_setup(main_file, input_map, output_list, mod_name):
 
     main_file.write(f'void* {mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ')
     for key in input_map:
-        main_file.write(f'{mangle_name(mod_name,"input_data")}_{key}, ')
+        sanitized_tensor_name = re.sub(r"\W", "_", key)
+        main_file.write(f'{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}, ')
     main_file.write("};\n")
 
     main_file.write(f'void* {mangle_name(mod_name,"outputs")}[{num_outputs}]  = {{ ')
@@ -521,8 +533,9 @@ def compile_and_run(
         workspace_bytes += extract_main_workspace_size_bytes(base_path)
 
         for key in model.inputs:
+            sanitized_tensor_name = re.sub(r"\W", "_", key)
             create_header_file(
-                f'{mangle_name(model.name, "input_data")}_{key}',
+                f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}',
                 model.inputs[key],
                 include_path,
             )
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 36cffefcd0bb..e117302d0ed8 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -503,5 +503,91 @@ def test_transpose(interface_api, use_unpacked_api, test_runner):
     )
 
 
+def test_name_sanitiser():
+    """Test that input tensors with special characters in the name don't break compilation"""
+
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_DEFAULT_RUNNER
+
+    func = relay.var("input-x::2", "float32")
+    ident = relay.Function([func], func)
+    one = np.array(1.0, "float32")
+    inputs = {"input-x::2": one}
+    output_list = generate_ref_data(ident, inputs)
+
+    compile_and_run(
+        AOTTestModel(module=IRModule.from_expr(func), inputs=inputs, outputs=output_list),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+        enable_op_fusion=False,
+    )
+
+
+def test_name_sanitiser_name_clash():
+    """Test that 2 input tensors with names that clash once sanitized, generates an error"""
+
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_DEFAULT_RUNNER
+
+    dtype = "float32"
+    x = relay.var("input::-1", shape=(10, 5), dtype=dtype)
+    # Next 2 input tensor names will clash once sanitized.
+    y = relay.var("input::-2", shape=(10, 5), dtype=dtype)
+    t = relay.var("input:--2", shape=(), dtype=dtype)
+    a = relay.add(x, y)
+    b = relay.transpose(a)
+    z = relay.add(b, t)
+    # Check result.
+    func = relay.Function([x, y, t], z)
+    x_data = np.random.rand(10, 5).astype(dtype)
+    y_data = np.random.rand(10, 5).astype(dtype)
+    t_data = np.random.uniform(size=()).astype(dtype)
+
+    inputs = {"input::-1": x_data, "input::-2": y_data, "input:--2": t_data}
+    output_list = generate_ref_data(func, inputs)
+
+    with pytest.raises(ValueError, match="Sanitized input tensor name clash"):
+        compile_and_run(
+            AOTTestModel(module=IRModule.from_expr(func), inputs=inputs, outputs=output_list),
+            test_runner,
+            interface_api,
+            use_unpacked_api,
+            enable_op_fusion=False,
+        )
+
+
+@pytest.mark.parametrize(
+    "workspace_byte_alignment,main_workspace_size,sum_workspace_size",
+    [
+        (8, 10368, 15200),
+        (16, 10368, 15232),
+        (256, 10752, 17408),
+    ],
+)
+def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_workspace_size):
+    mod, params = tvm.relay.testing.synthetic.get_workload()
+
+    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment}"
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        lib = tvm.relay.build(mod, target, params=params)
+
+    assert (
+        sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size
+    )
+    assert (
+        sum(
+            [
+                size
+                for metadata in lib.function_metadata.values()
+                for size in metadata.workspace_sizes.values()
+            ]
+        )
+        == sum_workspace_size
+    )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_analysis_extract_fused_functions.py b/tests/python/relay/test_analysis_extract_fused_functions.py
index 5734f4e1a891..9317a4de7887 100644
--- a/tests/python/relay/test_analysis_extract_fused_functions.py
+++ b/tests/python/relay/test_analysis_extract_fused_functions.py
@@ -96,7 +96,7 @@ def is_conv_add(func):
 def test_extract_resnet():
     mod, _params = get_workload()
     items = relay.analysis.extract_fused_functions(mod)
-    assert len(items) == 6
+    assert len(items) == 7
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_analysis_extract_operators.py b/tests/python/relay/test_analysis_extract_operators.py
new file mode 100644
index 000000000000..5878b2a6e497
--- /dev/null
+++ b/tests/python/relay/test_analysis_extract_operators.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test function extraction"""
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.testing.resnet import get_workload
+from tvm.relay.testing import run_opt_pass
+
+
+def get_conv_net():
+    """This gets the net for:
+          conv2d
+          /  |
+         /   |
+    conv2d   |
+        \    |
+         \   |
+        elemwise add
+             |
+    """
+    dshape = (1, 1, 5, 1)
+    x = relay.var("x", shape=dshape)
+    y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+    x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1)
+
+    z = relay.add(y, x1)
+
+    return tvm.IRModule.from_expr(z)
+
+
+def get_conv2d():
+    x = relay.var("x", shape=(1, 56, 56, 64))
+    weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
+    y = relay.nn.conv2d(
+        x,
+        weight1,
+        channels=32,
+        kernel_size=(3, 3),
+        padding=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+    return tvm.IRModule.from_expr(y)
+
+
+def test_extract_identity():
+    mod = get_conv2d()
+    op_freqs = relay.analysis.list_op_freqs(mod)
+    assert len(op_freqs) == 1
+    assert op_freqs["nn.conv2d"] == 1
+
+
+def test_extract_conv_net():
+    mod = get_conv_net()
+    op_freqs = relay.analysis.list_op_freqs(mod)
+    assert len(op_freqs) == 2
+    assert op_freqs["add"] == 1
+    assert op_freqs["nn.conv2d"] == 2
+
+
+def test_extract_fused():
+    mod = get_conv_net()
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.FuseOps(3)(mod)
+
+    op_freqs = relay.analysis.list_op_freqs(mod)
+    assert len(op_freqs) == 2
+    assert op_freqs["add"] == 1
+    assert op_freqs["nn.conv2d"] == 2
+
+
+def test_extract_resnet():
+    mod, _params = get_workload()
+    expected_op_freqs = {
+        "nn.batch_norm": 19,
+        "nn.conv2d": 21,
+        "nn.relu": 18,
+        "nn.max_pool2d": 1,
+        "add": 8,
+        "nn.global_avg_pool2d": 1,
+        "nn.batch_flatten": 1,
+        "nn.dense": 1,
+        "nn.bias_add": 1,
+        "nn.softmax": 1,
+    }
+    op_freqs = relay.analysis.list_op_freqs(mod)
+    assert len(op_freqs) == len(expected_op_freqs)
+    assert all([op_freqs[op] == expected_op_freqs[op] for op in expected_op_freqs])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 6430e6aa2116..decddc1ef0a4 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -24,8 +24,8 @@
 from tvm.relay.loops import while_loop
 from tvm.relay.testing import run_infer_type as infer_type
 
-from utils.assert_diagnostic import DiagnosticTesting
 from utils import ref_funcs
+from utils.assert_diagnostic import DiagnosticTesting
 
 
 def int32(val):
@@ -372,28 +372,51 @@ def test_any_shape_of():
     check_result([data], mod, np.array(3).astype("int64"))
 
 
-def verify_any_reduce(
-    reduce_op, data_shape, axis, exclude, keepdims, static_data_shape, ref_out_shape
-):
-    mod = tvm.IRModule()
-    dtype = "bool" if reduce_op == relay.all else "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = reduce_op(data, axis, keepdims, exclude)
-    mod["main"] = relay.Function([data], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    check_result([data_np], mod, ref_out_shape, assert_shape=True)
+class TestAnyReduce:
+    config = {
+        "argmax": (relay.argmax, any_dims(3), None, False, False, (3, 4, 5), ()),
+        "argmin": (relay.argmin, any_dims(4), 1, False, True, (3, 4, 5, 6), (3, 1, 5, 6)),
+        "all": (relay.all, any_dims(3), (1, 2), True, False, (3, 4, 5), (4, 5)),
+        "max": (relay.max, any_dims(4), -1, True, True, (3, 4, 5, 6), (1, 1, 1, 6)),
+        "min": (relay.min, any_dims(3), (0, 1), False, False, (4, 5, 6), (6,)),
+        "prod": (relay.prod, any_dims(4), 2, True, True, (3, 4, 5, 6), (1, 1, 5, 1)),
+        "mean": (relay.mean, any_dims(2), 0, False, False, (1, 2), (2,)),
+        "variance": (relay.variance, any_dims(5), (2, 4), False, False, (3, 4, 5, 6, 7), (3, 4, 6)),
+    }
 
+    (
+        reduce_op,
+        data_shape,
+        axis,
+        exclude,
+        keepdims,
+        static_data_shape,
+        ref_out_shape,
+    ) = tvm.testing.parameters(*config.values(), ids=config.keys())
+
+    def test_any_reduce(
+        self,
+        target,
+        dev,
+        reduce_op,
+        data_shape,
+        axis,
+        exclude,
+        keepdims,
+        static_data_shape,
+        ref_out_shape,
+    ):
+        target = tvm.target.Target(target)
+        if target.kind.name == "vulkan" and reduce_op == relay.all:
+            pytest.xfail("Known failing test case for vulkan runtime")
 
-@tvm.testing.uses_gpu
-def test_any_reduce():
-    verify_any_reduce(relay.argmax, any_dims(3), None, False, False, (3, 4, 5), ())
-    verify_any_reduce(relay.argmin, any_dims(4), 1, False, True, (3, 4, 5, 6), (3, 1, 5, 6))
-    verify_any_reduce(relay.all, any_dims(3), (1, 2), True, False, (3, 4, 5), (4, 5))
-    verify_any_reduce(relay.max, any_dims(4), -1, True, True, (3, 4, 5, 6), (1, 1, 1, 6))
-    verify_any_reduce(relay.min, any_dims(3), (0, 1), False, False, (4, 5, 6), (6,))
-    verify_any_reduce(relay.prod, any_dims(4), 2, True, True, (3, 4, 5, 6), (1, 1, 5, 1))
-    verify_any_reduce(relay.mean, any_dims(2), 0, False, False, (1, 2), (2,))
-    verify_any_reduce(relay.variance, any_dims(5), (2, 4), False, False, (3, 4, 5, 6, 7), (3, 4, 6))
+        mod = tvm.IRModule()
+        dtype = "bool" if reduce_op == relay.all else "float32"
+        data = relay.var("data", shape=data_shape, dtype=dtype)
+        y = reduce_op(data, axis, keepdims, exclude)
+        mod["main"] = relay.Function([data], y)
+        data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+        check_result([data_np], mod, ref_out_shape, assert_shape=True, targets=[(target, dev)])
 
 
 def verify_any_layout_transform(
@@ -579,66 +602,58 @@ def test_any_conv2d():
     )
 
 
-def verify_any_conv2d_NCHWc(
-    data_shape,
-    kernel_shape,
-    strides,
-    padding,
-    dilation,
-    data_layout,
-    kernel_layout,
-    out_layout,
-    static_data_shape,
-    ref_out_shape,
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    kernel = relay.var("kernel", shape=kernel_shape, dtype=dtype)
-    y = relay.nn.contrib_conv2d_nchwc(
-        data,
-        kernel,
+class TestAnyConv2dNCHWc:
+    data_shape = tvm.testing.parameter((relay.Any(), 8, 224, 224, 8))
+    kernel_shape = tvm.testing.parameter((8, 8, 3, 3, 8, 8))
+    strides = tvm.testing.parameter((1, 1))
+    padding = tvm.testing.parameter((1, 1))
+    data_layout = tvm.testing.parameter("NCHW8c")
+    kernel_layout = tvm.testing.parameter("OIHW8i8o")
+    out_layout = tvm.testing.parameter("NCHW8c")
+
+    dilation, static_data_shape, ref_out_shape = tvm.testing.parameters(
+        ((1, 1), (1, 8, 224, 224, 8), (1, 8, 224, 224, 8)),
+        ((2, 2), (2, 8, 224, 224, 8), (2, 8, 222, 222, 8)),
+    )
+
+    @tvm.testing.known_failing_targets("cuda", "vulkan")
+    def test_any_conv2d_NCHWc(
+        self,
+        target,
+        dev,
+        data_shape,
+        kernel_shape,
         strides,
         padding,
         dilation,
-        kernel_size=kernel_shape[2:4],
-        channels=kernel_shape[0] * kernel_shape[-1],
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-        out_layout=out_layout,
-    )
-    mod["main"] = relay.Function([data, kernel], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
-    check_result([data_np, kernel_np], mod, ref_out_shape, assert_shape=True)
-
-
-# TODO(@kevinthesun): Support dynamic input height and width.
-def test_any_conv2d_NCHWc():
-    verify_any_conv2d_NCHWc(
-        (relay.Any(), 8, 224, 224, 8),
-        (8, 8, 3, 3, 8, 8),
-        (1, 1),
-        (1, 1),
-        (1, 1),
-        "NCHW8c",
-        "OIHW8i8o",
-        "NCHW8c",
-        (1, 8, 224, 224, 8),
-        (1, 8, 224, 224, 8),
-    )
-    verify_any_conv2d_NCHWc(
-        (relay.Any(), 8, 224, 224, 8),
-        (8, 8, 3, 3, 8, 8),
-        (1, 1),
-        (1, 1),
-        (2, 2),
-        "NCHW8c",
-        "OIHW8i8o",
-        "NCHW8c",
-        (2, 8, 224, 224, 8),
-        (2, 8, 222, 222, 8),
-    )
+        data_layout,
+        kernel_layout,
+        out_layout,
+        static_data_shape,
+        ref_out_shape,
+    ):
+        mod = tvm.IRModule()
+        dtype = "float32"
+        data = relay.var("data", shape=data_shape, dtype=dtype)
+        kernel = relay.var("kernel", shape=kernel_shape, dtype=dtype)
+        y = relay.nn.contrib_conv2d_nchwc(
+            data,
+            kernel,
+            strides,
+            padding,
+            dilation,
+            kernel_size=kernel_shape[2:4],
+            channels=kernel_shape[0] * kernel_shape[-1],
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_layout=out_layout,
+        )
+        mod["main"] = relay.Function([data, kernel], y)
+        data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+        kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
+        check_result(
+            [data_np, kernel_np], mod, ref_out_shape, assert_shape=True, targets=[(target, dev)]
+        )
 
 
 def verify_any_conv1d_transpose_ncw(
@@ -867,6 +882,8 @@ def test_any_split():
     verify_any_split((relay.Any(), relay.Any()), 2, 1, (9, 4), [(9, 2), (9, 2)])
     verify_any_split((relay.Any(), 12), (1, 4, 8), 1, (7, 12), [(7, 1), (7, 3), (7, 4)])
     verify_any_split((relay.Any(), relay.Any()), (1, 4, 8), 1, (7, 12), [(7, 1), (7, 3), (7, 4)])
+    verify_any_split((relay.Any(), 12), (8,), 1, (7, 12), [(7, 8), (7, 4)])
+    verify_any_split((relay.Any(), relay.Any()), (8,), 1, (7, 12), [(7, 8), (7, 4)])
 
 
 @tvm.testing.uses_gpu
@@ -881,136 +898,150 @@ def test_any_batch_flatten():
     check_result([data_np], mod, ref_out_shape, assert_shape=True)
 
 
-def verify_any_dense(
-    data_shape,
-    weight_shape,
-    units,
-    static_data_shape,
-    static_weight_shape,
-    ref_out_shape,
-    use_cublas=False,
-):
-    mod = tvm.IRModule()
-    dtype = "float32"
-    data = relay.var("data", shape=data_shape, dtype=dtype)
-    weight = relay.var("weight", shape=weight_shape, dtype=dtype)
-    y = relay.nn.dense(data, weight, units)
-    mod["main"] = relay.Function([data, weight], y)
-    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
-    weight_np = np.random.uniform(size=static_weight_shape).astype(dtype)
-
-    targets = None
-    if use_cublas and tvm.get_global_func("tvm.contrib.cublas.matmul", True):
-        targets = [("cuda -libs=cublas", tvm.cuda(0))]
-
-    check_result([data_np, weight_np], mod, ref_out_shape, assert_shape=True, targets=targets)
+# TODO(tvm-team) Fix dense schedule
+@tvm.testing.known_failing_targets("cuda", "vulkan")
+class TestAnyDense:
+    (
+        data_shape,
+        weight_shape,
+        units,
+        static_data_shape,
+        static_weight_shape,
+        ref_out_shape,
+    ) = tvm.testing.parameters(
+        (any_dims(2), any_dims(2), None, (4, 16), (8, 16), (4, 8)),
+        (any_dims(2), (50, relay.Any()), 50, (4, 40), (50, 40), (4, 50)),
+    )
 
+    @tvm.testing.known_failing_targets("cuda", "vulkan")
+    def test_any_dense(
+        self,
+        target,
+        dev,
+        data_shape,
+        weight_shape,
+        units,
+        static_data_shape,
+        static_weight_shape,
+        ref_out_shape,
+    ):
+        mod = tvm.IRModule()
+        dtype = "float32"
+        data = relay.var("data", shape=data_shape, dtype=dtype)
+        weight = relay.var("weight", shape=weight_shape, dtype=dtype)
+        y = relay.nn.dense(data, weight, units)
+        mod["main"] = relay.Function([data, weight], y)
+        data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+        weight_np = np.random.uniform(size=static_weight_shape).astype(dtype)
+
+        check_result(
+            [data_np, weight_np], mod, ref_out_shape, assert_shape=True, targets=[(target, dev)]
+        )
 
-# TODO(tvm-team) Fix dense schedule
-# @tvm.testing.uses_gpu
-def test_any_dense():
-    verify_any_dense(any_dims(2), any_dims(2), None, (4, 16), (8, 16), (4, 8))
-    verify_any_dense(any_dims(2), (50, relay.Any()), 50, (4, 40), (50, 40), (4, 50))
+    @tvm.testing.parametrize_targets("cuda -libs=cublas")
+    @tvm.testing.known_failing_targets("cuda", "vulkan")
+    def test_any_dense_cublas(
+        self,
+        target,
+        dev,
+        data_shape,
+        weight_shape,
+        units,
+        static_data_shape,
+        static_weight_shape,
+        ref_out_shape,
+    ):
 
+        self.test_any_dense(
+            target,
+            dev,
+            data_shape,
+            weight_shape,
+            units,
+            static_data_shape,
+            static_weight_shape,
+            ref_out_shape,
+        )
 
-@tvm.testing.uses_gpu
-def test_any_dense_dynamic_batch():
-    verify_any_dense((relay.Any(), 40), (50, 40), 50, (4, 40), (50, 40), (4, 50))
-    verify_any_dense((relay.Any(), 40), (50, 40), 50, (4, 40), (50, 40), (4, 50), use_cublas=True)
-
-
-def verify_any_batch_matmul(
-    x_shape,
-    y_shape,
-    out_shape,
-    x_var_shape,
-    y_var_shape,
-    dtype="float32",
-    trans_x=False,
-    trans_y=True,
-):
-    x = relay.var("x", relay.TensorType(x_var_shape, dtype))
-    y = relay.var("y", relay.TensorType(y_var_shape, dtype))
-    z = relay.nn.batch_matmul(x, y, transpose_a=trans_x, transpose_b=trans_y)
-
-    func = relay.Function([x, y], z)
-    x_np = np.random.uniform(size=x_shape).astype(dtype)
-    y_np = np.random.uniform(size=y_shape).astype(dtype)
-    z_np = tvm.topi.testing.batch_matmul(x_np, y_np, trans_x=trans_x, trans_y=trans_y)
-
-    for target, dev in tvm.testing.enabled_targets():
-        for kind in ["vm", "debug"]:
-            mod = tvm.ir.IRModule.from_expr(func)
-            z = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                x_np, y_np
-            )
-            tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5)
 
+class TestAnyBatchMatmul:
+    dtype = tvm.testing.parameter("float32")
+    executor_kind = tvm.testing.parameter("vm", "debug")
 
-# TODO(mbrookhart): enable once VM supports heterogenous execution
-# @tvm.testing.uses_gpu
-def test_any_batch_matmul():
-    verify_any_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16), (1, 16, 32), (relay.Any(),) * 3)
-    verify_any_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16), (5, 16, 32), (relay.Any(),) * 3)
-    verify_any_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20), (5, 16, 32), (relay.Any(),) * 3)
-    verify_any_batch_matmul(
-        (30, 16, 32), (30, 20, 32), (30, 16, 20), (30, 16, 32), (relay.Any(),) * 3
+    (x_shape, y_shape) = tvm.testing.parameters(
+        ((1, 16, 32), (1, 32, 16)),
+        ((5, 16, 32), (5, 32, 16)),
+        ((5, 16, 32), (5, 32, 20)),
+        ((30, 16, 32), (30, 32, 20)),
     )
 
-    verify_any_batch_matmul(
-        (1, 16, 32), (1, 16, 32), (1, 16, 16), (relay.Any(), 16, 32), (relay.Any(), 16, 32)
-    )
-    verify_any_batch_matmul(
-        (5, 16, 32), (5, 16, 32), (5, 16, 16), (relay.Any(), 16, 32), (relay.Any(), 16, 32)
-    )
-    verify_any_batch_matmul(
-        (5, 16, 32), (5, 20, 32), (5, 16, 20), (relay.Any(), 16, 32), (relay.Any(), 20, 32)
-    )
-    verify_any_batch_matmul(
-        (30, 16, 32), (30, 20, 32), (30, 16, 20), (relay.Any(), 16, 32), (relay.Any(), 20, 32)
-    )
+    # any_x = tvm.testing.parameter("none", "batch")
+    # any_y = tvm.testing.parameter("none", "batch", "all")
 
-    verify_any_batch_matmul(
-        (1, 32, 16), (1, 16, 32), (1, 16, 16), (1, 32, 16), (relay.Any(),) * 3, trans_x=True
-    )
-    verify_any_batch_matmul(
-        (5, 16, 32), (5, 32, 16), (5, 16, 16), (5, 16, 32), (relay.Any(),) * 3, trans_y=False
-    )
-    verify_any_batch_matmul(
-        (5, 32, 16),
-        (5, 32, 20),
-        (5, 16, 20),
-        (5, 32, 16),
-        (relay.Any(),) * 3,
-        trans_x=True,
-        trans_y=False,
-    )
-    verify_any_batch_matmul(
-        (1, 32, 16),
-        (1, 16, 32),
-        (1, 16, 16),
-        (relay.Any(), 32, 16),
-        (relay.Any(), 16, 32),
-        trans_x=True,
-    )
-    verify_any_batch_matmul(
-        (5, 16, 32),
-        (5, 32, 16),
-        (5, 16, 16),
-        (relay.Any(), 16, 32),
-        (relay.Any(), 32, 16),
-        trans_y=False,
-    )
-    verify_any_batch_matmul(
-        (5, 32, 16),
-        (5, 32, 20),
-        (5, 16, 20),
-        (relay.Any(), 32, 16),
-        (relay.Any(), 32, 20),
-        trans_x=True,
-        trans_y=False,
+    any_x, any_y = tvm.testing.parameters(
+        ("none", "batch"), ("none", "all"), ("batch", "none"), ("batch", "batch"), ("batch", "all")
     )
 
+    transpose_x = tvm.testing.parameter(True, False)
+    transpose_y = tvm.testing.parameter(True, False)
+
+    @tvm.testing.fixture
+    def x_var_shape(self, x_shape, any_x):
+        if any_x == "none":
+            return x_shape
+        elif any_x == "batch":
+            return tuple(relay.Any() if i == 0 else size for i, size in enumerate(x_shape))
+        elif any_x == "all":
+            return tuple(relay.Any() for _ in x_shape)
+
+    @tvm.testing.fixture
+    def y_var_shape(self, y_shape, any_y):
+        if any_y == "none":
+            return y_shape
+        elif any_y == "batch":
+            return tuple(relay.Any() if i == 0 else size for i, size in enumerate(y_shape))
+        elif any_y == "all":
+            return tuple(relay.Any() for _ in y_shape)
+
+    @tvm.testing.known_failing_targets("cuda", "vulkan")
+    def test_any_batch_matmul(
+        self,
+        target,
+        dev,
+        x_shape,
+        y_shape,
+        any_x,
+        any_y,
+        x_var_shape,
+        y_var_shape,
+        transpose_x,
+        transpose_y,
+        executor_kind,
+        dtype,
+    ):
+        if transpose_x:
+            x_shape = (x_shape[0], x_shape[2], x_shape[1])
+            x_var_shape = (x_var_shape[0], x_var_shape[2], x_var_shape[1])
+
+        if transpose_y:
+            y_shape = (y_shape[0], y_shape[2], y_shape[1])
+            y_var_shape = (y_var_shape[0], y_var_shape[2], y_var_shape[1])
+
+        x = relay.var("x", relay.TensorType(x_var_shape, dtype))
+        y = relay.var("y", relay.TensorType(y_var_shape, dtype))
+        z = relay.nn.batch_matmul(x, y, transpose_a=transpose_x, transpose_b=transpose_y)
+
+        func = relay.Function([x, y], z)
+        x_np = np.random.uniform(size=x_shape).astype(dtype)
+        y_np = np.random.uniform(size=y_shape).astype(dtype)
+        z_np = tvm.topi.testing.batch_matmul(x_np, y_np, trans_x=transpose_x, trans_y=transpose_y)
+
+        mod = tvm.ir.IRModule.from_expr(func)
+        z = relay.create_executor(executor_kind, mod=mod, device=dev, target=target).evaluate()(
+            x_np, y_np
+        )
+        tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5)
+
 
 @tvm.testing.uses_gpu
 def verify_any_pad(data_shape, pad_width, static_data_shape):
@@ -1991,7 +2022,7 @@ def test_gather_nd():
     def verify_gather_nd(data_shape, indices_shape, data_shape_np, indices_shape_np, batch_dims=0):
         x = relay.var("x", relay.TensorType(data_shape, "float32"))
         y = relay.var("y", relay.TensorType(indices_shape, "int32"))
-        z = relay.gather_nd(x, y, batch_dims, indices_shape[0])
+        z = relay.gather_nd(x, y, batch_dims=batch_dims, index_rank=indices_shape[0])
 
         mod = tvm.IRModule()
         mod["main"] = relay.Function([x, y], z)
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
index b90bce548a5e..092cae01f568 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -194,6 +194,8 @@ def get_func(shape):
     engine.dump()
 
 
+# Note: Once compile engine is removed, we should keep this test so that
+# we make sure that opt_level=0 passes are being called correctly.
 def test_compile_placeholder_bypass():
     engine = relay.backend.compile_engine.get()
     x = relay.var("x", shape=(2, 3))
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index 30db5facc208..ad5f2aa9d4fa 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -16,7 +16,6 @@
 # under the License.
 """Unit tests for graph partitioning."""
 
-import os
 import sys
 from collections import OrderedDict
 import numpy as np
@@ -24,86 +23,17 @@
 
 import tvm
 from tvm import relay, runtime
-from tvm.contrib import utils
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.op.annotation import compiler_begin, compiler_end
+from utils.external_codegen import (
+    update_lib,
+    set_external_func_attr,
+    parametrize_external_codegen_checks,
+    parametrize_external_json_codegen_checks,
+)
 
 
-def update_lib(lib):
-    test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    source_dir = os.path.join(test_dir, "..", "..", "..")
-    contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
-
-    kwargs = {}
-    kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-    tmp_path = utils.tempdir()
-    lib_name = "lib.so"
-    lib_path = tmp_path.relpath(lib_name)
-    lib.export_library(lib_path, fcompile=False, **kwargs)
-    lib = tvm.runtime.load_module(lib_path)
-
-    return lib
-
-
-def check_vm_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()):
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        exe = relay.vm.compile(mod, target=target)
-    code, lib = exe.save()
-    lib = update_lib(lib)
-    exe = runtime.vm.Executable.load_exec(code, lib)
-    vm = runtime.vm.VirtualMachine(exe, device)
-    out = vm.run(**map_inputs)
-    tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
-
-
-def check_graph_executor_result(
-    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()
-):
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        json, lib, _ = relay.build(mod, target=target)
-    lib = update_lib(lib)
-    rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
-
-    for name, data in map_inputs.items():
-        rt_mod.set_input(name, data)
-    rt_mod.run()
-    out = tvm.nd.empty(out_shape, device=device)
-    out = rt_mod.get_output(0, out)
-
-    tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
-
-
-def check_aot_executor_result(
-    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()
-):
-    if tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON":
-        pytest.skip("MicroTVM support not enabled. Set USE_MICRO=ON in config.cmake to enable.")
-
-    # Late import to avoid breaking test with USE_MICRO=OFF.
-    from aot.aot_test_utils import AOTTestModel, AOT_DEFAULT_RUNNER, compile_and_run
-
-    interface_api = "packed"
-    use_unpacked_api = False
-    test_runner = AOT_DEFAULT_RUNNER
-    compile_and_run(
-        AOTTestModel(module=mod, inputs=map_inputs, outputs=[result]),
-        test_runner,
-        interface_api,
-        use_unpacked_api,
-    )
-
-
-def set_external_func_attr(func, compiler, ext_symbol):
-    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-    func = func.with_attr("Compiler", compiler)
-    func = func.with_attr("global_symbol", ext_symbol)
-    return func
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
-@pytest.mark.parametrize(
-    "check_result", [check_vm_result, check_graph_executor_result, check_aot_executor_result]
-)
+@parametrize_external_codegen_checks
 def test_multi_node_subgraph(check_result):
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -170,10 +100,7 @@ def test_multi_node_subgraph(check_result):
     )
 
 
-@pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
-@pytest.mark.parametrize(
-    "check_result", [check_vm_result, check_graph_executor_result, check_aot_executor_result]
-)
+@parametrize_external_codegen_checks
 def test_extern_gcc_single_op(check_result):
     x = relay.var("x", shape=(8, 8))
     y = relay.var("y", shape=(8, 8))
@@ -191,10 +118,7 @@ def test_extern_gcc_single_op(check_result):
     check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
 
 
-@pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
-@pytest.mark.parametrize(
-    "check_result", [check_vm_result, check_graph_executor_result, check_aot_executor_result]
-)
+@parametrize_external_codegen_checks
 def test_extern_gcc_single_op_int(check_result):
     x = relay.var("x", shape=(8, 8), dtype="int32")
     y = relay.var("y", shape=(8, 8), dtype="int32")
@@ -212,10 +136,7 @@ def test_extern_gcc_single_op_int(check_result):
     check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
 
 
-@pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
-@pytest.mark.parametrize(
-    "check_result", [check_vm_result, check_graph_executor_result, check_aot_executor_result]
-)
+@parametrize_external_codegen_checks
 def test_extern_gcc(check_result):
     x = relay.var("x", shape=(2, 2))
     y = relay.var("y", shape=(2, 2))
@@ -292,12 +213,11 @@ def constant_updater(expr, symbol):
     tvm._ffi.registry.remove_global_func("relay.ext.ccompiler.constant_updater")
 
 
-@pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
 @pytest.mark.skipif(
     not tvm.get_global_func("relay.ext.dnnl", True),
     reason="skip because DNNL codegen is not available",
 )
-@pytest.mark.parametrize("check_result", [check_vm_result, check_graph_executor_result])
+@parametrize_external_json_codegen_checks
 def test_extern_dnnl(check_result):
     dtype = "float32"
     ishape = (1, 32, 14, 14)
@@ -335,12 +255,11 @@ def test_extern_dnnl(check_result):
     )
 
 
-@pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
 @pytest.mark.skipif(
     not tvm.get_global_func("relay.ext.dnnl", True),
     reason="skip because DNNL codegen is not available",
 )
-@pytest.mark.parametrize("check_result", [check_vm_result, check_graph_executor_result])
+@parametrize_external_json_codegen_checks
 def test_extern_dnnl_const(check_result):
     dtype = "float32"
     ishape = (1, 32, 14, 14)
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index 11099ffe50ee..bab709f2b88d 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -14,15 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import sys
+
 import numpy as np
 import pytest
 
 import tvm
-from tvm import te
-from tvm import relay
+import tvm.testing
+
+from tvm import te, relay
 from tvm.relay.testing import check_grad, run_infer_type
 from tvm.relay.transform import gradient
-import tvm.testing
 
 
 def sigmoid(x):
@@ -36,131 +38,179 @@ def relu(x):
     return x_copy
 
 
-@tvm.testing.uses_gpu
-def test_unary_op():
-    def check_single_op(opfunc, ref, dtype):
-        shape = (10, 4)
+class TestUnaryOp:
+    config = {
+        "log": (tvm.relay.log, lambda x, g: g * (1 / x)),
+        "exp": (tvm.relay.exp, lambda x, g: g * np.exp(x)),
+        "sigmoid": (tvm.relay.sigmoid, lambda x, g: g * sigmoid(x) * (1 - sigmoid(x))),
+        "tanh": (tvm.relay.tanh, lambda x, g: g * (1 - np.tanh(x) * np.tanh(x))),
+        "sqrt": (tvm.relay.sqrt, lambda x, g: g * 0.5 * np.power(x, -0.5)),
+        "abs": (tvm.relay.abs, lambda x, g: np.where(x < 0, -g, g)),
+        "relu": (relay.nn.relu, lambda x, g: np.where(x < 0, np.zeros_like(x), g)),
+        "erf": (tvm.relay.erf, lambda x, g: g * (2.0 / (np.pi ** (0.5)) * np.exp(-x * x))),
+        "cos": (tvm.relay.cos, lambda x, g: g * -1.0 * np.sin(x)),
+        "sin": (tvm.relay.sin, lambda x, g: g * np.cos(x)),
+        "tan": (tvm.relay.tan, lambda x, g: g * (1.0 / (np.cos(x) ** 2))),
+        "atan": (tvm.relay.atan, lambda x, g: g * (1 / (1 + np.power(x, 2.0)))),
+        "log2": (tvm.relay.log2, lambda x, g: g * (1 / (np.log(2) * x))),
+        "log10": (tvm.relay.log10, lambda x, g: g * (1 / (np.log(10) * x))),
+        "cosh": (tvm.relay.cosh, lambda x, g: g * (np.sinh(x))),
+        "sinh": (tvm.relay.sinh, lambda x, g: g * (np.cosh(x))),
+        "asin": (tvm.relay.asin, lambda x, g: g * (1.0 / (1.0 - x ** 2) ** (1.0 / 2.0))),
+        "acos": (tvm.relay.acos, lambda x, g: g * (-1.0 / (1.0 - x ** 2.0) ** (1.0 / 2.0))),
+        "acosh": (tvm.relay.acosh, lambda x, g: g * (1.0 / (x ** 2 - 1.0) ** (1.0 / 2.0))),
+        "asinh": (tvm.relay.asinh, lambda x, g: g * (1.0 / (x ** 2 + 1.0) ** (1.0 / 2.0))),
+        "atanh": (tvm.relay.atanh, lambda x, g: g * (-1.0 / (x ** 2 - 1.0))),
+    }
+
+    relay_op, ref_func = tvm.testing.parameters(*config.values(), ids=config.keys())
+    dtype = tvm.testing.parameter("float32", "float64")
+    shape = tvm.testing.parameter((10, 4))
+
+    def test_op(self, target, dev, relay_op, ref_func, shape, dtype):
+
+        target = tvm.target.Target(target)
+        if target.kind.name == "vulkan":
+
+            known_breaks = {
+                "float32": [
+                    tvm.relay.erf,
+                    tvm.relay.tan,
+                    tvm.relay.atan,
+                    tvm.relay.log10,
+                    tvm.relay.cosh,
+                    tvm.relay.sinh,
+                    tvm.relay.asin,
+                    tvm.relay.acos,
+                    tvm.relay.acosh,
+                    tvm.relay.asinh,
+                    tvm.relay.atanh,
+                ],
+                "float64": [
+                    tvm.relay.log,
+                    tvm.relay.exp,
+                    tvm.relay.sigmoid,
+                    tvm.relay.tanh,
+                    tvm.relay.sqrt,
+                    tvm.relay.erf,
+                    tvm.relay.cos,
+                    tvm.relay.sin,
+                    tvm.relay.tan,
+                    tvm.relay.atan,
+                    tvm.relay.log2,
+                    tvm.relay.log10,
+                    tvm.relay.cosh,
+                    tvm.relay.sinh,
+                    tvm.relay.asin,
+                    tvm.relay.acos,
+                    tvm.relay.acosh,
+                    tvm.relay.asinh,
+                    tvm.relay.atanh,
+                ],
+            }
+
+            if relay_op in known_breaks[dtype]:
+                pytest.xfail(f"{dtype} {relay_op.__name__} not yet supported on Vulkan runtime")
+
         tp = relay.TensorType(shape, dtype)
         x = relay.var("x", tp)
         g = relay.var("g", tp)
-        y = opfunc(x) * g
-
-        if ref is not None:
-            data = np.random.rand(*shape).astype(dtype)
-            grad_in = np.random.rand(*shape).astype(dtype)
-            ref_grad = ref(data, grad_in)
-            fwd_func = relay.Function([x, g], y)
-            fwd_func = run_infer_type(fwd_func)
-            bwd_func = run_infer_type(gradient(fwd_func))
-
-            for target, dev in tvm.testing.enabled_targets():
-                op_res, (op_grad, _) = relay.create_executor(device=dev, target=target).evaluate(
-                    bwd_func
-                )(data, grad_in)
-                np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
-
-    for opfunc, ref in [
-        (tvm.relay.log, lambda x, g: g * (1 / x)),
-        (tvm.relay.exp, lambda x, g: g * np.exp(x)),
-        (tvm.relay.sigmoid, lambda x, g: g * sigmoid(x) * (1 - sigmoid(x))),
-        (tvm.relay.tanh, lambda x, g: g * (1 - np.tanh(x) * np.tanh(x))),
-        (tvm.relay.sqrt, lambda x, g: g * 0.5 * np.power(x, -0.5)),
-        (tvm.relay.abs, lambda x, g: np.where(x < 0, -g, g)),
-        (relay.nn.relu, lambda x, g: np.where(x < 0, np.zeros_like(x), g)),
-        (tvm.relay.erf, lambda x, g: g * (2.0 / (np.pi ** (0.5)) * np.exp(-x * x))),
-        (tvm.relay.cos, lambda x, g: g * -1.0 * np.sin(x)),
-        (tvm.relay.sin, lambda x, g: g * np.cos(x)),
-        (tvm.relay.tan, lambda x, g: g * (1.0 / (np.cos(x) ** 2))),
-        (tvm.relay.atan, lambda x, g: g * (1 / (1 + np.power(x, 2.0)))),
-        (tvm.relay.log2, lambda x, g: g * (1 / (np.log(2) * x))),
-        (tvm.relay.log10, lambda x, g: g * (1 / (np.log(10) * x))),
-        (tvm.relay.cosh, lambda x, g: g * (np.sinh(x))),
-        (tvm.relay.sinh, lambda x, g: g * (np.cosh(x))),
-        (tvm.relay.asin, lambda x, g: g * (1.0 / (1.0 - x ** 2) ** (1.0 / 2.0))),
-        (tvm.relay.acos, lambda x, g: g * (-1.0 / (1.0 - x ** 2.0) ** (1.0 / 2.0))),
-        (tvm.relay.acosh, lambda x, g: g * (1.0 / (x ** 2 - 1.0) ** (1.0 / 2.0))),
-        (tvm.relay.asinh, lambda x, g: g * (1.0 / (x ** 2 + 1.0) ** (1.0 / 2.0))),
-        (tvm.relay.atanh, lambda x, g: g * (-1.0 / (x ** 2 - 1.0))),
-    ]:
-        for dtype in ("float32", "float64"):
-            check_single_op(opfunc, ref, dtype)
-
-
-@tvm.testing.uses_gpu
-def test_binary_op():
-    def inst(vars, sh):
-        return [vars.get(s, s) for s in sh]
-
-    def check_binary_op(opfunc, ref, dtype):
-        s = (5, 10, 5)
-        t = relay.TensorType((5, 10, 5), dtype=dtype)
+        y = relay_op(x) * g
+
+        fwd_func = relay.Function([x, g], y)
+        fwd_func = run_infer_type(fwd_func)
+        bwd_func = run_infer_type(gradient(fwd_func))
+
+        data_in = np.random.rand(*shape).astype(dtype)
+        grad_in = np.random.rand(*shape).astype(dtype)
+        ref_grad_out = ref_func(data_in, grad_in)
+
+        op_res, (op_grad, _) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
+            data_in, grad_in
+        )
+        np.testing.assert_allclose(op_grad.numpy(), ref_grad_out, rtol=0.01)
+
+
+class TestBinaryOp:
+    config = {
+        "add": (relay.add, lambda x, y: [np.ones_like(x), np.ones_like(y)]),
+        "subtract": (relay.subtract, lambda x, y: [np.ones_like(x), -np.ones_like(y)]),
+        "multiply": (relay.multiply, lambda x, y: [y, x]),
+        "divide": (relay.divide, lambda x, y: [1 / y, -x / (y ** 2)]),
+    }
+
+    relay_op, ref_func = tvm.testing.parameters(*config.values(), ids=config.keys())
+    dtype = tvm.testing.parameter("float32", "float64")
+    shape = tvm.testing.parameter((5, 10, 5))
+
+    def test_binary_op(self, target, dev, relay_op, ref_func, shape, dtype):
+        t = relay.TensorType(shape, dtype=dtype)
         x = relay.var("x", t)
         y = relay.var("y", t)
-        z = opfunc(x, y)
+        z = relay_op(x, y)
 
-        x_data = np.random.rand(*s).astype(t.dtype)
-        y_data = np.random.rand(*s).astype(t.dtype)
-        ref_grad0, ref_grad1 = ref(x_data, y_data)
+        x_data = np.random.rand(*shape).astype(t.dtype)
+        y_data = np.random.rand(*shape).astype(t.dtype)
+        ref_grad0, ref_grad1 = ref_func(x_data, y_data)
         fwd_func = relay.Function([x, y], z)
         fwd_func = run_infer_type(fwd_func)
         bwd_func = run_infer_type(gradient(fwd_func))
 
-        for target, dev in tvm.testing.enabled_targets():
-            op_res, (op_grad0, op_grad1) = relay.create_executor(
-                device=dev, target=target
-            ).evaluate(bwd_func)(x_data, y_data)
-            np.testing.assert_allclose(op_grad0.numpy(), ref_grad0, rtol=0.01)
-            np.testing.assert_allclose(op_grad1.numpy(), ref_grad1, rtol=0.01)
+        op_res, (op_grad0, op_grad1) = relay.create_executor(device=dev, target=target).evaluate(
+            bwd_func
+        )(x_data, y_data)
+        np.testing.assert_allclose(op_grad0.numpy(), ref_grad0, rtol=0.01)
+        np.testing.assert_allclose(op_grad1.numpy(), ref_grad1, rtol=0.01)
 
-    for opfunc, ref in [
-        (relay.add, lambda x, y: [np.ones_like(x), np.ones_like(y)]),
-        (relay.subtract, lambda x, y: [np.ones_like(x), -np.ones_like(y)]),
-        (relay.multiply, lambda x, y: [y, x]),
-        (relay.divide, lambda x, y: [1 / y, -x / (y ** 2)]),
-    ]:
-        for dtype in ("float32", "float64"):
-            check_binary_op(opfunc, ref, dtype)
 
+def test_softmax_grad(target, dev):
+    target = tvm.target.Target(target)
+    if target.kind.name == "vulkan":
+        pytest.xfail("Known failure on vulkan")
 
-def test_softmax_grad():
     data = relay.var("data", relay.TensorType((1, 16), "float64"))
     fwd_func = relay.Function([data], relay.nn.softmax(data))
-    check_grad(fwd_func, scale=1)
+    check_grad(fwd_func, scale=1, target_devices=[(target, dev)])
 
 
-def test_log_softmax_grad():
+def test_log_softmax_grad(target, dev):
+    target = tvm.target.Target(target)
+    if target.kind.name == "vulkan":
+        pytest.xfail("Known failure on vulkan")
+
     data = relay.var("data", relay.TensorType((2, 16), "float64"))
     fwd_func = relay.Function([data], relay.nn.log_softmax(data))
-    check_grad(fwd_func, scale=1)
-
+    check_grad(fwd_func, scale=1, target_devices=[(target, dev)])
 
-def verify_bias_add(d_shape, b_shape, axis=1):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    bias = relay.var("bias", relay.TensorType(b_shape, "float32"))
-    fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias, axis=axis))
-    check_grad(fwd_func)
 
+class TestBiasAddGrad:
+    d_shape, b_shape, axis = tvm.testing.parameters(
+        ((1, 16), (16,), 1),
+        ((1, 8, 2, 2), (8,), 1),
+        ((1, 2, 2, 8), (8,), 3),
+        ((4, 8), (8,), 1),
+    )
 
-def test_bias_add_grad():
-    verify_bias_add((1, 16), (16,))
-    verify_bias_add((1, 8, 2, 2), (8,))
-    verify_bias_add((1, 2, 2, 8), (8,), 3)
-    verify_bias_add((4, 8), (8,))
+    def test_bias_add(self, target, dev, d_shape, b_shape, axis):
+        data = relay.var("data", relay.TensorType(d_shape, "float32"))
+        bias = relay.var("bias", relay.TensorType(b_shape, "float32"))
+        fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias, axis=axis))
+        check_grad(fwd_func, target_devices=[(target, dev)])
 
 
-def test_expand_dims_grad():
+def test_expand_dims_grad(target, dev):
     data = relay.var("data", shape=(2, 3), dtype="float64")
     fwd_func = relay.Function([data], relay.expand_dims(data, axis=1, num_newaxis=2))
-    check_grad(fwd_func)
+    check_grad(fwd_func, target_devices=[(target, dev)])
 
 
-def test_concatenate_grad():
+def test_concatenate_grad(target, dev):
     x = relay.var("x", shape=(2, 2, 5))
     y = relay.var("y", shape=(2, 1, 5))
     z = relay.var("z", shape=(2, 4, 5))
     fwd_func = relay.Function([x, y, z], relay.concatenate([x, y, z], axis=1))
-    check_grad(fwd_func)
+    check_grad(fwd_func, target_devices=[(target, dev)])
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 8d961eb60b18..4c2c9082e044 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -14,35 +14,52 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
+import sys
+
 import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
 
 from tvm import relay
 from tvm.relay.testing import check_grad
 
 
-def test_cross_entropy_grad():
-    for dtype in ("float32", "float64"):
-        x = relay.var("x", shape=(2, 5), dtype=dtype)
-        y = relay.var("y", shape=(2, 5), dtype=dtype)
-        check_grad(
-            relay.Function([x, y], relay.op.nn.cross_entropy(x, y)), eps=0.01, scale=0.1, mean=1
-        )
+index_dtype = tvm.testing.parameter("int32", "int64")
+val_dtype = tvm.testing.parameter("float32", "float64")
 
 
-def test_cross_entropy_with_logits_grad():
-    for dtype in ("float32", "float64"):
-        x = relay.var("x", shape=(2, 5), dtype=dtype)
-        y = relay.var("y", shape=(2, 5), dtype=dtype)
-        check_grad(
-            relay.Function([x, y], relay.op.nn.cross_entropy_with_logits(x, y)),
-            eps=0.01,
-            scale=0.1,
-            mean=1,
-        )
+def test_cross_entropy_grad(target, dev, val_dtype):
+    target = tvm.target.Target(target)
+    if target.kind.name == "vulkan" and val_dtype == "float64":
+        # GLSL.std.450's Log implementation only takes 16/32-bit floats.
+        pytest.xfail("Known failing test case for vulkan runtime")
+
+    x = relay.var("x", shape=(2, 5), dtype=val_dtype)
+    y = relay.var("y", shape=(2, 5), dtype=val_dtype)
+    check_grad(
+        relay.Function([x, y], relay.op.nn.cross_entropy(x, y)),
+        eps=0.01,
+        scale=0.1,
+        mean=1,
+        target_devices=[(target, dev)],
+    )
+
+
+def test_cross_entropy_with_logits_grad(target, dev, val_dtype):
+    x = relay.var("x", shape=(2, 5), dtype=val_dtype)
+    y = relay.var("y", shape=(2, 5), dtype=val_dtype)
+    check_grad(
+        relay.Function([x, y], relay.op.nn.cross_entropy_with_logits(x, y)),
+        eps=0.01,
+        scale=0.1,
+        mean=1,
+        target_devices=[(target, dev)],
+    )
 
 
-def test_checkpoint():
+def test_checkpoint(target, dev):
     inputs = [relay.var("x{}".format(i), shape=(1,)) for i in range(4)]
     output = relay.multiply(relay.add(inputs[0], inputs[1]), relay.add(inputs[2], inputs[3]))
     check_grad(relay.Function(inputs, relay.annotation.checkpoint(output)))
@@ -59,56 +76,59 @@ def test_checkpoint():
         )
     )
     out_single = scope.get()
-    check_grad(relay.Function(inputs, out_single))
+    check_grad(relay.Function(inputs, out_single), target_devices=[(target, dev)])
 
 
-def verify_batch_matmul_grad(a_shape, b_shape, transpose_a, transpose_b):
-    tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
-    tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
-    check_grad(
-        relay.Function(
-            [tensor_a, tensor_b],
-            relay.op.nn.batch_matmul(
-                tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b
-            ),
-        )
+class TestBatchMatmulGrad:
+    a_shape, b_shape, transpose_a, transpose_b = tvm.testing.parameters(
+        ((2, 3, 5), (2, 5, 4), False, False),
+        ((2, 3, 5), (2, 4, 5), False, True),
+        ((2, 5, 3), (2, 5, 4), True, False),
+        ((2, 5, 3), (2, 4, 5), True, True),
     )
 
-
-def test_batch_matmul_grad():
-    verify_batch_matmul_grad((2, 3, 5), (2, 5, 4), False, False)
-    verify_batch_matmul_grad((2, 3, 5), (2, 4, 5), False, True)
-    verify_batch_matmul_grad((2, 5, 3), (2, 5, 4), True, False)
-    verify_batch_matmul_grad((2, 5, 3), (2, 4, 5), True, True)
+    def test_batch_matmul_grad(self, target, dev, a_shape, b_shape, transpose_a, transpose_b):
+        tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
+        tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
+        check_grad(
+            relay.Function(
+                [tensor_a, tensor_b],
+                relay.op.nn.batch_matmul(
+                    tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b
+                ),
+            ),
+            target_devices=[(target, dev)],
+        )
 
 
-def test_reverse_reshape_grad():
+def test_reverse_reshape_grad(target, dev):
     x = relay.var("x", shape=(3, 4, 5), dtype="float64")
-    check_grad(relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))))
+    check_grad(
+        relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))),
+        target_devices=[(target, dev)],
+    )
 
 
-def test_one_hot_grad():
+def test_one_hot_grad(target, dev, index_dtype, val_dtype):
     indices_shape = (3, 4)
     depth = 5
     axis = -1
 
-    for indices_dtype in ["int32", "int64"]:
-        for val_dtype in ["float32", "float64"]:
-            inputs = [
-                np.random.randint(depth, size=indices_shape, dtype=indices_dtype),
-                np.array(np.random.randn() * 1e-5).astype(val_dtype),
-                np.array(np.random.randn() * 1e-5).astype(val_dtype),
-            ]
-            test_inputs = inputs[1:]
+    inputs = [
+        np.random.randint(depth, size=indices_shape, dtype=index_dtype),
+        np.array(np.random.randn() * 1e-5).astype(val_dtype),
+        np.array(np.random.randn() * 1e-5).astype(val_dtype),
+    ]
+    test_inputs = inputs[1:]
 
-            indices = relay.var("indices", shape=indices_shape, dtype=indices_dtype)
-            on_val = relay.var("on_val", shape=tuple(), dtype=val_dtype)
-            off_val = relay.var("off_val", shape=tuple(), dtype=val_dtype)
-            y = relay.one_hot(indices, on_val, off_val, depth, axis, val_dtype)
-            f = relay.Function([indices, on_val, off_val], y)
+    indices = relay.var("indices", shape=indices_shape, dtype=index_dtype)
+    on_val = relay.var("on_val", shape=tuple(), dtype=val_dtype)
+    off_val = relay.var("off_val", shape=tuple(), dtype=val_dtype)
+    y = relay.one_hot(indices, on_val, off_val, depth, axis, val_dtype)
+    f = relay.Function([indices, on_val, off_val], y)
 
-            check_grad(f, inputs=inputs, test_inputs=test_inputs)
+    check_grad(f, inputs=inputs, test_inputs=test_inputs, target_devices=[(target, dev)])
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 97e10eb25a95..eff3919460c2 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -43,54 +43,67 @@ def rsqrt(x):
     return one / np.sqrt(x)
 
 
-@tvm.testing.uses_gpu
-def test_unary_op():
-    def check_single_op(opfunc, ref, dtype):
+class TestUnaryOp:
+    op_list = {
+        "log": (tvm.relay.log, np.log),
+        "exp": (tvm.relay.exp, np.exp),
+        "erf": (tvm.relay.erf, scipy.special.erf),
+        "sqrt": (tvm.relay.sqrt, np.sqrt),
+        "rqsrt": (tvm.relay.rsqrt, rsqrt),
+        "sigmoid": (tvm.relay.sigmoid, sigmoid),
+        "tanh": (tvm.relay.tanh, np.tanh),
+        "relu": (relay.nn.relu, relu),
+        "cos": (tvm.relay.cos, np.cos),
+        "sin": (tvm.relay.sin, np.sin),
+        "tan": (tvm.relay.tan, np.tan),
+        "atan": (tvm.relay.atan, np.arctan),
+    }
+
+    dtype = tvm.testing.parameter("float16", "float32")
+
+    relay_op, ref_func = tvm.testing.parameters(*op_list.values(), ids=op_list.keys())
+
+    def test_unary_op(self, target, dev, relay_op, ref_func, dtype):
+        target = tvm.target.Target(target)
+        if (
+            dtype == "float16"
+            and target.kind.name == "cuda"
+            and not have_fp16(tvm.cuda(0).compute_version)
+        ):
+            pytest.xfail("No float16 support on local cuda device")
+        elif (
+            dtype == "float16"
+            and target.kind.name == "cuda"
+            and not target.attrs.get("supports_float16", False)
+        ):
+            pytest.xfail("No float16 support on vulkan target")
+
+        if target.kind.name == "vulkan" and relay_op in [
+            tvm.relay.erf,
+            tvm.relay.tan,
+            tvm.relay.atan,
+        ]:
+            pytest.xfail(f"Vulkan runtime doesn't yet support {relay_op}")
+
         shape = (10, 4)
         dtype = dtype
         tp = relay.TensorType(shape)
         x = relay.var("x", tp, dtype=dtype)
-        y = opfunc(x)
+        y = relay_op(x)
         # test printer
         assert ("{}(%x)".format(y.op.name)) in y.astext()
         # test type inference
         yy = run_infer_type(y)
         assert yy.checked_type == tp
 
-        if ref is not None:
+        if ref_func is not None:
             data = np.random.rand(*shape).astype(dtype)
-            ref_res = ref(data)
+            ref_res = ref_func(data)
             func = relay.Function([x], y)
-            for target, dev in tvm.testing.enabled_targets():
-                # use graph by execuor default for testing, as we need
-                # create function explicitly to avoid constant-folding.
-                if (
-                    dtype == "float16"
-                    and target == "cuda"
-                    and not have_fp16(tvm.cuda(0).compute_version)
-                ):
-                    continue
-                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                    data
-                )
-                np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
-
-    for opfunc, ref in [
-        (tvm.relay.log, np.log),
-        (tvm.relay.exp, np.exp),
-        (tvm.relay.erf, scipy.special.erf),
-        (tvm.relay.sqrt, np.sqrt),
-        (tvm.relay.rsqrt, rsqrt),
-        (tvm.relay.sigmoid, sigmoid),
-        (tvm.relay.tanh, np.tanh),
-        (relay.nn.relu, relu),
-        (tvm.relay.cos, np.cos),
-        (tvm.relay.sin, np.sin),
-        (tvm.relay.tan, np.tan),
-        (tvm.relay.atan, np.arctan),
-    ]:
-        for dtype in ["float16", "float32"]:
-            check_single_op(opfunc, ref, dtype)
+            # use graph by execuor default for testing, as we need
+            # create function explicitly to avoid constant-folding.
+            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
+            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 87cdc41570d0..44f211dd9f8a 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -16,10 +16,15 @@
 # under the License.
 """ Support level2 operator test cases.
 """
+import sys
+
 import numpy as np
+import pytest
+
 import tvm
 import tvm.testing
 import tvm.topi.testing
+
 from tvm import autotvm, relay, te
 from tvm.contrib import utils
 from tvm.relay import transform
@@ -191,174 +196,213 @@ def test_conv2d_infer_type():
     assert yy.checked_type == relay.TensorType((n, h, w, 16), "int32")
 
 
-@tvm.testing.uses_gpu
-def test_conv2d_run():
-    def run_test_conv2d(
+class TestConv2D:
+    config = {
+        "group1": dict(
+            dtype="float32",
+            out_dtype="float32",
+            scale=1,
+            dshape=(1, 32, 18, 18),
+            kshape=(32, 4, 3, 3),
+            padding=(1, 1),
+            channels=32,
+            groups=8,
+            kernel_size=(3, 3),
+            dilation=(1, 1),
+        ),
+        "group2": dict(
+            dtype="float32",
+            out_dtype="float32",
+            scale=1,
+            dshape=(1, 32, 18, 18),
+            kshape=(64, 1, 3, 3),
+            padding=(1, 1),
+            channels=64,
+            groups=32,
+            kernel_size=(3, 3),
+            dilation=(1, 1),
+        ),
+        "normal": dict(
+            dtype="float32",
+            out_dtype="float32",
+            scale=1,
+            dshape=(1, 3, 224, 224),
+            kshape=(10, 3, 3, 3),
+            padding=(1, 1),
+            channels=10,
+            groups=1,
+            kernel_size=(3, 3),
+            dilation=(1, 1),
+        ),
+        "mixed_precision_int8_int32_case1": dict(
+            dtype="int8",
+            out_dtype="int32",
+            scale=1,
+            dshape=(1, 3, 224, 224),
+            kshape=(10, 3, 3, 3),
+            padding=(1, 1),
+            channels=10,
+            groups=1,
+            kernel_size=(3, 3),
+            dilation=(1, 1),
+        ),
+        "mixed_precision_int8_int32_case2": dict(
+            dtype="int8",
+            out_dtype="int32",
+            scale=1,
+            dshape=(1, 3, 224, 224),
+            kshape=(10, 3, 1, 3),
+            padding=(0, 1),
+            channels=10,
+            groups=1,
+            kernel_size=(1, 3),
+            dilation=(1, 1),
+        ),
+        "dilated": dict(
+            dtype="float32",
+            out_dtype="float32",
+            scale=1,
+            dshape=(1, 3, 18, 18),
+            kshape=(10, 3, 3, 3),
+            padding=(1, 1),
+            channels=10,
+            groups=1,
+            kernel_size=(3, 3),
+            dilation=(3, 3),
+        ),
+    }
+
+    # TODO(Lunderberg): Make a cleaner utility for this type of
+    # parametrization.  It would be much nicer to have the fixture
+    # name come from the dictionaries themselves, rather than needing
+    # to be re-packed into tuples.
+    (
         dtype,
         out_dtype,
         scale,
         dshape,
         kshape,
-        padding=(1, 1),
-        fref=None,
-        groups=1,
-        dilation=(1, 1),
-        except_targets=None,
-        **attrs,
+        padding,
+        channels,
+        groups,
+        kernel_size,
+        dilation,
+    ) = tvm.testing.parameters(
+        *[
+            [
+                d[p]
+                for p in [
+                    "dtype",
+                    "out_dtype",
+                    "scale",
+                    "dshape",
+                    "kshape",
+                    "padding",
+                    "channels",
+                    "groups",
+                    "kernel_size",
+                    "dilation",
+                ]
+            ]
+            for d in config.values()
+        ],
+        ids=config.keys(),
+    )
+
+    def test_run(
+        self,
+        target,
+        dev,
+        dtype,
+        out_dtype,
+        scale,
+        dshape,
+        kshape,
+        padding,
+        groups,
+        dilation,
+        channels,
+        kernel_size,
     ):
-        if except_targets is None:
-            except_targets = []
+        target = tvm.target.Target(target)
 
         x = relay.var("x", shape=dshape, dtype=dtype)
         w = relay.var("w", shape=kshape, dtype=dtype)
-        y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
+        y = relay.nn.conv2d(
+            x,
+            w,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            channels=channels,
+            kernel_size=kernel_size,
+        )
         func = relay.Function([x, w], y)
-        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
+
         kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
         dkernel = tvm.topi.testing.dilate_python(kernel, (1, 1) + dilation)
-        if fref is None:
-            ref_res = tvm.topi.testing.conv2d_nchw_python(
-                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding, groups=groups
-            )
-        else:
-            ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
-
-        for target, dev in tvm.testing.enabled_targets():
-            if target in except_targets:
-                continue
-            dev = tvm.device(target, 0)
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                data, kernel
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-4, atol=1e-4)
 
-    def compile_test_conv2d_arm_cpu(
-        dtype, out_dtype, scale, dshape, kshape, padding=(1, 1), groups=1, dilation=(1, 1), **attrs
-    ):
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", shape=kshape, dtype=dtype)
-        y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
-        func = relay.Function([x, w], y)
-        mod = tvm.IRModule()
-        mod["main"] = func
+        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
+        ref_res = tvm.topi.testing.conv2d_nchw_python(
+            data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding, groups=groups
+        )
 
-        test_schedule = '{"i": ["llvm -device=arm_cpu", "depthwise_conv2d_nchw_spatial_pack.arm_cpu", \
-                        [["TENSOR", [1, 512, 32, 32], "float32"], \
-                        ["TENSOR", [512, 1, 3, 3], "float32"], \
-                        [1, 1], [1, 1], [1, 1], "float32"], {}, \
-                        ["depthwise_conv2d_nchw_spatial_pack.arm_cpu", [1, 512, 32, 32, "float32"], \
-                        [512, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], \
-                        {"i": 743640, "t": "", "c": null, \
-                        "e": [["tile_co", "sp", [32, 16]], ["tile_oh", "sp", [8, 1]], \
-                        ["tile_ow", "sp", [1, 8]], \
-                        ["reorder_0", "re", [0, 1, 2, 3, 4, 5, 8, 6, 7]], \
-                        ["reorder_1", "re", [0, 1, 2, 3, 6, 4, 5]], \
-                        ["ann_reduce", "an", ["unroll", "none"]], \
-                        ["ann_spatial", "an", ["unroll", "unroll", "vec"]], \
-                        ["data_pad_inline", "ot", 4], ["data_vec_inline", "ot", 1], \
-                        ["conv_inline", "ot", 0]]}], "r": [[0.0002933163], \
-                        0, 3.1976189613342285, 1570811630.6058347], "v": 0.1}'
-        temp = utils.tempdir()
-        with open(temp.relpath("temp.log"), "w") as log_file:
-            log_file.write(test_schedule)
-        with autotvm.apply_history_best(temp.relpath("temp.log")):
-            with tvm.transform.PassContext(opt_level=3):
-                print("Compiling...")
-                graph_json, mod, params = tvm.relay.build(mod, target="llvm -device=arm_cpu")
+        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            data, kernel
+        )
+        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-4, atol=1e-4)
 
-    # depthwise conv2d
-    dshape = (1, 32, 18, 18)
-    kshape = (32, 1, 3, 3)
-    run_test_conv2d(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=32,
-        groups=32,
-        kernel_size=(3, 3),
-        fref=lambda x, w: tvm.topi.testing.depthwise_conv2d_python_nchw(x, w, (1, 1), "SAME"),
-    )
 
-    # depthwise conv2d for arm_cpu
+def test_compile_depthwise_conv2d_arm_cpu():
+    dtype = "float32"
+    out_dtype = "float32"
+    scale = 1
     dshape = (1, 512, 32, 32)
     kshape = (512, 1, 3, 3)
-    compile_test_conv2d_arm_cpu(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=512,
-        groups=512,
-        kernel_size=(3, 3),
-    )
-
-    # CUDA is disabled for 'direct' schedule:
-    # https://github.com/apache/tvm/pull/3070#issuecomment-486597553
-    # group conv2d
-    dshape = (1, 32, 18, 18)
-    kshape = (32, 4, 3, 3)
-    run_test_conv2d(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=32,
-        groups=8,
-        kernel_size=(3, 3),
-        except_targets=["cuda"],
-    )
-    # also group conv2d
-    dshape = (1, 32, 18, 18)
-    kshape = (64, 1, 3, 3)
-    run_test_conv2d(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=64,
-        groups=32,
-        kernel_size=(3, 3),
-        except_targets=["cuda"],
-    )
-
-    # normal conv2d
-    dshape = (1, 3, 224, 224)
-    kshape = (10, 3, 3, 3)
-    run_test_conv2d(
-        "float32", "float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(3, 3)
-    )
-    # mixed precision
-    run_test_conv2d(
-        "int8", "int32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(3, 3)
-    )
-    kshape = (10, 3, 1, 3)
-    # mixed precision.
-    run_test_conv2d(
-        "int8", "int32", 1, dshape, kshape, padding=(0, 1), channels=10, kernel_size=(1, 3)
-    )
-    # dilated conv2d
-    dshape = (1, 3, 18, 18)
-    kshape = (10, 3, 3, 3)
-    run_test_conv2d(
-        "float32",
-        "float32",
-        1,
-        dshape,
-        kshape,
-        padding=(1, 1),
-        channels=10,
-        kernel_size=(3, 3),
-        dilation=(3, 3),
+    padding = (1, 1)
+    channels = 512
+    groups = 512
+    kernel_size = (3, 3)
+    dilation = (1, 1)
+
+    x = relay.var("x", shape=dshape, dtype=dtype)
+    w = relay.var("w", shape=kshape, dtype=dtype)
+    y = relay.nn.conv2d(
+        x,
+        w,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        channels=channels,
+        kernel_size=kernel_size,
     )
+    func = relay.Function([x, w], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+
+    test_schedule = '{"i": ["llvm -device=arm_cpu", "depthwise_conv2d_nchw_spatial_pack.arm_cpu", \
+                    [["TENSOR", [1, 512, 32, 32], "float32"], \
+                    ["TENSOR", [512, 1, 3, 3], "float32"], \
+                    [1, 1], [1, 1], [1, 1], "float32"], {}, \
+                    ["depthwise_conv2d_nchw_spatial_pack.arm_cpu", [1, 512, 32, 32, "float32"], \
+                    [512, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], \
+                    {"i": 743640, "t": "", "c": null, \
+                    "e": [["tile_co", "sp", [32, 16]], ["tile_oh", "sp", [8, 1]], \
+                    ["tile_ow", "sp", [1, 8]], \
+                    ["reorder_0", "re", [0, 1, 2, 3, 4, 5, 8, 6, 7]], \
+                    ["reorder_1", "re", [0, 1, 2, 3, 6, 4, 5]], \
+                    ["ann_reduce", "an", ["unroll", "none"]], \
+                    ["ann_spatial", "an", ["unroll", "unroll", "vec"]], \
+                    ["data_pad_inline", "ot", 4], ["data_vec_inline", "ot", 1], \
+                    ["conv_inline", "ot", 0]]}], "r": [[0.0002933163], \
+                    0, 3.1976189613342285, 1570811630.6058347], "v": 0.1}'
+    temp = utils.tempdir()
+    with open(temp.relpath("temp.log"), "w") as log_file:
+        log_file.write(test_schedule)
+    with autotvm.apply_history_best(temp.relpath("temp.log")):
+        with tvm.transform.PassContext(opt_level=3):
+            print("Compiling...")
+            graph_json, mod, params = tvm.relay.build(mod, target="llvm -device=arm_cpu")
 
 
 @tvm.testing.uses_gpu
@@ -1588,7 +1632,7 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
         return assembly
 
     def _has_fast_int8_instructions(asm, target):
-        if "skylake-avx512" in target:
+        if "nehalem" in target or "core-avx2" in target or "skylake-avx512" in target:
             return "pmaddubs" in asm
         elif "cascadelake" in target:
             return "vpdpbusd" in asm
@@ -1598,8 +1642,13 @@ def _has_fast_int8_instructions(asm, target):
     # TODO(@anijain2305, @icemelon9): disable conv2d_int8 for NHWC data layout.
     #   Re-enable this after adding conv2d_NCHWc_int8 support for NHWC.
 
-    # compile conv2d for x86 (skylake, cascadelake) and test assembly contains *pmadd* instructions
-    targets = ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]
+    # compile conv2d for x86 (SSE3/AVX2/AVX512/VNNI capable) and test assembly contains *pmadd* instructions
+    targets = [
+        "llvm -mcpu=nehalem",
+        "llvm -mcpu=core-avx2",
+        "llvm -mcpu=skylake-avx512",
+        "llvm -mcpu=cascadelake",
+    ]
     llvm_version = tvm.target.codegen.llvm_version_major()
     for target in targets:
         if tvm.testing.device_enabled(target) and llvm_version >= 8:
@@ -1675,7 +1724,7 @@ def _has_fast_int8_instructions(asm, target):
 
     # Check that a vectorized instruction is generated for older Intel
     # generations, because we default to NCHWc layout.
-    target = "llvm -mcpu=core-avx2"
+    target = "llvm -mcpu=x86-64"
     if tvm.testing.device_enabled(target):
         fast_int8_dtypes = ("uint8", "int8", "int32")
         asm = _compile(
@@ -1687,7 +1736,7 @@ def _has_fast_int8_instructions(asm, target):
             dtypes=fast_int8_dtypes,
         )
         # Check that vector int mult and add instructions are generated.
-        assert "vpmulld" in asm and "vpadd" in asm
+        assert "pmulhw" in asm and "paddd" in asm
 
 
 @tvm.testing.uses_gpu
@@ -1851,38 +1900,4 @@ def _test_correlation(
 
 
 if __name__ == "__main__":
-    test_pool1d()
-    test_pool2d()
-    test_pool3d()
-    test_avg_pool2d_no_count_pad()
-    test_lrn()
-    test_l2_normalize()
-    test_conv1d_infer_type()
-    test_conv2d_infer_type()
-    test_conv3d_infer_type()
-    test_bitpack_infer_type()
-    test_upsampling_infer_type()
-    test_upsampling3d_infer_type()
-    test_flatten_infer_type()
-    test_pad_infer_type()
-    test_pad_run()
-    test_pad_run_dynamic_pad_value()
-    test_conv3d_transpose_infer_type()
-    test_conv3d_transpose_ncdhw_run()
-    test_conv2d_transpose_infer_type()
-    test_conv2d_transpose_nchw_run()
-    test_conv2d_transpose_nhwc_run()
-    test_conv1d_transpose_ncw_run()
-    test_conv1d_run()
-    test_conv2d_run()
-    test_conv2d_winograd()
-    test_conv3d_run()
-    test_conv3d_ndhwc_run()
-    test_conv3d_winograd()
-    test_bitserial_conv2d_infer_type()
-    test_batch_flatten()
-    test_upsampling()
-    test_upsampling3d()
-    test_conv2d_int8_intrinsics()
-    test_depthwise_conv2d_int8()
-    test_correlation()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index e0b95fe7fbf7..eaddd33678df 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -16,21 +16,31 @@
 # under the License.
 """ Support level3 operator test cases.
 """
+import sys
 from typing import Callable, Optional
 
 import numpy as np
 import pytest
+
 import tvm
 import tvm.testing
+
 from tvm import relay, te
 from tvm.error import TVMError
 from tvm.relay import create_executor, transform
 from tvm.relay.testing import check_grad, run_infer_type
+
 from utils import ref_funcs
 
 
-def test_zeros_ones():
-    for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
+executor_kind = tvm.testing.parameter("graph", "debug")
+
+
+class TestZerosOnes:
+    config = {"zeros": (relay.zeros, np.zeros), "ones": (relay.ones, np.ones)}
+    op, ref = tvm.testing.parameters(*config.values(), ids=config.keys())
+
+    def test_zeros_ones(self, op, ref):
         y = op(shape=(124, 50), dtype="float64")
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((124, 50), "float64")
@@ -38,19 +48,22 @@ def test_zeros_ones():
         np.testing.assert_allclose(intrp_res, ref((124, 50), "float64"))
 
 
-def test_unary_identity():
-    for op, ref in [
-        (relay.zeros_like, np.zeros_like),
-        (relay.ones_like, np.ones_like),
-        (relay.ceil, np.ceil),
-        (relay.floor, np.floor),
-        (relay.trunc, np.trunc),
-        (relay.round, np.round),
-        (relay.abs, np.abs),
-        (relay.copy, None),  # np.copy
-        (relay.negative, np.negative),
-        (relay.sign, np.sign),
-    ]:
+class TestUnaryIdentity:
+    config = {
+        "zeros_like": (relay.zeros_like, np.zeros_like),
+        "ones_like": (relay.ones_like, np.ones_like),
+        "ceil": (relay.ceil, np.ceil),
+        "floor": (relay.floor, np.floor),
+        "trunc": (relay.trunc, np.trunc),
+        "round": (relay.round, np.round),
+        "abs": (relay.abs, np.abs),
+        "copy": (relay.copy, None),  # np.copy
+        "negative": (relay.negative, np.negative),
+        "sign": (relay.sign, np.sign),
+    }
+    op, ref = tvm.testing.parameters(*config.values(), ids=config.keys())
+
+    def test_unary_identity(self, op, ref):
         shape = (8, 9, 4)
         x = relay.var("x", relay.TensorType(shape, "float32"))
         y = op(x)
@@ -169,8 +182,14 @@ def reference_tanh(x):
     np.testing.assert_allclose(op_res.numpy(), reference_tanh(data), atol=4e-5, rtol=1e-9)
 
 
-def test_squeeze():
-    def verify_squeeze(shape, dtype, axis):
+class TestSqueeze:
+    shape, dtype, axis = tvm.testing.parameters(
+        ((1, 3, 2, 5), "float32", None),
+        ((1, 3, 1), "float32", [0]),
+        ((1, 2, 1, 2, 1), "float32", [0, 2]),
+    )
+
+    def test_squeeze(self, shape, dtype, axis):
         x = relay.var("x", relay.TensorType(shape, dtype))
         squeeze = relay.squeeze(x, axis=axis)
 
@@ -181,10 +200,6 @@ def verify_squeeze(shape, dtype, axis):
         ref_res = np.squeeze(data, axis=np_axis)
         np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
 
-    verify_squeeze((1, 3, 2, 5), "float32", None)
-    verify_squeeze((1, 3, 1), "float32", [0])
-    verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
-
 
 def test_transpose_infer_type():
     n, t, d = te.size_var("n"), te.size_var("t"), 100
@@ -200,24 +215,19 @@ def test_transpose_infer_type():
     assert yy.checked_type == relay.TensorType((100, t, n), "float32")
 
 
-@tvm.testing.uses_gpu
-def test_transpose():
-    def verify_transpose(dshape, axes):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.transpose(x, axes=axes)
+def test_transpose(target, dev, executor_kind):
+    dshape = (2, 3, 4)
+    axes = (0, 2, 1)
 
-        func = relay.Function([x], z)
-        x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
-        ref_res = np.transpose(x_data, axes=axes)
+    x = relay.var("x", relay.TensorType(dshape, "float32"))
+    z = relay.transpose(x, axes=axes)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
+    ref_res = np.transpose(x_data, axes=axes)
 
-    verify_transpose((2, 3, 4), (0, 2, 1))
+    op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_data)
+    tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 def test_squeeze_infer_type():
@@ -253,9 +263,26 @@ def test_reshape_infer_type():
     assert yy.checked_type == relay.TensorType((n, t, 2000), "float32")
 
 
-@tvm.testing.uses_gpu
-def test_reshape():
-    def verify_reshape(shape, newshape, oshape):
+class TestReshape:
+    shape, newshape, oshape = tvm.testing.parameters(
+        ((2, 3, 4), (8, 3), (8, 3)),
+        ((4, 7), (2, 7, 2), (2, 7, 2)),
+        ((2, 3, 4), (4, 0, 2), (4, 3, 2)),
+        ((2, 3, 4), (2, 0, 0), (2, 3, 4)),
+        ((2, 3, 4), (0, -1), (2, 12)),
+        ((2, 3, 4), (-1, 0), (8, 3)),
+        ((2, 3, 4), (2, -2), (2, 3, 4)),
+        ((2, 3, 4), (-2, 1, 1), (2, 3, 4, 1, 1)),
+        ((2, 3, 4), (-3, 4), (6, 4)),
+        ((2, 3, 4, 5), (-3, -3), (6, 20)),
+        ((2, 3, 4), (0, -3), (2, 12)),
+        ((2, 3, 4), (-3, -2), (6, 4)),
+        ((2, 3, 4), (-4, 1, 2, -2), (1, 2, 3, 4)),
+        ((2, 3, 4), (2, -4, -1, 3, -2), (2, 1, 3, 4)),
+        ((1,), (), ()),
+    )
+
+    def test_reshape(self, target, dev, executor_kind, shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.reshape(x, newshape=newshape)
         zz = run_infer_type(z)
@@ -266,28 +293,10 @@ def verify_reshape(shape, newshape, oshape):
         check_grad(func)
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_reshape((2, 3, 4), (8, 3), (8, 3))
-    verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
-    verify_reshape((2, 3, 4), (4, 0, 2), (4, 3, 2))
-    verify_reshape((2, 3, 4), (2, 0, 0), (2, 3, 4))
-    verify_reshape((2, 3, 4), (0, -1), (2, 12))
-    verify_reshape((2, 3, 4), (-1, 0), (8, 3))
-    verify_reshape((2, 3, 4), (2, -2), (2, 3, 4))
-    verify_reshape((2, 3, 4), (-2, 1, 1), (2, 3, 4, 1, 1))
-    verify_reshape((2, 3, 4), (-3, 4), (6, 4))
-    verify_reshape((2, 3, 4, 5), (-3, -3), (6, 20))
-    verify_reshape((2, 3, 4), (0, -3), (2, 12))
-    verify_reshape((2, 3, 4), (-3, -2), (6, 4))
-    verify_reshape((2, 3, 4), (-4, 1, 2, -2), (1, 2, 3, 4))
-    verify_reshape((2, 3, 4), (2, -4, -1, 3, -2), (2, 1, 3, 4))
-    verify_reshape((1,), (), ())
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 def test_reshape_fail():
@@ -340,9 +349,16 @@ def test_reshape_like_infer_type():
     assert w.checked_type == relay.TensorType((5, 6, 4), "float32")
 
 
-@tvm.testing.uses_gpu
-def test_reshape_like():
-    def verify_reshape_like(shape, oshape, shape_like=None, reshape_like_kwargs={}):
+class TestReshapeLike:
+    shape, oshape, shape_like, reshape_like_kwargs = tvm.testing.parameters(
+        ((2, 3, 4), (1, 8, 3), None, {}),
+        ((4, 7), (2, 7, 2), None, {}),
+        ((1, 2, 3, 4), (1, 6, 4), (1, 6, 5), dict(lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)),
+    )
+
+    def test_reshape_like(
+        self, target, dev, executor_kind, shape, oshape, shape_like=None, reshape_like_kwargs={}
+    ):
         if shape_like is None:
             shape_like = oshape
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
@@ -357,41 +373,56 @@ def verify_reshape_like(shape, oshape, shape_like=None, reshape_like_kwargs={}):
 
         func = relay.Function([x, y], z)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data, y_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
-    verify_reshape_like((2, 3, 4), (1, 8, 3))
-    verify_reshape_like((4, 7), (2, 7, 2))
-    verify_reshape_like(
-        (1, 2, 3, 4), (1, 6, 4), (1, 6, 5), dict(lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)
-    )
 
+class TestTakeInferType:
+    d1, d2, d3 = te.var("d1"), te.var("d2"), te.var("d3")
+    d4, d5, d6 = te.var("d4"), te.var("d5"), te.var("d6")
+    dshape, indices_shape, oshape, axis = tvm.testing.parameters(
+        ((d1,), (1,), (1,), 0),
+        ((4,), (d1, d2), (d1, d2), None),
+        ((3, 3, 3), (1, d2), (1, d2), None),
+        ((d1, d2), (d3, d4, d5), (d3, d4, d5, d2), 0),
+        ((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1),
+        ((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2),
+    )
 
-def test_take_infer_type():
-    def verify_take(dshape, indices_shape, oshape, axis=None):
+    def test_take(self, dshape, indices_shape, oshape, axis):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
         y = relay.take(x, indices, axis=axis)
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType(oshape, "float32")
 
-    d1, d2, d3 = te.var("d1"), te.var("d2"), te.var("d3")
-    d4, d5, d6 = te.var("d4"), te.var("d5"), te.var("d6")
-    verify_take((d1,), (1,), (1,), 0)
-    verify_take((4,), (d1, d2), (d1, d2))
-    verify_take((3, 3, 3), (1, d2), (1, d2))
-    verify_take((d1, d2), (d3, d4, d5), (d3, d4, d5, d2), 0)
-    verify_take((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1)
-    verify_take((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2)
 
+class TestTake:
+    src_shape, indices_src, axis, mode = tvm.testing.parameters(
+        ((4,), [1], None, "clip"),
+        ((4,), [[0, 1, 2, 3]], None, "clip"),
+        ((3, 3, 3), [[11, 25]], None, "clip"),
+        ((4,), [[0, 1], [2, 3]], None, "clip"),
+        ((4,), [1], 0, "clip"),
+        ((2, 2), [[[1, 0], [0, 1]]], 0, "clip"),
+        ((2, 2), [[[1, 0], [0, 1]]], 1, "clip"),
+        ((4, 3, 5, 6), [[2, 1, 0, 0]], -2, "clip"),
+        ((3, 4), [-5, 20], None, "clip"),
+        ((3, 4), [-5, 20], None, "wrap"),
+        ((3, 4), [-1, 2], 0, "clip"),
+        ((3, 4), [-1, 2], 0, "wrap"),
+        ((3, 4), [-1, 2], 1, "clip"),
+        ((3, 4), [-1, 2], 1, "wrap"),
+        ((3, 3, 3), [[11, 25]], None, "fast"),
+        ((3, 4), [0, 2], 0, "fast"),
+        ((3, 4), [0, 2], 1, "fast"),
+    )
 
-@tvm.testing.uses_gpu
-def test_take():
-    def verify_take(src_shape, indices_src, axis=None, mode="clip"):
+    # Incorrect numeric output in some cases on vulkan
+    @tvm.testing.known_failing_targets("vulkan")
+    def test_take(self, target, dev, executor_kind, src_shape, indices_src, axis, mode):
         src_dtype = "float32"
         indices_dtype = "int32"
         indices_src = np.array(indices_src, dtype=indices_dtype)
@@ -404,134 +435,117 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
         np_mode = "raise" if mode == "fast" else mode
         ref_res = np.take(x_data, indices=indices_src, axis=axis, mode=np_mode)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data, indices_src
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_take((4,), [1])
-    verify_take((4,), [[0, 1, 2, 3]])
-    verify_take((3, 3, 3), [[11, 25]])
-    verify_take((4,), [[0, 1], [2, 3]])
-    verify_take((4,), [1], 0)
-    verify_take((2, 2), [[[1, 0], [0, 1]]], 0)
-    verify_take((2, 2), [[[1, 0], [0, 1]]], 1)
-    verify_take((4, 3, 5, 6), [[2, 1, 0, 0]], -2)
-    verify_take((3, 4), [-5, 20])
-    verify_take((3, 4), [-5, 20], mode="wrap")
-    verify_take((3, 4), [-1, 2], axis=0)
-    verify_take((3, 4), [-1, 2], axis=0, mode="wrap")
-    verify_take((3, 4), [-1, 2], axis=1)
-    verify_take((3, 4), [-1, 2], axis=1, mode="wrap")
-    verify_take((3, 3, 3), [[11, 25]], mode="fast")
-    verify_take((3, 4), [0, 2], axis=0, mode="fast")
-    verify_take((3, 4), [0, 2], axis=1, mode="fast")
-
-
-def test_split_infer_type():
-    def verify_split(dshape, indices_or_sections, ret_type, axis=None):
-        x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
-        y = relay.split(x, indices_or_sections, axis=axis)
-        yy = run_infer_type(y.astuple())
-        assert yy.checked_type == ret_type
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data, indices_src
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+
 
+class TestSplitInferType:
     idxd = tvm.tir.indexdiv
 
     d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
     axis = te.var("axis")
-    verify_split(
-        (5, 5, 2, 2),
-        5,
-        relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                    relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                    relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                    relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                    relay.ty.TensorType((5, 1, 2, 2), "float32"),
-                ]
-            )
+
+    dshape, indices_or_sections, ret_type, axis = tvm.testing.parameters(
+        (
+            (5, 5, 2, 2),
+            5,
+            relay.ty.TupleType(
+                tvm.runtime.convert(
+                    [
+                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                        relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                    ]
+                )
+            ),
+            1,
         ),
-        axis=1,
-    )
-    verify_split(
-        (5, 5, 2, 2),
-        5,
-        relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                    relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                    relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                    relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                    relay.ty.TensorType((1, 5, 2, 2), "float32"),
-                ]
-            )
+        (
+            (5, 5, 2, 2),
+            5,
+            relay.ty.TupleType(
+                tvm.runtime.convert(
+                    [
+                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                        relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                    ]
+                )
+            ),
+            0,
         ),
-        axis=0,
-    )
-    verify_split(
-        (d1, d2, d3, d4),
-        4,
-        relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                    relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                    relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                    relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
-                ]
-            )
+        (
+            (d1, d2, d3, d4),
+            4,
+            relay.ty.TupleType(
+                tvm.runtime.convert(
+                    [
+                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
+                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
+                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
+                        relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
+                    ]
+                )
+            ),
+            2,
         ),
-        axis=2,
-    )
-    verify_split(
-        (d1, d2, d3, d4),
-        2,
-        relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"),
-                    relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"),
-                ]
-            )
+        (
+            (d1, d2, d3, d4),
+            2,
+            relay.ty.TupleType(
+                tvm.runtime.convert(
+                    [
+                        relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"),
+                        relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"),
+                    ]
+                )
+            ),
+            0,
         ),
-        axis=0,
-    )
-    verify_split(
-        (d1, d2, d3, d4),
-        (2, 4, 7),
-        relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                    relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                    relay.ty.TensorType((d1, 3, d3, d4), "float32"),
-                    relay.ty.TensorType((d1, (d2 - 7), d3, d4), "float32"),
-                ]
-            )
+        (
+            (d1, d2, d3, d4),
+            (2, 4, 7),
+            relay.ty.TupleType(
+                tvm.runtime.convert(
+                    [
+                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                        relay.ty.TensorType((d1, 3, d3, d4), "float32"),
+                        relay.ty.TensorType((d1, (d2 - 7), d3, d4), "float32"),
+                    ]
+                )
+            ),
+            1,
         ),
-        axis=1,
-    )
-    verify_split(
-        (d1, d2, d3, d4),
-        tuple(np.array([2, 4, 7]).astype(np.int64)),
-        relay.ty.TupleType(
-            tvm.runtime.convert(
-                [
-                    relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                    relay.ty.TensorType((d1, 2, d3, d4), "float32"),
-                    relay.ty.TensorType((d1, 3, d3, d4), "float32"),
-                    relay.ty.TensorType((d1, (d2 - 7), d3, d4), "float32"),
-                ]
-            )
+        (
+            (d1, d2, d3, d4),
+            tuple(np.array([2, 4, 7]).astype(np.int64)),
+            relay.ty.TupleType(
+                tvm.runtime.convert(
+                    [
+                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                        relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                        relay.ty.TensorType((d1, 3, d3, d4), "float32"),
+                        relay.ty.TensorType((d1, (d2 - 7), d3, d4), "float32"),
+                    ]
+                )
+            ),
+            1,
         ),
-        axis=1,
     )
 
+    def test_split(self, dshape, indices_or_sections, ret_type, axis):
+        x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
+        y = relay.split(x, indices_or_sections, axis=axis)
+        yy = run_infer_type(y.astuple())
+        assert yy.checked_type == ret_type
+
 
 def test_full_infer_type():
     # default settings: match input dtype
@@ -548,23 +562,36 @@ def test_full_infer_type():
     assert yy.checked_type == relay.TensorType((1, 2), "int8")
 
 
-@tvm.testing.uses_gpu
-def test_full():
-    def verify_full(fill_value, src_shape, dtype):
+class TestFull:
+    fill_value, arr_shape, dtype = tvm.testing.parameters(
+        (4, (1, 3, 4, 4), "int32"),
+        (4, (1, 3, 4, 4), "int64"),
+        (4.0, (1, 4), "float32"),
+    )
+
+    def test_full(self, target, dev, executor_kind, fill_value, arr_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
-        z = relay.full(x, src_shape, dtype)
+        z = relay.full(x, arr_shape, dtype)
         func = relay.Function([x], z)
-        ref_res = np.full(src_shape, fill_value)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    np.array(fill_value, dtype)
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        ref_res = np.full(arr_shape, fill_value, dtype=dtype)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            np.array(fill_value, dtype)
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
-    verify_full(4, (1, 3, 4, 4), "int32")
-    # verify_full(4, (1, 3, 4, 4), "int64") # This does not pass, python int32 is not upcast to int64, not sure how to fix it.
-    verify_full(4.0, (1, 4), "float32")
+    def test_full_like(self, target, dev, executor_kind, arr_shape, fill_value, dtype):
+        x_data = np.random.uniform(low=-1, high=1, size=arr_shape).astype(dtype)
+        x = relay.var("x", relay.TensorType(arr_shape, dtype))
+        y = relay.var("y", relay.scalar_type(dtype))
+        z = relay.full_like(x, y)
+
+        func = relay.Function([x, y], z)
+        ref_res = np.full_like(x_data, fill_value)
+
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data, np.array(fill_value, dtype)
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 def test_full_like_infer_type():
@@ -584,30 +611,7 @@ def test_full_like_infer_type():
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
 
-@tvm.testing.uses_gpu
-def test_full_like():
-    def verify_full_like(base, fill_value, dtype):
-        x_data = np.random.uniform(low=-1, high=1, size=base).astype(dtype)
-        x = relay.var("x", relay.TensorType(base, dtype))
-        y = relay.var("y", relay.scalar_type(dtype))
-        z = relay.full_like(x, y)
-
-        func = relay.Function([x, y], z)
-        ref_res = np.full_like(x_data, fill_value)
-
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data, np.array(fill_value, dtype)
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_full_like((1, 3, 4, 4), 4, "int32")
-    verify_full_like((1, 1), 44.0, "float32")
-
-
-@tvm.testing.uses_gpu
-def test_infer_type_leaky_relu():
+def test_infer_type_leaky_relu(target, dev):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = relay.nn.leaky_relu(x, alpha=0.1)
@@ -626,42 +630,55 @@ def test_infer_type_leaky_relu():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
 
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+    op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
+    tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
+    op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
+    tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
 
 
-def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
-    x = relay.var("data", relay.TensorType(data, dtype))
-    if alpha:
-        y = relay.var("alpha", relay.TensorType(alpha, dtype))
-    else:
-        y = relay.var("alpha", relay.IncompleteType())
-    z = relay.nn.prelu(x, y, axis=axis)
-    zz = run_infer_type(z)
-    if axis != 1:
-        assert "axis" in z.astext()
-    assert zz.checked_type == relay.ty.TensorType(output, dtype)
-    if not alpha:
-        axis = axis if axis else 1
-        alpha_shape = (data[axis],)
-        assert zz.args[1].checked_type == relay.TensorType(alpha_shape, "float32")
-
-    if all(isinstance(v, tvm.tir.Var) == 1 for v in data) or not alpha:
-        return
-
-    func = relay.Function([x, y], z)
-    x_data = np.random.uniform(low=-1, high=1, size=data).astype(dtype)
-    a_data = np.random.uniform(low=-1, high=1, size=alpha).astype(dtype)
-
-    if axis == 1:
-        ref_res = (x_data < 0) * (x_data * a_data.reshape(3, 1, 1)) + (x_data >= 0) * x_data
-    else:
-        ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data
-
-    for target, dev in tvm.testing.enabled_targets():
+class TestInferTypePrelu:
+    dtype = tvm.testing.parameter("float32")
+
+    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
+    data, alpha, axis, output = tvm.testing.parameters(
+        ((n, c, h, w), (c,), 1, (n, c, h, w)),
+        ((n, h, w, c), (c,), 3, (n, h, w, c)),
+        ((n, c, h, w), None, 1, (n, c, h, w)),
+        ((n, h, w, c), None, 3, (n, h, w, c)),
+        ((1, 3, 2, 2), (3,), 1, (1, 3, 2, 2)),
+        ((1, 2, 2, 3), (3,), 3, (1, 2, 2, 3)),
+        ((1, 3, 2, 2), None, 1, (1, 3, 2, 2)),
+        ((1, 2, 2, 3), None, 3, (1, 2, 2, 3)),
+    )
+
+    def test_infer_type_prelu(self, target, dev, data, alpha, axis, output, dtype):
+        x = relay.var("data", relay.TensorType(data, dtype))
+        if alpha:
+            y = relay.var("alpha", relay.TensorType(alpha, dtype))
+        else:
+            y = relay.var("alpha", relay.IncompleteType())
+        z = relay.nn.prelu(x, y, axis=axis)
+        zz = run_infer_type(z)
+        if axis != 1:
+            assert "axis" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(output, dtype)
+        if not alpha:
+            axis = axis if axis else 1
+            alpha_shape = (data[axis],)
+            assert zz.args[1].checked_type == relay.TensorType(alpha_shape, "float32")
+
+        if all(isinstance(v, tvm.tir.Var) == 1 for v in data) or not alpha:
+            return
+
+        func = relay.Function([x, y], z)
+        x_data = np.random.uniform(low=-1, high=1, size=data).astype(dtype)
+        a_data = np.random.uniform(low=-1, high=1, size=alpha).astype(dtype)
+
+        if axis == 1:
+            ref_res = (x_data < 0) * (x_data * a_data.reshape(3, 1, 1)) + (x_data >= 0) * x_data
+        else:
+            ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data
+
         op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
             x_data, a_data
         )
@@ -672,23 +689,24 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
         tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
 
 
-@tvm.testing.uses_gpu
-def test_infer_type_prelu():
-    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    verify_infer_type_prelu((n, c, h, w), (c,), 1, (n, c, h, w))
-    verify_infer_type_prelu((n, h, w, c), (c,), 3, (n, h, w, c))
-    verify_infer_type_prelu((n, c, h, w), None, 1, (n, c, h, w))
-    verify_infer_type_prelu((n, h, w, c), None, 3, (n, h, w, c))
-    verify_infer_type_prelu((1, 3, 2, 2), (3,), 1, (1, 3, 2, 2))
-    verify_infer_type_prelu((1, 2, 2, 3), (3,), 3, (1, 2, 2, 3))
-    verify_infer_type_prelu((1, 3, 2, 2), None, 1, (1, 3, 2, 2))
-    verify_infer_type_prelu((1, 2, 2, 3), None, 3, (1, 2, 2, 3))
-
-
-@tvm.testing.uses_gpu
-def test_arange():
-    def verify_arange(start, stop, step):
-        dtype = "float32"
+class TestArange:
+    dtype = tvm.testing.parameter("float32")
+
+    start, stop, step = tvm.testing.parameters(
+        (None, 20, None),
+        (None, 20, 2),
+        (1, 20, None),
+        (1, 20, 2),
+        # arange doesnt' support floating point right now, see type relation
+        # (1, 20, 1.5),
+        (1, 20.5, None),
+        (1, 20, 3),
+        (20, 1, -1),
+        # arange doesnt' support floating point right now, see type relation
+        # (20, 1, -1.5),
+    )
+
+    def test_arange(self, target, dev, executor_kind, start, stop, step, dtype):
         if start is None and step is None:
             x = relay.arange(relay.const(stop, dtype=dtype))
             ref_res = np.arange(stop).astype(dtype)
@@ -707,27 +725,21 @@ def verify_arange(start, stop, step):
             ref_res = np.arange(start, stop, step).astype(dtype)
 
         func = relay.Function([], x)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)()
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_arange(None, 20, None)
-    verify_arange(None, 20, 2)
-    verify_arange(1, 20, None)
-    verify_arange(1, 20, 2)
-    # arange doesnt' support floating point right now, see type relation
-    # verify_arange(1, 20, 1.5)
-    verify_arange(1, 20.5, None)
-    verify_arange(1, 20, 3)
-    verify_arange(20, 1, -1)
-    # arange doesnt' support floating point right now, see type relation
-    # verify_arange(20, 1, -1.5)
-
-
-@tvm.testing.uses_gpu
-def test_meshgrid():
-    def verify_meshgrid(lengths, indexing="ij"):
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)()
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+
+
+class TestMeshgrid:
+    lengths, indexing = tvm.testing.parameters(
+        ([3, 5], "ij"),
+        ([4, 2], "xy"),
+        ([3, 5, 2], "ij"),
+        ([3, 1, 5], "xy"),
+        # Length 0 signifies scalar.
+        ([3, 5, 0], "ij"),
+    )
+
+    def test_meshgrid(self, target, dev, executor_kind, lengths, indexing="ij"):
         input_vars = []
         input_data = []
         for i, length in enumerate(lengths):
@@ -745,26 +757,22 @@ def verify_meshgrid(lengths, indexing="ij"):
         # Get ref
         ref_res = np.meshgrid(*input_data, indexing=indexing)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    *input_data
-                )
-                assert len(op_res) == len(ref_res)
-                for i in range(len(op_res)):
-                    tvm.testing.assert_allclose(op_res[i].numpy(), ref_res[i], rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            *input_data
+        )
+        assert len(op_res) == len(ref_res)
+        for i in range(len(op_res)):
+            tvm.testing.assert_allclose(op_res[i].numpy(), ref_res[i], rtol=1e-5)
 
-    verify_meshgrid([3, 5])
-    verify_meshgrid([4, 2], indexing="xy")
-    verify_meshgrid([3, 5, 2])
-    verify_meshgrid([3, 1, 5], indexing="xy")
-    # Length 0 signifies scalar.
-    verify_meshgrid([3, 5, 0])
 
+class TestTile:
+    dshape, reps = tvm.testing.parameters(
+        ((2, 3, 4), (3, 2, 1)),
+        ((2, 3, 4), (1, 2)),
+        ((2, 3), (3, 2, 1)),
+    )
 
-@tvm.testing.uses_gpu
-def test_tile():
-    def verify_tile(dshape, reps):
+    def test_tile(self, target, dev, executor_kind, dshape, reps):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         z = relay.tile(x, reps=reps)
 
@@ -772,95 +780,91 @@ def verify_tile(dshape, reps):
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.tile(x_data, reps=reps)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
-    verify_tile((2, 3, 4), (3, 2, 1))
-    verify_tile((2, 3, 4), (1, 2))
-    verify_tile((2, 3), (3, 2, 1))
 
+class TestRepeat:
+    dshape, repeats, axis = tvm.testing.parameters(
+        ((3,), 2, 0),
+        ((3, 10), 2, -1),
+        ((3, 2, 4), 3, 1),
+    )
 
-@tvm.testing.uses_gpu
-def test_repeat():
-    def verify_repeat(dshape, repeats, axis):
+    def test_repeat(self, target, dev, executor_kind, dshape, repeats, axis):
         x = relay.Var("x", relay.TensorType(dshape, "float32"))
         func = relay.Function([x], relay.repeat(x, repeats, axis))
         data = np.random.uniform(size=dshape).astype("float32")
         ref_res = np.repeat(data, repeats, axis)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(data)
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+
 
-    verify_repeat((3,), 2, 0)
-    verify_repeat((3, 10), 2, -1)
-    verify_repeat((3, 2, 4), 3, 1)
+class TestStack:
+    dshapes, axis = tvm.testing.parameters(
+        ([(2,), (2,), (2,)], -1),
+        ([(2,), (2,), (2,)], 0),
+        ([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1),
+        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1),
+        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], 4),
+    )
+
+    expr_type = tvm.testing.parameter("tuple", "list", "tuple_expr")
+
+    @tvm.testing.fixture
+    def ref_data(self, dshapes, axis):
+        np_in = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
+        np_out = np.stack(np_in, axis=axis)
+        return np_in, np_out
+
+    @tvm.testing.fixture
+    def input_expr(self, dshapes, axis, expr_type, ref_data):
+        input_vars = [relay.var("input", relay.TensorType(shape, "float32")) for shape in dshapes]
 
+        if expr_type == "tuple":
+            input_expr = relay.Tuple(input_vars)
 
-@tvm.testing.uses_gpu
-def test_stack():
-    def produce_input_tuple(dshapes):
-        y = [relay.var("input", relay.TensorType(shape, "float32")) for shape in dshapes]
-        return relay.Tuple(y)
+        elif expr_type == "list":
+            input_expr = input_vars
 
-    def ref_stack(inputs, axis):
-        return np.stack(inputs, axis=axis)
+        elif expr_type == "tuple_expr":
+            # expression that evaluates to a tuple
+            # but is not a tuple literal
+            np_in, np_out = ref_data
+            x = relay.Var("x")
+            input_expr = relay.Let(x, relay.Tuple([relay.const(inp) for inp in np_in]), x)
 
-    def verify_stack(input_expr, relay_args, ref_res, axis):
+        else:
+            raise ValueError(f"Unknown expr_type '{expr_type}'")
+
+        return input_expr
+
+    def test_stack(self, target, dev, executor_kind, input_expr, ref_data, axis):
         z = relay.stack(input_expr, axis=axis)
         inp_vars = relay.analysis.free_vars(z)
         func = relay.Function(inp_vars, z)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    *relay_args
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    def verify_tup_lit_stack(dshapes, axis):
-        input_tuple = produce_input_tuple(dshapes)
-        input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
-        ref_res = ref_stack(input_data, axis)
-        verify_stack(input_tuple, input_data, ref_res, axis)
-
-    def verify_list_lit_stack(dshapes, axis):
-        input_list = produce_input_tuple(dshapes).fields
-        input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
-        ref_res = ref_stack(input_data, axis)
-        verify_stack(input_list, input_data, ref_res, axis)
-
-    def verify_tup_expr_stack(dshapes, axis):
-        input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
-        ref_res = ref_stack(input_data, axis)
-
-        # expression that evaluates to a tuple
-        # but is not a tuple literal
-        x = relay.Var("x")
-        input_expr = relay.Let(x, relay.Tuple([relay.const(inp) for inp in input_data]), x)
-        verify_stack(input_expr, [], ref_res, axis)
-
-    dshape_axis_combos = [
-        ([(2,), (2,), (2,)], -1),
-        ([(2,), (2,), (2,)], 0),
-        ([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1),
-        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1),
-        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], 4),
-    ]
+        np_in, np_out = ref_data
+        relay_args = np_in if inp_vars else []
 
-    for dshapes, axis in dshape_axis_combos:
-        verify_tup_lit_stack(dshapes, axis)
-        verify_list_lit_stack(dshapes, axis)
-        verify_tup_expr_stack(dshapes, axis)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            *relay_args
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-5)
 
 
-@tvm.testing.uses_gpu
-def test_reverse():
-    def verify_reverse(dshape, axis):
+class TestReverse:
+    dshape, axis = tvm.testing.parameters(
+        ((2, 3, 4), 1),
+        ((4, 7), 0),
+        ((2, 3, 4), -1),
+    )
+
+    def test_reverse(self, target, dev, executor_kind, dshape, axis):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         z = relay.reverse(x, axis=axis)
         zz = run_infer_type(z)
@@ -868,20 +872,13 @@ def verify_reverse(dshape, axis):
         func = relay.Function([x], z)
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.flip(x_data, axis)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-
-    verify_reverse((2, 3, 4), 1)
-    verify_reverse((4, 7), 0)
-    verify_reverse((2, 3, 4), -1)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
-@tvm.testing.uses_gpu
-def test_reverse_sequence():
+def test_reverse_sequence(target, dev, executor_kind):
     def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
         seq_lengths_data = np.array(seq_lengths).astype("int32")
         x = relay.var("x", relay.TensorType(x_data.shape, str(x_data.dtype)))
@@ -890,12 +887,10 @@ def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
         assert zz.checked_type == x.type_annotation
         func = relay.Function([x], z)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
     result = [[0, 5, 10, 15], [4, 1, 6, 11], [8, 9, 2, 7], [12, 13, 14, 3]]
@@ -958,19 +953,19 @@ def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
     )
 
 
-@tvm.testing.uses_gpu
-def test_scatter():
-    def ref_scatter(data, indices, updates, axis=0):
-        idx = np.indices(indices.shape).reshape(indices.ndim, -1)
+def ref_scatter(data, indices, updates, axis=0):
+    idx = np.indices(indices.shape).reshape(indices.ndim, -1)
 
-        updated_idx = np.copy(idx)
-        indices = indices.reshape(-1)
-        for i in range(len(indices)):
-            updated_idx[axis, i] = indices[i]
-        scattered = np.copy(data)
-        scattered[tuple(updated_idx)] = updates[tuple(idx)]
-        return scattered
+    updated_idx = np.copy(idx)
+    indices = indices.reshape(-1)
+    for i in range(len(indices)):
+        updated_idx[axis, i] = indices[i]
+    scattered = np.copy(data)
+    scattered[tuple(updated_idx)] = updates[tuple(idx)]
+    return scattered
 
+
+def test_scatter(target, dev, executor_kind):
     def verify_scatter(dshape, ishape, axis=0):
         d = relay.var("d", relay.TensorType(dshape, "float32"))
         i = relay.var("i", relay.TensorType(ishape, "int64"))
@@ -985,14 +980,45 @@ def verify_scatter(dshape, ishape, axis=0):
 
         ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data_np, indices_np, updates_np
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            data_np, indices_np, updates_np
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
-    def verify_dynamic_scatter(dshape, ishape, axis=0):
+    verify_scatter((10,), (10,), 0)
+    verify_scatter((10, 5), (10, 5), -2)
+    verify_scatter((10, 5), (10, 5), -1)
+    verify_scatter((10, 5), (3, 5), 0)
+    verify_scatter((12, 4), (7, 2), 1)
+    verify_scatter((2, 3, 4), (1, 3, 4), 0)
+    verify_scatter((2, 3, 4), (2, 1, 4), 1)
+    verify_scatter((2, 3, 4), (2, 3, 1), 2)
+    verify_scatter((4, 2, 1), (1, 1, 1), 0)
+    verify_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
+    verify_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
+    verify_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
+    verify_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
+
+
+class TestDynamicScatter:
+    dshape, ishape, axis = tvm.testing.parameters(
+        ((10,), (10,), 0),
+        ((10, 5), (10, 5), -2),
+        ((10, 5), (10, 5), -1),
+        ((10, 5), (3, 5), 0),
+        ((12, 4), (7, 2), 1),
+        ((2, 3, 4), (1, 3, 4), 0),
+        ((2, 3, 4), (2, 1, 4), 1),
+        ((2, 3, 4), (2, 3, 1), 2),
+        ((4, 2, 1), (1, 1, 1), 0),
+        ((2, 3, 4, 5), (1, 3, 4, 5), 0),
+        ((6, 3, 4, 5), (2, 3, 4, 5), 1),
+        ((2, 3, 8, 5), (2, 3, 1, 1), 2),
+        ((16, 16, 4, 5), (16, 16, 4, 5), 3),
+    )
+
+    @pytest.mark.parametrize("executor_kind", ["vm", "debug"])
+    def test_dynamic_scatter(self, target, dev, executor_kind, dshape, ishape, axis):
         d = relay.var("d", relay.TensorType([relay.Any() for i in range(len(dshape))], "float32"))
         i = relay.var("i", relay.TensorType([relay.Any() for i in range(len(ishape))], "int64"))
         u = relay.var("u", relay.TensorType([relay.Any() for i in range(len(ishape))], "float32"))
@@ -1006,47 +1032,15 @@ def verify_dynamic_scatter(dshape, ishape, axis=0):
 
         ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    data_np, indices_np, updates_np
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        mod = tvm.ir.IRModule.from_expr(func)
+        op_res = relay.create_executor(
+            executor_kind, mod=mod, device=dev, target=target
+        ).evaluate()(data_np, indices_np, updates_np)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
-    verify_scatter((10,), (10,), 0)
-    verify_scatter((10, 5), (10, 5), -2)
-    verify_scatter((10, 5), (10, 5), -1)
-    verify_scatter((10, 5), (3, 5), 0)
-    verify_scatter((12, 4), (7, 2), 1)
-    verify_scatter((2, 3, 4), (1, 3, 4), 0)
-    verify_scatter((2, 3, 4), (2, 1, 4), 1)
-    verify_scatter((2, 3, 4), (2, 3, 1), 2)
-    verify_scatter((4, 2, 1), (1, 1, 1), 0)
-    verify_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
-    verify_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
-    verify_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
-    verify_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
 
-    verify_dynamic_scatter((10,), (10,), 0)
-    verify_dynamic_scatter((10, 5), (10, 5), -2)
-    verify_dynamic_scatter((10, 5), (10, 5), -1)
-    verify_dynamic_scatter((10, 5), (3, 5), 0)
-    verify_dynamic_scatter((12, 4), (7, 2), 1)
-    verify_dynamic_scatter((2, 3, 4), (1, 3, 4), 0)
-    verify_dynamic_scatter((2, 3, 4), (2, 1, 4), 1)
-    verify_dynamic_scatter((2, 3, 4), (2, 3, 1), 2)
-    verify_dynamic_scatter((4, 2, 1), (1, 1, 1), 0)
-    verify_dynamic_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
-    verify_dynamic_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
-    verify_dynamic_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
-    verify_dynamic_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
-
-
-@tvm.testing.uses_gpu
-@pytest.mark.parametrize(
-    "dshape, ishape, axis, dtype",
-    [
+class TestScatterAdd:
+    dshape, ishape, axis, dtype = tvm.testing.parameters(
         ((10,), (10,), 0, "int32"),
         ((1000,), (1000,), 0, "int32"),
         ((10, 5), (10, 5), -2, "float32"),
@@ -1060,18 +1054,25 @@ def verify_dynamic_scatter(dshape, ishape, axis=0):
         ((6, 3, 4, 5), (2, 3, 4, 5), 1, "float32"),
         ((2, 3, 8, 5), (2, 3, 1, 1), 2, "float32"),
         ((16, 16, 4, 5), (16, 16, 4, 5), 3, "float32"),
-    ],
-)
-def test_scatter_add(dshape, ishape, axis, dtype):
-    def ref_scatter_add(data, indices, updates, axis=0):
-        output = np.copy(data)
-        for index in np.ndindex(*indices.shape):
-            new_index = list(index)
-            new_index[axis] = indices[index]
-            output[tuple(new_index)] += updates[index]
-        return output
+    )
 
-    def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_data(self, dshape, ishape, axis, dtype):
+        data_np = np.random.uniform(size=dshape).astype(dtype)
+        updates_np = np.random.uniform(size=ishape).astype(dtype)
+        indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
+
+        out_np = np.copy(data_np)
+        for index in np.ndindex(*indices_np.shape):
+            new_index = list(index)
+            new_index[axis] = indices_np[index]
+            out_np[tuple(new_index)] += updates_np[index]
+        return data_np, updates_np, indices_np, out_np
+
+    # Optimization can produce tir.atomic_add, not currently supported
+    # on vulkan runtime.
+    @tvm.testing.known_failing_targets("vulkan")
+    def test_scatter_add(self, target, dev, ref_data, dshape, ishape, axis, dtype):
         d = relay.var("d", relay.TensorType(shape=[relay.Any() for _ in dshape], dtype=dtype))
         i = relay.var("i", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype="int64"))
         u = relay.var("u", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype=dtype))
@@ -1079,22 +1080,11 @@ def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
 
         func = relay.Function([d, i, u], z)
 
-        data_np = np.random.uniform(size=dshape).astype(dtype)
-        updates_np = np.random.uniform(size=ishape).astype(dtype)
-        indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
+        data_np, updates_np, indices_np, out_np = ref_data
 
-        ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis)
+        verify_func(target, dev, func, [data_np, indices_np, updates_np], out_np)
 
-        verify_func(
-            func,
-            [data_np, indices_np, updates_np],
-            ref_res,
-        )
-
-    verify_scatter_add(dshape, ishape, axis, dtype)
 
-
-@tvm.testing.uses_gpu
 @pytest.mark.parametrize(
     "data, axis, indices, ref_res",
     [
@@ -1250,7 +1240,7 @@ def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
         ),
     ],
 )
-def test_gather(data, axis, indices, ref_res):
+def test_gather(target, dev, executor_kind, data, axis, indices, ref_res):
     def verify_gather(data, axis, indices, ref_res):
         data = np.asarray(data, dtype="float32")
         indices = np.asarray(indices, dtype="int32")
@@ -1261,18 +1251,15 @@ def verify_gather(data, axis, indices, ref_res):
 
         func = relay.Function([d, i], z)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data, indices
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            data, indices
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_gather(data, axis, indices, ref_res)
 
 
-@tvm.testing.uses_gpu
-def test_gather_nd():
+def test_gather_nd(target, dev, executor_kind):
     def verify_gather_nd(xshape, yshape, y_data, batch_dims=0):
         x = relay.var("x", relay.TensorType(xshape, "float32"))
         y = relay.var("y", relay.TensorType(yshape, "int32"))
@@ -1289,12 +1276,10 @@ def verify_gather_nd(xshape, yshape, y_data, batch_dims=0):
 
         ref_res = ref_funcs.gather_nd(x_data, y_data, batch_dims)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data, y_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_gather_nd((2, 2), (2, 3), [[1, 1, 0], [0, 1, 0]])
     verify_gather_nd((2, 2, 2), (2, 2), [[0, 1], [1, 0]])
@@ -1353,8 +1338,7 @@ def test_isinf():
     _verify_infiniteness_ops(relay.isinf, np.isinf)
 
 
-@tvm.testing.uses_gpu
-def test_unravel_index():
+def test_unravel_index(target, dev, executor_kind):
     def verify_unravel_index(indices, shape, dtype):
         x_data = np.array(indices).astype(dtype)
         y_data = np.array(shape).astype(dtype)
@@ -1372,12 +1356,10 @@ def verify_unravel_index(indices, shape, dtype):
 
         func = relay.Function([x, y], z)
         ref_res = np.unravel_index(x_data, y_data)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data, y_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data, y_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     for dtype in ["int64", "int32"]:
         verify_unravel_index([0, 1, 2, 3], [2, 2], dtype)
@@ -1392,8 +1374,7 @@ def verify_unravel_index(indices, shape, dtype):
         # verify_unravel_index([0, 1, 2, 5], [2, 2], dtype)
 
 
-@tvm.testing.uses_gpu
-def test_sparse_to_dense():
+def test_sparse_to_dense(target, dev, executor_kind):
     def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
         sparse_indices_data = np.array(sparse_indices)
         sparse_values_data = np.array(sparse_values)
@@ -1419,14 +1400,12 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
         assert zz.checked_type == relay.ty.TensorType(output_shape, str(sparse_values_data.dtype))
 
         func = relay.Function(args, d)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                f = relay.create_executor(kind, device=dev, target=target).evaluate(func)
-                if default_value is None:
-                    op_res = f(sparse_indices_data, sparse_values_data)
-                else:
-                    op_res = f(sparse_indices_data, sparse_values_data, default_value_data)
-                tvm.testing.assert_allclose(op_res.numpy(), xpected, rtol=1e-5)
+        f = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)
+        if default_value is None:
+            op_res = f(sparse_indices_data, sparse_values_data)
+        else:
+            op_res = f(sparse_indices_data, sparse_values_data, default_value_data)
+        tvm.testing.assert_allclose(op_res.numpy(), xpected, rtol=1e-5)
 
     verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
     verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
@@ -1454,10 +1433,9 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
     # verify_sparse_to_dense([[[[0, 1, 4], [0, 2, 4]]]], [[[[3.1, 3.1, 3.1]]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
 
 
-@tvm.testing.uses_gpu
-@pytest.mark.parametrize(
-    "sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np",
-    [
+class TestSparseReshape:
+
+    sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np = tvm.testing.parameters(
         (
             np.array([[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 2, 3]], dtype=np.int32),
             np.array([7, 5, 6, 3, 9], dtype=np.int32),
@@ -1542,46 +1520,48 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
             np.array([3, 6], dtype=np.int32),
             np.array([-1, 2], dtype=np.int32),
         ),
-    ],
-)
-@pytest.mark.parametrize("use_dyn", [True, False])
-def test_sparse_reshape(sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn):
-    def ref_sparse_reshape(
-        sparse_indices: np.ndarray,
-        prev_shape: np.ndarray,
-        new_shape: np.ndarray,
+    )
+
+    use_dyn = tvm.testing.parameter(True, False, ids=["dyn", "static"])
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_res(
+        self,
+        sparse_indices_np: np.ndarray,
+        prev_shape_np: np.ndarray,
+        new_shape_np: np.ndarray,
     ):
         """
         This function calculates the expected output of sparseshape operator given the inputs.
         """
 
         new_sparse_indices = np.ones(
-            (sparse_indices.shape[0], new_shape.shape[0]), dtype=sparse_indices.dtype
+            (sparse_indices_np.shape[0], new_shape_np.shape[0]), dtype=sparse_indices_np.dtype
         )
-        multipliers = np.ones(prev_shape.shape[0])
-        dividers = np.ones(new_shape.shape[0])
-        total_ele = np.prod(prev_shape)
+        multipliers = np.ones(prev_shape_np.shape[0])
+        dividers = np.ones(new_shape_np.shape[0])
+        total_ele = np.prod(prev_shape_np)
         division_total_ele = 1
-        for i in range(new_shape.shape[0]):
-            if new_shape[i] == -1:
+        for i in range(new_shape_np.shape[0]):
+            if new_shape_np[i] == -1:
                 continue
-            division_total_ele *= new_shape[i]
-        for i in range(prev_shape.shape[0] - 2, -1, -1):
-            multipliers[i] = prev_shape[i + 1] * multipliers[i + 1]
+            division_total_ele *= new_shape_np[i]
+        for i in range(prev_shape_np.shape[0] - 2, -1, -1):
+            multipliers[i] = prev_shape_np[i + 1] * multipliers[i + 1]
 
-        for i in range(len(new_shape)):
-            if new_shape[i] == -1:
-                new_shape[i] = total_ele // division_total_ele
+        for i in range(len(new_shape_np)):
+            if new_shape_np[i] == -1:
+                new_shape_np[i] = total_ele // division_total_ele
 
-        if np.array_equal(prev_shape, new_shape):
-            return sparse_indices, prev_shape
+        if np.array_equal(prev_shape_np, new_shape_np):
+            return sparse_indices_np, prev_shape_np
 
-        for i in range(new_shape.shape[0] - 2, -1, -1):
-            dividers[i] = new_shape[i + 1] * dividers[i + 1]
+        for i in range(new_shape_np.shape[0] - 2, -1, -1):
+            dividers[i] = new_shape_np[i + 1] * dividers[i + 1]
 
-        for row_num, sparse_row in enumerate(sparse_indices):
+        for row_num, sparse_row in enumerate(sparse_indices_np):
             flat_idx = 0
-            if len(sparse_indices.shape) != 1:
+            if len(sparse_indices_np.shape) != 1:
                 for i, ele in enumerate(sparse_row):
                     flat_idx += sparse_row[i] * multipliers[i]
             else:
@@ -1593,17 +1573,20 @@ def ref_sparse_reshape(
             else:
                 new_sparse_indices[row_num] = flat_idx
 
-        return new_sparse_indices, new_shape
+        return new_sparse_indices, new_shape_np
 
-    def verify_sparse_reshape(
-        sparse_indices_np: np.ndarray,
-        sparse_values_np: np.ndarray,
-        prev_shape_np: np.ndarray,
-        new_shape_np: np.ndarray,
+    @tvm.testing.known_failing_targets("vulkan")
+    def test_sparse_reshape(
+        self,
+        target,
+        dev,
+        ref_res,
+        sparse_indices_np,
+        sparse_values_np,
+        prev_shape_np,
+        new_shape_np,
+        use_dyn,
     ):
-        """
-        This function verifies the relay output of sparse_reshape with its expected output.
-        """
         if use_dyn:
             sparse_indices = relay.var(
                 "sparse_indices",
@@ -1635,7 +1618,6 @@ def verify_sparse_reshape(
 
         func = relay.Function([sparse_indices, prev_shape, new_shape], z)
 
-        ref_res = ref_sparse_reshape(sparse_indices_np, prev_shape_np, new_shape_np)
         outputs = run_infer_type(z)
         new_sparse_indices_infer_type, new_shape_infer_type = (
             outputs.checked_type.fields[0].dtype,
@@ -1645,23 +1627,16 @@ def verify_sparse_reshape(
         assert new_sparse_indices_infer_type == sparse_indices_np.dtype
         assert new_shape_infer_type == new_shape_np.dtype
         verify_func(
+            target,
+            dev,
             func,
             [sparse_indices_np, prev_shape_np, new_shape_np],
             ref_res,
         )
 
-    verify_sparse_reshape(
-        sparse_indices_np,
-        sparse_values_np,
-        prev_shape_np,
-        new_shape_np,
-    )
 
-
-@tvm.testing.uses_gpu
-@pytest.mark.parametrize(
-    "data_np, segment_ids_np, num_segments",
-    [
+class TestSegmentSum:
+    data_np, segment_ids_np, num_segments = tvm.testing.parameters(
         (
             np.array([5, 1, 7, 2, 3, 4], dtype=np.float32),
             np.array([0, 0, 1, 1, 0, 1], dtype=np.int32),
@@ -1697,28 +1672,40 @@ def verify_sparse_reshape(
             np.array([0, 0, 1, 5, 5], dtype=np.int32),
             100,
         ),
-    ],
-)
-@pytest.mark.parametrize("use_dyn", [True, False])
-def test_segment_sum(data_np, segment_ids_np, num_segments, use_dyn):
-    def ref_segment_sum(
-        data: np.ndarray,
-        segment_ids: np.ndarray,
-        num_segments: Optional[int] = None,
+    )
+
+    use_dyn = tvm.testing.parameter(True, False, ids=["dyn", "static"])
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_res(
+        self,
+        data_np: np.ndarray,
+        segment_ids_np: np.ndarray,
+        num_segments: Optional[int],
     ):
         """
         This function calculates the expected output of segment_sum operator given the inputs.
         """
         if not num_segments:
-            num_segments = np.unique(segment_ids).shape[0]
+            num_segments = np.unique(segment_ids_np).shape[0]
 
-        result = np.zeros((num_segments,) + data.shape[1:], data.dtype)
-        for i, index in enumerate(segment_ids):
-            result[index] += data[i]
+        result = np.zeros((num_segments,) + data_np.shape[1:], data_np.dtype)
+        for i, index in enumerate(segment_ids_np):
+            result[index] += data_np[i]
         return result
 
-    def verify_segment_sum(
-        data_np: np.ndarray, segment_ids_np: np.ndarray, num_segments: Optional[int]
+    # Optimization can produce tir.atomic_add, not currently supported
+    # on vulkan runtime.
+    @tvm.testing.known_failing_targets("vulkan")
+    def test_segment_sum(
+        self,
+        target,
+        dev,
+        ref_res: np.ndarray,
+        data_np: np.ndarray,
+        segment_ids_np: np.ndarray,
+        num_segments: Optional[int],
+        use_dyn: bool,
     ):
         """
         This function verifies the relay output of segment_sum with its expected output.
@@ -1745,40 +1732,35 @@ def verify_segment_sum(
         z = relay.op.segment_sum(data, segment_ids, num_segments)
 
         func = relay.Function([data, segment_ids], z)
-        ref_res = ref_segment_sum(data_np, segment_ids_np, num_segments=num_segments)
         segment_sum_result = run_infer_type(z)
         assert segment_sum_result.checked_type.dtype == data_np.dtype
         verify_func(
+            target,
+            dev,
             func,
             [data_np, segment_ids_np],
             ref_res,
         )
 
-    verify_segment_sum(data_np, segment_ids_np, num_segments)
-
 
-def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()):
+def verify_func(target, dev, func, data, ref_res):
     assert isinstance(data, list)
-    for target, dev in target_device:
-        for kind in ["vm"]:
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                *data
-            )
-            if isinstance(op_res, tvm.runtime.container.ADT):
-                assert len(op_res) == len(
-                    ref_res
-                ), "Outputs from TVM and Python implementation must be equal "
-
-                for op_result, ref_result in zip(op_res, ref_res):
-                    tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
-            else:
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-            relay.backend.compile_engine.get().clear()
+    for kind in ["vm"]:
+        mod = tvm.ir.IRModule.from_expr(func)
+        op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(*data)
+        if isinstance(op_res, tvm.runtime.container.ADT):
+            assert len(op_res) == len(
+                ref_res
+            ), "Outputs from TVM and Python implementation must be equal "
+
+            for op_result, ref_result in zip(op_res, ref_res):
+                tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
+        else:
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        relay.backend.compile_engine.get().clear()
 
 
-@tvm.testing.uses_gpu
-def test_adv_index():
+def test_adv_index(target, dev, executor_kind):
     def verify_adv_index(data_shape, index_shapes):
         dtype = "float32"
         inputs = [relay.var("data", relay.TensorType(data_shape, dtype))]
@@ -1793,12 +1775,10 @@ def verify_adv_index(data_shape, index_shapes):
         out = relay.op.adv_index(inputs)
 
         func = relay.Function(inputs, out)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    *np_args
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            *np_args
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-5)
 
     verify_adv_index((10, 5), [(3, 4), (3, 1)])
     verify_adv_index(
@@ -1815,7 +1795,12 @@ def verify_adv_index(data_shape, index_shapes):
 
 
 def run_binop_tests(
-    target, dev, binop_type: str, gt_func: Callable[..., np.array], identity_value: int
+    target,
+    dev,
+    executor_kind,
+    binop_type: str,
+    gt_func: Callable[..., np.array],
+    identity_value: int,
 ):
     def assert_relay_scanop(
         data_np: np.array,
@@ -1833,9 +1818,10 @@ def assert_relay_scanop(
         out = scanops_supported[binop_type](inp, axis, out_dtype, exclusive=exclusive)
         func = relay.Function([inp], out)
 
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(data_np)
-            tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=rtol, atol=atol)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            data_np
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=rtol, atol=atol)
 
     data = np.array([2, 3, 0])
     assert_relay_scanop(data, gt_func(data))
@@ -1873,17 +1859,21 @@ def assert_relay_scanop(
 
 
 @tvm.testing.parametrize_targets
-def test_cumsum(target, dev):
-    run_binop_tests(target, dev, binop_type="cumsum", gt_func=np.cumsum, identity_value=0)
+def test_cumsum(target, dev, executor_kind):
+    run_binop_tests(
+        target, dev, executor_kind, binop_type="cumsum", gt_func=np.cumsum, identity_value=0
+    )
 
 
 @tvm.testing.parametrize_targets
-def test_cumprod(target, dev):
-    run_binop_tests(target, dev, binop_type="cumprod", gt_func=np.cumprod, identity_value=1)
+def test_cumprod(target, dev, executor_kind):
+    run_binop_tests(
+        target, dev, executor_kind, binop_type="cumprod", gt_func=np.cumprod, identity_value=1
+    )
 
 
 @tvm.testing.parametrize_targets
-def test_scatter_nd(target, dev):
+def test_scatter_nd(target, dev, executor_kind):
     def verify_scatter_nd(
         data_np, indices_np, updates_np, ref_res, mode="add", rtol=1e-5, atol=1e-5
     ):
@@ -1894,11 +1884,10 @@ def verify_scatter_nd(
         out = relay.op.scatter_nd(data, indices, updates, mode)
         func = relay.Function([data, indices, updates], out)
 
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                data_np, indices_np, updates_np
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            data_np, indices_np, updates_np
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
 
     def verify_scatter_nd_with_stack(
         data_np, indices_np, updates_np, ref_res, mode="add", rtol=1e-5, atol=1e-5
@@ -1921,9 +1910,10 @@ def verify_scatter_nd_with_stack(
         fargs = [data_np, updates_np]
         for a in indices_np:
             fargs.append(a)
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(*fargs)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            *fargs
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
 
     data = np.zeros((2, 2)).astype("int64")
     indices = np.array([[1, 1, 0], [0, 1, 0]])
@@ -1968,7 +1958,7 @@ def verify_scatter_nd_with_stack(
         verify_scatter_nd_with_stack(data, indices, updates, out, mode)
 
 
-def test_unique():
+def test_unique(target, dev):
     def calc_numpy_unique(data, is_sorted=False):
         uniq, index, inverse, counts = np.unique(
             data, return_index=True, return_inverse=True, return_counts=True
@@ -2004,32 +1994,27 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
         else:
             backends = ["graph", "debug"]
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in backends:
-                mod = tvm.ir.IRModule.from_expr(func)
-                tvm_res = relay.create_executor(
-                    kind, mod=mod, device=dev, target=target
-                ).evaluate()(
-                    x_data
-                )  # unique, indices, inverse_indices, num_unique, (counts)
-                np_res = calc_numpy_unique(
-                    x_data, is_sorted
-                )  # unique, indices, inverse_indices, num_unique, counts
-                num_unique = np_res[3][0]
-
-                # num_unique
-                assert num_unique == tvm_res[3].numpy()[0]
-                # unique
-                tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5)
-                # indices
-                tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5)
-                # inverse_indices
-                tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5)
-                # counts
-                if return_counts:
-                    tvm.testing.assert_allclose(
-                        tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5
-                    )
+        for kind in backends:
+            mod = tvm.ir.IRModule.from_expr(func)
+            tvm_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
+                x_data
+            )  # unique, indices, inverse_indices, num_unique, (counts)
+            np_res = calc_numpy_unique(
+                x_data, is_sorted
+            )  # unique, indices, inverse_indices, num_unique, counts
+            num_unique = np_res[3][0]
+
+            # num_unique
+            assert num_unique == tvm_res[3].numpy()[0]
+            # unique
+            tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5)
+            # indices
+            tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5)
+            # inverse_indices
+            tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5)
+            # counts
+            if return_counts:
+                tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5)
 
     for dtype in ["int32", "int64"]:
         for i in range(8):
@@ -2038,4 +2023,4 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 6415976bfd59..7b8e922fb721 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -14,15 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import sys
+
 import numpy as np
 import numpy.random
+import pytest
+
 import tvm
 import tvm.testing
 import tvm.topi.testing
+
 from tvm import relay, te
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
 
+executor_kind = tvm.testing.parameter("graph", "debug")
+
 
 @tvm.testing.uses_gpu
 def test_binary_op():
@@ -223,123 +230,146 @@ def verify(x_np, y_np, cond_np):
     verify(x_np.astype(dtype), y_np.astype(dtype), cond_np)
 
 
-def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):
-    test_func = funcs[0]
-    ref_func = funcs[1]
-    dtype = "bool" if ref_func in [np.all, np.any] else dtype
-
-    x = relay.var("x", relay.TensorType(data, dtype))
-    if test_func == relay.logsumexp:
-        z = test_func(x, axis, keepdims)
-    else:
-        z = test_func(x, axis, keepdims, exclude)
-    zz = run_infer_type(z)
-    if axis:
-        assert "axis=" in z.astext()
-    if keepdims:
-        assert "keepdims=" in z.astext()
-    if exclude:
-        assert "exclude=" in z.astext()
-    out_type = "int32" if test_func in [relay.argmin, relay.argmax] else dtype
-    assert zz.checked_type == relay.ty.TensorType(output, out_type)
-
-    if all(isinstance(v, tvm.tir.Var) == 1 for v in data):
-        return
-
-    func = relay.Function([x], z)
-    x_data = (
-        np.random.choice([True, False], size=data)
-        if ref_func in [np.all]
-        else np.random.uniform(size=data).astype(dtype)
+def _with_keepdims(func):
+    def _wrapper(data, axis=None, keepdims=False):
+        if not keepdims:
+            return func(data, axis=axis)
+        else:
+            if axis is not None:
+                axis = axis if isinstance(axis, int) else axis[0]
+                out_shape = list(data.shape)
+                out_shape[axis] = 1
+            else:
+                out_shape = [1 for _ in range(len(data.shape))]
+            return func(data, axis=axis).reshape(out_shape)
+
+    return _wrapper
+
+
+def _np_log_sum_exp(x, axis, keepdims=False):
+    max_x = np.max(x, axis=axis, keepdims=True)
+    x = np.log(np.sum(np.exp(x - max_x), axis=axis, keepdims=True))
+    x = x + max_x
+    if not keepdims:
+        x = np.squeeze(x, axis=axis)
+    return x
+
+
+def _unbiased_relay_wrapper(f):
+    def _unbiased_func(x, axis=None, keepdims=False, exclude=False):
+        return f(x, axis=axis, keepdims=keepdims, exclude=exclude, unbiased=True)
+
+    return _unbiased_func
+
+
+def _unbiased_np_wrapper(f):
+    def _unbiased_func(a, axis=None, dtype=None, keepdims=None):
+        return f(a, axis=axis, dtype=dtype, ddof=1, keepdims=keepdims)
+
+    return _unbiased_func
+
+
+class TestReduceFunctions:
+    funcs = {
+        "sum": (relay.sum, np.sum),
+        "max": (relay.max, np.max),
+        "min": (relay.min, np.min),
+        "mean": (relay.mean, np.mean),
+        "var": (relay.variance, np.var),
+        "unbiased_var": (_unbiased_relay_wrapper(relay.variance), _unbiased_np_wrapper(np.var)),
+        "std": (relay.std, np.std),
+        "unbiased_std": (_unbiased_relay_wrapper(relay.std), _unbiased_np_wrapper(np.std)),
+        "prod": (relay.prod, np.prod),
+        "all": (relay.all, np.all),
+        "any": (relay.any, np.any),
+        "logsumexp": (relay.logsumexp, _np_log_sum_exp),
+        "argmin": (relay.argmin, _with_keepdims(np.argmin)),
+        "argmax": (relay.argmax, _with_keepdims(np.argmax)),
+    }
+    relay_func, ref_func = tvm.testing.parameters(
+        *funcs.values(),
+        ids=list(funcs),
     )
 
-    if ref_func in [np.sum]:
-        ref_res = ref_func(x_data + 0, axis=axis, dtype=dtype, keepdims=keepdims)
-    elif ref_func in [np.max, np.min, np.mean, np.prod]:
-        ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
-    else:  # argmin/argmax
-        if axis and not isinstance(axis, int) and len(axis) > 1:
-            return
-        ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
-
-    for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
-
+    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
 
-@tvm.testing.uses_gpu
-def test_reduce_functions():
-    def _with_keepdims(func):
-        def _wrapper(data, axis=None, keepdims=False):
-            if not keepdims:
-                return func(data, axis=axis)
-            else:
-                if axis is not None:
-                    axis = axis if isinstance(axis, int) else axis[0]
-                    out_shape = list(data.shape)
-                    out_shape[axis] = 1
-                else:
-                    out_shape = [1 for _ in range(len(data.shape))]
-                return func(data, axis=axis).reshape(out_shape)
-
-        return _wrapper
-
-    def _np_log_sum_exp(x, axis, keepdims=False):
-        max_x = np.max(x, axis=axis, keepdims=True)
-        x = np.log(np.sum(np.exp(x - max_x), axis=axis, keepdims=True))
-        x = x + max_x
-        if not keepdims:
-            x = np.squeeze(x, axis=axis)
-        return x
+    data, axis, keepdims, exclude, output = tvm.testing.parameters(
+        ((d1, d2, d3, d4), None, False, False, ()),
+        ((d1, d2, d3, d4), 2, True, False, (d1, d2, 1, d4)),
+        ((d1, d2, d3, d4), 0, True, False, (1, d2, d3, d4)),
+        ((d1, d2, d3), 1, True, False, (d1, 1, d3)),
+        ((d1, d2, d3), 0, True, False, (1, d2, d3)),
+        ((d1, d2, d3), None, True, False, (1, 1, 1)),
+        ((d1, d2, d3), (0, 1), True, False, (1, 1, d3)),
+        ((2, 3, 4), 1, True, False, (2, 1, 4)),
+        ((2, 3, 4), (1,), True, False, (2, 1, 4)),
+        ((2, 3, 4), -1, True, False, (2, 3, 1)),
+        ((2, 3, 4), (0, 1, 2), False, False, ()),
+        ((4, 4, 3), None, False, False, ()),
+        ((4, 4, 3), (0, 2), False, False, (4,)),
+        ((128, 24, 128), (0, 1), False, False, (128,)),
+        ((128, 24, 128), (0, 2), False, False, (24,)),
+        ((128, 24, 128), (0, 1), True, False, (1, 1, 128)),
+        ((128, 24, 128), (0, 2), True, False, (1, 24, 1)),
+    )
 
-    def _unbiased_relay_wrapper(f):
-        def _unbiased_func(x, axis=None, keepdims=False, exclude=False):
-            return f(x, axis=axis, keepdims=keepdims, exclude=exclude, unbiased=True)
+    def test_reduce(
+        self,
+        target,
+        dev,
+        relay_func,
+        ref_func,
+        executor_kind,
+        data,
+        axis,
+        keepdims,
+        exclude,
+        output,
+    ):
+        dtype = "bool" if ref_func in [np.all, np.any] else "float32"
+        out_type = "int32" if relay_func in [relay.argmin, relay.argmax] else dtype
 
-        return _unbiased_func
+        target = tvm.target.Target(target)
+        if target.kind.name == "vulkan" and dtype == "bool":
+            pytest.xfail("Known failing test on vulkan runtime")
 
-    def _unbiased_np_wrapper(f):
-        def _unbiased_func(a, axis=None, dtype=None, keepdims=None):
-            return f(a, axis=axis, dtype=dtype, ddof=1, keepdims=keepdims)
+        x = relay.var("x", relay.TensorType(data, dtype))
+        if relay_func == relay.logsumexp:
+            z = relay_func(x, axis, keepdims)
+        else:
+            z = relay_func(x, axis, keepdims, exclude)
+        zz = run_infer_type(z)
+        if axis:
+            assert "axis=" in z.astext()
+        if keepdims:
+            assert "keepdims=" in z.astext()
+        if exclude:
+            assert "exclude=" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(output, out_type)
+
+        if all(isinstance(v, tvm.tir.Var) == 1 for v in data):
+            return
 
-        return _unbiased_func
+        func = relay.Function([x], z)
+        x_data = (
+            np.random.choice([True, False], size=data)
+            if ref_func in [np.all]
+            else np.random.uniform(size=data).astype(dtype)
+        )
 
-    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
-    for func in [
-        [relay.sum, np.sum],
-        [relay.max, np.max],
-        [relay.min, np.min],
-        [relay.mean, np.mean],
-        [relay.variance, np.var],
-        [_unbiased_relay_wrapper(relay.variance), _unbiased_np_wrapper(np.var)],
-        [relay.std, np.std],
-        [_unbiased_relay_wrapper(relay.std), _unbiased_np_wrapper(np.std)],
-        [relay.prod, np.prod],
-        [relay.all, np.all],
-        [relay.any, np.any],
-        [relay.logsumexp, _np_log_sum_exp],
-        [relay.argmin, _with_keepdims(np.argmin)],
-        [relay.argmax, _with_keepdims(np.argmax)],
-    ]:
-        verify_reduce(func, (d1, d2, d3, d4), None, False, False, ())
-        verify_reduce(func, (d1, d2, d3, d4), 2, True, False, (d1, d2, 1, d4))
-        verify_reduce(func, (d1, d2, d3, d4), 0, True, False, (1, d2, d3, d4))
-        verify_reduce(func, (d1, d2, d3), 1, True, False, (d1, 1, d3))
-        verify_reduce(func, (d1, d2, d3), 0, True, False, (1, d2, d3))
-        verify_reduce(func, (d1, d2, d3), None, True, False, (1, 1, 1))
-        verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3))
-        verify_reduce(func, (2, 3, 4), 1, True, False, (2, 1, 4))
-        verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4))
-        verify_reduce(func, (2, 3, 4), -1, True, False, (2, 3, 1))
-        verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ())
-        verify_reduce(func, (4, 4, 3), None, False, False, ())
-        verify_reduce(func, (4, 4, 3), (0, 2), False, False, (4,))
-        verify_reduce(func, (128, 24, 128), (0, 1), False, False, (128,))
-        verify_reduce(func, (128, 24, 128), (0, 2), False, False, (24,))
-        verify_reduce(func, (128, 24, 128), (0, 1), True, False, (1, 1, 128))
-        verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
+        if ref_func in [np.sum]:
+            ref_res = ref_func(x_data + 0, axis=axis, dtype=dtype, keepdims=keepdims)
+        elif ref_func in [np.max, np.min, np.mean, np.prod]:
+            ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
+        else:  # argmin/argmax
+            if axis and not isinstance(axis, int) and len(axis) > 1:
+                return
+            ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
+
+        op_res1 = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
@@ -611,13 +641,4 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True):
 
 
 if __name__ == "__main__":
-    test_strided_slice()
-    test_dyn_strided_slice()
-    # test_strided_set()
-    # test_binary_op()
-    # test_cmp_type()
-    # test_binary_int_broadcast_1()
-    # test_binary_int_broadcast_2()
-    # test_where()
-    # test_reduce_functions()
-    # test_mean_var_std()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index c08b538d22e6..3414fd453646 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -17,14 +17,19 @@
 """ Support level5 operator test cases.
 """
 import math
+import sys
 
 import numpy as np
+import pytest
+
 import tvm
 import tvm.testing
 import tvm.topi.testing
 from tvm import relay, te
 from tvm.relay.testing import run_infer_type
 
+executor_kind = tvm.testing.parameter("graph", "debug")
+
 
 def test_resize1d_infer_type():
     n, c, w = te.size_var("n"), te.size_var("c"), te.size_var("w")
@@ -41,9 +46,31 @@ def test_resize1d_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, 200), "int8")
 
 
-@tvm.testing.uses_gpu
-def test_resize1d():
-    def verify_resize(dshape, scale, method, layout, coord_trans):
+class TestResize1D:
+    interpolate_method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
+    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
+
+    layout = tvm.testing.parameter("NWC", "NCW")
+    dshape, scale = tvm.testing.parameters(
+        ((1, 4, 4), 2),
+        ((2, 8, 17), 3),
+        ((2, 8, 17), 3),
+        ((3, 4, 5), 5),
+    )
+
+    def test_resize(
+        self, target, dev, executor_kind, dshape, scale, interpolate_method, layout, coord_trans
+    ):
+        target_kind = tvm.target.Target(target).kind.name
+        if (
+            target_kind == "vulkan"
+            and dshape == (3, 4, 5)
+            and scale == 5
+            and interpolate_method == "nearest_neighbor"
+            and coord_trans == "align_corners"
+        ):
+            pytest.xfail("Known failing case for these parameters")
+
         if layout == "NWC":
             size = (dshape[1] * scale,)
         else:
@@ -51,29 +78,21 @@ def verify_resize(dshape, scale, method, layout, coord_trans):
 
         x_data = np.random.uniform(size=dshape).astype("float32")
 
-        ref_res = tvm.topi.testing.resize1d_python(x_data, (scale,), layout, method, coord_trans)
+        ref_res = tvm.topi.testing.resize1d_python(
+            x_data, (scale,), layout, interpolate_method, coord_trans
+        )
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         z = relay.image.resize1d(
-            x, size, layout, method, coordinate_transformation_mode=coord_trans
+            x, size, layout, interpolate_method, coordinate_transformation_mode=coord_trans
         )
         assert "size=" in z.astext()
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-4)
-
-    for method in ["nearest_neighbor", "linear", "cubic"]:
-        for coord_trans in ["asymmetric", "align_corners", "half_pixel"]:
-            for layout in ["NWC", "NCW"]:
-                verify_resize((1, 4, 4), 2, method, layout, coord_trans)
-                verify_resize((2, 8, 17), 3, method, layout, coord_trans)
-                verify_resize((2, 8, 17), 3, method, layout, coord_trans)
-                verify_resize((3, 4, 5), 5, method, layout, coord_trans)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-4)
 
 
 def test_resize2d_infer_type():
@@ -91,9 +110,32 @@ def test_resize2d_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
 
 
-@tvm.testing.uses_gpu
-def test_resize2d():
-    def verify_resize(dshape, scale, method, layout, coord_trans):
+class TestResize2D:
+    interpolate_method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
+    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
+
+    layout = tvm.testing.parameter("NHWC", "NCHW")
+
+    dshape, scale = tvm.testing.parameters(
+        ((1, 4, 4, 4), 2),
+        ((2, 8, 17, 20), 3),
+        ((2, 8, 17, 20), 3),
+        ((3, 4, 5, 6), 5),
+    )
+
+    def test_resize(
+        self, target, dev, executor_kind, dshape, scale, interpolate_method, layout, coord_trans
+    ):
+        target_kind = tvm.target.Target(target).kind.name
+        if (
+            target_kind == "vulkan"
+            and dshape == (3, 4, 5, 6)
+            and scale == 5
+            and interpolate_method == "nearest_neighbor"
+            and coord_trans == "align_corners"
+        ):
+            pytest.xfail("Known failing case for these parameters")
+
         if layout == "NHWC":
             size = (dshape[1] * scale, dshape[2] * scale)
         else:
@@ -102,30 +144,20 @@ def verify_resize(dshape, scale, method, layout, coord_trans):
         x_data = np.random.uniform(size=dshape).astype("float32")
 
         ref_res = tvm.topi.testing.resize2d_python(
-            x_data, (scale, scale), layout, method, coord_trans
+            x_data, (scale, scale), layout, interpolate_method, coord_trans
         )
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         z = relay.image.resize2d(
-            x, size, layout, method, coordinate_transformation_mode=coord_trans
+            x, size, layout, interpolate_method, coordinate_transformation_mode=coord_trans
         )
         assert "size=" in z.astext()
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-4)
-
-    for method in ["nearest_neighbor", "linear", "cubic"]:
-        for coord_trans in ["asymmetric", "align_corners", "half_pixel"]:
-            for layout in ["NHWC", "NCHW"]:
-                verify_resize((1, 4, 4, 4), 2, method, layout, coord_trans)
-                verify_resize((2, 8, 17, 20), 3, method, layout, coord_trans)
-                verify_resize((2, 8, 17, 20), 3, method, layout, coord_trans)
-                verify_resize((3, 4, 5, 6), 5, method, layout, coord_trans)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-4)
 
 
 def test_resize3d_infer_type():
@@ -149,9 +181,19 @@ def test_resize3d_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, 10, 10, 20), "int8")
 
 
-@tvm.testing.parametrize_targets
-def test_resize3d(target, dev):
-    def verify_resize(dshape, scale, method, layout):
+class TestResize3D:
+    interpolate_method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
+    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
+
+    layout = tvm.testing.parameter("NDHWC", "NCDHW")
+
+    dshape, scale = tvm.testing.parameters(
+        ((1, 4, 4, 4, 4), 2),
+    )
+
+    def test_resize(
+        self, target, dev, executor_kind, dshape, scale, interpolate_method, layout, coord_trans
+    ):
         if layout == "NDHWC":
             size = (dshape[1] * scale, dshape[2] * scale, dshape[3] * scale)
         else:
@@ -159,35 +201,59 @@ def verify_resize(dshape, scale, method, layout):
 
         x_data = np.random.uniform(size=dshape).astype("float32")
         ref_res = tvm.topi.testing.resize3d_python(
-            x_data, (scale, scale, scale), layout, method, "align_corners"
+            x_data, (scale, scale, scale), layout, interpolate_method, coord_trans
         )
         x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.image.resize3d(x, size, layout, method, "align_corners")
+        z = relay.image.resize3d(x, size, layout, interpolate_method, coord_trans)
         assert "size=" in z.astext()
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_data)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
 
-    for method in ["nearest_neighbor", "linear", "cubic"]:
-        for coord_trans in ["asymmetric", "align_corners", "half_pixel"]:
-            for layout in ["NDHWC", "NCDHW"]:
-                verify_resize((1, 4, 4, 4, 4), 2, method, layout)
 
+class TestCropAndResize:
+    interpolate_method = tvm.testing.parameter("bilinear", "nearest_neighbor")
+    layout = tvm.testing.parameter("NHWC", "NCHW")
 
-@tvm.testing.uses_gpu
-def test_crop_and_resize():
-    def verify_crop_and_resize(
-        img_shape, boxes, box_indices, crop_size, layout, method, extrapolation_value=0.0
-    ):
+    def test_crop_and_resize(self, target, dev, executor_kind, layout, interpolate_method):
+        target_kind = tvm.target.Target(target).kind.name
+        if (
+            target_kind == "vulkan"
+            and layout == "NHWC"
+            and interpolate_method == "nearest_neighbor"
+        ):
+            pytest.xfail("Known failing case for these parameters")
+
+        extrapolation_value = 0.0
+
+        if layout == "NHWC":
+            img_shape = (10, 224, 224, 3)
+            boxes = np.array([[0.1, 0.2, 0.8, 0.7], [0.2, 0, 1, 0.6]]).astype("float32")
+            box_indices = np.array([1, 0]).astype("int32")
+            crop_size = np.array([20, 30]).astype("int32")
+        elif layout == "NCHW":
+            img_shape = (5, 3, 255, 255)
+            boxes = np.array([[0, 0, 1, 1], [0.2, 0.1, 1, 0.9]]).astype("float32")
+            box_indices = np.array([0, 1]).astype("int32")
+            crop_size = np.array([30, 30]).astype("int32")
+        else:
+            raise ValueError(f"Unknown layout: {layout}")
 
         image_data = np.random.uniform(size=img_shape).astype("float32")
 
         ref_res = tvm.topi.testing.crop_and_resize_python(
-            image_data, boxes, box_indices, crop_size, layout, method, extrapolation_value
+            image_data,
+            boxes,
+            box_indices,
+            crop_size,
+            layout,
+            interpolate_method,
+            extrapolation_value,
         )
 
         img = relay.var("img", relay.TensorType(img_shape, "float32"))
@@ -195,33 +261,16 @@ def verify_crop_and_resize(
         bx_idx = relay.var("bx_idx", relay.TensorType(box_indices.shape, "int32"))
 
         z = relay.image.crop_and_resize(
-            img, bx, bx_idx, list(crop_size), layout, method, extrapolation_value
+            img, bx, bx_idx, list(crop_size), layout, interpolate_method, extrapolation_value
         )
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([img, bx, bx_idx], z)
 
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    image_data, boxes, box_indices
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-04)
-
-    boxes_nhwc = np.array([[0.1, 0.2, 0.8, 0.7], [0.2, 0, 1, 0.6]]).astype("float32")
-    indices_nhwc = np.array([1, 0]).astype("int32")
-    size_nhwc = np.array([20, 30]).astype("int32")
-    boxes_nchw = np.array([[0, 0, 1, 1], [0.2, 0.1, 1, 0.9]]).astype("float32")
-    indices_nchw = np.array([0, 1]).astype("int32")
-    size_nchw = np.array([30, 30]).astype("int32")
-
-    for method in ["bilinear", "nearest_neighbor"]:
-        verify_crop_and_resize(
-            (10, 224, 224, 3), boxes_nhwc, indices_nhwc, size_nhwc, "NHWC", method
-        )
-        verify_crop_and_resize(
-            (5, 3, 255, 255), boxes_nchw, indices_nchw, size_nchw, "NCHW", method, 0.1
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            image_data, boxes, box_indices
         )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-3, atol=1e-04)
 
 
 @tvm.testing.uses_gpu
@@ -957,90 +1006,74 @@ def verify_yolo_reorg(shape, stride):
     verify_yolo_reorg((1, 4, 6, 6), 2)
 
 
-@tvm.testing.uses_gpu
-def test_deformable_conv2d():
-    def test_infer_type(batch, in_channel, size, out_channel, deformable_groups, groups, layout):
-        kernel_size = (3, 3)
+class TestDeformableConv2D:
+    batch, in_channel, size, out_channel, deformable_groups = tvm.testing.parameters(
+        (1, 4, 16, 4, 4),
+        (2, 4, 16, 4, 1),
+    )
+    kernel_size = tvm.testing.parameter((3, 3))
+    groups = tvm.testing.parameter(1, 2)
+    layout = tvm.testing.parameter("NCHW", "NHWC")
+    dtype = tvm.testing.parameter("float32")
+
+    @tvm.testing.fixture
+    def data_shape(self, layout, batch, in_channel, size):
         if layout == "NCHW":
-            kernel_layout = "OIHW"
-            data_shape = (batch, in_channel, size, size)
-            weight_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
-            out_shape = (batch, out_channel, size, size)
-            offset_shape = (
-                batch,
-                2 * kernel_size[0] * kernel_size[1] * deformable_groups,
-                out_shape[2],
-                out_shape[3],
-            )
-        else:
-            kernel_layout = "HWIO"
-            data_shape = (batch, size, size, in_channel)
-            weight_shape = (kernel_size[0], kernel_size[1], in_channel // groups, out_channel)
-            out_shape = (batch, size, size, out_channel)
-            offset_shape = (
-                batch,
-                out_shape[1],
-                out_shape[2],
-                2 * kernel_size[0] * kernel_size[1] * deformable_groups,
-            )
+            return (batch, in_channel, size, size)
+        elif layout == "NHWC":
+            return (batch, size, size, in_channel)
 
-        data = relay.var("data", shape=data_shape)
-        offset = relay.var("offset")
-        kernel = relay.var("kernel")
-        y = relay.nn.deformable_conv2d(
-            data,
-            offset,
-            kernel,
-            strides=(1, 1),
-            padding=(1, 1),
-            dilation=(1, 1),
-            data_layout=layout,
-            kernel_layout=kernel_layout,
-            kernel_size=kernel_size,
-            deformable_groups=deformable_groups,
-            groups=groups,
-            channels=out_channel,
-        )
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType(out_shape), yy.checked_type
-        assert yy.args[1].checked_type == relay.TensorType(offset_shape), yy.args[1].checked_type
-        assert yy.args[2].checked_type == relay.TensorType(weight_shape), yy.args[2].checked_type
+    @tvm.testing.fixture
+    def kernel_shape(self, layout, in_channel, out_channel, groups, kernel_size):
+        if layout == "NCHW":
+            return (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
+        elif layout == "NHWC":
+            return (kernel_size[0], kernel_size[1], in_channel // groups, out_channel)
 
-    test_infer_type(1, 4, 16, 4, 4, 1, "NCHW")
-    test_infer_type(2, 4, 16, 4, 1, 2, "NCHW")
-    test_infer_type(1, 4, 16, 4, 4, 1, "NHWC")
-    test_infer_type(2, 4, 16, 4, 1, 2, "NHWC")
+    @tvm.testing.fixture
+    def out_shape(self, layout, batch, out_channel, size):
+        if layout == "NCHW":
+            return (batch, out_channel, size, size)
+        elif layout == "NHWC":
+            return (batch, size, size, out_channel)
 
-    def test_run(batch, in_channel, size, out_channel, deformable_groups, groups, layout):
-        kernel_size = (3, 3)
+    @tvm.testing.fixture
+    def offset_shape(self, layout, batch, kernel_size, deformable_groups, out_shape):
         if layout == "NCHW":
-            kernel_layout = "OIHW"
-            data_shape = (batch, in_channel, size, size)
-            kernel_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
-            out_shape = (batch, out_channel, size, size)
-            offset_shape = (
+            return (
                 batch,
                 2 * kernel_size[0] * kernel_size[1] * deformable_groups,
                 out_shape[2],
                 out_shape[3],
             )
-        else:
-            kernel_layout = "HWIO"
-            data_shape = (batch, size, size, in_channel)
-            kernel_shape = (kernel_size[0], kernel_size[1], in_channel // groups, out_channel)
-            out_shape = (batch, size, size, out_channel)
-            offset_shape = (
+        elif layout == "NHWC":
+            return (
                 batch,
                 out_shape[1],
                 out_shape[2],
                 2 * kernel_size[0] * kernel_size[1] * deformable_groups,
             )
 
-        dtype = "float32"
+    @tvm.testing.fixture
+    def kernel_layout(self, layout):
+        return {"NCHW": "OIHW", "NHWC": "HWIO"}[layout]
+
+    @tvm.testing.fixture
+    def relay_setup(
+        self,
+        dtype,
+        data_shape,
+        layout,
+        kernel_layout,
+        kernel_size,
+        deformable_groups,
+        groups,
+        out_channel,
+    ):
         data = relay.var("data", shape=data_shape, dtype=dtype)
-        offset = relay.var("offset")
-        kernel = relay.var("kernel")
-        y = relay.nn.deformable_conv2d(
+        offset = relay.var("offset", dtype=dtype)
+        kernel = relay.var("kernel", dtype=dtype)
+        expr = relay.nn.deformable_conv2d(
             data,
             offset,
             kernel,
@@ -1054,7 +1087,37 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups, la
             groups=groups,
             channels=out_channel,
         )
-        func = relay.Function([data, offset, kernel], y)
+        func = relay.Function([data, offset, kernel], expr)
+        return expr, func
+
+    def test_infer_type(self, relay_setup, out_shape, offset_shape, kernel_shape):
+        expr, func = relay_setup
+        yy = run_infer_type(expr)
+        assert yy.checked_type == relay.TensorType(out_shape), yy.checked_type
+        assert yy.args[1].checked_type == relay.TensorType(offset_shape), yy.args[1].checked_type
+        assert yy.args[2].checked_type == relay.TensorType(kernel_shape), yy.args[2].checked_type
+
+    # The reference python implementation only supports groups==1.
+    @pytest.mark.parametrize("groups", [1])
+    def test_run(
+        self,
+        target,
+        dev,
+        dtype,
+        executor_kind,
+        data_shape,
+        offset_shape,
+        kernel_shape,
+        relay_setup,
+        deformable_groups,
+        groups,
+        layout,
+    ):
+        target = tvm.target.Target(target)
+        if layout == "NHWC" and target.kind.name != "llvm":
+            pytest.xfail("Can only run NHWC layout on llvm")
+
+        expr, func = relay_setup
         data = np.random.uniform(size=data_shape).astype(dtype)
         offset = np.random.uniform(size=offset_shape).astype(dtype)
         kernel = np.random.uniform(size=kernel_shape).astype(dtype)
@@ -1080,19 +1143,11 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups, la
                 deformable_groups=deformable_groups,
                 groups=groups,
             )
-        for target, dev in tvm.testing.enabled_targets():
-            if target == "cuda" and layout == "NHWC":
-                continue  # Cannot run NHWC layout on cuda target, only on llvm
-            for kind in ["graph", "debug"]:
-                op_res1 = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data, offset, kernel
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
-    test_run(1, 4, 16, 4, 1, 1, "NCHW")
-    test_run(1, 4, 16, 4, 1, 1, "NHWC")
-    test_run(2, 4, 16, 4, 4, 1, "NCHW")
-    test_run(2, 4, 16, 4, 4, 1, "NHWC")
+        op_res1 = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            data, offset, kernel
+        )
+        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
 @tvm.testing.uses_gpu
@@ -1202,119 +1257,111 @@ def test_dilation2d_infer_type():
     assert yy.checked_type == relay.TensorType((n, 10, 217, 217), "float32")
 
 
-@tvm.testing.uses_gpu
-def test_dilation2d_run():
-    def run_test_dilation2d(
-        indata,
-        kernel,
-        out,
-        dtype="float32",
-        strides=[1, 1],
-        padding=[0, 0],
-        dilations=[1, 1],
-        except_targets=["cuda"],
-        **attrs,
-    ):
-
-        dshape = indata.shape
-        kshape = kernel.shape
-
-        if except_targets is None:
-            except_targets = []
-
-        x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", shape=kshape, dtype=dtype)
-        y = relay.image.dilation2d(
-            x, w, strides=strides, dilations=dilations, padding=padding, **attrs
-        )
-        func = relay.Function([x, w], y)
+class TestDilation2DRun:
+    data_layout, kernel_layout = tvm.testing.parameters(("NCHW", "IHW"), ("NHWC", "HWI"))
+    dtype = tvm.testing.parameter("float32")
+
+    config = tvm.testing.parameter(
+        dict(
+            image=[[[[0.1], [0.2]], [[0.3], [0.4]]]],
+            kernel=[[[0.4], [0.3]], [[0.1], [0.0]]],
+            out=[[[[0.5]]]],
+        ),
+        dict(
+            image=[[[[0.1], [0.2]], [[0.3], [0.4]]]],
+            kernel=[[[0.4], [0.3]], [[0.1], [0.0]]],
+            out=[[[[0.5], [0.6]], [[0.7], [0.8]]]],
+            padding=[0, 0, 1, 1],
+        ),
+        dict(
+            image=[[[[0.1, 0.2, 0.0], [0.2, 0.3, 0.1]], [[0.3, 0.4, 0.2], [0.4, 0.5, 0.3]]]],
+            kernel=[[[0.4, 0.5, 0.3], [0.3, 0.4, 0.2]], [[0.1, 0.2, 0.0], [0.0, 0.1, -0.1]]],
+            out=[[[[0.5, 0.7, 0.3], [0.6, 0.8, 0.4]], [[0.7, 0.9, 0.5], [0.8, 1.0, 0.6]]]],
+            padding=[0, 0, 1, 1],
+        ),
+        dict(
+            image=[[[[0.1], [0.2]], [[0.3], [0.4]]], [[[0.2], [0.3]], [[0.4], [0.5]]]],
+            kernel=[[[0.4], [0.3]], [[0.1], [0.0]]],
+            out=[[[[0.5], [0.6]], [[0.7], [0.8]]], [[[0.6], [0.7]], [[0.8], [0.9]]]],
+            padding=[0, 0, 1, 1],
+        ),
+        dict(
+            image=[[[[0.1], [0.2]], [[0.3], [0.4]]]],
+            kernel=[[[0.4], [0.3]]],
+            out=[[[[0.5]], [[0.7]]]],
+        ),
+        dict(
+            image=[[[[0.1], [0.2], [0.3]], [[0.4], [0.5], [0.6]], [[0.7], [0.8], [0.9]]]],
+            kernel=[[[0.4], [0.3]], [[0.1], [0.2]]],
+            out=[[[[0.7], [0.8], [0.6]], [[1.0], [1.1], [0.9]], [[0.8], [0.9], [0.9]]]],
+            padding=[1, 1],
+            dilations=[2, 2],
+        ),
+        dict(
+            image=[
+                [
+                    [[0.1], [0.2], [0.3], [0.4]],
+                    [[0.5], [0.6], [0.7], [0.8]],
+                    [[0.9], [1.0], [1.1], [1.2]],
+                ]
+            ],
+            kernel=[[[0.4], [0.3]], [[0.1], [0.2]]],
+            out=[[[[0.8], [1.0]], [[1.2], [1.4]]]],
+            strides=[1, 2],
+        ),
+    )
 
-        for target, dev in tvm.testing.enabled_targets():
-            if target in except_targets:
-                continue
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                indata, kernel
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), out, rtol=1e-5, atol=1e-5)
+    @tvm.testing.fixture
+    def test_case(self, config, data_layout, dtype):
+        indata = np.array(config["image"], dtype=dtype)
+        kernel = np.array(config["kernel"], dtype=dtype)
+        out = np.array(config["out"], dtype=dtype)
 
-    def _convert_data(indata, kernel, out, layout=None):
-        indata = np.asarray(indata)
-        kernel = np.asarray(kernel)
-        out = np.asarray(out)
-        if layout == "NCHW":
+        if data_layout == "NHWC":
+            pass
+        elif data_layout == "NCHW":
             indata = indata.transpose([0, 3, 1, 2])
             kernel = kernel.transpose([2, 0, 1])
             out = out.transpose([0, 3, 1, 2])
-        return indata, kernel, out
+        else:
+            raise ValueError(f"Unsupported layout '{data_layout}'")
 
-    image = [[[[0.1], [0.2]], [[0.3], [0.4]]]]
-    kernel = [[[0.4], [0.3]], [[0.1], [0.0]]]
-    out = [[[[0.5]]]]
-    run_test_dilation2d(*_convert_data(image, kernel, out, layout="NCHW"))
-    run_test_dilation2d(*_convert_data(image, kernel, out), data_layout="NHWC", kernel_layout="HWI")
-
-    image = [[[[0.1], [0.2]], [[0.3], [0.4]]]]
-    kernel = [[[0.4], [0.3]], [[0.1], [0.0]]]
-    out = [[[[0.5], [0.6]], [[0.7], [0.8]]]]
-    run_test_dilation2d(*_convert_data(image, kernel, out, layout="NCHW"), padding=[0, 0, 1, 1])
-    run_test_dilation2d(
-        *_convert_data(image, kernel, out),
-        padding=[0, 0, 1, 1],
-        data_layout="NHWC",
-        kernel_layout="HWI",
-    )
+        return indata, kernel, out
 
-    image = [[[[0.1, 0.2, 0.0], [0.2, 0.3, 0.1]], [[0.3, 0.4, 0.2], [0.4, 0.5, 0.3]]]]
-    kernel = [[[0.4, 0.5, 0.3], [0.3, 0.4, 0.2]], [[0.1, 0.2, 0.0], [0.0, 0.1, -0.1]]]
-    out = [[[[0.5, 0.7, 0.3], [0.6, 0.8, 0.4]], [[0.7, 0.9, 0.5], [0.8, 1.0, 0.6]]]]
-    run_test_dilation2d(*_convert_data(image, kernel, out, layout="NCHW"), padding=[0, 0, 1, 1])
-    run_test_dilation2d(
-        *_convert_data(image, kernel, out),
-        padding=[0, 0, 1, 1],
-        data_layout="NHWC",
-        kernel_layout="HWI",
-    )
+    @tvm.testing.parametrize_targets("llvm")
+    def test_dilation2d(
+        self,
+        target,
+        dev,
+        test_case,
+        dtype,
+        config,
+        data_layout,
+        kernel_layout,
+    ):
+        strides = config.get("strides", [1, 1])
+        padding = config.get("padding", [0, 0])
+        dilations = config.get("dilations", [1, 1])
 
-    image = [[[[0.1], [0.2]], [[0.3], [0.4]]], [[[0.2], [0.3]], [[0.4], [0.5]]]]
-    kernel = [[[0.4], [0.3]], [[0.1], [0.0]]]
-    out = [[[[0.5], [0.6]], [[0.7], [0.8]]], [[[0.6], [0.7]], [[0.8], [0.9]]]]
-    run_test_dilation2d(*_convert_data(image, kernel, out, layout="NCHW"), padding=[0, 0, 1, 1])
-    run_test_dilation2d(
-        *_convert_data(image, kernel, out),
-        padding=[0, 0, 1, 1],
-        data_layout="NHWC",
-        kernel_layout="HWI",
-    )
+        indata, kernel, out = test_case
 
-    image = [[[[0.1], [0.2]], [[0.3], [0.4]]]]
-    kernel = [[[0.4], [0.3]]]
-    out = [[[[0.5]], [[0.7]]]]
-    run_test_dilation2d(*_convert_data(image, kernel, out, layout="NCHW"))
-    run_test_dilation2d(*_convert_data(image, kernel, out), data_layout="NHWC", kernel_layout="HWI")
-
-    image = [[[[0.1], [0.2], [0.3]], [[0.4], [0.5], [0.6]], [[0.7], [0.8], [0.9]]]]
-    kernel = [[[0.4], [0.3]], [[0.1], [0.2]]]
-    out = [[[[0.7], [0.8], [0.6]], [[1.0], [1.1], [0.9]], [[0.8], [0.9], [0.9]]]]
-    run_test_dilation2d(
-        *_convert_data(image, kernel, out, layout="NCHW"), padding=[1, 1], dilations=[2, 2]
-    )
-    run_test_dilation2d(
-        *_convert_data(image, kernel, out),
-        padding=[1, 1],
-        dilations=[2, 2],
-        data_layout="NHWC",
-        kernel_layout="HWI",
-    )
+        x = relay.var("x", shape=indata.shape, dtype=dtype)
+        w = relay.var("w", shape=kernel.shape, dtype=dtype)
+        y = relay.image.dilation2d(
+            x,
+            w,
+            strides=strides,
+            dilations=dilations,
+            padding=padding,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+        )
+        func = relay.Function([x, w], y)
 
-    image = [
-        [[[0.1], [0.2], [0.3], [0.4]], [[0.5], [0.6], [0.7], [0.8]], [[0.9], [1.0], [1.1], [1.2]]]
-    ]
-    kernel = [[[0.4], [0.3]], [[0.1], [0.2]]]
-    out = [[[[0.8], [1.0]], [[1.2], [1.4]]]]
-    run_test_dilation2d(*_convert_data(image, kernel, out, layout="NCHW"), strides=[1, 2])
-    run_test_dilation2d(
-        *_convert_data(image, kernel, out), strides=[1, 2], data_layout="NHWC", kernel_layout="HWI"
-    )
+        op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            indata, kernel
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), out, rtol=1e-5, atol=1e-5)
 
 
 @tvm.testing.uses_gpu
@@ -1523,25 +1570,4 @@ def verify_all_class_non_max_suppression(
 
 
 if __name__ == "__main__":
-    test_resize_infer_type()
-    test_resize()
-    test_resize3d_infer_type()
-    test_crop_and_resize()
-    test_multibox_prior()
-    test_multibox_transform_loc()
-    test_get_valid_counts()
-    test_roi_align()
-    test_roi_pool()
-    test_proposal()
-    test_yolo_reorg_infer_shape()
-    test_yolo_reorg()
-    test_non_max_suppression()
-    test_deformable_conv2d()
-    test_depth_to_space()
-    test_space_to_depth()
-    test_dilation2d_infer_type()
-    test_dilation2d_run()
-    test_affine_grid()
-    test_grid_sample()
-    test_space_to_batch_nd()
-    test_all_class_non_max_suppression()
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py
index ac92692c727f..9fd3d1b84537 100644
--- a/tests/python/relay/test_op_qnn_conv2_transpose.py
+++ b/tests/python/relay/test_op_qnn_conv2_transpose.py
@@ -404,6 +404,58 @@ def test_both_zero_point():
     verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
 
 
+def test_different_dtype():
+    # uint8 input and int8 weight
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+        channels=kernel_shape[1],
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input and uint8 weight
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+        channels=kernel_shape[1],
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
 def test_layout():
     # uint8 input
     data_shape = (2, 2, 4, 4)  # NHWC
@@ -631,6 +683,7 @@ def test_per_channel_kernel_scale():
     test_input_zero_point()
     test_kernel_zero_point()
     test_both_zero_point()
+    test_different_dtype()
     test_layout()
     test_padding()
     test_const_folding()
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index b5702a1542a9..ef5824c957e8 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -712,7 +712,8 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
-def test_alter_layout_strided_slice():
+@tvm.testing.parametrize_targets("llvm")
+def test_alter_layout_strided_slice(target, dev):
     """Test rewriting strided_slice during alter_iop_layout"""
 
     def before():
@@ -756,24 +757,20 @@ def expected():
     mod_before = transform.InferType()(mod_before)
     mod_new = transform.InferType()(mod_new)
     with relay.build_config(opt_level=3):
-        for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug", "vm"]:
-                np_data = np.random.uniform(size=(1, 32, 28, 28)).astype("float32")
-                np_weight = np.random.uniform(size=(32, 32, 3, 3)).astype("float32")
-                f_before = relay.create_executor(
-                    kind, mod=mod_before, device=dev, target=target
-                ).evaluate()
-                result_before = f_before(np_data, np_weight)
-                f_new = relay.create_executor(
-                    kind, mod=mod_new, device=dev, target=target
-                ).evaluate()
-                result_new = f_new(np_data, np_weight)
-                tvm.testing.assert_allclose(
-                    result_before.numpy(), result_new.numpy(), rtol=1e-5, atol=1e-5
-                )
-
-
-@tvm.testing.uses_gpu
+        for kind in ["graph", "debug", "vm"]:
+            np_data = np.random.uniform(size=(1, 32, 28, 28)).astype("float32")
+            np_weight = np.random.uniform(size=(32, 32, 3, 3)).astype("float32")
+            f_before = relay.create_executor(
+                kind, mod=mod_before, device=dev, target=target
+            ).evaluate()
+            result_before = f_before(np_data, np_weight)
+            f_new = relay.create_executor(kind, mod=mod_new, device=dev, target=target).evaluate()
+            result_new = f_new(np_data, np_weight)
+            tvm.testing.assert_allclose(
+                result_before.numpy(), result_new.numpy(), rtol=1e-5, atol=1e-5
+            )
+
+
 def test_alter_layout_strided_slice_axes_nhwc():
     """Test rewriting strided_slice with axes during alter_iop_layout"""
 
@@ -841,7 +838,7 @@ def before():
     from tvm import topi
 
     def alter_conv2d(attrs, inputs, tinfos, out_type):
-        with tvm.target.Target("llvm"):
+        with tvm.target.Target("llvm -mcpu=core-avx2"):
             return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
 
     def expected():
@@ -1317,15 +1314,15 @@ def before():
     def expected():
         x = relay.var("x", shape=(32, 64))
         weight = relay.var("weight", shape=(48, 64))
-        target_layout = "NK16n"
-        weight_transform = relay.layout_transform(weight, "NK", target_layout)
+        target_layout = "NC16n"
+        weight_transform = relay.layout_transform(weight, "NC", target_layout)
         y = relay.nn.contrib_dense_pack(
             x, weight_transform, target_layout, units=None, out_dtype="float32"
         )
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    target = "llvm"
+    target = "llvm -mcpu=core-avx2"
     with tvm.target.Target(target):
         with TempOpAttr(
             "nn.dense", "FTVMAlterOpLayout", topi.x86.dense_alter_op._alter_dense_layout
@@ -1387,13 +1384,13 @@ def expected():
         squeeze = relay.squeeze(pool, axis=[2, 3])
         dense = relay.nn.contrib_dense_pack(
             relay.layout_transform(squeeze, "NC8c", "NC"),
-            relay.layout_transform(dense_weight, "NK", "NK16n"),
-            "NK16n",
+            relay.layout_transform(dense_weight, "NC", "NC16n"),
+            "NC16n",
             out_dtype="float32",
         )
         return relay.Function(analysis.free_vars(dense), dense)
 
-    with tvm.target.Target("llvm"):
+    with tvm.target.Target("llvm -mcpu=core-avx2"):
         with TempOpAttr(
             "nn.dense", "FTVMAlterOpLayout", topi.x86.dense_alter_op._alter_dense_layout
         ):
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index 836d49b3441b..a34c4ac6f705 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -26,10 +26,12 @@
 import tvm.testing
 
 
-def run_opt_pass(expr, opt_pass):
+def run_opt_pass(expr, opt_pass, params=None):
     assert isinstance(opt_pass, tvm.transform.Pass)
 
     mod = tvm.IRModule.from_expr(expr)
+    if params is not None:
+        mod["main"] = bind_params_by_name(mod["main"], params)
     mod = opt_pass(mod)
     entry = mod["main"]
     return entry if isinstance(expr, relay.Function) else entry.body
@@ -148,11 +150,12 @@ def test_dynamic_to_static_topk():
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
         x = relay.var("x", relay.TensorType(shape, "float32"))
-        k_var = relay.const(k)
+        k_var = relay.var("k", relay.TensorType((), "int32"))
         out = relay.topk(x, k_var, axis, ret_type, is_ascend, dtype)
         if isinstance(out, relay.expr.TupleWrapper):
             out = out.astuple()
-        func = relay.Function([x], out)
+        func = relay.Function([x, k_var], out)
+        params = {"k": k}
 
         np_data = np.random.uniform(size=shape).astype("float32")
         if is_ascend:
@@ -172,7 +175,9 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
                 np_values[i, :] = np_data[i, np_indices[i, :]]
         np_indices = np_indices.astype(dtype)
 
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        func2 = run_opt_pass(
+            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
+        )
         zz = func2.body
         assert isinstance(zz, relay.Call)
         assert zz.op == relay.op.get("topk")
@@ -258,14 +263,17 @@ def verify_resize(shape, scale, method, layout):
             size = (shape[2] * scale, shape[3] * scale)
 
         x = relay.var("x", relay.TensorType(shape, "float32"))
-        size_var = relay.const(np.array(size).astype("float32"))
+        size_var = relay.var("size", relay.TensorType((len(size),), "float32"))
         coord_trans = "asymmetric" if method == "nearest_neighbor" else "align_corners"
         z = relay.image.resize2d(
             x, size_var, layout, method, coordinate_transformation_mode=coord_trans
         )
+        params = {"size": np.array(size).astype("float32")}
 
-        func = run_infer_type(relay.Function([x], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        func = run_infer_type(relay.Function([x, size_var], z))
+        func2 = run_opt_pass(
+            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
+        )
 
         zz = func2.body
         assert isinstance(zz, relay.Call)
@@ -286,12 +294,18 @@ def test_dynamic_to_static_one_hot():
     def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
         depth_var = relay.const(depth)
-        on_value_const = relay.const(on_value)
-        off_value_const = relay.const(off_value)
-        out = relay.one_hot(indices, on_value_const, off_value_const, depth_var, axis, dtype)
-        func = relay.Function([indices], out)
-
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        on_value_var = relay.var("on_value", relay.TensorType((), "int32"))
+        off_value_var = relay.var("off_value", relay.TensorType((), "int32"))
+        out = relay.one_hot(indices, on_value_var, off_value_var, depth_var, axis, dtype)
+        params = {
+            "on_value": on_value,
+            "off_value": off_value,
+        }
+
+        func = relay.Function([indices, on_value_var, off_value_var], out)
+        func2 = run_opt_pass(
+            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
+        )
 
         zz = func2.body
         assert isinstance(zz, relay.Call)
@@ -334,12 +348,18 @@ def verify_full(fill_value, fill_shape, dtype):
 def test_dynamic_to_static_upsampling():
     def verify_upsampling(data_shape, scale_h_val, scale_w_val, dtype):
         x = relay.var("x", relay.TensorType(data_shape, dtype))
-        scale_h = relay.const(scale_h_val)
-        scale_w = relay.const(scale_w_val)
+        scale_h = relay.var("scale_h", relay.TensorType((), "float32"))
+        scale_w = relay.var("scale_w", relay.TensorType((), "float32"))
         z = relay.nn.upsampling(x, scale_h, scale_w)
-
-        func = run_infer_type(relay.Function([x], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        params = {
+            "scale_h": scale_h_val,
+            "scale_w": scale_w_val,
+        }
+
+        func = run_infer_type(relay.Function([x, scale_h, scale_w], z))
+        func2 = run_opt_pass(
+            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
+        )
 
         zz = func2.body
         assert isinstance(zz, relay.Call)
@@ -358,14 +378,21 @@ def verify_upsampling(data_shape, scale_h_val, scale_w_val, dtype):
 def test_dynamic_to_static_upsampling3d():
     def verify_upsampling3d(data_shape, scale_d_val, scale_h_val, scale_w_val, dtype):
         x = relay.var("x", relay.TensorType(data_shape, dtype))
-        scale_d = relay.const(scale_d_val)
-        scale_h = relay.const(scale_h_val)
-        scale_w = relay.const(scale_w_val)
+        scale_d = relay.var("scale_d", relay.TensorType((), "float32"))
+        scale_h = relay.var("scale_h", relay.TensorType((), "float32"))
+        scale_w = relay.var("scale_w", relay.TensorType((), "float32"))
 
         z = relay.nn.upsampling3d(x, scale_d, scale_h, scale_w)
-
-        func = run_infer_type(relay.Function([x], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        params = {
+            "scale_d": scale_d_val,
+            "scale_h": scale_h_val,
+            "scale_w": scale_w_val,
+        }
+
+        func = run_infer_type(relay.Function([x, scale_d, scale_h, scale_w], z))
+        func2 = run_opt_pass(
+            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
+        )
 
         zz = func2.body
         assert isinstance(zz, relay.Call)
@@ -387,18 +414,24 @@ def verify_upsampling3d(data_shape, scale_d_val, scale_h_val, scale_w_val, dtype
 
 
 def test_dynamic_to_static_pad():
-    def verify_pad(data_shape, pad_width, pad_val, dtype):
+    def verify_pad(data_shape, pad_width_val, pad_val, dtype):
         x = relay.var("x", relay.TensorType(data_shape, dtype))
-        z = relay.nn.pad(x, relay.const(np.array(pad_width)), pad_val)
-        func = run_infer_type(relay.Function([x], z))
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        pad_width = relay.var(
+            "pad_width", relay.TensorType((len(pad_width_val), len(pad_width_val[0])), "int32")
+        )
+        z = relay.nn.pad(x, pad_width, pad_val)
+        func = run_infer_type(relay.Function([x, pad_width], z))
+        params = {"pad_width": np.array(pad_width_val)}
+        func2 = run_opt_pass(
+            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
+        )
         zz = func2.body
         assert isinstance(zz, relay.Call)
         assert zz.op == relay.op.get("nn.pad")
 
         x_data = np.random.uniform(size=data_shape).astype(dtype)
         ref_res = np.pad(
-            x_data, pad_width, "constant", constant_values=(((pad_val,) * 2),) * len(data_shape)
+            x_data, pad_width_val, "constant", constant_values=(((pad_val,) * 2),) * len(data_shape)
         )
         verify_func(func2, [x_data], ref_res)
 
@@ -407,35 +440,51 @@ def verify_pad(data_shape, pad_width, pad_val, dtype):
 
 
 def test_dynamic_to_static_strided_slice():
-    def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, dtype="int32"):
+    def verify(
+        dshape,
+        begin_val,
+        end_val,
+        strides_val,
+        output,
+        slice_mode="end",
+        test_ref=True,
+        dtype="int32",
+    ):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         ndim = len(dshape)
-        begin = begin if begin else [0] * ndim
-        end = end if end else list(dshape)
-        if strides:
-            if len(strides) == 1:
-                strides = strides * ndim
+        begin_val = begin_val if begin_val else [0] * ndim
+        end_val = end_val if end_val else list(dshape)
+        if strides_val:
+            if len(strides_val) == 1:
+                strides_val = strides_val * ndim
         else:
-            strides = [1] * ndim
+            strides_val = [1] * ndim
 
         # target numpy result
         x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = tvm.topi.testing.strided_slice_python(x_data, begin, end, strides, slice_mode)
-        data = [x_data, np.array(begin), np.array(end)]
+        ref_res = tvm.topi.testing.strided_slice_python(
+            x_data, begin_val, end_val, strides_val, slice_mode
+        )
+        data = [x_data, np.array(begin_val), np.array(end_val)]
 
-        begin = relay.const(begin, dtype=dtype)
-        end = relay.const(end, dtype=dtype)
+        begin = relay.var("begin", relay.TensorType((len(begin_val),), dtype))
+        end = relay.var("end", relay.TensorType((len(end_val),), dtype))
 
-        if strides:
-            data.append(np.array(strides))
-            strides = relay.const(strides, dtype=dtype)
+        func_params = [x, begin, end]
+        if strides_val:
+            data.append(np.array(strides_val))
+            strides = relay.var("strides", relay.TensorType((len(strides_val),), dtype))
             z = relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=slice_mode)
+            func_params.append(strides)
         else:
             z = relay.strided_slice(x, begin=begin, end=end, slice_mode=slice_mode)
-        func = relay.Function([x], z)
+        func = relay.Function(func_params, z)
+        params = {"begin": begin_val, "end": end_val, "strides": strides_val}
 
         func = run_infer_type(func)
-        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        func2 = run_opt_pass(
+            run_opt_pass(func, transform.DynamicToStatic(), params), transform.InferType()
+        )
         assert isinstance(func2.body, relay.Call)
         assert func2.body.op == relay.op.get("strided_slice")
         verify_func(func2, [x_data], ref_res)
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 2bc2e4e635f0..3680310b4f92 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -34,7 +34,6 @@ def compare_fq_to_int(expr, args, allow_rounding_error=False):
         .evaluate()(*args)
         .numpy()
     )
-
     result_int = (
         relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
         .evaluate()(*args)
@@ -42,7 +41,7 @@ def compare_fq_to_int(expr, args, allow_rounding_error=False):
     )
 
     if allow_rounding_error:
-        assert np.all(np.abs(result - result_int) <= 1)
+        assert np.all(np.abs(result.astype("int32") - result_int.astype("int32")) <= 1)
     else:
         assert np.array_equal(result, result_int)
 
@@ -57,6 +56,7 @@ def test_fake_quantize_conv():
         op = relay.op.nn.conv2d(
             relay.qnn.op.dequantize(x, relay.const(2.0), zero),
             relay.qnn.op.dequantize(w, relay.const(0.5), zero),
+            kernel_size=[5, 5],
         )
         op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
 
@@ -66,6 +66,29 @@ def test_fake_quantize_conv():
         compare_fq_to_int(op, [x_np, w_np])
 
 
+def test_fake_quantize_conv_per_channel():
+    for out_dtype in ["int8", "uint8"]:
+        x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
+        w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
+        one = relay.const([1.0] * 16)
+        zero = relay.const([0] * 16)
+
+        op = relay.op.nn.conv2d(
+            relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0)),
+            relay.qnn.op.dequantize(
+                w, relay.const(np.random.random([16]).astype("float32")), zero, axis=0
+            ),
+            kernel_size=[5, 5],
+            channels=16,
+        )
+        op = relay.qnn.op.quantize(op, relay.const(1.0), relay.const(0), out_dtype=out_dtype)
+
+        x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
+        w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
+
+        compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)
+
+
 def test_fake_quantize_dense():
     for out_dtype in ["int8", "uint8"]:
         x = relay.var("x", shape=[128, 64], dtype="int8")
@@ -85,6 +108,31 @@ def test_fake_quantize_dense():
         compare_fq_to_int(op, [x_np, w_np])
 
 
+def test_fake_quantize_dense_per_channel():
+    for out_dtype in ["int8", "uint8"]:
+        x = relay.var("x", shape=[128, 64], dtype="int8")
+        w = relay.var("w", shape=[256, 64], dtype="int8")
+        one = relay.const(1.0)
+        zero = relay.const(0)
+
+        op = relay.op.nn.dense(
+            relay.qnn.op.dequantize(x, relay.const(2.0), zero),
+            relay.qnn.op.dequantize(
+                w,
+                relay.const(np.random.random([256]).astype("float32")),
+                relay.const([0] * 256),
+                axis=0,
+            ),
+            units=256,
+        )
+        op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
+
+        x_np = np.random.randint(-128, 127, size=[128, 64], dtype="int8")
+        w_np = np.random.randint(-128, 127, size=[256, 64], dtype="int8")
+
+        compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)
+
+
 def test_fake_quantize_batch_matmul():
     for out_dtype in ["int8", "uint8"]:
         x = relay.var("x", shape=[1, 128, 64], dtype="int8")
@@ -112,7 +160,9 @@ def test_fake_transpose_quantize_conv():
 
     x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
     x = relay.transpose(x, [0, 3, 1, 2])
-    op = relay.op.nn.conv2d(x, relay.qnn.op.dequantize(w, relay.const(0.5), zero))
+    op = relay.op.nn.conv2d(
+        x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5]
+    )
     op = relay.qnn.op.quantize(op, one, zero)
 
     x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8")
@@ -130,7 +180,9 @@ def test_fake_transpose_quantize_conv_bias_add():
 
     x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
     x = relay.transpose(x, [0, 3, 1, 2])
-    op = relay.op.nn.conv2d(x, relay.qnn.op.dequantize(w, relay.const(0.5), zero))
+    op = relay.op.nn.conv2d(
+        x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5]
+    )
     op = relay.op.nn.bias_add(op, relay.qnn.op.dequantize(bias, one, zero))
     op = relay.qnn.op.quantize(op, one, zero)
 
@@ -141,6 +193,32 @@ def test_fake_transpose_quantize_conv_bias_add():
     compare_fq_to_int(op, [x_np, w_np, bias_np])
 
 
+def test_fake_transpose_quantize_conv_bias_add_per_channel():
+    x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8")
+    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
+    bias = relay.var("bias", shape=[16], dtype="int32")
+    one = relay.const(1.0)
+    zero = relay.const(0)
+    w_scale = (np.random.random([16]).astype("float32") - 0.5) / 10 + 0.5
+    w_zp = relay.const([0] * 16)
+
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
+    x = relay.transpose(x, [0, 3, 1, 2])
+    op = relay.op.nn.conv2d(
+        x, relay.qnn.op.dequantize(w, relay.const(w_scale), w_zp, axis=0), kernel_size=[5, 5]
+    )
+    op = relay.op.nn.bias_add(
+        op, relay.qnn.op.dequantize(bias, relay.const(2.0 * w_scale), w_zp, axis=0)
+    )
+    op = relay.qnn.op.quantize(op, one, zero)
+
+    x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8")
+    w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")
+    bias_np = np.random.randint(-32768, 32767, size=[16], dtype="int32")
+
+    compare_fq_to_int(op, [x_np, w_np, bias_np], allow_rounding_error=True)
+
+
 def test_fake_transpose_quantize_conv_bias_add_mismatch():
     x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8")
     w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
@@ -151,7 +229,9 @@ def test_fake_transpose_quantize_conv_bias_add_mismatch():
 
     x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
     x = relay.transpose(x, [0, 3, 1, 2])
-    op = relay.op.nn.conv2d(x, relay.qnn.op.dequantize(w, relay.const(0.5), zero))
+    op = relay.op.nn.conv2d(
+        x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5]
+    )
     op = relay.op.nn.bias_add(op, relay.qnn.op.dequantize(bias, two, zero))
     op = relay.qnn.op.quantize(op, one, zero)
 
@@ -318,6 +398,50 @@ def test_fake_quantize_clip():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_clip_per_channel():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
+
+    x = relay.qnn.op.dequantize(
+        x, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), axis=1
+    )
+    op = relay.op.clip(x, 0, 6)
+    op = relay.qnn.op.quantize(
+        op, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), out_dtype="uint8", axis=1
+    )
+
+    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
+
+    compare_fq_to_int(op, [x_np])
+
+
+def test_fake_quantize_relu():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
+
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
+    op = relay.op.nn.relu(x)
+    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
+
+    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
+
+    compare_fq_to_int(op, [x_np])
+
+
+def test_fake_quantize_relu_per_channel():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
+
+    x = relay.qnn.op.dequantize(
+        x, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), axis=1
+    )
+    op = relay.op.nn.relu(x)
+    op = relay.qnn.op.quantize(
+        op, relay.const([1.0, 2.0, 3.0]), relay.const([96, 114, 128]), out_dtype="uint8", axis=1
+    )
+
+    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
+
+    compare_fq_to_int(op, [x_np])
+
+
 @pytest.mark.parametrize(
     "operator",
     [relay.op.add, relay.op.multiply, relay.op.subtract, relay.op.minimum, relay.op.maximum],
@@ -377,3 +501,16 @@ def test_fake_quantize_pad():
     x_np = np.random.randint(-25, 25, size=[1, 383, 128], dtype="int8")
 
     compare_fq_to_int(op, [x_np])
+
+
+def test_fake_quantize_depth_to_space():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
+
+    zero = relay.const(0)
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
+    op = relay.op.nn.depth_to_space(x, 4)
+    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
+
+    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
+
+    compare_fq_to_int(op, [x_np])
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 855650f810a5..121d6562594e 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -15,12 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_opt_pass
 import tvm.testing
+import tvm.topi.testing
 
 
 def test_fuse_simple():
@@ -784,23 +786,42 @@ def test_fuse_dynamic_squeeze_slice_take():
     assert np.allclose(result.numpy(), np_result)
 
 
+@tvm.testing.uses_gpu
+def test_fuse_softmax():
+    """Test if softmax can be fused with following ops."""
+    channel_size = 16
+
+    def before():
+        x = relay.var("x", shape=(16, channel_size))
+        softmax = relay.nn.softmax(x)
+        out = relay.cast(softmax, "float16")
+        return relay.Function([x], out)
+
+    def expected():
+        p0 = relay.var("p0", shape=(16, channel_size))
+        softmax = relay.nn.softmax(p0)
+        out = relay.cast(softmax, "float16")
+
+        x = relay.var("x", shape=(16, channel_size))
+
+        f0 = relay.Function([p0], out)
+        f0 = f0.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+        y = relay.Call(f0, [x])
+        return relay.Function([x], y)
+
+    orig = before()
+    m = fuse2(tvm.IRModule.from_expr(orig))
+    after = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(m["main"], after)
+
+    inp = np.random.randn(16, channel_size).astype("float32")
+    ref = tvm.topi.testing.softmax_python(inp).astype("float16")
+
+    for tgt, dev in tvm.testing.enabled_targets():
+        ex = relay.create_executor("graph", mod=m, device=dev, target=tgt)
+        result = ex.evaluate()(inp).numpy()
+        tvm.testing.assert_allclose(result, ref, rtol=1e-4, atol=1e-4)
+
+
 if __name__ == "__main__":
-    test_fuse_simple()
-    test_conv2d_fuse()
-    test_concatenate()
-    test_tuple_root()
-    test_stop_fusion()
-    test_fuse_myia_regression()
-    test_fuse_tuple_get_elemwise()
-    test_tuple_get_root()
-    test_tuple_intermediate()
-    test_tuple_consecutive()
-    test_inception_like()
-    test_fuse_parallel_injective()
-    test_immutable()
-    test_split()
-    test_fuse_max()
-    test_fuse_take()
-    test_fuse_gather_nd()
-    test_fuse_bcast_reduce_scalar()
-    test_fuse_max_diamond()
+    pytest.main([__pfile__])
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
new file mode 100644
index 000000000000..d9411c92c375
--- /dev/null
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.relay import transform
+from tvm.contrib import graph_executor, pipeline_executor
+
+
+def get_mannual_mod():
+    # Get a list of modules representing subgraphs.
+    mods = []
+    dshape = (3, 3)
+    data = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data21 = relay.var("data_1", relay.TensorType(dshape, "float32"))
+    data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
+    data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    mvalue1 = np.full((1), 1).astype("float32")
+    mvalue2 = np.full((1), 2).astype("float32")
+    mvalue3 = np.full((1), 3).astype("float32")
+    mv1 = relay.Constant(tvm.nd.array(mvalue1))
+    mv2 = relay.Constant(tvm.nd.array(mvalue2))
+    mv3 = relay.Constant(tvm.nd.array(mvalue3))
+
+    # There are three outputs in the first model.
+
+    net1_output1 = relay.add(data, mv1)
+    net1_output2 = relay.subtract(data, mv2)
+    net1_output3 = relay.multiply(data, mv3)
+
+    # The second model use output named net1_output1 of the first model as the first input,
+    # the second input of the second model is data21.
+    net2 = relay.add(data_net1_output_1, mv2)
+    net2 = relay.add(net2, data21)
+    net2_output = relay.add(net2, mv3)
+
+    # The third model use the output named net2_output of the second model as the first input
+    # and use the output named net1_output2 of the first model as the second input.
+    net3 = relay.multiply(data_net2_output_1, mv3)
+    net3 = relay.add(net3, data_net1_output_2)
+
+    mods.append(
+        tvm.IRModule.from_expr(
+            relay.Function([data], relay.Tuple([net1_output1, net1_output2, net1_output3]))
+        )
+    )
+    mods.append(tvm.IRModule.from_expr(relay.Function([data_net1_output_1, data21], net2_output)))
+    mods.append(
+        tvm.IRModule.from_expr(relay.Function([data_net1_output_2, data_net2_output_1], net3))
+    )
+
+    return mods, dshape
+
+
+def get_manual_conf(mods, target):
+    # This function is used to generate manual pipeline configuration.
+    mod_config = {}
+    # The third output is the final output, the second output is for mod3, the first output
+    # is for mod2 input.
+    pipe_config1 = {
+        "mod_idx": 1,
+        "output": [
+            {"output_idx": 0, "dependent": [{"mod_idx": 2, "input_name": "data_0"}]},
+            {"output_idx": 1, "dependent": [{"mod_idx": 3, "input_name": "data_0"}]},
+            {"output_idx": 2, "dependent": [{"mod_idx": 0, "input_name": "0"}]},
+        ],
+    }
+    mod_config[mods[0]] = {
+        "pipeline": pipe_config1,
+        "target_host": None,
+        "mod_name": "default",
+        "build": None,
+        "params": None,
+        "target": target[0],
+        "dev": target[1],
+    }
+
+    pipe_config2 = {
+        "mod_idx": 2,
+        "output": [
+            {"output_idx": 0, "dependent": [{"mod_idx": 3, "input_name": "data_1"}]},
+        ],
+    }
+    mod_config[mods[1]] = {
+        "pipeline": pipe_config2,
+        "target_host": None,
+        "mod_name": "default",
+        "build": None,
+        "params": None,
+        "target": "llvm",
+        "dev": tvm.cpu(0),
+    }
+
+    pipe_config3 = {
+        "mod_idx": 3,
+        "output": [{"output_idx": 0, "dependent": [{"mod_idx": 0, "input_name": "1"}]}],
+    }
+    mod_config[mods[2]] = {
+        "pipeline": pipe_config3,
+        "target_host": None,
+        "mod_name": "default",
+        "build": None,
+        "params": None,
+        "target": "llvm",
+        "dev": tvm.cpu(0),
+    }
+    return mod_config
+
+
+def test_pipe_config_check():
+    # This function is used to trigger runtime error by applying wrong logic connection.
+
+    # Get the three pipeline modules here.
+    (mod1, mod2, mod3), dshape = get_mannual_mod()
+
+    # The input or output name is illegal and expects a runtime error.
+    pipe_error = pipeline_executor.PipelineConfig()
+    with pytest.raises(RuntimeError):
+        pipe_error[mod1]["output"][9]
+
+    with pytest.raises(RuntimeError):
+        pipe_error[mod1]["input"]["data_9"]
+
+    # The module connection will cause a cycle in DAG and expects runtime error.
+    with pytest.raises(RuntimeError):
+        pipe_error[mod1]["output"][0].connect(pipe_error[mod2]["input"]["data_0"])
+        pipe_error[mod2]["output"][0].connect(pipe_error[mod1]["input"]["data_0"])
+
+    # The module connection is illegal and expects runtime error.
+
+    with pytest.raises(RuntimeError):
+        pipe_error[mod1]["output"][0].connect(pipe_error[mod1]["input"]["data_0"])
+
+    with pytest.raises(RuntimeError):
+        pipe_error[mod1]["input"]["data_0"].connect(pipe_error[mod1]["input"]["data_0"])
+
+    with pytest.raises(RuntimeError):
+        pipe_error[mod1]["input"]["data_0"].connect(pipe_error[mod2]["input"]["data_0"])
+
+    with pytest.raises(RuntimeError):
+        pipe_error[mod1]["output"][0].connect(pipe_error["input"]["data_0"])
+
+    with pytest.raises(RuntimeError):
+        pipe_error["input"]["data_0"].connect(pipe_error[mod1]["output"][0])
+
+    with pytest.raises(RuntimeError):
+        pipe_error["output"]["0"].connect(pipe_error[mod1]["output"][0])
+
+
+def test_pipeline():
+    if pipeline_executor.pipeline_executor_enabled():
+        target_list = tvm.testing.enabled_targets()
+        for target in target_list:
+            # Get the three pipeline modules here.
+            (mod1, mod2, mod3), dshape = get_mannual_mod()
+
+            # Prepare batch data for pipeline computation.
+            datas = []
+            for i in range(5):
+                datas.append(np.full(dshape, 3 + i).astype("float32"))
+
+            pipe_config = pipeline_executor.PipelineConfig()
+
+            # The global input named "data_0" will be connected to a input named "data_0" of mod1.
+            pipe_config["input"]["data_0"].connect(pipe_config[mod1]["input"]["data_0"])
+
+            # The global Input named "data_1" will be connected to a input named "data_1" of mod2.
+            pipe_config["input"]["data_1"].connect(pipe_config[mod2]["input"]["data_1"])
+
+            # The mod1 output[0] will be connected to a input named "data_0" of mod2.
+            pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_0"])
+
+            # The mod1 output[1] will be connected to a input named "data_0" of mod3.
+            pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_0"])
+
+            # The mod2 output[2] will be connected to a input named "data_1" of mod3.
+            pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_1"])
+
+            # The mod1 output[2] will be connected to global output[1].
+            pipe_config[mod1]["output"][2].connect(pipe_config["output"]["0"])
+
+            # The mod3 output[0] will be connected to global output[2].
+            pipe_config[mod3]["output"][0].connect(pipe_config["output"]["1"])
+            # Print configueration (print(pipe_config)), the result looks like following.
+            #
+            # Inputs
+            #   |data_0: mod1:data_0
+            #   |data_1: mod2:data_1
+            #
+            # output
+            #   |output(1) : mod1.output(2)
+            #   |output(2) : mod3.output(0)
+            #
+            # connections
+            #   |mod1.output(0)-> mod2.data_0
+            #   |mod1.output(1)-> mod3.data_0
+            #   |mod2.output(0)-> mod3.data_1
+
+            # Set other parameters.
+            pipe_config[mod1].target = target[0]
+            pipe_config[mod1].dev = target[1]
+
+            pipe_config[mod2].target = "llvm"
+            pipe_config[mod2].dev = tvm.cpu(0)
+
+            pipe_config[mod3].target = "llvm"
+            pipe_config[mod3].dev = tvm.cpu(0)
+
+            # Here is to check the correctness of the configuration generated by API.
+            assert pipe_config.get_config() == get_manual_conf([mod1, mod2, mod3], target)
+
+            # Build and create a pipeline module.
+            with tvm.transform.PassContext(opt_level=3):
+                pipeline_mod_factory = pipeline_executor.build(pipe_config)
+
+            pipeline_module = pipeline_executor.PipelineModule(pipeline_mod_factory)
+            assert pipeline_module
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_target_hooks.py b/tests/python/relay/test_target_hooks.py
new file mode 100644
index 000000000000..4d7a7fcdc15b
--- /dev/null
+++ b/tests/python/relay/test_target_hooks.py
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for target hooks."""
+import sys
+import numpy as np
+import pytest
+
+from tvm import relay, IRModule
+
+from utils.external_codegen import (
+    set_external_func_attr,
+    check_aot_executor_result,
+    check_graph_executor_result,
+)
+
+
+@pytest.mark.parametrize("check_result", [check_aot_executor_result, check_graph_executor_result])
+def test_tir_external_generation(check_result):
+    shape = (8,)
+    x_data = np.random.randint(255, size=shape).astype("float32")
+    y_data = np.random.randint(255, size=shape).astype("float32")
+    inputs = {"x": x_data, "y": y_data}
+
+    x0 = relay.var("x0", shape=shape, dtype="float32")
+    y0 = relay.var("y0", shape=shape, dtype="float32")
+    z = x0 + y0
+    f = relay.Function([x0, y0], z)
+    f = set_external_func_attr(f, "example_target_hook", "replace_add_with_subtract")
+
+    x = relay.var("x", shape=(8,), dtype="float32")
+    y = relay.var("y", shape=(8,), dtype="float32")
+    call = relay.Call(f, [x, y])
+    func = IRModule.from_expr(call)
+
+    check_result(func, inputs, (8,), x_data - y_data)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 4c5b98514724..725d2765477f 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -34,7 +34,7 @@
 from tvm.relay.testing import mlp
 
 
-def check_result(args, expected_result, mod=None):
+def check_result(target, dev, args, expected_result, mod=None):
     """
     Check that evaluating `expr` applied to the arguments produces
     `result` on Relay VM.
@@ -47,11 +47,8 @@ def check_result(args, expected_result, mod=None):
     expected_result:
         The expected result of running the expression.
     """
-    for target, dev in tvm.testing.enabled_targets():
-        rts_result = relay.create_executor("vm", device=dev, target=target, mod=mod).evaluate()(
-            *args
-        )
-        tvm.testing.assert_allclose(expected_result, rts_result.numpy())
+    rts_result = relay.create_executor("vm", device=dev, target=target, mod=mod).evaluate()(*args)
+    tvm.testing.assert_allclose(expected_result, rts_result.numpy())
 
 
 def veval(f, *args, device=tvm.cpu(), target="llvm"):
@@ -78,8 +75,7 @@ def vmobj_to_list(o):
         raise RuntimeError("Unknown object type: %s" % type(o))
 
 
-@tvm.testing.uses_gpu
-def test_split():
+def test_split(target, dev):
     x = relay.var("x", shape=(12,))
     y = relay.split(x, 3, axis=0).astuple()
     f = relay.Function([x], y)
@@ -88,14 +84,12 @@ def test_split():
         12,
     ).astype("float32")
     ref_res = np.split(x_data, 3, axis=0)
-    for tgt, dev in tvm.testing.enabled_targets():
-        res = veval(f, x_data, device=dev, target=tgt)
-        for i in range(3):
-            tvm.testing.assert_allclose(res[i].numpy(), ref_res[i])
+    res = veval(f, x_data, device=dev, target=target)
+    for i in range(3):
+        tvm.testing.assert_allclose(res[i].numpy(), ref_res[i])
 
 
-@tvm.testing.uses_gpu
-def test_split_no_fuse():
+def test_split_no_fuse(target, dev):
     x = relay.var("x", shape=(12,))
     y = relay.split(x, 3, axis=0).astuple()
     z = relay.concatenate([relay.TupleGetItem(y, 0)], axis=0)
@@ -104,29 +98,27 @@ def test_split_no_fuse():
     x_data = np.random.rand(
         12,
     ).astype("float32")
-    for tgt, dev in tvm.testing.enabled_targets():
-        res = veval(f, x_data, device=dev, target=tgt)
-        tvm.testing.assert_allclose(res.numpy(), np.split(x_data, 3, axis=0)[0])
 
+    res = veval(f, x_data, device=dev, target=target)
+    tvm.testing.assert_allclose(res.numpy(), np.split(x_data, 3, axis=0)[0])
 
-@tvm.testing.uses_gpu
-def test_id():
+
+def test_id(target, dev):
     x = relay.var("x", shape=(10, 10), dtype="float64")
     f = relay.Function([x], x)
     x_data = np.random.rand(10, 10).astype("float64")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result([x_data], x_data, mod=mod)
+    check_result(target, dev, [x_data], x_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_op():
+def test_op(target, dev):
     x = relay.var("x", shape=(10, 10))
     f = relay.Function([x], x + x)
     x_data = np.random.rand(10, 10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result([x_data], 2 * x_data, mod=mod)
+    check_result(target, dev, [x_data], 2 * x_data, mod=mod)
 
 
 def any(x):
@@ -134,8 +126,8 @@ def any(x):
     return relay.op.min(x, axis=[0, 1])
 
 
-@tvm.testing.uses_gpu
-def test_cond():
+@tvm.testing.known_failing_targets("vulkan")
+def test_cond(target, dev):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(10, 10))
     # f = relay.Function([x, y], relay.op.equal(x, y))
@@ -146,14 +138,14 @@ def test_cond():
     mod = tvm.IRModule()
     mod["main"] = f
     # same
-    check_result([x_data, x_data], True, mod=mod)
+    check_result(target, dev, [x_data, x_data], True, mod=mod)
 
     # diff
-    check_result([x_data, y_data], False, mod=mod)
+    check_result(target, dev, [x_data, y_data], False, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_simple_if():
+@tvm.testing.known_failing_targets("vulkan")
+def test_simple_if(target, dev):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(10, 10))
     f = relay.Function([x, y], relay.If(any(relay.op.equal(x, y)), x, y))
@@ -163,14 +155,14 @@ def test_simple_if():
     mod = tvm.IRModule()
     mod["main"] = f
     # same
-    check_result([x_data, x_data], x_data, mod=mod)
+    check_result(target, dev, [x_data, x_data], x_data, mod=mod)
 
     # diff
-    check_result([x_data, y_data], y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], y_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_multiple_ifs():
+@tvm.testing.parametrize_targets("llvm")
+def test_multiple_ifs(target, dev):
     mod = tvm.IRModule({})
     b = relay.var("b")
     v0 = relay.var("v0")
@@ -184,14 +176,12 @@ def test_multiple_ifs():
     out = relay.Let(v0, relay.Tuple([relay.const(0)]), out)
     fn = relay.Function([b], out)
     mod["main"] = fn
-    dev = tvm.runtime.device("llvm", 0)
     func = relay.create_executor(device=dev, mod=mod, kind="vm").evaluate()
     res = vmobj_to_list(func(False))
     assert res == [1, 0]
 
 
-@tvm.testing.uses_gpu
-def test_unused_function():
+def test_unused_function(target, dev):
     cond = relay.const(True)
     mod = tvm.IRModule()
     then_name = relay.GlobalVar("times_2")
@@ -212,11 +202,10 @@ def test_unused_function():
     x_data = np.random.rand(2, 2).astype("float32")
     y_data = x_data * 2
 
-    check_result([x_data], y_data, mod=mod)
+    check_result(target, dev, [x_data], y_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_simple_call():
+def test_simple_call(target, dev):
     mod = tvm.IRModule({})
     sum_up = relay.GlobalVar("sum_up")
     i = relay.var("i", shape=[], dtype="int32")
@@ -227,11 +216,10 @@ def test_simple_call():
     i_data = np.array(0, dtype="int32")
     iarg = relay.var("iarg", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg], sum_up(iarg))
-    check_result([i_data], i_data, mod=mod)
+    check_result(target, dev, [i_data], i_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_count_loop():
+def test_count_loop(target, dev):
     mod = tvm.IRModule({})
     sum_up = relay.GlobalVar("sum_up")
     i = relay.var("i", shape=[], dtype="int32")
@@ -247,14 +235,12 @@ def test_count_loop():
     i_data = np.array(0, dtype="int32")
     iarg = relay.var("i", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg], sum_up(iarg))
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, i_data, device=dev, target=tgt)
-        tvm.testing.assert_allclose(result.numpy(), i_data)
-    check_result([i_data], i_data, mod=mod)
+    result = veval(mod, i_data, device=dev, target=target)
+    tvm.testing.assert_allclose(result.numpy(), i_data)
+    check_result(target, dev, [i_data], i_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_sum_loop():
+def test_sum_loop(target, dev):
     mod = tvm.IRModule({})
     sum_up = relay.GlobalVar("sum_up")
     i = relay.var("i", shape=[], dtype="int32")
@@ -275,11 +261,10 @@ def test_sum_loop():
     iarg = relay.var("i", shape=[], dtype="int32")
     aarg = relay.var("accum", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
-    check_result([i_data, accum_data], sum(range(1, loop_bound + 1)), mod=mod)
+    check_result(target, dev, [i_data, accum_data], sum(range(1, loop_bound + 1)), mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_tuple_fst():
+def test_tuple_fst(target, dev):
     ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
     tup = relay.var("tup", type_annotation=ttype)
     f = relay.Function([tup], relay.TupleGetItem(tup, 0))
@@ -287,11 +272,10 @@ def test_tuple_fst():
     j_data = np.random.rand(10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result([(i_data, j_data)], i_data, mod=mod)
+    check_result(target, dev, [(i_data, j_data)], i_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_tuple_second():
+def test_tuple_second(target, dev):
     ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
     tup = relay.var("tup", type_annotation=ttype)
     f = relay.Function([tup], relay.TupleGetItem(tup, 1))
@@ -299,11 +283,10 @@ def test_tuple_second():
     j_data = np.random.rand(10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result([(i_data, j_data)], j_data, mod=mod)
+    check_result(target, dev, [(i_data, j_data)], j_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_list_constructor():
+def test_list_constructor(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -316,17 +299,15 @@ def test_list_constructor():
 
     mod["main"] = f
 
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        assert len(result) == 2
-        assert len(result[1]) == 2
+    result = veval(mod, device=dev, target=target)
+    assert len(result) == 2
+    assert len(result[1]) == 2
 
-        obj = vmobj_to_list(result)
-        tvm.testing.assert_allclose(obj, np.array([3, 2, 1]))
+    obj = vmobj_to_list(result)
+    tvm.testing.assert_allclose(obj, np.array([3, 2, 1]))
 
 
-@tvm.testing.uses_gpu
-def test_let_tensor():
+def test_let_tensor(target, dev):
     sb = relay.ScopeBuilder()
     shape = (1,)
     x = relay.var("x", shape=shape, dtype="float32")
@@ -342,11 +323,10 @@ def test_let_tensor():
     x_data = np.random.rand(*shape).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result([x_data], x_data + 42.0, mod=mod)
+    check_result(target, dev, [x_data], x_data + 42.0, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_let_scalar():
+def test_let_scalar(target, dev):
     sb = relay.ScopeBuilder()
 
     x = relay.var("x", "float32")
@@ -360,11 +340,10 @@ def test_let_scalar():
     x_data = np.array(np.random.rand()).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result([x_data], x_data + 42.0, mod=mod)
+    check_result(target, dev, [x_data], x_data + 42.0, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_compose():
+def test_compose(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -394,13 +373,11 @@ def test_compose():
     mod["main"] = f
 
     x_data = np.array(np.random.rand()).astype("float32")
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, [x_data], device=dev, target=tgt)
-        tvm.testing.assert_allclose(result.numpy(), x_data + 2.0)
+    result = veval(mod, [x_data], device=dev, target=target)
+    tvm.testing.assert_allclose(result.numpy(), x_data + 2.0)
 
 
-@tvm.testing.uses_gpu
-def test_list_hd():
+def test_list_hd(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -415,13 +392,11 @@ def test_list_hd():
 
     mod["main"] = f
 
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(result.numpy(), 3)
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(result.numpy(), 3)
 
 
-@pytest.mark.xfail
-def test_list_tl_empty_list():
+def test_list_tl_empty_list(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -432,12 +407,11 @@ def test_list_tl_empty_list():
 
     mod["main"] = f
 
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
+    with pytest.raises(tvm.error.TVMError):
+        result = veval(mod, device=dev, target=target)
 
 
-@tvm.testing.uses_gpu
-def test_list_tl():
+def test_list_tl(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -452,13 +426,11 @@ def test_list_tl():
 
     mod["main"] = f
 
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(vmobj_to_list(result), np.array([2, 1]))
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([2, 1]))
 
 
-@tvm.testing.uses_gpu
-def test_list_nth():
+def test_list_nth(target, dev):
     expected = list(range(10))
 
     for i in range(len(expected)):
@@ -474,13 +446,11 @@ def test_list_nth():
 
         f = relay.Function([], nth(l, relay.const(i)))
         mod["main"] = f
-        for tgt, dev in tvm.testing.enabled_targets():
-            result = veval(mod, device=dev, target=tgt)
-            tvm.testing.assert_allclose(result.numpy(), expected[i])
+        result = veval(mod, device=dev, target=target)
+        tvm.testing.assert_allclose(result.numpy(), expected[i])
 
 
-@tvm.testing.uses_gpu
-def test_list_update():
+def test_list_update(target, dev):
     expected = list(range(10))
 
     mod = tvm.IRModule()
@@ -500,13 +470,11 @@ def test_list_update():
 
     f = relay.Function([], l)
     mod["main"] = f
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(vmobj_to_list(result), np.array(expected))
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(vmobj_to_list(result), np.array(expected))
 
 
-@tvm.testing.uses_gpu
-def test_list_length():
+def test_list_length(target, dev):
     expected = list(range(10))
 
     mod = tvm.IRModule()
@@ -524,13 +492,11 @@ def test_list_length():
 
     f = relay.Function([], l)
     mod["main"] = f
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(result.numpy(), 10)
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(result.numpy(), 10)
 
 
-@tvm.testing.uses_gpu
-def test_list_map():
+def test_list_map(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -544,13 +510,11 @@ def test_list_map():
 
     f = relay.Function([], map(add_one_func, l))
     mod["main"] = f
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 2]))
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 2]))
 
 
-@tvm.testing.uses_gpu
-def test_list_foldl():
+def test_list_foldl(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -564,13 +528,11 @@ def test_list_foldl():
     l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
     f = relay.Function([], foldl(rev_dup_func, nil(), l))
     mod["main"] = f
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 3, 2, 2, 1, 1]))
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 3, 2, 2, 1, 1]))
 
 
-@tvm.testing.uses_gpu
-def test_list_foldr():
+def test_list_foldr(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -584,13 +546,11 @@ def test_list_foldr():
     l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
     f = relay.Function([], foldr(identity_func, nil(), l))
     mod["main"] = f
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(vmobj_to_list(result), np.array([1, 2, 3]))
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([1, 2, 3]))
 
 
-@tvm.testing.uses_gpu
-def test_list_sum():
+def test_list_sum(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -600,13 +560,11 @@ def test_list_sum():
     l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
     f = relay.Function([], sum(l))
     mod["main"] = f
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(result.numpy(), 6)
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(result.numpy(), 6)
 
 
-@tvm.testing.uses_gpu
-def test_list_filter():
+def test_list_filter(target, dev):
     mod = tvm.IRModule()
     p = Prelude(mod)
 
@@ -623,26 +581,22 @@ def test_list_filter():
     )
     f = relay.Function([], filter(greater_than_one, l))
     mod["main"] = f
-    for tgt, dev in tvm.testing.enabled_targets():
-        result = veval(mod, device=dev, target=tgt)
-        tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 5]))
+    result = veval(mod, device=dev, target=target)
+    tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 5]))
 
 
-@tvm.testing.uses_gpu
-def test_closure():
+def test_closure(target, dev):
     x = relay.var("x", shape=())
     y = relay.var("y", shape=())
     f = relay.Function([x], x + y)
     ff = relay.Function([y], f)
     clo = ff(relay.const(1.0))
     main = clo(relay.const(2.0))
-    for tgt, dev in tvm.testing.enabled_targets():
-        res = veval(main, device=dev, target=tgt)
-        tvm.testing.assert_allclose(res.numpy(), 3.0)
+    res = veval(main, device=dev, target=target)
+    tvm.testing.assert_allclose(res.numpy(), 3.0)
 
 
-@tvm.testing.uses_gpu
-def test_add_op_scalar():
+def test_add_op_scalar(target, dev):
     """
     test_add_op_scalar:
         fn (x, y) {
@@ -660,11 +614,10 @@ def test_add_op_scalar():
     ]
     for (x_data, y_data) in x_y_data:
         mod["main"] = func
-        check_result([x_data, y_data], x_data + y_data, mod=mod)
+        check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_add_op_scalar_int():
+def test_add_op_scalar_int(target, dev):
     """
     test_add_op_scalar_int:
         fn (x, y) {
@@ -682,11 +635,10 @@ def test_add_op_scalar_int():
     ]
     for (x_data, y_data) in x_y_data:
         mod["main"] = func
-        check_result([x_data, y_data], x_data + y_data, mod=mod)
+        check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_add_op_tensor():
+def test_add_op_tensor(target, dev):
     """
     test_add_op_tensor:
         fn (x, y) {
@@ -700,11 +652,10 @@ def test_add_op_tensor():
     x_data = np.random.rand(10, 5).astype("float32")
     y_data = np.random.rand(10, 5).astype("float32")
     mod["main"] = func
-    check_result([x_data, y_data], x_data + y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_add_op_broadcast():
+def test_add_op_broadcast(target, dev):
     """
     test_add_op_broadcast:
         fn (x, y) {
@@ -718,7 +669,7 @@ def test_add_op_broadcast():
     x_data = np.random.rand(10, 5).astype("float32")
     y_data = np.random.rand(1, 5).astype("float32")
     mod["main"] = func
-    check_result([x_data, y_data], x_data + y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
 
 
 def test_vm_optimize_dynamic():
@@ -742,8 +693,7 @@ def test_vm_optimize():
     assert len(free_vars) == 1
 
 
-@tvm.testing.uses_gpu
-def test_loop_free_var():
+def test_loop_free_var(target, dev):
     x = relay.var("x", shape=(), dtype="int32")
     i = relay.var("i", shape=(), dtype="int32")
     s = relay.var("s", shape=(), dtype="int32")
@@ -765,11 +715,10 @@ def body_with_free_var(i, acc):
         ret = relay.TupleGetItem(tup, 1)
         mod = tvm.IRModule()
         mod["main"] = relay.Function(relay.analysis.free_vars(ret), ret)
-        check_result(args, expected, mod=mod)
+        check_result(target, dev, args, expected, mod=mod)
 
 
-@tvm.testing.uses_gpu
-def test_vm_reshape_tensor():
+def test_vm_reshape_tensor(target, dev):
     x_np = np.random.uniform(size=(8, 16)).astype("float32")
     x = relay.var("x", shape=(8, 16), dtype="float32")
     y = relay.reshape(x, [-1, 4, 8])
@@ -778,7 +727,7 @@ def test_vm_reshape_tensor():
     with tvm.transform.PassContext(opt_level=3):
         exec = relay.vm.compile(mod, "llvm")
     assert "reshape_tensor" in exec.bytecode
-    check_result([x_np], x_np.reshape([4, 4, 8]), mod)
+    check_result(target, dev, [x_np], x_np.reshape([4, 4, 8]), mod)
 
     x = relay.var("x", shape=(8, 16), dtype="float32")
     y = relay.reshape(x, [16, -1])
@@ -788,7 +737,7 @@ def test_vm_reshape_tensor():
     with tvm.transform.PassContext(opt_level=3):
         exec = relay.vm.compile(mod, "llvm")
     assert exec.bytecode.count("reshape_tensor") == 1
-    check_result([x_np], x_np.reshape([4, 4, 8]), mod)
+    check_result(target, dev, [x_np], x_np.reshape([4, 4, 8]), mod)
 
     # reshape with symbolic/any shape
     for n in [tvm.tir.Any(), tvm.te.size_var("n")]:
@@ -800,7 +749,7 @@ def test_vm_reshape_tensor():
         with tvm.transform.PassContext(opt_level=3):
             exec = relay.vm.compile(mod, "llvm")
         assert exec.bytecode.count("reshape_tensor") == 1
-        check_result([x_np], x_np.reshape([32, 2, 2]), mod)
+        check_result(target, dev, [x_np], x_np.reshape([32, 2, 2]), mod)
 
     # dyn.reshape
     x = relay.var("x", shape=(8, 16), dtype="float32")
@@ -814,10 +763,10 @@ def test_vm_reshape_tensor():
     assert exec.bytecode.count("reshape_tensor") == 2
     assert "reshape_tensor" in exec.bytecode
     y_np = np.array([8, 2, 8]).astype("int32")
-    check_result([x_np, y_np], x_np.reshape([8, 2, 8]), mod)
+    check_result(target, dev, [x_np, y_np], x_np.reshape([8, 2, 8]), mod)
 
 
-def test_vm_reshape_tuple(x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
+def test_vm_reshape_tuple(target, dev, x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
     tup = relay.var(
         "tup",
         type_annotation=relay.TupleType([relay.TensorType(x_shape), relay.TensorType(y_shape)]),
@@ -828,9 +777,8 @@ def test_vm_reshape_tuple(x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
     x_data = np.random.uniform(size=x_shape).astype("float32")
     y_data = np.random.uniform(size=y_shape).astype("float32")
 
-    for tgt, dev in tvm.testing.enabled_targets():
-        res = veval(f, (x_data, y_data), device=dev, target=tgt)
-        tvm.testing.assert_allclose(res.numpy(), np.reshape(x_data, (1, -1)))
+    res = veval(f, (x_data, y_data), device=dev, target=target)
+    tvm.testing.assert_allclose(res.numpy(), np.reshape(x_data, (1, -1)))
 
 
 def test_constant_shape_with_external_codegen():
@@ -921,9 +869,8 @@ def test_get_output_single():
     np.testing.assert_allclose(outputs[0].numpy(), inp + inp)
 
 
-def test_get_output_multiple():
-    target = tvm.target.Target("llvm")
-
+@tvm.testing.parametrize_targets("llvm")
+def test_get_output_multiple(target, dev):
     # Build a IRModule.
     x = relay.var("x", shape=(10,))
     f = relay.Function([x], relay.Tuple([x + x, x]))
@@ -931,7 +878,7 @@ def test_get_output_multiple():
 
     # Compile to VMExecutable.
     vm_exec = vm.compile(mod, target=target)
-    vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
+    vm_factory = runtime.vm.VirtualMachine(vm_exec, dev)
     inp = np.ones(10, dtype="float32")
     vm_factory.invoke_stateful("main", inp)
     outputs = vm_factory.get_outputs()
@@ -940,9 +887,8 @@ def test_get_output_multiple():
     np.testing.assert_allclose(outputs[1].numpy(), inp)
 
 
-def test_get_input_index():
-    target = tvm.target.Target("llvm")
-
+@tvm.testing.parametrize_targets("llvm")
+def test_get_input_index(target, dev):
     # Build a IRModule.
     data_0, data_1 = ["d1", "d2"]
     x, y = [relay.var(c, shape=(10,)) for c in [data_0, data_1]]
@@ -951,16 +897,16 @@ def test_get_input_index():
 
     # Compile to VMExecutable.
     vm_exec = vm.compile(mod, target=target)
-    vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
+    vm_factory = runtime.vm.VirtualMachine(vm_exec, dev)
     assert vm_factory.get_input_index(data_1) == 1
     assert vm_factory.get_input_index(data_0) == 0
     assert vm_factory.get_input_index("invalid") == -1
 
 
-@tvm.testing.requires_llvm
-def test_benchmark():
+@tvm.testing.parametrize_targets("llvm")
+def test_benchmark(target, dev):
     mod, params = mlp.get_workload(1)
-    lib = vm.compile(mod, target="llvm", params=params)
+    lib = vm.compile(mod, target=target, params=params)
     exe = runtime.vm.VirtualMachine(lib, tvm.cpu())
     data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
     result = exe.benchmark(tvm.cpu(), data, func_name="main", repeat=2, number=1)
@@ -973,7 +919,7 @@ def test_benchmark():
         "time_evaluator",
         return_value=lambda x: tvm.runtime.module.BenchmarkResult([1, 2, 2, 5]),
     ) as method:
-        result = exe.benchmark(tvm.cpu(), data, func_name="main", repeat=2, number=1)
+        result = exe.benchmark(dev, data, func_name="main", repeat=2, number=1)
         assert result.mean == 2.5
         assert result.median == 2.0
         assert result.max == 5
@@ -981,8 +927,7 @@ def test_benchmark():
         assert result.std == 1.5
 
 
-@tvm.testing.parametrize_targets("cuda", "llvm")
-def test_benchmark_end_to_end(dev, target):
+def test_benchmark_end_to_end(target, dev):
     mod, params = mlp.get_workload(1)
     lib = vm.compile(mod, target=target, params=params)
     exe = runtime.vm.VirtualMachine(lib, dev)
@@ -1014,4 +959,4 @@ def test_benchmark_end_to_end_rpc():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py
new file mode 100644
index 000000000000..85583f6ccc5d
--- /dev/null
+++ b/tests/python/relay/utils/external_codegen.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities for testing external code generation"""
+
+import os
+import sys
+
+import pytest
+
+import tvm
+from tvm import relay, runtime
+from tvm.contrib import utils
+from tests.python.relay.aot.aot_test_utils import AOTTestModel, compile_and_run
+
+
+skip_windows = pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
+skip_micro = pytest.mark.skipif(
+    tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON",
+    reason="MicroTVM support not enabled. Set USE_MICRO=ON in config.cmake to enable.",
+)
+
+
+def parametrize_external_codegen_checks(test):
+    """Parametrize over the various check_result functions which are available"""
+    return pytest.mark.parametrize(
+        "check_result",
+        [
+            pytest.param(check_aot_executor_result, marks=[skip_windows, skip_micro]),
+            pytest.param(check_graph_executor_result, marks=[skip_windows]),
+            pytest.param(check_vm_result, marks=[skip_windows]),
+        ],
+    )(test)
+
+
+def parametrize_external_json_codegen_checks(test):
+    """Parametrize over the various check_result functions which are available for JSON"""
+    return pytest.mark.parametrize(
+        "check_result",
+        [
+            pytest.param(check_graph_executor_result, marks=[skip_windows]),
+            pytest.param(check_vm_result, marks=[skip_windows]),
+        ],
+    )(test)
+
+
+def update_lib(lib):
+    test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    source_dir = os.path.join(test_dir, "..", "..", "..")
+    contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
+
+    kwargs = {}
+    kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
+    tmp_path = utils.tempdir()
+    lib_name = "lib.so"
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path, fcompile=False, **kwargs)
+    lib = tvm.runtime.load_module(lib_path)
+
+    return lib
+
+
+def check_vm_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()):
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        exe = relay.vm.compile(mod, target=target)
+    code, lib = exe.save()
+    lib = update_lib(lib)
+    exe = runtime.vm.Executable.load_exec(code, lib)
+    vm = runtime.vm.VirtualMachine(exe, device)
+    out = vm.run(**map_inputs)
+    tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
+
+
+def check_graph_executor_result(
+    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()
+):
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        executor_factory = relay.build(mod, target=target)
+    lib = update_lib(executor_factory.lib)
+    rt_mod = tvm.contrib.graph_executor.create(executor_factory.graph_json, lib, device)
+
+    for name, data in map_inputs.items():
+        rt_mod.set_input(name, data)
+    rt_mod.run()
+    out = tvm.nd.empty(out_shape, device=device)
+    out = rt_mod.get_output(0, out)
+
+    tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)
+
+
+def check_aot_executor_result(
+    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()
+):
+    # Late import to avoid breaking test with USE_MICRO=OFF.
+    from aot.aot_test_utils import AOTTestModel, AOT_DEFAULT_RUNNER, compile_and_run
+
+    interface_api = "packed"
+    use_unpacked_api = False
+    test_runner = AOT_DEFAULT_RUNNER
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=map_inputs, outputs=[result]),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
+def set_external_func_attr(func, compiler, ext_symbol):
+    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Compiler", compiler)
+    func = func.with_attr("global_symbol", ext_symbol)
+    return func
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 641eed51d5cf..6ca2a2a5fcb0 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 import tvm
 from tvm import te
 
@@ -931,20 +932,27 @@ def test_shift_left_simplify():
     ck.verify(z, tvm.tir.const(1 << 10, "int32"))
 
 
+def test_div_zero_simplify():
+    ck = RewriteChecker()
+    ramp = tvm.tir.Ramp(1, 1, 2)
+    broadcast = tvm.tir.Broadcast(0, 2)
+
+    with pytest.raises(tvm.error.TVMError) as cm:
+        ck.analyzer.rewrite_simplify(tvm.tir.Div(ramp, broadcast))
+        assert "division by zero" in str(cm.execption)
+
+    with pytest.raises(tvm.error.TVMError) as cm:
+        ck.analyzer.rewrite_simplify(tvm.tir.Mod(ramp, broadcast))
+        assert "division by zero" in str(cm.execption)
+
+    with pytest.raises(tvm.error.TVMError) as cm:
+        ck.analyzer.rewrite_simplify(tvm.tir.FloorDiv(ramp, broadcast))
+        assert "division by zero" in str(cm.execption)
+
+    with pytest.raises(tvm.error.TVMError) as cm:
+        ck.analyzer.rewrite_simplify(tvm.tir.FloorMod(ramp, broadcast))
+        assert "division by zero" in str(cm.execption)
+
+
 if __name__ == "__main__":
-    test_floordiv_index_simplify()
-    test_floormod_index_simplify()
-    test_cmp_simplify()
-    test_vector_simplify()
-    test_add_index_simplify()
-    test_sub_index_simplify()
-    test_mul_index_simplify()
-    test_div_index_simplify()
-    test_max_index_simplify()
-    test_min_index_simplify()
-    test_mod_index_simplify()
-    test_select_simplify()
-    test_logical_simplify()
-    test_let_simplify()
-    test_cast_simplify()
-    test_shift_left_simplify()
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_autotvm_database.py b/tests/python/unittest/test_autotvm_database.py
index 197243ed47c0..d5980022811f 100644
--- a/tests/python/unittest/test_autotvm_database.py
+++ b/tests/python/unittest/test_autotvm_database.py
@@ -21,7 +21,7 @@
 from tvm.autotvm import database
 from tvm.autotvm.record import encode, MeasureResult
 
-from test_autotvm_common import get_sample_records
+from tvm.testing.autotvm import get_sample_records
 
 
 def test_save_load():
diff --git a/tests/python/unittest/test_autotvm_executor.py b/tests/python/unittest/test_autotvm_executor.py
deleted file mode 100644
index 9757576be9e3..000000000000
--- a/tests/python/unittest/test_autotvm_executor.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test local executor"""
-import time
-
-from tvm.autotvm.measure import LocalExecutor, executor
-
-
-def slow(n):
-    r = 0
-    for i in range(0, n + 1):
-        r += i
-    return r
-
-
-def fast(n):
-    return n * (n + 1) // 2
-
-
-def test_local_measure_async():
-    ex = LocalExecutor()
-    f1 = ex.submit(slow, 9999999)
-    f2 = ex.submit(fast, 9999999)
-    t1 = 0
-    t2 = 0
-    while True:
-        if t1 == 0 and f1.done():
-            t1 = time.time()
-        if t2 == 0 and f2.done():
-            t2 = time.time()
-        if t1 != 0 and t2 != 0:
-            break
-    assert t2 < t1, "Expected fast async job to finish first!"
-    assert f1.get() == f2.get()
-
-
-def timeout_job(n):
-    time.sleep(n * 1.5)
-
-
-def test_timeout():
-    timeout = 0.5
-
-    ex = LocalExecutor(timeout=timeout)
-
-    f1 = ex.submit(timeout_job, timeout)
-    while not f1.done():
-        pass
-    res = f1.get()
-    assert isinstance(res, executor.TimeoutError)
-
-
-if __name__ == "__main__":
-    test_local_measure_async()
-    test_timeout()
diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py
index c433d8fb7297..be89ee2506fc 100644
--- a/tests/python/unittest/test_autotvm_index_tuner.py
+++ b/tests/python/unittest/test_autotvm_index_tuner.py
@@ -17,7 +17,7 @@
 """Test index based tuners"""
 
 import multiprocessing
-from test_autotvm_common import DummyRunner, get_sample_task
+from tvm.testing.autotvm import DummyRunner, get_sample_task
 from tvm import autotvm
 from tvm.autotvm.tuner import GridSearchTuner, RandomTuner
 
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index a89c69c37d64..3ef5cbdad635 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -17,13 +17,14 @@
 """Test builder and runner"""
 import logging
 import multiprocessing
-import time
+import concurrent
 
 import numpy as np
 
 import tvm
 from tvm import te
-from test_autotvm_common import DummyRunner, bad_matmul, get_sample_task
+from tvm.autotvm.measure import executor
+from tvm.testing.autotvm import DummyRunner, bad_matmul, get_sample_task
 from tvm import autotvm
 from tvm.autotvm.measure.measure import MeasureErrorNo, MeasureResult
 from tvm.autotvm import measure
@@ -76,7 +77,9 @@ def submit(self, func, *args, **kwargs):
             self.ran_dummy_executor = True
             sig = Signature.from_callable(func)
             assert sig.bind(*args, **kwargs).arguments["ref_input"] == refinp
-            return measure.local_executor.LocalFutureNoFork(None)
+            dummy_future = concurrent.futures.Future()
+            dummy_future.set_result(None)
+            return dummy_future
 
     runner.executor = DummyExecutor()
     runner.run([None], [None])
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index 51cc9074a4fe..65739df52cd9 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -25,7 +25,7 @@
 from tvm.autotvm.measure import MeasureInput, MeasureResult, MeasureErrorNo
 from tvm.autotvm.record import encode, decode, ApplyHistoryBest, measure_str_key
 
-from test_autotvm_common import get_sample_task
+from tvm.testing.autotvm import get_sample_task
 
 
 def test_load_dump():
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 445cff8759ab..baecdaceab6d 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -25,7 +25,7 @@
 from tvm.autotvm import MeasureInput, MeasureResult
 from tvm.autotvm.tuner.xgboost_cost_model import XGBoostCostModel
 
-from test_autotvm_common import get_sample_task, get_sample_records
+from tvm.testing.autotvm import get_sample_task, get_sample_records
 
 
 def test_fit():
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 586e9fbfb91e..af14a38c9f9a 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -219,5 +219,110 @@ def test_platform_timer():
         assert len(result.results) == 3
 
 
+@tvm.testing.requires_micro
+def test_autotune():
+    """Verify that autotune works with micro."""
+    import tvm.relay as relay
+
+    data = relay.var("data", relay.TensorType((1, 3, 64, 64), "float32"))
+    weight = relay.var("weight", relay.TensorType((8, 3, 5, 5), "float32"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        kernel_layout="OIHW",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+
+    main_func = mod["main"]
+    shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
+    type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
+
+    weight_data = np.ones(shape_dict["weight"]).astype(type_dict["weight"])
+    input_data = np.ones(shape_dict["data"]).astype(type_dict["data"])
+    params = {"weight": weight_data}
+    inputs = {"data": input_data}
+
+    target = tvm.target.target.micro("host")
+    template_project_dir = pathlib.Path(tvm.micro.get_standalone_crt_dir()) / "template" / "host"
+
+    pass_context = tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True})
+    with pass_context:
+        tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target)
+    assert len(tasks) > 0
+
+    module_loader = tvm.micro.AutoTvmModuleLoader(
+        template_project_dir=template_project_dir,
+        project_options={},
+    )
+    builder = tvm.autotvm.LocalBuilder(
+        n_parallel=1,
+        build_kwargs={"build_option": {"tir.disable_vectorize": True}},
+        do_fork=True,
+        build_func=tvm.micro.autotvm_build_func,
+    )
+    runner = tvm.autotvm.LocalRunner(number=1, repeat=1, module_loader=module_loader)
+
+    measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)
+
+    tune_log_file = pathlib.Path("crt_autotune.log")
+    if tune_log_file.exists():
+        tune_log_file.unlink()
+
+    num_trials = 10
+    for task in tasks:
+        tuner = tvm.autotvm.tuner.GATuner(task)
+        tuner.tune(
+            n_trial=num_trials,
+            measure_option=measure_option,
+            callbacks=[
+                tvm.autotvm.callback.log_to_file(str(tune_log_file)),
+                tvm.autotvm.callback.progress_bar(num_trials, si_prefix="M"),
+            ],
+            si_prefix="M",
+        )
+
+    assert tuner.best_flops > 0
+
+    # Build without tuning
+    with pass_context:
+        lowered = tvm.relay.build(mod, target=TARGET, params=params)
+
+    temp_dir = tvm.contrib.utils.tempdir()
+    project = tvm.micro.generate_project(template_project_dir, lowered, temp_dir / "project")
+    project.build()
+    with tvm.micro.Session(project.transport()) as session:
+        graph_mod = tvm.micro.create_local_graph_executor(
+            lowered.get_graph_json(), session.get_system_lib(), session.device
+        )
+        graph_mod.set_input(**lowered.get_params())
+        graph_mod.run(**inputs)
+        expected_output = graph_mod.get_output(0).numpy()
+        del graph_mod
+
+    # Build using autotune logs
+    with tvm.autotvm.apply_history_best(str(tune_log_file)):
+        with pass_context:
+            lowered_tuned = tvm.relay.build(mod, target=target, params=params)
+
+    temp_dir = tvm.contrib.utils.tempdir()
+    project = tvm.micro.generate_project(template_project_dir, lowered_tuned, temp_dir / "project")
+    project.build()
+    with tvm.micro.Session(project.transport()) as session:
+        graph_mod = tvm.micro.create_local_graph_executor(
+            lowered_tuned.get_graph_json(), session.get_system_lib(), session.device
+        )
+        graph_mod.set_input(**lowered_tuned.get_params())
+        graph_mod.run(**inputs)
+        output = graph_mod.get_output(0).numpy()
+        del graph_mod
+
+    tvm.testing.assert_allclose(output, expected_output, rtol=1e-4, atol=1e-5)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_ir_container.py b/tests/python/unittest/test_ir_container.py
index fb83817f1eed..3652d5bdb280 100644
--- a/tests/python/unittest/test_ir_container.py
+++ b/tests/python/unittest/test_ir_container.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 import tvm
 from tvm import te
 import numpy as np
@@ -34,6 +35,17 @@ def test_array_save_load_json():
     assert a_loaded[1].value == 2
 
 
+def test_dir_array():
+    a = tvm.runtime.convert([1, 2, 3])
+    assert dir(a)
+
+
+def test_getattr_array():
+    a = tvm.runtime.convert([1, 2, 3])
+    assert getattr(a, "type_key") == "Array"
+    assert not hasattr(a, "test_key")
+
+
 def test_map():
     a = te.var("a")
     b = te.var("b")
@@ -70,6 +82,21 @@ def test_map_save_load_json():
     assert dd == {"a": 2, "b": 3}
 
 
+def test_dir_map():
+    a = te.var("a")
+    b = te.var("b")
+    amap = tvm.runtime.convert({a: 2, b: 3})
+    assert dir(amap)
+
+
+def test_getattr_map():
+    a = te.var("a")
+    b = te.var("b")
+    amap = tvm.runtime.convert({a: 2, b: 3})
+    assert getattr(amap, "type_key") == "Map"
+    assert not hasattr(amap, "test_key")
+
+
 def test_in_container():
     arr = tvm.runtime.convert(["a", "b", "c"])
     assert "a" in arr
@@ -86,10 +113,4 @@ def test_ndarray_container():
 
 
 if __name__ == "__main__":
-    test_str_map()
-    test_array()
-    test_map()
-    test_array_save_load_json()
-    test_map_save_load_json()
-    test_in_container()
-    test_ndarray_container()
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 2abbcef29283..162e10280d13 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -100,7 +100,7 @@ def test_cpu_get_graph_json():
     loaded_lib = tvm.runtime.load_module(path_lib)
     json = loaded_lib["get_graph_json"]()
     assert isinstance(json, str) == True
-    assert json.find("tvmgen_default_fused_nn_softmax1") > -1
+    assert json.find("tvmgen_default_fused_nn_softmax_add") > -1
 
 
 @tvm.testing.requires_cuda
diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py
index 523065465172..7bf4d72b047e 100644
--- a/tests/python/unittest/test_runtime_module_load.py
+++ b/tests/python/unittest/test_runtime_module_load.py
@@ -88,7 +88,12 @@ def save_object(names):
     with open(path_runtime_py, "w") as fo:
         fo.write(runtime_py)
 
-    subprocess.check_call("python3 %s %s %s" % (path_runtime_py, path_dso, dtype), shell=True)
+    proc = subprocess.run(
+        [sys.executable, path_runtime_py, path_dso, dtype],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    assert proc.returncode == 0, f"{proc.args} exited with {proc.returncode}: {proc.stdout}"
 
 
 @tvm.testing.requires_gpu
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 8306f2f67fa1..ca6cb0181489 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -26,6 +26,9 @@
 from tvm import relay
 from tvm.relay.testing import mlp
 from tvm.contrib.debugger import debug_executor
+from tvm import rpc
+from tvm.contrib import utils
+from tvm.runtime.profiling import Report
 
 
 def read_csv(report):
@@ -102,7 +105,6 @@ def test_papi(target, dev):
         func_name="main",
         collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})],
     )
-    print(report)
     assert metric in str(report)
 
     csv = read_csv(report)
@@ -126,10 +128,60 @@ def test_json():
     assert "microseconds" in parsed["calls"][0]["Duration (us)"]
     assert len(parsed["calls"]) > 0
     for call in parsed["calls"]:
-        assert isinstance(call["Name"], str)
+        assert isinstance(call["Name"]["string"], str)
         assert isinstance(call["Count"]["count"], int)
         assert isinstance(call["Duration (us)"]["microseconds"], float)
 
 
+@tvm.testing.requires_llvm
+def test_rpc_vm():
+    server = rpc.Server(key="profiling")
+    remote = rpc.connect("127.0.0.1", server.port, key="profiling")
+
+    mod, params = mlp.get_workload(1)
+    exe = relay.vm.compile(mod, "llvm", params=params)
+    temp = utils.tempdir()
+    path = temp.relpath("lib.tar")
+    exe.mod.export_library(path)
+    remote.upload(path)
+    rexec = remote.load_module("lib.tar")
+    vm = profiler_vm.VirtualMachineProfiler(rexec, remote.cpu())
+    report = vm.profile(tvm.nd.array(np.ones((1, 1, 28, 28), dtype="float32"), device=remote.cpu()))
+    assert len(report.calls) > 0
+
+
+def test_rpc_graph():
+    server = rpc.Server(key="profiling")
+    remote = rpc.connect("127.0.0.1", server.port, key="profiling")
+
+    mod, params = mlp.get_workload(1)
+    exe = relay.build(mod, "llvm", params=params)
+    temp = utils.tempdir()
+    path = temp.relpath("lib.tar")
+    exe.export_library(path)
+    remote.upload(path)
+    rexec = remote.load_module("lib.tar")
+
+    gr = debug_executor.create(exe.get_graph_json(), rexec, remote.cpu())
+
+    data = np.random.rand(1, 1, 28, 28).astype("float32")
+    report = gr.profile(data=data)
+    assert len(report.calls) > 0
+
+
+def test_report_serialization():
+    mod, params = mlp.get_workload(1)
+
+    exe = relay.vm.compile(mod, "llvm", params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, tvm.cpu())
+
+    data = np.random.rand(1, 1, 28, 28).astype("float32")
+    report = vm.profile(data, func_name="main")
+
+    report2 = Report.from_json(report.json())
+    # equality on reports compares pointers, so we compare the printed results instead.
+    assert str(report) == str(report2)
+
+
 if __name__ == "__main__":
     test_papi("llvm", tvm.cpu())
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index 6ffb2f4741e8..c8b48993967b 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -15,9 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import numpy as np
 import os
 import re
 import tvm
+import tvm.relay
 import tvm.contrib.hexagon as hexagon
 
 
@@ -107,7 +109,84 @@ def test_alloc_vtcm():
     assert "HexagonBackendFreeVTCM" in calls
 
 
+def test_llvm_options():
+    if not check_prereq_and_setup():
+        return
+    target = tvm.target.hexagon("v66", llvm_options="-hexagon-noopt")
+    Zero = tvm.te.compute((10,), lambda _: tvm.tir.const(0, "int32"))
+    s = tvm.te.create_schedule(Zero.op)
+    tvm.build(s, [Zero], target=target, name="zero")
+    # Check that BuildHexagon hasn't crashed because of target attribute
+    # type mismatch.
+    assert re.search("-hexagon-noopt", str(target))
+
+
+def test_linked_params_codegen():
+    if not check_prereq_and_setup():
+        return
+
+    # A simple model (a single conv2d) to trigger parameter separation:
+    mod_lines = [
+        '#[version = "0.0.5"]',
+        "def @main(%input: Tensor[(1, 16, 16, 3), uint8], %weights: Tensor[(3, 3, 3, 3), uint8])"
+        " -> Tensor[(1, 14, 14, 3), uint8] {",
+        '  nn.conv2d(%input, %weights, data_layout="NHWC", kernel_layout="HWIO", '
+        'kernel_size=[3, 3], out_dtype="uint8")',
+        "}",
+    ]
+    mod = tvm.parser.fromtext("\n".join(mod_lines))
+    # Make the params be 81 x 'T':
+    params = {"weights": np.full([3, 3, 3, 3], fill_value=ord("T"), dtype=np.uint8)}
+
+    target = tvm.target.hexagon("v68", link_params=True)
+
+    with tvm.transform.PassContext(opt_level=3):
+        lib = tvm.relay.build(mod, target=target, target_host=target, params=params)
+        llvm_ir = lib.get_lib().get_source("ll")
+
+    # The definition of the parameter:
+    p0_def_re = r"@__tvm_param__p0 = internal constant \[81 x i8\] c\"T{81}\", align 128"
+    assert re.search(p0_def_re, llvm_ir)
+
+    # The body of the _lookup_linked_param function:
+    linked_param_re = r"(define.*@_lookup_linked_param\(.*\).* {[^}]*})"
+    linked_param_body = re.search(linked_param_re, llvm_ir, flags=re.MULTILINE)
+    assert linked_param_body and linked_param_body.groups()
+
+    # Reference to the parameter:
+    p0_use_re = r"\[81 x i8\]\* @__tvm_param__p0"
+    assert re.search(p0_use_re, linked_param_body.groups()[0])
+
+    """
+    A snippet of actual LLVM IR containing the definition of the linked
+    parameter, and the the body of the _lookup_linked_param function.
+
+
+    @__tvm_param__p0 = internal constant [81 x i8] c"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", align 128
+
+    define dllexport i32 @_lookup_linked_param(i8* nocapture readonly %0, i32* nocapture readnone %1, i32 %2, i8* nocapture %3, i32* nocapture %4, i8* nocapture readnone %5) local_unnamed_addr #2 {
+    entry:
+      %6 = bitcast i8* %0 to i64*
+      %7 = load i64, i64* %6, align 8
+      %cond = icmp eq i64 %7, 1
+      br i1 %cond, label %case___tvm_param__p0, label %common.ret
+
+    common.ret:                                       ; preds = %entry, %case___tvm_param__p0
+      %storemerge = phi i32 [ 3, %case___tvm_param__p0 ], [ 4, %entry ]
+      store i32 %storemerge, i32* %4, align 4
+      ret i32 0
+
+    case___tvm_param__p0:                             ; preds = %entry
+      %8 = bitcast i8* %3 to i8**
+      store i8* getelementptr inbounds ([81 x i8], [81 x i8]* @__tvm_param__p0, i32 0, i32 0), i8** %8, align 4
+      br label %common.ret
+    }
+    """
+
+
 if __name__ == "__main__":
     test_basic()
     test_llvm_target_features()
     test_alloc_vtcm()
+    test_llvm_options()
+    test_linked_params_codegen()
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 10cbcd68f362..e5e93ed2c940 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -818,5 +818,32 @@ def do_atomic_add(A):
         tvm.testing.assert_allclose(a.numpy(), ref, rtol=1e-5)
 
 
+@tvm.testing.requires_llvm
+def test_llvm_order_functions():
+    """Check that functions in the LLVM module are ordered alphabetically."""
+
+    # Note: the order is alphabetical because that's a predictable ordering. Any predictable
+    # ordering will work fine, but if the ordering changes, this test will need to be updated.
+    def make_call_extern(caller, callee):
+        # Create a function:
+        #   float32 caller(float32 v) { return callee(v); }
+        ib = tvm.tir.ir_builder.create()
+        v = tvm.te.var("v", dtype="float32")
+        t = tvm.tir.call_extern("float32", callee, v)
+        ib.emit(t)
+        return tvm.tir.PrimFunc([v], ib.get()).with_attr("global_symbol", caller)
+
+    # Create some functions in a random order.
+    functions = {
+        "Danny": make_call_extern("Danny", "Dave"),
+        "Sammy": make_call_extern("Sammy", "Eve"),
+        "Kirby": make_call_extern("Kirby", "Fred"),
+    }
+    mod = tvm.IRModule(functions=functions)
+    ir_text = tvm.build(mod, None, target="llvm").get_source("ll")
+    matches = re.findall(r"^define[^@]*@([a-zA-Z_][a-zA-Z0-9_]*)", ir_text, re.MULTILINE)
+    assert matches == sorted(matches)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py b/tests/python/unittest/test_target_texture_codegen_opencl.py
index 03944c85ade5..acfadc9d51ad 100644
--- a/tests/python/unittest/test_target_texture_codegen_opencl.py
+++ b/tests/python/unittest/test_target_texture_codegen_opencl.py
@@ -514,7 +514,7 @@ def copy_to_texture(stage):
 
 
 def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None):
-    """Convolution operator in NCHWc layout. """
+    """Convolution operator in NCHWc layout."""
 
     if out_dtype is None:
         out_dtype = Input.dtype
@@ -694,7 +694,7 @@ def copy_to_texture(stage):
 
 
 def compute_conv2d_NCHWc_KCRSk_acc32(Input, Filter, stride, padding, dilation, out_dtype=None):
-    """Convolution operator in NCHWc layout. """
+    """Convolution operator in NCHWc layout."""
 
     if out_dtype is None:
         out_dtype = Input.dtype
@@ -879,7 +879,7 @@ def copy_to_texture(stage):
 def compute_depthwise_conv2d_NCHWc_KCRSk_acc32(
     Input, Filter, stride, padding, dilation, out_dtype=None
 ):
-    """Depthwise convolution operator in NCHWc layout. """
+    """Depthwise convolution operator in NCHWc layout."""
     if out_dtype is None:
         out_dtype = Input.dtype
     assert isinstance(stride, int) or len(stride) == 2
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index 42f9c34133df..422d730160b5 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 import tvm
 import tvm.testing
 from tvm import te
@@ -84,6 +85,28 @@ def test_buffer_vload():
     tvm.testing.assert_prim_expr_equal(load.index, n * 2 + 103)
 
 
+def test_buffer_vload_nullptr():
+    var = tvm.tir.Var("v", dtype="int32")
+    buf = tvm.tir.decl_buffer((1,), name="buf")
+    buf_load = tvm.tir.expr.BufferLoad(buffer=buf, indices=tvm.runtime.convert([0]))
+    buf_load_stmt = tvm.tir.stmt.Evaluate(buf_load)
+    for_loop = tvm.tir.stmt.For(
+        loop_var=var, kind=0, min_val=0, extent=buf_load, body=buf_load_stmt
+    )
+    buf_func = tvm.tir.PrimFunc(params={}, body=for_loop)
+    mod = tvm.IRModule({"main": buf_func})
+    # Trigger nullptr buffer bug by pass
+    with pytest.raises(tvm.error.TVMError) as cm:
+        mod = tvm.transform.Sequential(
+            [
+                tvm.tir.transform.PlanAndUpdateBufferAllocationLocation(),
+                tvm.tir.transform.CompactBufferAllocation(),
+                tvm.tir.transform.FlattenBuffer(),
+            ]
+        )(mod)
+        assert "(n != nullptr) is false" in str(cm.execption)
+
+
 def test_buffer_index_merge_mult_mod():
     m = te.size_var("m")
     n = te.size_var("n")
@@ -229,11 +252,4 @@ def check_auto_bind():
 
 
 if __name__ == "__main__":
-    test_buffer()
-    test_buffer_access_ptr()
-    test_buffer_access_ptr_offset()
-    test_buffer_access_ptr_extent()
-    test_buffer_vload()
-    test_buffer_index_merge_mult_mod()
-    test_buffer_broadcast()
-    test_buffer_broadcast_expr()
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
new file mode 100644
index 000000000000..a4f8b2e77078
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -0,0 +1,832 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import sys
+
+import pytest
+
+import tvm
+from tvm import tir
+from tvm.script import ty
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+
+# fmt: off
+# pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
+
+@tvm.script.tir
+def two_elementwise(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, (128, 128), "float32")
+    B = tir.alloc_buffer((128, 128), "float32")
+    C = tir.match_buffer(c, (128, 128), "float32")
+    with tir.block([128, 128], "B") as [vi, vj]:
+        B[vi, vj] = A[vi, vj] * 2.0
+    with tir.block([128, 128], "C") as [vi, vj]:
+        C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def two_elementwise_after_compute_at(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, (128, 128), "float32")
+    B = tir.alloc_buffer((128, 128), "float32")
+    C = tir.match_buffer(c, (128, 128), "float32")
+    for i in range(0, 128):
+        for ax0, ax1 in tir.grid(1, 128):
+            with tir.block([128, 128], "B") as [vi, vj]:
+                tir.bind(vi, i + ax0)
+                tir.bind(vj, ax1)
+                B[vi, vj] = A[vi, vj] * 2.0
+        for j in range(0, 128):
+            with tir.block([128, 128], "B") as [vi, vj]:
+                C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def blockized_1(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128], "float32")
+    B = tir.alloc_buffer([128, 128], "float32")
+    C = tir.match_buffer(c, [128, 128], "float32")
+    with tir.block([128, 128], "B") as [vi, vj]:
+        B[vi, vj] = A[vi, vj] * 2.0
+    with tir.block([8, 8], "C_outer") as [vi_o, vj_o]:
+        tir.reads([B[
+            vi_o * 16 : vi_o * 16 + 16,
+            vj_o * 16 : vj_o * 16 + 16,
+        ]])
+        tir.writes([C[
+            vi_o * 16 : vi_o * 16 + 16,
+            vj_o * 16 : vj_o * 16 + 16
+        ]])
+        for i_i, j_i in tir.grid(16, 16):
+            with tir.block([128, 128], "C_inner") as [vi, vj]:
+                tir.bind(vi, vi_o * 16 + i_i)
+                tir.bind(vj, vj_o * 16 + j_i)
+                C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def blockized_after_compute_at(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128], "float32")
+    B = tir.alloc_buffer([128, 128], "float32")
+    C = tir.match_buffer(c, [128, 128], "float32")
+    for i0_0, i1_0 in tir.grid(8, 8):
+        for ax0, ax1 in tir.grid(16, 16):
+            with tir.block([128, 128], "B") as [vi, vj]:
+                tir.bind(vi, i0_0 * 16 + ax0)
+                tir.bind(vj, i1_0 * 16 + ax1)
+                B[vi, vj] = A[vi, vj] * 2.0
+        with tir.block([8, 8], "C_outer") as [vi_o, vj_o]:
+            tir.bind(vi_o, i0_0)
+            tir.bind(vj_o, i1_0)
+            tir.reads([B[
+                vi_o * 16 : vi_o * 16 + 16,
+                vj_o * 16 : vj_o * 16 + 16,
+            ]])
+            tir.writes([C[
+                vi_o * 16 : vi_o * 16 + 16,
+                vj_o * 16 : vj_o * 16 + 16
+            ]])
+            for i0_1, i1_1 in tir.grid(16, 16):
+                with tir.block([128, 128], "C_inner") as [vi, vj]:
+                    tir.bind(vi, vi_o * 16 + i0_1)
+                    tir.bind(vj, vj_o * 16 + i1_1)
+                    C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def blockized_2(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128], "float32")
+    B = tir.alloc_buffer([128, 128], "float32")
+    C = tir.match_buffer(c, [128, 128], "float32")
+    for i_o, j_o in tir.grid(8, 8):
+        with tir.block([8, 8], "B_outer") as [vio, vjo]:
+            tir.bind(vio, i_o)
+            tir.bind(vjo, j_o)
+            tir.reads([A[
+                vio * 16 : vio * 16 + 16,
+                vjo * 16 : vjo * 16 + 16,
+            ]])
+            tir.writes([B[
+                vio * 16 : vio * 16 + 16,
+                vjo * 16 : vjo * 16 + 16
+            ]])
+            for i_i, j_i in tir.grid(16, 16):
+                with tir.block([128, 128], "B_inner") as [vi, vj]:
+                    tir.bind(vi, vio * 16 + i_i)
+                    tir.bind(vj, vjo * 16 + j_i)
+                    B[vi, vj] = A[vi, vj] * 2.0
+    for i_o, j_o, i_i, j_i in tir.grid(4, 4, 32, 32):
+        with tir.block([128, 128], "C") as [vi, vj]:
+            tir.bind(vi, i_o * 32 + i_i)
+            tir.bind(vj, j_o * 32 + j_i)
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def blockized_2_after_reverse_compute_at(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128], "float32")
+    B = tir.alloc_buffer([128, 128], "float32")
+    C = tir.match_buffer(c, [128, 128], "float32")
+    for i_o, j_o in tir.grid(8, 8):
+        with tir.block([8, 8], "B_outer") as [vio, vjo]:
+            tir.bind(vio, i_o)
+            tir.bind(vjo, j_o)
+            tir.reads([A[
+                vio * 16 : vio * 16 + 16,
+                vjo * 16 : vjo * 16 + 16,
+            ]])
+            tir.writes([B[
+                vio * 16 : vio * 16 + 16,
+                vjo * 16 : vjo * 16 + 16
+            ]])
+            for i_i, j_i in tir.grid(16, 16):
+                with tir.block([128, 128], "B_inner") as [vi, vj]:
+                    tir.bind(vi, vio * 16 + i_i)
+                    tir.bind(vj, vjo * 16 + j_i)
+                    B[vi, vj] = A[vi, vj] * 2.0
+        for ax0, ax1 in tir.grid(16, 16):
+            with tir.block([128, 128], "C") as [vi, vj]:
+                tir.bind(vi, i_o * 16 + ax0)
+                tir.bind(vj, j_o * 16 + ax1)
+                tir.reads([B[vi, vj]])
+                tir.writes([C[vi, vj]])
+                C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def blockized_2_after_compute_at(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128], "float32")
+    B = tir.alloc_buffer([128, 128], "float32")
+    C = tir.match_buffer(c, [128, 128], "float32")
+    for i_o, j_o in tir.grid(4, 4):
+        for ax0, ax1 in tir.grid(2, 2):
+            with tir.block([8, 8], "blockized_B") as [vio, vjo]:
+                tir.bind(vio, i_o * 2 + ax0)
+                tir.bind(vjo, j_o * 2 + ax1)
+                tir.reads([A[
+                    vio * 16 : vio * 16 + 16,
+                    vjo * 16 : vjo * 16 + 16,
+                ]])
+                tir.writes([B[
+                    vio * 16 : vio * 16 + 16,
+                    vjo * 16 : vjo * 16 + 16,
+                ]])
+                for i_i, j_i in tir.grid(16, 16):
+                    with tir.block([128, 128], "B") as [vi, vj]:
+                        tir.bind(vi, vio * 16 + i_i)
+                        tir.bind(vj, vjo * 16 + j_i)
+                        B[vi, vj] = A[vi, vj] * 2.0
+        for i_i, j_i in tir.grid(32, 32):
+            with tir.block([128, 128], "C") as [vi, vj]:
+                tir.bind(vi, i_o * 32 + i_i)
+                tir.bind(vj, j_o * 32 + j_i)
+                C[vi, vj] = B[vi, vj] + 1.0
+
+@tvm.script.tir
+def cuda_matmul_0(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=undefined-loop-variable
+    A = tir.match_buffer(a, [2048, 2048], "float32")
+    B = tir.match_buffer(b, [2048, 2048], "float32")
+    C = tir.match_buffer(c, [2048, 2048], "float32")
+    A_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    B_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    A_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    B_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    C_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    with tir.block([2048, 2048], "A_shared") as [v0, v1]:
+        A_shared[v0, v1] = A[v0, v1]
+    with tir.block([2048, 2048], "B_shared") as [v0, v1]:
+        B_shared[v0, v1] = B[v0, v1]
+    with tir.block([2048, 2048], "A_shared_local") as [v0, v1]:
+        A_shared_local[v0, v1] = A_shared[v0, v1]
+    with tir.block([2048, 2048], "B_shared_local") as [v0, v1]:
+        B_shared_local[v0, v1] = B_shared[v0, v1]
+    with tir.block([2048, 2048, tir.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
+        with tir.init():
+            C_local[vi, vj] = 0.0
+        C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+    for by in tir.thread_binding(0, 32, thread = "blockIdx.y"):
+        for bx in tir.thread_binding(0, 32, thread = "blockIdx.x"):
+            for vy in tir.thread_binding(0, 2, thread = "vthread.y"):
+                for vx in tir.thread_binding(0, 2, thread = "vthread.x"):
+                    for ty in tir.thread_binding(0, 8, thread = "threadIdx.y"):
+                        for tx in tir.thread_binding(0, 8, thread = "threadIdx.x"):
+                            for i, j in tir.grid(4, 4):
+                                with tir.block([2048, 2048], "C_local") as [v0_4, v1_4]:
+                                    tir.bind(v0_4, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(v1_4, bx * 64 + vx * 32 + tx * 4 + j)
+                                    C[v0_4, v1_4] = C_local[v0_4, v1_4]
+
+
+@tvm.script.tir
+def cuda_matmul_0_after_compute_at(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=undefined-loop-variable
+    A = tir.match_buffer(a, [2048, 2048], "float32")
+    B = tir.match_buffer(b, [2048, 2048], "float32")
+    C = tir.match_buffer(c, [2048, 2048], "float32")
+    A_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    B_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    A_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    B_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    C_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    with tir.block([2048, 2048], "A_shared") as [v0, v1]:
+        A_shared[v0, v1] = A[v0, v1]
+    with tir.block([2048, 2048], "B_shared") as [v0, v1]:
+        B_shared[v0, v1] = B[v0, v1]
+    with tir.block([2048, 2048], "A_shared_local") as [v0, v1]:
+        A_shared_local[v0, v1] = A_shared[v0, v1]
+    with tir.block([2048, 2048], "B_shared_local") as [v0, v1]:
+        B_shared_local[v0, v1] = B_shared[v0, v1]
+    for by in tir.thread_binding(0, 32, thread = "blockIdx.y"):
+        for bx in tir.thread_binding(0, 32, thread = "blockIdx.x"):
+            for vy in tir.thread_binding(0, 2, thread = "vthread.y"):
+                for vx in tir.thread_binding(0, 2, thread = "vthread.x"):
+                    for ty in tir.thread_binding(0, 8, thread = "threadIdx.y"):
+                        for tx in tir.thread_binding(0, 8, thread = "threadIdx.x"):
+                            for i, j, k in tir.grid(4, 4, 2048):
+                                with tir.block([2048, 2048, tir.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
+                                    tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                    tir.bind(vk, k)
+                                    with tir.init():
+                                        C_local[vi, vj] = 0.0
+                                    C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+                            for i, j in tir.grid(4, 4):
+                                with tir.block([2048, 2048], "C_local") as [vi, vj]:
+                                    tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                    C[vi, vj] = C_local[vi, vj]
+
+
+@tvm.script.tir
+def cuda_matmul_1(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=undefined-loop-variable
+    A = tir.match_buffer(a, [2048, 2048], "float32")
+    B = tir.match_buffer(b, [2048, 2048], "float32")
+    C = tir.match_buffer(c, [2048, 2048], "float32")
+    A_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    B_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    A_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    B_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    C_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    with tir.block([2048, 2048], "A_shared") as [v0, v1]:
+        A_shared[v0, v1] = A[v0, v1]
+    with tir.block([2048, 2048], "B_shared") as [v0, v1]:
+        B_shared[v0, v1] = B[v0, v1]
+    with tir.block([2048, 2048], "A_shared_local") as [v0, v1]:
+        A_shared_local[v0, v1] = A_shared[v0, v1]
+    with tir.block([2048, 2048], "B_shared_local") as [v0, v1]:
+        B_shared_local[v0, v1] = B_shared[v0, v1]
+    for by in tir.thread_binding(0, 32, thread = "blockIdx.y"):
+        for bx in tir.thread_binding(0, 32, thread = "blockIdx.x"):
+            for vy in tir.thread_binding(0, 2, thread = "vthread.y"):
+                for vx in tir.thread_binding(0, 2, thread = "vthread.x"):
+                    for ty in tir.thread_binding(0, 8, thread = "threadIdx.y"):
+                        for tx in tir.thread_binding(0, 8, thread = "threadIdx.x"):
+                            for k_0 in tir.serial(0, 256):
+                                for k_1 in tir.unroll(0, 8):
+                                    for _, i, j in tir.grid(1, 4, 4):
+                                        with tir.block([2048, 2048, tir.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
+                                            tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                            tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                            tir.bind(vk, k_0 * 8 + k_1)
+                                            with tir.init():
+                                                C_local[vi, vj] = 0.0
+                                            C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+                            for i, j in tir.grid(4, 4):
+                                with tir.block([2048, 2048], "C_local") as [vi, vj]:
+                                    tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                    C[vi, vj] = C_local[vi, vj]
+
+
+@tvm.script.tir
+def cuda_matmul_2(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=undefined-loop-variable
+    A = tir.match_buffer(a, [2048, 2048], "float32")
+    B = tir.match_buffer(b, [2048, 2048], "float32")
+    C = tir.match_buffer(c, [2048, 2048], "float32")
+    A_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    B_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    A_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    B_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    C_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    with tir.block([2048, 2048], "A_shared") as [v0, v1]:
+        A_shared[v0, v1] = A[v0, v1]
+    with tir.block([2048, 2048], "B_shared") as [v0, v1]:
+        B_shared[v0, v1] = B[v0, v1]
+    with tir.block([2048, 2048], "B_shared_local") as [v0, v1]:
+        B_shared_local[v0, v1] = B_shared[v0, v1]
+    for by in tir.thread_binding(0, 32, thread = "blockIdx.y"):
+        for bx in tir.thread_binding(0, 32, thread = "blockIdx.x"):
+            for vy in tir.thread_binding(0, 2, thread = "vthread.y"):
+                for vx in tir.thread_binding(0, 2, thread = "vthread.x"):
+                    for ty in tir.thread_binding(0, 8, thread = "threadIdx.y"):
+                        for tx in tir.thread_binding(0, 8, thread = "threadIdx.x"):
+                            for k_0 in tir.serial(0, 256):
+                                for k_1 in tir.unroll(0, 8):
+                                    for i, j in tir.grid(1, 4):
+                                        with tir.block([2048, 2048], "A_shared_local") as [v0, v1]:
+                                            tir.bind(v0, k_0 * 8 + k_1 + i)
+                                            tir.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                            A_shared_local[v0, v1] = A_shared[v0, v1]
+                                    for _, i, j in tir.grid(1, 4, 4):
+                                        with tir.block([2048, 2048, tir.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
+                                            tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                            tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                            tir.bind(vk, k_0 * 8 + k_1)
+                                            with tir.init():
+                                                C_local[vi, vj] = tir.float32(0)
+                                            C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+                            for i, j in tir.grid(4, 4):
+                                with tir.block([2048, 2048], "C_local") as [v0, v1]:
+                                    tir.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                    C[v0, v1] = C_local[v0, v1]
+
+
+@tvm.script.tir
+def cuda_matmul_3(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=undefined-loop-variable
+    A = tir.match_buffer(a, [2048, 2048], "float32")
+    B = tir.match_buffer(b, [2048, 2048], "float32")
+    C = tir.match_buffer(c, [2048, 2048], "float32")
+    A_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    B_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    A_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    B_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    C_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    with tir.block([2048, 2048], "A_shared") as [v0, v1]:
+        A_shared[v0, v1] = A[v0, v1]
+    with tir.block([2048, 2048], "B_shared") as [v0, v1]:
+        B_shared[v0, v1] = B[v0, v1]
+    for by in tir.thread_binding(0, 32, thread = "blockIdx.y"):
+        for bx in tir.thread_binding(0, 32, thread = "blockIdx.x"):
+            for vy in tir.thread_binding(0, 2, thread = "vthread.y"):
+                for vx in tir.thread_binding(0, 2, thread = "vthread.x"):
+                    for ty in tir.thread_binding(0, 8, thread = "threadIdx.y"):
+                        for tx in tir.thread_binding(0, 8, thread = "threadIdx.x"):
+                            for k0 in tir.serial(0, 256):
+                                for k1 in tir.unroll(0, 8):
+                                    for i, j in tir.grid(1, 4):
+                                        with tir.block([2048, 2048], "A_shared_local") as [v0, v1]:
+                                            tir.bind(v0, k0 * 8 + k1 + i)
+                                            tir.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                            A_shared_local[v0, v1] = A_shared[v0, v1]
+                                    for i, j in tir.grid(1, 4):
+                                        with tir.block([2048, 2048], "B_shared_local") as [v0, v1]:
+                                            tir.bind(v0, k0 * 8 + k1 + i)
+                                            tir.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                            B_shared_local[v0, v1] = B_shared[v0, v1]
+                                    for _, i, j in tir.grid(1, 4, 4):
+                                        with tir.block([2048, 2048, tir.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
+                                            tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                            tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                            tir.bind(vk, k0 * 8 + k1)
+                                            with tir.init():
+                                                C_local[vi, vj] = tir.float32(0)
+                                            C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+                            for i, j in tir.grid(4, 4):
+                                with tir.block([2048, 2048], "C_local") as [v0, v1]:
+                                    tir.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                    C[v0, v1] = C_local[v0, v1]
+
+
+@tvm.script.tir
+def cuda_matmul_4(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=undefined-loop-variable
+    A = tir.match_buffer(a, [2048, 2048], "float32")
+    B = tir.match_buffer(b, [2048, 2048], "float32")
+    C = tir.match_buffer(c, [2048, 2048], "float32")
+    A_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    B_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    A_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    B_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    C_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    with tir.block([2048, 2048], "B_shared") as [v0, v1]:
+        B_shared[v0, v1] = B[v0, v1]
+    for by in tir.thread_binding(0, 32, thread = "blockIdx.y"):
+        for bx in tir.thread_binding(0, 32, thread = "blockIdx.x"):
+            for vy in tir.thread_binding(0, 2, thread = "vthread.y"):
+                for vx in tir.thread_binding(0, 2, thread = "vthread.x"):
+                    for ty in tir.thread_binding(0, 8, thread = "threadIdx.y"):
+                        for tx in tir.thread_binding(0, 8, thread = "threadIdx.x"):
+                            for k0 in tir.serial(0, 256):
+                                for i, j in tir.grid(8, 64):
+                                    with tir.block([2048, 2048], "A_shared") as [v0, v1]:
+                                        tir.bind(v0, k0 * 8 + i)
+                                        tir.bind(v1, by * 64 + j)
+                                        A_shared[v0, v1] = A[v0, v1]
+                                for k1 in tir.unroll(0, 8):
+                                    for i, j in tir.grid(1, 4):
+                                        with tir.block([2048, 2048], "A_shared_local") as [v0, v1]:
+                                            tir.bind(v0, k0 * 8 + k1 + i)
+                                            tir.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                            A_shared_local[v0, v1] = A_shared[v0, v1]
+                                    for i, j in tir.grid(1, 4):
+                                        with tir.block([2048, 2048], "B_shared_local") as [v0, v1]:
+                                            tir.bind(v0, k0 * 8 + k1 + i)
+                                            tir.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                            B_shared_local[v0, v1] = B_shared[v0, v1]
+                                    for _, i, j in tir.grid(1, 4, 4):
+                                        with tir.block([2048, 2048, tir.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
+                                            tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                            tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                            tir.bind(vk, k0 * 8 + k1)
+                                            with tir.init():
+                                                C_local[vi, vj] = 0.0
+                                            C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+                            for i, j in tir.grid(4, 4):
+                                with tir.block([2048, 2048], "C_local") as [v0, v1]:
+                                    tir.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                    C[v0, v1] = C_local[v0, v1]
+
+
+@tvm.script.tir
+def cuda_matmul_5(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=undefined-loop-variable
+    A = tir.match_buffer(a, [2048, 2048], "float32")
+    B = tir.match_buffer(b, [2048, 2048], "float32")
+    C = tir.match_buffer(c, [2048, 2048], "float32")
+    A_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    B_shared = tir.alloc_buffer([2048, 2048], "float32", scope="shared")
+    A_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    B_shared_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    C_local = tir.alloc_buffer([2048, 2048], "float32", scope="local")
+    for by in tir.thread_binding(0, 32, thread = "blockIdx.y"):
+        for bx in tir.thread_binding(0, 32, thread = "blockIdx.x"):
+            for vy in tir.thread_binding(0, 2, thread = "vthread.y"):
+                for vx in tir.thread_binding(0, 2, thread = "vthread.x"):
+                    for ty in tir.thread_binding(0, 8, thread = "threadIdx.y"):
+                        for tx in tir.thread_binding(0, 8, thread = "threadIdx.x"):
+                            for k0 in tir.serial(0, 256):
+                                for i, j in tir.grid(8, 64):
+                                    with tir.block([2048, 2048], "A_shared") as [v0, v1]:
+                                        tir.bind(v0, k0 * 8 + i)
+                                        tir.bind(v1, by * 64 + j)
+                                        A_shared[v0, v1] = A[v0, v1]
+                                for i, j in tir.grid(8, 64):
+                                    with tir.block([2048, 2048], "B_shared") as [v0, v1]:
+                                        tir.bind(v0, k0 * 8 + i)
+                                        tir.bind(v1, bx * 64 + j)
+                                        B_shared[v0, v1] = B[v0, v1]
+                                for k1 in tir.unroll(0, 8):
+                                    for i, j in tir.grid(1, 4):
+                                        with tir.block([2048, 2048], "A_shared_local") as [v0, v1]:
+                                            tir.bind(v0, k0 * 8 + k1 + i)
+                                            tir.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                            A_shared_local[v0, v1] = A_shared[v0, v1]
+                                    for i, j in tir.grid(1, 4):
+                                        with tir.block([2048, 2048], "B_shared_local") as [v0, v1]:
+                                            tir.bind(v0, k0 * 8 + k1 + i)
+                                            tir.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                            B_shared_local[v0, v1] = B_shared[v0, v1]
+                                    for _, i, j in tir.grid(1, 4, 4):
+                                        with tir.block([2048, 2048, tir.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
+                                            tir.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
+                                            tir.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                            tir.bind(vk, k0 * 8 + k1)
+                                            with tir.init():
+                                                C_local[vi, vj] = 0.0
+                                            C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+                            for i, j in tir.grid(4, 4):
+                                with tir.block([2048, 2048], "C_local") as [v0, v1]:
+                                    tir.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
+                                    tir.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                    C[v0, v1] = C_local[v0, v1]
+
+
+@tvm.script.tir
+def tiled(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128], "float32")
+    B = tir.alloc_buffer([128, 128], "float32")
+    C = tir.match_buffer(c, [128, 128], "float32")
+    for i_0, j_0, i_1, j_1 in tir.grid(8, 8, 16, 16):
+        with tir.block([128, 128], "B") as [vi, vj]:
+            tir.bind(vi, i_0 * 16 + i_1)
+            tir.bind(vj, j_0 * 16 + j_1)
+            B[vi, vj] = A[vi, vj] * 2.0
+    with tir.block([128, 128], "C") as [vi, vj]:
+        C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def tiled_after_reverse_compute_at(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128], "float32")
+    B = tir.alloc_buffer([128, 128], "float32")
+    C = tir.match_buffer(c, [128, 128], "float32")
+    for i_0, j_0, i_1 in tir.grid(8, 8, 16):
+        for j_1 in tir.serial(0, 16):
+            with tir.block([128, 128], "B") as [vi, vj]:
+                tir.bind(vi, i_0 * 16 + i_1)
+                tir.bind(vj, j_0 * 16 + j_1)
+                B[vi, vj] = A[vi, vj] * 2.0
+        for j_1 in tir.serial(0, 16):
+            with tir.block([128, 128], "C") as [vi, vj]:
+                tir.bind(vi, i_0 * 16 + i_1)
+                tir.bind(vj, j_0 * 16 + j_1)
+                C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def factorized(a: ty.handle, b: ty.handle) -> None:
+    A = tir.match_buffer(a, [16, 16, 16], "float32")
+    B = tir.match_buffer(b, [16], "float32")
+    B_rf_local = tir.alloc_buffer([16, 16], "float32", scope="local")
+    for j in tir.thread_binding(0, 16, thread = "blockIdx.x"):
+        for i_o in tir.thread_binding(0, 4, thread = "threadIdx.x"):
+            for i_i, k in tir.grid(4, 16):
+                with tir.block([16, 16, tir.reduce_axis(0, 16)], "B_rf") as [vi, vj, vk]:
+                    tir.bind(vi, i_o * 4 + i_i)
+                    tir.bind(vj, j)
+                    tir.bind(vk, k)
+                    with tir.init():
+                        B_rf_local[vi, vj] = 0.0
+                    B_rf_local[vi, vj] = B_rf_local[vi, vj] + A[vj, vi, vk]
+    for i, k in tir.grid(16, 16):
+        with tir.block([16, tir.reduce_axis(0, 16)], "B") as [vi, vk]:
+            tir.bind(vi, i)
+            tir.bind(vk, k)
+            with tir.init():
+                B[vi] = 0.0
+            B[vi] = B[vi] + B_rf_local[vk, vi]
+
+
+@tvm.script.tir
+def factorized_after_reverse_compute_at(a: ty.handle, b: ty.handle) -> None:
+    A = tir.match_buffer(a, [16, 16, 16], "float32")
+    B = tir.match_buffer(b, [16], "float32")
+    B_rf_local = tir.alloc_buffer([16, 16], "float32", scope="local")
+    for j in tir.thread_binding(0, 16, thread = "blockIdx.x"):
+        for i_o in tir.thread_binding(0, 4, thread = "threadIdx.x"):
+            for i_i, k in tir.grid(4, 16):
+                with tir.block([16, 16, tir.reduce_axis(0, 16)], "B_rf") as [vi, vj, vk]:
+                    tir.bind(vi, i_o * 4 + i_i)
+                    tir.bind(vj, j)
+                    tir.bind(vk, k)
+                    with tir.init():
+                        B_rf_local[vi, vj] = 0.0
+                    B_rf_local[vi, vj] = B_rf_local[vi, vj] + A[vj, vi, vk]
+            for k in tir.serial(0, 4):
+                with tir.block([16, tir.reduce_axis(0, 16)], "B") as [vi, vk]:
+                    tir.bind(vi, j)
+                    tir.bind(vk, i_o * 4 + k)
+                    with tir.init():
+                        B[vi] = 0.0
+                    B[vi] = B[vi] + B_rf_local[vk, vi]
+
+
+@tvm.script.tir
+def fail_subtree_compact_dataflow(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, (128, 128), "float32")
+    B = tir.alloc_buffer((128, 128), "float32")
+    C = tir.match_buffer(c, (128, 128), "float32")
+    for i in range(0, 128):
+        for j in range(0, 64):
+            with tir.block([128, 128], "B_0") as [vi, vj]:
+                tir.bind(vi, i)
+                tir.bind(vj, j)
+                B[vi, vj] = A[vi, vj] * 2.0
+        for j in range(0, 64):
+            with tir.block([128, 128], "B_1") as [vi, vj]:
+                tir.bind(vi, i)
+                tir.bind(vj, j + 64)
+                B[vi, vj] = A[vi, vj] * 2.0
+    with tir.block([128, 128], "C") as [vi, vj]:
+        C[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def fail_all_consumers_under_loop(a: ty.handle, c: ty.handle, d: ty.handle) -> None:
+    A = tir.match_buffer(a, (128, 128), "float32")
+    B = tir.alloc_buffer((128, 128), "float32")
+    C = tir.match_buffer(c, (128, 128), "float32")
+    D = tir.match_buffer(d, (128, 128), "float32")
+    for i, j in tir.grid(128, 128):
+        with tir.block([128, 128], "B") as [vi, vj]:
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in tir.grid(128, 128):
+        with tir.block([128, 128], "C") as [vi, vj]:
+            C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in tir.grid(128, 128):
+        with tir.block([128, 128], "D") as [vi, vj]:
+            D[vi, vj] = B[vi, vj] + 1.0
+
+
+@tvm.script.tir
+def fail_all_producers_under_loop(a: ty.handle, d: ty.handle) -> None:
+    A = tir.match_buffer(a, (128, 128), "float32")
+    B = tir.alloc_buffer((128, 128), "float32")
+    C = tir.alloc_buffer((128, 128), "float32")
+    D = tir.match_buffer(d, (128, 128), "float32")
+    for i, j in tir.grid(128, 128):
+        with tir.block([128, 128], "B") as [vi, vj]:
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in tir.grid(128, 128):
+        with tir.block([128, 128], "C") as [vi, vj]:
+            C[vi, vj] = A[vi, vj] + 1.0
+    for i, j in tir.grid(128, 128):
+        with tir.block([128, 128], "D") as [vi, vj]:
+            D[vi, vj] = B[vi, vj] + C[vi, vj]
+
+
+@tvm.script.tir
+def read_out_of_bound(a: ty.handle, c:ty.handle) -> None:
+    A = tir.match_buffer(a, [16], "float32")
+    B = tir.alloc_buffer([16], "float32")
+    C = tir.match_buffer(c, [16], "float32")
+    for i in tir.serial(0, 16):
+        with tir.block([16], "B") as [v]:
+            B[v] = A[v]
+    for j in tir.serial(0, 16):
+        with tir.block([16], "C") as [v]:
+            tir.reads(B[v : v + 2])
+            C[v] = tir.if_then_else(v < 15, tir.max(B[v], B[v + 1]), B[v], dtype="float32")
+
+
+@tvm.script.tir
+def read_out_of_bound_after_compute_at(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [16], "float32")
+    B = tir.alloc_buffer([16], "float32")
+    C = tir.match_buffer(c, [16], "float32")
+    for j in tir.serial(0, 16):
+        for i in tir.serial(0, tir.min(1, 15 - j) + 1):
+            with tir.block([16], "B") as [v]:
+                tir.bind(v, j + i)
+                B[v] = A[v]
+        with tir.block([16], "C") as [v]:
+            tir.bind(v, j)
+            tir.reads([B[v : v + 2]])
+            C[v] = tir.if_then_else(v < 15, tir.max(B[v], B[v + 1]), B[v], dtype="float32")
+
+
+# pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
+# fmt: on
+
+
+def test_compute_at_two_elementwise():
+    sch = tir.Schedule(two_elementwise, debug_mask="all")
+    block = sch.get_block("B")
+    loop, _ = sch.get_loops(sch.get_block("C"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(two_elementwise_after_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=two_elementwise)
+
+
+def test_compute_at_blockized_1():
+    sch = tir.Schedule(blockized_1, debug_mask="all")
+    block = sch.get_block("B")
+    _, loop = sch.get_loops(sch.get_block("C_outer"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(blockized_after_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=blockized_1)
+
+
+def test_compute_at_blockized_2():
+    sch = tir.Schedule(blockized_2, debug_mask="all")
+    block = sch.get_block("B_outer")
+    _, loop, _, _ = sch.get_loops(sch.get_block("C"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(blockized_2_after_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=blockized_2)
+
+
+def test_compute_at_cuda_matmul_0():
+    sch = tir.Schedule(cuda_matmul_0, debug_mask="all")
+    block = sch.get_block("C")
+    _, _, _, _, _, loop, _, _ = sch.get_loops(sch.get_block("C_local"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(cuda_matmul_0_after_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=cuda_matmul_0)
+
+
+def test_compute_at_cuda_matmul_1():
+    sch = tir.Schedule(cuda_matmul_1, debug_mask="all")
+    block = sch.get_block("A_shared_local")
+    _, _, _, _, _, _, _, loop, _, _, _ = sch.get_loops(sch.get_block("C"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(cuda_matmul_2, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=cuda_matmul_1)
+
+
+def test_compute_at_cuda_matmul_2():
+    sch = tir.Schedule(cuda_matmul_2, debug_mask="all")
+    block = sch.get_block("B_shared_local")
+    _, _, _, _, _, _, _, loop, _, _, _ = sch.get_loops(sch.get_block("C"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(cuda_matmul_3, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=cuda_matmul_2)
+
+
+def test_compute_at_cuda_matmul_3():
+    sch = tir.Schedule(cuda_matmul_3, debug_mask="all")
+    block = sch.get_block("A_shared")
+    _, _, _, _, _, _, loop, _, _, _, _ = sch.get_loops(sch.get_block("C"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(cuda_matmul_4, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=cuda_matmul_3)
+
+
+def test_compute_at_cuda_matmul_4():
+    sch = tir.Schedule(cuda_matmul_4, debug_mask="all")
+    block = sch.get_block("B_shared")
+    _, _, _, _, _, _, loop, _, _, _, _ = sch.get_loops(sch.get_block("C"))
+    sch.compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(cuda_matmul_5, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=cuda_matmul_4)
+
+
+def test_reverse_compute_at_tiled():
+    sch = tir.Schedule(tiled, debug_mask="all")
+    block = sch.get_block("C")
+    _, _, loop, _ = sch.get_loops(sch.get_block("B"))
+    sch.reverse_compute_at(block, loop, preserve_unit_loops=False)
+    tvm.ir.assert_structural_equal(tiled_after_reverse_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=tiled)
+
+
+def test_reverse_compute_at_blockized_2():
+    sch = tir.Schedule(blockized_2, debug_mask="all")
+    block = sch.get_block("C")
+    _, loop = sch.get_loops(sch.get_block("B_outer"))
+    sch.reverse_compute_at(block, loop, preserve_unit_loops=True)
+    tvm.ir.assert_structural_equal(blockized_2_after_reverse_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=blockized_2)
+
+
+def test_reverse_compute_at_factorized():
+    sch = tir.Schedule(factorized, debug_mask="all")
+    block = sch.get_block("B")
+    _, loop, _, _ = sch.get_loops(sch.get_block("B_rf"))
+    sch.reverse_compute_at(block, loop, preserve_unit_loops=False)
+    tvm.ir.assert_structural_equal(factorized_after_reverse_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=factorized)
+
+
+def test_read_out_of_bound():
+    sch = tir.Schedule(read_out_of_bound, debug_mask="all")
+    block = sch.get_block("B")
+    (loop,) = sch.get_loops(sch.get_block("C"))
+    sch.compute_at(block, loop)
+    tvm.ir.assert_structural_equal(read_out_of_bound_after_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=read_out_of_bound)
+
+
+def test_fail_subtree_compact_dataflow():
+    sch = tir.Schedule(fail_subtree_compact_dataflow, debug_mask="all")
+    block = sch.get_block("B_0")
+    loop, _ = sch.get_loops(sch.get_block("C"))
+    with pytest.raises(tvm.tir.ScheduleError, match="compact dataflow"):
+        sch.compute_at(block, loop)
+
+
+def test_fail_not_in_same_scope():
+    sch = tir.Schedule(blockized_1, debug_mask="all")
+    block = sch.get_block("B")
+    loop, _ = sch.get_loops(sch.get_block("C_inner"))
+    with pytest.raises(tvm.tir.ScheduleError, match="same block scope"):
+        sch.compute_at(block, loop)
+
+
+def test_fail_loop_is_ancestor_of_block():
+    sch = tir.Schedule(two_elementwise, debug_mask="all")
+    block = sch.get_block("B")
+    loop, _ = sch.get_loops(sch.get_block("B"))
+    with pytest.raises(tvm.tir.ScheduleError, match="ancestor of block"):
+        sch.compute_at(block, loop)
+
+
+def test_fail_output_block():
+    sch = tir.Schedule(tiled, debug_mask="all")
+    block = sch.get_block("C")
+    loop, _, _, _ = sch.get_loops(sch.get_block("B"))
+    with pytest.raises(tvm.tir.ScheduleError, match="output block"):
+        sch.compute_at(block, loop)
+
+
+def test_fail_all_consumers_under_loop():
+    sch = tir.Schedule(fail_all_consumers_under_loop, debug_mask="all")
+    block = sch.get_block("B")
+    loop, _ = sch.get_loops(sch.get_block("C"))
+    with pytest.raises(tvm.tir.ScheduleError, match="requires all the consumer"):
+        sch.compute_at(block, loop)
+
+
+def test_fail_all_producers_under_loop():
+    sch = tir.Schedule(fail_all_producers_under_loop, debug_mask="all")
+    block = sch.get_block("D")
+    loop, _ = sch.get_loops(sch.get_block("C"))
+    with pytest.raises(tvm.tir.ScheduleError, match="requires all the producer"):
+        sch.reverse_compute_at(block, loop)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_tir_transform_ir_utils.py b/tests/python/unittest/test_tir_transform_ir_utils.py
new file mode 100644
index 000000000000..b6752ee3efd3
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_ir_utils.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import tvm
+from tvm import tir
+
+
+def test_convert_ssa():
+    zero = tir.const(0)
+    nop = tir.Evaluate(zero)
+    v = tir.Var("i1", "int32")
+    for_stmt = tir.For(v, zero, zero, tir.ForKind.SERIAL, nop)
+    load = tir.Evaluate(tir.Load("int32", v, zero))
+    seq = tir.SeqStmt([for_stmt, for_stmt, load])
+    func = tir.PrimFunc([], seq)
+    mod = tvm.IRModule({"main": func})
+    mod = tir.transform.InjectVirtualThread()(
+        mod
+    )  # Use pass InjectVirtualThread to invoke ConvertSSA
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_tvm_testing_features.py b/tests/python/unittest/test_tvm_testing_features.py
index f396eeeee5fb..cbcdc4356250 100644
--- a/tests/python/unittest/test_tvm_testing_features.py
+++ b/tests/python/unittest/test_tvm_testing_features.py
@@ -87,6 +87,11 @@ def test_known_failing_target(self, target):
     def test_all_targets_ran(self):
         assert self.run_targets_with_known_failure == self.enabled_targets
 
+    @tvm.testing.known_failing_targets("llvm")
+    @tvm.testing.parametrize_targets("llvm")
+    def test_known_failing_explicit_list(self, target):
+        assert target != "llvm"
+
 
 class TestJointParameter:
     param1_vals = [1, 2, 3]
@@ -98,7 +103,8 @@ class TestJointParameter:
 
     joint_usages = 0
     joint_param_vals = list(zip(param1_vals, param2_vals))
-    joint_param1, joint_param2 = tvm.testing.parameters(*joint_param_vals)
+    joint_param_ids = ["apple", "pear", "banana"]
+    joint_param1, joint_param2 = tvm.testing.parameters(*joint_param_vals, ids=joint_param_ids)
 
     def test_using_independent(self, param1, param2):
         type(self).independent_usages += 1
@@ -113,6 +119,14 @@ def test_using_joint(self, joint_param1, joint_param2):
     def test_joint(self):
         assert self.joint_usages == len(self.joint_param_vals)
 
+    def test_joint_test_id(self, joint_param1, joint_param2, request):
+        param_string = (
+            request.node.name.replace(request.node.originalname, "")
+            .replace("[", "")
+            .replace("]", "")
+        )
+        assert param_string in self.joint_param_ids
+
 
 class TestFixtureCaching:
     param1_vals = [1, 2, 3]
@@ -202,8 +216,8 @@ def test_num_uses_cached(self):
 class TestAutomaticMarks:
     @staticmethod
     def check_marks(request, target):
-        parameter = tvm.testing.plugin._pytest_target_params([target])[0]
-        required_marks = [decorator.mark for decorator in parameter.marks]
+        decorators = tvm.testing.plugin._target_to_requirement(target)
+        required_marks = [decorator.mark for decorator in decorators]
         applied_marks = list(request.node.iter_markers())
 
         for required_mark in required_marks:
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index f9aee67f1d71..7c123afdc4d0 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -15,6 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import sys
+import pytest
+
 import tvm
 from tvm import tir
 from tvm.script import ty
@@ -2675,9 +2678,9 @@ def vthread_func(a: ty.handle, c: ty.handle) -> None:
     tir.launch_thread(i1, 2)
     tir.launch_thread(i2, 2)
     B = tir.allocate([16], "float32", "local")
-    for j in range(0, 16):
+    for j in range(16):
         B[j] = tir.load("float32", A.data, i0 * 64 + i1 * 32 + i2 * 16 + j) + tir.float32(1)
-    for j in range(0, 16):
+    for j in range(16):
         C.data[i0 * 64 + i1 * 32 + i2 * 16 + j] = tir.load("float32", B, j) * tir.float32(2)
 
 
@@ -2709,7 +2712,7 @@ def matmul_original(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
         with tir.block([128, 128], "init") as [vi, vj]:
             C[vi, vj] = tir.float32(0)
 
-        for k in range(0, 128):
+        for k in range(128):
             with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
@@ -2898,8 +2901,8 @@ def opaque_block(a: ty.handle, b: ty.handle) -> None:
     A = tir.match_buffer(a, (16, 16), "float32")
     B = tir.match_buffer(b, (16, 16), "float32")
 
-    for i in range(0, 16):
-        for j in range(0, 16):
+    for i in range(16):
+        for j in range(16):
             with tir.block([]):
                 tir.reads([])
                 tir.writes(A[i, j])
@@ -2907,7 +2910,7 @@ def opaque_block(a: ty.handle, b: ty.handle) -> None:
         with tir.block([]):
             tir.reads([A[i, 0:16]])
             tir.writes([B[i, 0:16]])
-            for j in range(0, 16):
+            for j in range(16):
                 B[i, j] = A[i, j]
 
 
@@ -2951,7 +2954,7 @@ def rank0_block(a: ty.handle) -> None:
     with tir.block([], "update") as []:
         tir.reads([A[()]])
         tir.writes([B[()]])
-        for i in range(0, 1):
+        for i in range(1):
             B[()] = A[()]
 
 
@@ -3008,28 +3011,77 @@ def constant_folding(a: ty.handle) -> None:
     A[()] = tir.min(2.2, 5.0)
 
 
-def test_script_printer():
+def test_constant_folding():
     func = constant_folding
     rt_func = tvm.script.from_source(tvm.script.asscript(func, True))
     tvm.ir.assert_structural_equal(func, rt_func)
 
 
+@tvm.script.tir
+def simplify_bracket() -> None:
+    a = tir.var("int32")
+    b = tir.var("int32")
+    c = tir.var("int32")
+    d = tir.var("int32")
+    tir.evaluate(a + b * (c + d))
+
+
+def test_simplify_bracket():
+    func = simplify_bracket
+    out_str = tvm.script.asscript(func, True)
+    assert out_str.count("a + b*(c + d)") == 1
+
+
+@tvm.script.tir
+def var_with_same_name(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        A[vi, vj] = 0
+    with tir.block([16, 16]) as [vi, vj]:
+        A[vi, vj] = 0
+    for i, j in tir.grid(16, 16):
+        with tir.block([16, 16]) as [vi, vj]:
+            A[vi, vj] = 0
+    for i, j in tir.grid(16, 16):
+        with tir.block([16, 16]) as [vi, vj]:
+            A[vi, vj] = 0
+
+
+def test_same_name_var():
+    func = var_with_same_name
+    out_str = tvm.script.asscript(func, True)
+    rt_func = tvm.script.from_source(out_str)
+    tvm.ir.assert_structural_equal(func, rt_func)
+
+    assert out_str.count("with tir.block([16, 16]) as [vi, vj]") == 4
+    assert out_str.find("vi_") == -1
+    assert out_str.find("vj_") == -1
+
+    assert out_str.count("for i0, i1 in tir.grid(16, 16)") == 2
+    assert out_str.find("i0_") == -1
+    assert out_str.find("i1_") == -1
+
+    assert out_str.count("for i, j in tir.grid(16, 16)") == 2
+    assert out_str.find("i_") == -1
+    assert out_str.find("i_") == -1
+
+
+@tvm.script.tir
+def while_loop(a: ty.handle, b: ty.handle) -> None:
+    A = tir.match_buffer(a, (16,), "float32")
+    B = tir.match_buffer(b, (16,), "float32")
+    i = tir.alloc_buffer((), "int32", scope="local")
+    with tir.block([16]) as [vi]:
+        B[vi] = 0
+    while i[()] < 10:
+        for j in range(16):
+            B[j] += A[j]
+
+
+def test_while_loop():
+    rt_func = tvm.script.from_source(tvm.script.asscript(while_loop, True))
+    tvm.ir.assert_structural_equal(while_loop, rt_func)
+
+
 if __name__ == "__main__":
-    test_opt_gemm_normalize()
-    test_opt_gemm_mod_host()
-    test_opt_gemm_lower()
-    test_opt_conv_tensorcore_normalize()
-    test_opt_conv_tensorcore_lower()
-    test_opt_conv_tensorcore_mod_host()
-    test_vthread()
-    test_module_define()
-    test_matmul()
-    test_matmul_original()
-    test_element_wise()
-    test_predicate()
-    test_for_thread_binding()
-    test_match_buffer_region()
-    test_block_elements()
-    test_opaque_block()
-    test_abs()
-    test_script_printer()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
index 753d17d8afe5..01d5587e70ad 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_ci_setup.sh
@@ -30,7 +30,7 @@ set -o pipefail
 #
 echo "Addtiional setup in" ${CI_IMAGE_NAME}
 
-python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.3.0
+python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.4.0
 
 # Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in
 # Jenkinsfile. We expect config.cmake to be present from pack_lib().
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 167e5becd4a7..ace88b46ae0c 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -43,7 +43,9 @@ echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake
 echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake
 echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake
 echo set\(USE_ETHOSN_HW OFF\) >> config.cmake
+echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_VITIS_AI ON\) >> config.cmake
 echo set\(USE_VERILATOR ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
+echo set\(USE_ETHOSU ON\) >> config.cmake
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index b05acb090c2f..8507f311e9da 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -25,3 +25,6 @@ mypy  --check-untyped-defs python/tvm/tir/analysis/
 
 echo "Checking MyPy Type defs in the transofrm package."
 mypy  --check-untyped-defs python/tvm/tir/transform/
+
+echo "Checking MyPy Type defs in the tvm.relay.backend.contrib.ethosu package."
+mypy  --check-untyped-defs python/tvm/relay/backend/contrib/ethosu/
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 7b7758c3da24..4bee8d566f11 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -23,9 +23,11 @@ set -x  # NOTE(areusch): Adding to diagnose flaky timeouts
 source tests/scripts/setup-pytest-env.sh
 
 make cython3
-run_pytest ctypes python-microtvm-zephyr tests/micro/zephyr --microtvm-platforms=qemu_x86
-run_pytest ctypes python-microtvm-zephyr tests/micro/zephyr --microtvm-platforms=mps2_an521
+run_pytest ctypes python-microtvm-zephyr tests/micro/zephyr --zephyr-board=qemu_x86
+# Temporarily removing mps2_an512 from CI due to issue 8728:
+# https://github.com/apache/tvm/issues/8728
+# run_pytest ctypes python-microtvm-zephyr tests/micro/zephyr --zephyr-board=mps2_an521
 
 run_pytest ctypes python-microtvm-arduino apps/microtvm/arduino/template_project/tests
-run_pytest ctypes python-microtvm-arduino-nano33ble tests/micro/arduino  --test-build-only --microtvm-platforms=nano33ble
-run_pytest ctypes python-microtvm-arduino-due tests/micro/arduino  --test-build-only --microtvm-platforms=due
+run_pytest ctypes python-microtvm-arduino-nano33ble tests/micro/arduino  --test-build-only --arduino-board=nano33ble
+run_pytest ctypes python-microtvm-arduino-due tests/micro/arduino  --test-build-only --arduino-board=due
diff --git a/tutorials/micro/micro_autotune.py b/tutorials/micro/micro_autotune.py
new file mode 100644
index 000000000000..f89432ff01cf
--- /dev/null
+++ b/tutorials/micro/micro_autotune.py
@@ -0,0 +1,250 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+.. _tutorial-micro-autotune:
+
+Autotuning with micro TVM
+=========================
+**Author**: `Andrew Reusch <https://github.com/areusch>`_, `Mehrdad Hessar <https://github.com/mehrdadh>`
+
+This tutorial explains how to autotune a model using the C runtime.
+"""
+
+import numpy as np
+import subprocess
+import pathlib
+
+import tvm
+
+####################
+# Defining the model
+####################
+#
+# To begin with, define a model in Relay to be executed on-device. Then create an IRModule from relay model and
+# fill parameters with random numbers.
+#
+
+data_shape = (1, 3, 10, 10)
+weight_shape = (6, 3, 5, 5)
+
+data = tvm.relay.var("data", tvm.relay.TensorType(data_shape, "float32"))
+weight = tvm.relay.var("weight", tvm.relay.TensorType(weight_shape, "float32"))
+
+y = tvm.relay.nn.conv2d(
+    data,
+    weight,
+    padding=(2, 2),
+    kernel_size=(5, 5),
+    kernel_layout="OIHW",
+    out_dtype="float32",
+)
+f = tvm.relay.Function([data, weight], y)
+
+relay_mod = tvm.IRModule.from_expr(f)
+relay_mod = tvm.relay.transform.InferType()(relay_mod)
+
+weight_sample = np.random.rand(
+    weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]
+).astype("float32")
+params = {"weight": weight_sample}
+
+#######################
+# Defining the target #
+#######################
+# Now we define the TVM target that describes the execution environment. This looks very similar
+# to target definitions from other microTVM tutorials.
+#
+# When running on physical hardware, choose a target and a board that
+# describe the hardware. There are multiple hardware targets that could be selected from
+# PLATFORM list in this tutorial. You can chose the platform by passing --platform argument when running
+# this tutorial.
+#
+TARGET = tvm.target.target.micro("host")
+
+# Compiling for physical hardware
+# --------------------------------------------------------------------------
+#  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
+#  STM32L4R5ZI Nucleo target and board is chosen in the example below.
+#
+#    TARGET = tvm.target.target.micro("stm32l4r5zi")
+#    BOARD = "nucleo_l4r5zi"
+
+#########################
+# Extracting tuning tasks
+#########################
+# Not all operators in the Relay program printed above can be tuned. Some are so trivial that only
+# a single implementation is defined; others don't make sense as tuning tasks. Using
+# `extract_from_program`, you can produce a list of tunable tasks.
+#
+# Because task extraction involves running the compiler, we first configure the compiler's
+# transformation passes; we'll apply the same configuration later on during autotuning.
+
+pass_context = tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True})
+with pass_context:
+    tasks = tvm.autotvm.task.extract_from_program(relay_mod["main"], {}, TARGET)
+assert len(tasks) > 0
+
+######################
+# Configuring microTVM
+######################
+# Before autotuning, we need to define a module loader and then pass that to
+# a `tvm.autotvm.LocalBuilder`. Then we create a `tvm.autotvm.LocalRunner` and use
+# both builder and runner to generates multiple measurements for auto tunner.
+#
+# In this tutorial, we have the option to use x86 host as an example or use different targets
+# from Zephyr RTOS. If you choose pass `--platform=host` to this tutorial it will uses x86. You can
+# choose other options by choosing from `PLATFORM` list.
+#
+
+repo_root = pathlib.Path(
+    subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding="utf-8").strip()
+)
+
+module_loader = tvm.micro.AutoTvmModuleLoader(
+    template_project_dir=repo_root / "src" / "runtime" / "crt" / "host",
+    project_options={},
+)
+builder = tvm.autotvm.LocalBuilder(
+    n_parallel=1,
+    build_kwargs={"build_option": {"tir.disable_vectorize": True}},
+    do_fork=True,
+    build_func=tvm.micro.autotvm_build_func,
+)
+runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader)
+
+measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)
+
+# Compiling for physical hardware
+# --------------------------------------------------------------------------
+#    module_loader = tvm.micro.AutoTvmModuleLoader(
+#        template_project_dir=repo_root / "apps" / "microtvm" / "zephyr" / "template_project",
+#        project_options={
+#            "zephyr_board": BOARD,
+#            "west_cmd": "west",
+#            "verbose": 1,
+#            "project_type": "host_driven",
+#        },
+#    )
+#    builder = tvm.autotvm.LocalBuilder(
+#        n_parallel=1,
+#        build_kwargs={"build_option": {"tir.disable_vectorize": True}},
+#        do_fork=False,
+#        build_func=tvm.micro.autotvm_build_func,
+#    )
+#    runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader)
+
+# measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)
+
+################
+# Run Autotuning
+################
+# Now we can run autotuning separately on each extracted task.
+
+num_trials = 10
+for task in tasks:
+    tuner = tvm.autotvm.tuner.GATuner(task)
+    tuner.tune(
+        n_trial=num_trials,
+        measure_option=measure_option,
+        callbacks=[
+            tvm.autotvm.callback.log_to_file("microtvm_autotune.log.txt"),
+            tvm.autotvm.callback.progress_bar(num_trials, si_prefix="M"),
+        ],
+        si_prefix="M",
+    )
+
+############################
+# Timing the untuned program
+############################
+# For comparison, let's compile and run the graph without imposing any autotuning schedules. TVM
+# will select a randomly-tuned implementation for each operator, which should not perform as well as
+# the tuned operator.
+
+with pass_context:
+    lowered = tvm.relay.build(relay_mod, target=TARGET, params=params)
+
+temp_dir = tvm.contrib.utils.tempdir()
+
+project = tvm.micro.generate_project(
+    str(repo_root / "src" / "runtime" / "crt" / "host"), lowered, temp_dir / "project"
+)
+
+# Compiling for physical hardware
+# --------------------------------------------------------------------------
+#    project = tvm.micro.generate_project(
+#        str(repo_root / "apps" / "microtvm" / "zephyr" / "template_project"),
+#        lowered,
+#        temp_dir / "project",
+#        {
+#            "zephyr_board": BOARD,
+#            "west_cmd": "west",
+#            "verbose": 1,
+#            "project_type": "host_driven",
+#        },
+#    )
+
+project.build()
+project.flash()
+with tvm.micro.Session(project.transport()) as session:
+    debug_module = tvm.micro.create_local_debug_executor(
+        lowered.get_graph_json(), session.get_system_lib(), session.device
+    )
+    debug_module.set_input(**lowered.get_params())
+    print("########## Build without Autotuning ##########")
+    debug_module.run()
+    del debug_module
+
+##########################
+# Timing the tuned program
+##########################
+# Once autotuning completes, you can time execution of the entire program using the Debug Runtime:
+
+with tvm.autotvm.apply_history_best("microtvm_autotune.log.txt"):
+    with pass_context:
+        lowered_tuned = tvm.relay.build(relay_mod, target=TARGET, params=params)
+
+temp_dir = tvm.contrib.utils.tempdir()
+
+project = tvm.micro.generate_project(
+    str(repo_root / "src" / "runtime" / "crt" / "host"), lowered_tuned, temp_dir / "project"
+)
+
+# Compiling for physical hardware
+# --------------------------------------------------------------------------
+#    project = tvm.micro.generate_project(
+#        str(repo_root / "apps" / "microtvm" / "zephyr" / "template_project"),
+#        lowered_tuned,
+#        temp_dir / "project",
+#        {
+#            "zephyr_board": BOARD,
+#            "west_cmd": "west",
+#            "verbose": 1,
+#            "project_type": "host_driven",
+#        },
+#    )
+
+project.build()
+project.flash()
+with tvm.micro.Session(project.transport()) as session:
+    debug_module = tvm.micro.create_local_debug_executor(
+        lowered_tuned.get_graph_json(), session.get_system_lib(), session.device
+    )
+    debug_module.set_input(**lowered_tuned.get_params())
+    print("########## Build with Autotuning ##########")
+    debug_module.run()
+    del debug_module
diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py
index bb262893eb6b..773329405282 100644
--- a/tutorials/micro/micro_reference_vm.py
+++ b/tutorials/micro/micro_reference_vm.py
@@ -111,13 +111,13 @@
 Rebuilding TVM inside the Reference VM
 --------------------------------------
 
-After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm``,
+After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm-zephyr``,
 up-to-date when you modify the C++ runtime or checkout a different revision. You can either
 re-provision the machine (``vagrant provision`` in the same directory you ran ``vagrant up`` before)
 or manually rebuild TVM yourself.
 
 Remember: the TVM ``.so`` built inside the VM is different from the one you may use on your host
-machine. This is why it's built inside the special directory ``build-microtvm``.
+machine. This is why it's built inside the special directory ``build-microtvm-zephyr``.
 
 Logging in to the VM
 --------------------
@@ -143,7 +143,7 @@
 .. code-block:: bash
 
     $ cd apps/microtvm/reference-vm/zephyr
-    $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+    $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=stm32f746g_disco
 
 If you do not have physical hardware attached, but wish to run the tests using the
 local QEMU emulator running within the VM, run the following commands instead:
@@ -152,7 +152,7 @@
 
     $ cd /Users/yourusername/path/to/tvm
     $ cd apps/microtvm/reference-vm/zephyr/
-    $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=host
+    $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=qemu_x86
 
 
 
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 139e30333f1e..572aaee7c3b4 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -191,7 +191,7 @@
                 env.WGT_WIDTH,
                 start_name=pack_dict[model][0],
                 stop_name=pack_dict[model][1],
-                device_annot=(env.TARGET == "intelfocl" or env.TARGET == "sim"),
+                device_annot=(env.TARGET == "intelfocl"),
             )
     else:
         relay_prog = mod["main"]
@@ -203,7 +203,7 @@
                 relay_prog, target=target, params=params, target_host=env.target_host
             )
     else:
-        if env.TARGET == "intelfocl" or env.TARGET == "sim":
+        if env.TARGET == "intelfocl":
             # multiple targets to run both on cpu and vta
             target = {"cpu": env.target_vta_cpu, "ext_dev": target}
         with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
@@ -221,7 +221,7 @@
     remote.upload(temp.relpath("graphlib.tar"))
     lib = remote.load_module("graphlib.tar")
 
-    if env.TARGET == "intelfocl" or env.TARGET == "sim":
+    if env.TARGET == "intelfocl":
         ctxes = [remote.ext_dev(0), remote.cpu(0)]
         m = graph_executor.create(graph, lib, ctxes)
     else: