diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 5e5749055bb0..7a348f3f1a45 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -20,6 +20,54 @@
 from .. import TensorIntrin
 
 
+def generate_dma_load_intrin(
+    size: int,
+    dtype: str,
+):
+    """Generator of dma_load intrins"""
+
+    @T.prim_func
+    def sync_dma_load_desc(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
+        C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
+        with T.block("root"):
+            T.reads(A[0:size])
+            T.writes(C[0:size])
+            for i in T.serial(size):
+                with T.block("load"):
+                    vii = T.axis.remap("S", [i])
+                    C[vii] = A[vii]
+
+    @T.prim_func
+    def sync_dma_load_impl(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
+        C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
+        with T.block("root"):
+            T.reads(A[0:size])
+            T.writes(C[0:size])
+            T.evaluate(
+                T.tvm_call_packed(
+                    "device_api.hexagon.dma_copy",
+                    -1,  # Use QueueId of -1 to not interfere with async copies.
+                    T.address_of(C[0], dtype="handle"),
+                    T.address_of(A[0], dtype="handle"),
+                    size,
+                    0,  # Do not use experimental bypass mode.
+                    dtype="int32",
+                )
+            )
+            T.evaluate(
+                T.tvm_call_packed(
+                    "device_api.hexagon.dma_wait",
+                    -1,
+                    0,  # Wait for the sync queue (-1) to have 0 messages.
+                    dtype="int32",
+                )
+            )
+
+    return sync_dma_load_desc, sync_dma_load_impl
+
+
 def generate_dot_product_32x4_u8u8i32(mem_scope="global"):
     @T.prim_func
     def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
@@ -163,3 +211,9 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N
 
 VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy"
 TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm"))
+
+DMA_READ_128_u8 = "dma_read_128_u8"
+TensorIntrin.register(DMA_READ_128_u8, *generate_dma_load_intrin(128, "uint8"))
+
+DMA_READ_128_i8 = "dma_read_128_i8"
+TensorIntrin.register(DMA_READ_128_i8, *generate_dma_load_intrin(128, "int8"))
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 254eb00cb2ea..53d0428a5ad1 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -20,7 +20,9 @@
 import numpy as np
 
 import tvm
+import pytest
 from tvm.script import tir as T
+from tvm.tir.tensor_intrin.hexagon import DMA_READ_128_i8
 
 from .infrastructure import get_hexagon_target
 
@@ -30,6 +32,7 @@
     "Test bandwidth with buffer size {}MB... \n"
     "    -Base: {} GBps \n    -Vectorized: {} GBps\n"
     "    -Vectorized and Parallelized: {} GBps\n"
+    "    -Sync DMA: {} GBps\n"
     "    -Single DMA Copy: {} GBps\n"
 )
 
@@ -103,13 +106,12 @@ def evaluate(hexagon_session, sch, size):
         a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
 
-    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
-    number = 1
-    repeat = 1
+    if tvm.testing.utils.IS_IN_CI:
+        # Run with reduced number and repeat for CI
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
+    else:
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10)
 
-    timer = module.time_evaluator(
-        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
-    )
     runtime = timer(a_hexagon, a_vtcm_hexagon)
 
     gbps = round((size / 2**30) / runtime.mean, 4)
@@ -123,18 +125,11 @@ class TestMatMulVec:
 
     # Removed most of these to speedup CI.
     size = tvm.testing.parameter(
-        # 10 * KB,
-        # 20 * KB,
-        # 40 * KB,
-        # 80 * KB,
-        # 160 * KB,
-        # 320 * KB,
-        640 * KB,
-        # MB,
-        # 2 * MB,
-        # 3 * MB,
-        # 4 * MB,
-        # 8 * MB,  # Only works on 8gen1 HDKs
+        128,
+        KB,
+        10 * KB,
+        100 * KB,
+        MB,
     )
 
     outer_split = tvm.testing.parameter(4)
@@ -144,6 +139,10 @@ class TestMatMulVec:
     @tvm.testing.requires_hexagon
     def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
         """Test bandwidth."""
+
+        if tvm.testing.utils.IS_IN_CI and (size > 128):
+            pytest.skip("Skipping test since it takes too long in CI.")
+
         # Run the base memcopy operator.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         base_gpbs = evaluate(hexagon_session, sch, size)
@@ -169,6 +168,15 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         sch.parallel(vbo_a)
         parallel_gbps = evaluate(hexagon_session, sch, size)
 
+        # Run with some basic unroll and vectorize scheduling and parallelization.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        block = sch.get_block("A_global.vtcm")
+        loops = sch.get_loops(block)
+        _, inner = sch.split(loops[0], [None, 128])
+        sch.tensorize(inner, DMA_READ_128_i8)
+        # print(sch.mod.script())
+        sync_dma_gbps = evaluate(hexagon_session, sch, size)
+
         # Run using a single dma copy to transfer the data.
         sch = tvm.tir.Schedule(single_dma_operator(size))
         single_dma_gbps = evaluate(hexagon_session, sch, size)
@@ -176,7 +184,7 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         mbs = round(size / MB, 2)
         print(
             TEST_OUTPUT_TEMPLATE.format(
-                mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps
+                mbs, base_gpbs, vectorize_gbps, parallel_gbps, sync_dma_gbps, single_dma_gbps
             )
         )