diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt
index 1723fd3d4fb73..e43af19ff9390 100644
--- a/ggml/src/ggml-qnn/npu/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt
@@ -220,7 +220,7 @@ else()
         target_compile_options(hexagon_npu_skel_OBJS PUBLIC
             -fsanitize=address -fno-omit-frame-pointer
         )
-        target_link_libraries(hexagon_npu_skel_OBJS PUBLIC
+        target_link_options(hexagon_npu_skel_OBJS PUBLIC
             -fsanitize=address
         )
     endif()
@@ -248,9 +248,9 @@ else()
 
     add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
     target_link_libraries(hexagon_npu_skel
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.a
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.so.1
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.so.1
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.so
     )
     set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
     target_link_libraries(hexagon_npu_skel qprintf_static)
diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp
index f07391711241d..cbe136cf1fb74 100644
--- a/ggml/src/ggml-qnn/npu/device/device.cpp
+++ b/ggml/src/ggml-qnn/npu/device/device.cpp
@@ -17,21 +17,30 @@
 namespace {
 
 struct npu_device_context {
+    std::unique_ptr<hexagon::power_utils>         power_utils;       // Power management utilities
     std::unique_ptr<hexagon::default_thread_pool> thread_pool;
     std::unique_ptr<float[]>                      f16_to_f32_table;  // TODO: store vtcm?
 
     bool init() {
         if (!init_ltu()) {
-            DEVICE_LOG_ERROR("Failed to initialize LTU");
+            DEVICE_LOG_ERROR("Failed to initialize LTU\n");
             return false;
         }
 
         if (!init_thread_pool()) {
-            DEVICE_LOG_ERROR("Failed to initialize thread pool");
+            DEVICE_LOG_ERROR("Failed to initialize thread pool\n");
             return false;
         }
 
-        DEVICE_LOG_DEBUG("NPU device context initialized");
+        power_utils = std::make_unique<hexagon::power_utils>();
+        if (power_utils && power_utils->is_valid()) {
+            power_utils->set_dvcs_performance_mode(true);
+            DEVICE_LOG_DEBUG("Power utilities initialized with DVCS performance mode enabled\n");
+        } else {
+            DEVICE_LOG_ERROR("Failed to initialize power utilities\n");
+        }
+
+        DEVICE_LOG_DEBUG("NPU device context initialized\n");
         return true;
     }
 
@@ -41,29 +50,29 @@ struct npu_device_context {
 
         f16_to_f32_table = std::make_unique<float[]>(kLtuCount);
         if (!f16_to_f32_table) {
-            DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table");
+            DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table\n");
             return false;
         }
 
         hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount);
-        DEVICE_LOG_DEBUG("f16_to_f32 table initialized");
+        DEVICE_LOG_DEBUG("f16_to_f32 table initialized\n");
         return true;
     }
 
     bool init_thread_pool() {
         if (thread_pool) {
-            DEVICE_LOG_DEBUG("Thread pool already initialized");
+            DEVICE_LOG_DEBUG("Thread pool already initialized\n");
             return true;
         }
 
         auto pool = std::make_unique<hexagon::default_thread_pool>();
         if (!pool) {
-            DEVICE_LOG_ERROR("Failed to create thread pool");
+            DEVICE_LOG_ERROR("Failed to create thread pool\n");
             return false;
         }
 
         thread_pool = std::move(pool);
-        DEVICE_LOG_DEBUG("Thread pool initialized");
+        DEVICE_LOG_DEBUG("Thread pool initialized\n");
         return true;
     }
 };
@@ -102,25 +111,25 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
     // TODO: should we have a device context here?
     auto * context = new npu_device_context();
     if (!context->init()) {
-        DEVICE_LOG_ERROR("Failed to initialize npu_device_context");
+        DEVICE_LOG_ERROR("Failed to initialize npu_device_context\n");
         delete context;
         return AEE_EFAILED;
     }
 
     *h = reinterpret_cast<remote_handle64>(context);
-    DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
+    DEVICE_LOG_INFO("NPU device context created: %p\n", (void *) *h);
     return AEE_SUCCESS;
 }
 
 int npu_device_close(remote_handle64 h) {
     auto * context = device_context_from_handle(h);
     if (!context) {
-        DEVICE_LOG_ERROR("Invalid npu_device_context handle");
+        DEVICE_LOG_ERROR("Invalid npu_device_context handle\n");
         return AEE_EINVHANDLE;
     }
 
     delete context;
-    DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
+    DEVICE_LOG_INFO("NPU device context destroyed: %p\n", (void *) h);
     return AEE_SUCCESS;
 }
 
@@ -139,7 +148,7 @@ AEEResult npu_device_device_support_op(remote_handle64                   _h,
     NPU_UNUSED(_h);
 
     if (!srcs || srcsLen <= 0 || !dst || !is_supported) {
-        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
+        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments\n");
         return AEE_EINVARGS;
     }
 
@@ -185,7 +194,7 @@ AEEResult npu_device_tensors_free(remote_handle64                    _h,
                                   int                                tensor_handlesLen) {
     NPU_UNUSED(_h);
     if (!tensor_handles || tensor_handlesLen < 0) {
-        DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments");
+        DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments\n");
         return AEE_EINVARGS;
     }
 
@@ -194,7 +203,7 @@ AEEResult npu_device_tensors_free(remote_handle64                    _h,
         if (tensor) {
             delete tensor;
         } else {
-            DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d", i);
+            DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d\n", i);
         }
     }
 
@@ -250,13 +259,13 @@ AEEResult npu_device_graph_set_tensor_with_param(remote_handle64
 AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
     auto dev_ctx = device_context_from_handle(_h);
     if (!dev_ctx) {
-        DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
+        DEVICE_LOG_DEBUG("Invalid npu_device_context handle\n");
         return AEE_EINVHANDLE;
     }
 
     auto * graph = graph_from_handle(graph_handle);
     if (!graph) {
-        DEVICE_LOG_ERROR("Invalid graph handle");
+        DEVICE_LOG_ERROR("Invalid graph handle\n");
         return AEE_EINVHANDLE;
     }
 
diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp
index c963ef966ea22..19326a523d3ff 100644
--- a/ggml/src/ggml-qnn/npu/device/graph.cpp
+++ b/ggml/src/ggml-qnn/npu/device/graph.cpp
@@ -91,6 +91,7 @@ void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread
 
         const bool should_sync = requires_thread_barrier(op);
         if (pool && should_sync && i < _tensor_count - 1) {
+            // For the last tensor, the thread pool will handle synchronization
             DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]",
                                               (void *) this,
                                               params.get_thread_index(),
diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp
index 5437fb848e3b0..b128a6b82ff82 100644
--- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp
@@ -13,7 +13,7 @@ inline float f16_to_f32(const npu_device_fp16_t src) {
 }
 
 // From: ggml/src/ggml-cpu/ops.cpp
-template <bool _IsKvF16>
+template <bool _IsKvF16, bool _HasMask>
 void flash_attn_impl(hexagon::tensor *         out,
                      const hexagon::tensor *   q,
                      const hexagon::tensor *   k,
@@ -24,6 +24,7 @@ void flash_attn_impl(hexagon::tensor *         out,
     static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count");
 
     constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32;
+    constexpr const bool                        kHasMask    = _HasMask;
 
     if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) {
         DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n",
@@ -32,6 +33,11 @@ void flash_attn_impl(hexagon::tensor *         out,
         return;
     }
 
+    if (kHasMask != (mask != nullptr)) {
+        DEVICE_LOG_ERROR("flash_attn_impl: mask is required when kHasMask is true\n");
+        return;
+    }
+
     float       scale         = out->get_op_param<float>(0);
     const float max_bias      = out->get_op_param<float>(1);
     const float logit_softcap = out->get_op_param<float>(2);
@@ -96,7 +102,7 @@ void flash_attn_impl(hexagon::tensor *         out,
     const uint8_t * q_ptr     = q->get_read_buffer();
     const uint8_t * k_ptr     = k->get_read_buffer();
     const uint8_t * v_ptr     = v->get_read_buffer();
-    const uint8_t * mask_ptr  = mask ? mask->get_read_buffer() : nullptr;
+    const uint8_t * mask_ptr  = kHasMask ? mask->get_read_buffer() : nullptr;
     const uint8_t * sinks_ptr = sinks ? sinks->get_read_buffer() : nullptr;
     float *         VKQ32     = reinterpret_cast<float *>(cache_ptr);          // FP32 VKQ accumulator
     auto * VKQ16 = reinterpret_cast<npu_device_fp16_t *>(VKQ32 + aligned_dv);  // (temporary) FP16 VKQ accumulator
@@ -125,11 +131,17 @@ void flash_attn_impl(hexagon::tensor *         out,
         }
 
         const npu_device_fp16_t * mp =
-            mask_ptr ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
+            kHasMask ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
                                                                    (iq2 % mask->get_ne(2)) * mask->get_nb(2) +
                                                                    (iq3 % mask->get_ne(3)) * mask->get_nb(3)) :
                        nullptr;
 
+        q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
+
+        if (kHasMask) {
+            hexagon::l2fetch_row(reinterpret_cast<const uint8_t *>(mp), mask->get_nb(1));
+        }
+
         // k indices
         const int ik3 = iq3 / rk3;
         const int ik2 = iq2 / rk2;
@@ -138,8 +150,6 @@ void flash_attn_impl(hexagon::tensor *         out,
         const int iv3 = iq3 / rv3;
         const int iv2 = iq2 / rv2;
 
-        q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
-
         // online softmax / attention
         // loop over n_kv and n_head_kv
         // ref: https://arxiv.org/pdf/2112.05682.pdf
@@ -147,7 +157,7 @@ void flash_attn_impl(hexagon::tensor *         out,
         const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3);
         for (int64_t ic = 0; ic < k->get_ne(1); ++ic) {
             DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop);
-            float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f;
+            float mv = kHasMask ? (slope * f16_to_f32(mp[ic])) : 0.0f;
             if (mv == -INFINITY) {
                 continue;
             }
@@ -282,9 +292,17 @@ bool flash_attn_f32(tensor * out, compute_params * params) {
     const auto * mask  = out->get_src(3);
     const auto * sinks = out->get_src(4);
     if (k->get_type() == NPU_DATA_TYPE_F16) {
-        flash_attn_impl<true>(out, q, k, v, mask, sinks, params);
+        if (mask) {
+            flash_attn_impl<true, true>(out, q, k, v, mask, sinks, params);
+        } else {
+            flash_attn_impl<true, false>(out, q, k, v, mask, sinks, params);
+        }
     } else {
-        flash_attn_impl<false>(out, q, k, v, mask, sinks, params);
+        if (mask) {
+            flash_attn_impl<false, true>(out, q, k, v, mask, sinks, params);
+        } else {
+            flash_attn_impl<false, false>(out, q, k, v, mask, sinks, params);
+        }
     }
     return true;
 }
@@ -338,8 +356,8 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
 
     if (dst->ne[0] != v->ne[0] || dst->ne[2] != q->ne[1]) {
         DEVICE_LOG_DEBUG(
-            "[%s]dst shape does not match q and v: dst ne: %ld, %ld, %ld, %ld, q ne: %ld, %ld, %ld, %ld, "
-            "v ne: %ld, %ld, %ld, %ld\n",
+            "[%s]dst shape does not match q and v: dst ne: %lld, %lld, %lld, %lld, q ne: %lld, %lld, %lld, %lld, "
+            "v ne: %lld, %lld, %lld, %lld\n",
             op_get_name(op),
             dst->ne[0],
             dst->ne[1],
@@ -359,24 +377,25 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
     if (is_transposed_or_permuted(dst->nb)) {
         DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n",
                          op_get_name(op),
-                         dst->nb[0],
-                         dst->nb[1],
-                         dst->nb[2],
-                         dst->nb[3]);
+                         (size_t) dst->nb[0],
+                         (size_t) dst->nb[1],
+                         (size_t) dst->nb[2],
+                         (size_t) dst->nb[3]);
         return false;
     }
 
     if (q->ne[0] != k->ne[0]) {
-        DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n",
-                         op_get_name(op),
-                         q->ne[0],
-                         q->ne[1],
-                         q->ne[2],
-                         q->ne[3],
-                         k->ne[0],
-                         k->ne[1],
-                         k->ne[2],
-                         k->ne[3]);
+        DEVICE_LOG_DEBUG(
+            "[%s]q and k shapes do not match: q ne: %lld, %lld, %lld, %lld, k ne: %lld, %lld, %lld, %lld\n",
+            op_get_name(op),
+            q->ne[0],
+            q->ne[1],
+            q->ne[2],
+            q->ne[3],
+            k->ne[0],
+            k->ne[1],
+            k->ne[2],
+            k->ne[3]);
         return false;
     }
 
diff --git a/ggml/src/ggml-qnn/npu/device/op_glu.cpp b/ggml/src/ggml-qnn/npu/device/op_glu.cpp
new file mode 100644
index 0000000000000..f3e89064fb99c
--- /dev/null
+++ b/ggml/src/ggml-qnn/npu/device/op_glu.cpp
@@ -0,0 +1,228 @@
+
+#include "op_glu.hpp"
+
+#include "type_traits.hpp"
+#include "util.hpp"
+
+namespace {
+
+template <typename T> struct get_data_type {};
+
+template <typename _TyData, typename _TyParam>
+struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t, _TyParam)> {
+    using type       = _TyData;
+    using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
+};
+
+inline float dummy_load_coeff() {
+    // This is a dummy function to satisfy the template requirements.
+    // In practice, this should be replaced with a proper coefficient loading function.
+    return 0;
+}
+
+inline float expf_f16_guard_inf(float x) {
+    // Avoid overflow for large values, f16: log(65504)
+    constexpr float kMaxExp = 11.0898664f;
+
+    if (x >= kMaxExp) {
+        // Avoid overflow for large values
+        return std::numeric_limits<float>::infinity();
+    }
+
+    return std::expf(x);
+}
+
+inline void glu_vec_op_f16_f16(const __fp16 * src0, const __fp16 * src1, __fp16 * dst, size_t count, float coeff) {
+    // TODO: use simd version, for some input hexagon intrinsics will generate nan instead of inf.
+    for (uint32_t i = 0; i < count; ++i) {
+        float x = src0[i];
+        float g = src1[i];
+
+        dst[i] = (x / (1.0f + expf_f16_guard_inf(-x))) * g;
+    }
+}
+
+inline void glu_vec_op_f32_f32(const float *              src0,
+                               const float *              src1,
+                               float *                    dst,
+                               size_t                     count,
+                               hexagon::HVX_VectorPair_x4 coeff) {
+    using namespace hexagon::vec;
+    vec_trans_with_param_impl<float, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f32_f32>(
+        src0, src1, dst, count, coeff);
+}
+
+template <auto _GluRowFunc, auto _CoeffLoadFunc>
+bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) {
+    using data_type  = typename get_data_type<decltype(_GluRowFunc)>::type;
+    using param_type = typename get_data_type<decltype(_GluRowFunc)>::param_type;
+    static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
+    static_assert(std::is_same_v<param_type, decltype(_CoeffLoadFunc())>,
+                  "GluRowFunc must have the same param type as CoeffLoadFunc");
+
+    if (!out) {
+        return false;
+    }
+
+    const bool has_src1 = out->get_src(1) != nullptr;
+    auto *     src0     = out->get_src(0);
+    auto *     src1     = has_src1 ? out->get_src(1) : src0;
+    if (!src0 || !src1) {
+        return true;  // skip if no src
+    }
+
+    const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2;
+    if (out->get_ne(0) != total_cols) {
+        DEVICE_LOG_ERROR(
+            "[hexagon-npu][GLU]out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols);
+        return false;
+    }
+
+    auto       total_rows    = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
+    const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
+    const auto start_end     = params->get_work_slice(total_rows);
+    if (start_end.first >= start_end.second) {
+        return true;
+    }
+
+    uint8_t * dst_ptr = out->get_write_buffer();
+    if (!dst_ptr) {
+        DEVICE_LOG_ERROR("[hexagon-npu][GLU]glu_impl: dst_ptr is not writable, tensor: %p, type: %s\n",
+                         (void *) out,
+                         hexagon::get_type_name(out->get_type()));
+        return false;
+    }
+
+    const int32_t   swapped  = out->get_op_param<int32_t>(1);
+    const uint8_t * src0_ptr = src0->get_read_buffer();
+    const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type));
+    if (swapped) {
+        std::swap(src0_ptr, src1_ptr);
+    }
+
+    DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
+
+    auto         coeff           = _CoeffLoadFunc();
+    const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
+    for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
+        const auto i03 = ir / rows_per_cube;
+        const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
+        const auto i01 = ir % out->get_ne(1);  // TODO: should we use divide instead of mod?
+        const auto i13 = i03 % src1->get_ne(3);
+        const auto i12 = i02 % src1->get_ne(2);
+        const auto i11 = i01 % src1->get_ne(1);
+
+        auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
+        auto * src0_row   = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
+        auto * src1_row   = src1_plane + i11 * src1->get_nb(1);
+        auto * dst_row    = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
+        if (ir + 1 < start_end.second) {
+            hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
+            hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
+        }
+
+        _GluRowFunc(reinterpret_cast<const data_type *>(src0_row),
+                    reinterpret_cast<const data_type *>(src1_row),
+                    reinterpret_cast<data_type *>(dst_row),
+                    static_cast<size_t>(total_cols),
+                    coeff);
+    }
+
+    out->release_write_buffer();  // mark the output tensor as modified
+    return true;
+}
+
+template <npu_device_tensor_data_type _DataType>
+bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) {
+    using namespace hexagon::vec::math;
+
+    if (out->get_op_param<int32_t>(0) != NPU_GLU_OP_SWIGLU) {
+        DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", (int) out->get_op_param<int32_t>(0));
+        return false;
+    }
+
+    if (out->get_type() != _DataType) {
+        DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n",
+                         hexagon::get_type_name(out->get_type()),
+                         hexagon::get_type_name(_DataType));
+        return false;
+    }
+
+    if constexpr (_DataType == NPU_DATA_TYPE_F32) {
+        return glu_impl<glu_vec_op_f32_f32, qhmath_load_div_sf_ltu>(out, params);
+    } else if constexpr (_DataType == NPU_DATA_TYPE_F16) {
+        return glu_impl<glu_vec_op_f16_f16, dummy_load_coeff>(out, params);
+    }
+
+    DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type()));
+    return true;
+}
+
+}  // namespace
+
+namespace hexagon {
+
+bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params) {
+    return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F32>(out, params);
+}
+
+bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params) {
+    return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F16>(out, params);
+}
+
+bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
+                         const npu_device_tensor_spec *    dst,
+                         const npu_device_tensor_spec *    srcs,
+                         size_t                            src_len) {
+    const auto op = op_spec->op;
+    if (op != NPU_OP_GLU) {
+        DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
+        return false;
+    }
+
+    if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) {
+        DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), (int) op_spec->params[0]);
+        return false;
+    }
+
+    if (!dst || !srcs || src_len < 1) {
+        DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
+        return false;
+    }
+
+    const auto & src0 = srcs[0];
+    if (dst->type != src0.type) {
+        DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n",
+                         hexagon::op_get_name(op),
+                         hexagon::get_type_name(src0.type),
+                         hexagon::get_type_name(dst->type));
+        return false;
+    }
+
+    if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
+        DEVICE_LOG_DEBUG(
+            "[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type));
+        return false;
+    }
+
+    if (src_len > 1) {
+        if (!hexagon::is_same_shape(src0, *dst) || !hexagon::is_same_shape(srcs[1], *dst)) {
+            DEVICE_LOG_DEBUG("[%s]src0, src1 and dst have different shape\n", hexagon::op_get_name(op));
+            return false;  // src0 and src1 have the same shape as dst
+        }
+    } else {
+        static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "GLU requires max dims 4");
+        if (src0.ne[0] / 2 != dst->ne[0] || src0.ne[1] != dst->ne[1] || src0.ne[2] != dst->ne[2] ||
+            src0.ne[3] != dst->ne[3]) {
+            DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape: src0.ne[0]: %ld, dst.ne[0]: %ld\n",
+                             hexagon::op_get_name(op),
+                             (long) src0.ne[0],
+                             (long) dst->ne[0]);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+}  // namespace hexagon
diff --git a/ggml/src/ggml-qnn/npu/device/op_glu.hpp b/ggml/src/ggml-qnn/npu/device/op_glu.hpp
new file mode 100644
index 0000000000000..075dce9ad6ca9
--- /dev/null
+++ b/ggml/src/ggml-qnn/npu/device/op_glu.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "op_types.hpp"
+
+namespace hexagon {
+
+bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params);
+bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params);
+
+bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
+                         const npu_device_tensor_spec *    dst,
+                         const npu_device_tensor_spec *    srcs,
+                         size_t                            src_len);
+
+}  // namespace hexagon
diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp
index 647fdd347ff3e..c423b24778981 100644
--- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp
@@ -3,11 +3,13 @@
 #include "op_impl.hpp"
 
 #include "op_flash_attn.hpp"
+#include "op_glu.hpp"
 #include "op_mul_mat.hpp"
 #include "op_rope.hpp"
 #include "type_traits.hpp"
 #include "vec_ops.hpp"
 
+#include <cmath>
 #include <type_traits>
 
 namespace {
@@ -59,15 +61,10 @@ template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const
     using type = _TyData;
 };
 
-template <typename _TyData>
-struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t, hexagon::HVX_VectorPair_x4)> {
-    using type = _TyData;
-};
-
 template <typename _TyData, typename _TyParam>
 struct get_data_type<void (*)(const _TyData *, _TyData *, size_t, _TyParam)> {
     using type       = _TyData;
-    using param_type = typename std::remove_cv<typename std::remove_reference<_TyData>::type>::type;
+    using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
 };
 
 template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
@@ -325,171 +322,6 @@ bool is_unary_op_supported(const npu_device_tensor_op_spec * op_spec,
     return true;
 }
 
-inline void glu_vec_op_f32_f32(const float *              src0,
-                               const float *              src1,
-                               float *                    dst,
-                               size_t                     count,
-                               hexagon::HVX_VectorPair_x4 coeff) {
-    using namespace hexagon::vec;
-    vec_trans_with_param_impl<float, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f32_f32>(
-        src0, src1, dst, count, coeff);
-}
-
-inline void glu_vec_op_f16_f16(const npu_device_fp16_t *  src0,
-                               const npu_device_fp16_t *  src1,
-                               npu_device_fp16_t *        dst,
-                               size_t                     count,
-                               hexagon::HVX_VectorPair_x4 coeff) {
-    using namespace hexagon::vec;
-    vec_trans_with_param_impl<npu_device_fp16_t, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f16_f16>(
-        src0, src1, dst, count, coeff);
-}
-
-template <auto _GluRowFunc, hexagon::HVX_VectorPair_x4 (*_CoeffLoadFunc)()>
-bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) {
-    using data_type = typename get_data_type<decltype(_GluRowFunc)>::type;
-    static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
-
-    if (!out) {
-        return false;
-    }
-
-    const bool has_src1 = out->get_src(1) != nullptr;
-    auto *     src0     = out->get_src(0);
-    auto *     src1     = has_src1 ? out->get_src(1) : src0;
-    if (!src0 || !src1) {
-        return true;  // skip if no src
-    }
-
-    const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2;
-    if (out->get_ne(0) != total_cols) {
-        DEVICE_LOG_ERROR("out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols);
-        return false;
-    }
-
-    auto       total_rows    = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
-    const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
-    const auto start_end     = params->get_work_slice(total_rows);
-    if (start_end.first >= start_end.second) {
-        return true;
-    }
-
-    uint8_t * dst_ptr = out->get_write_buffer();
-    if (!dst_ptr) {
-        DEVICE_LOG_ERROR("element_wise_op: dst_ptr is not writable, tensor: %p, type: %s\n",
-                         (void *) out,
-                         hexagon::get_type_name(out->get_type()));
-        return false;
-    }
-
-    const int32_t   swapped  = out->get_op_param<int32_t>(1);
-    const uint8_t * src0_ptr = src0->get_read_buffer();
-    const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type));
-    if (swapped) {
-        std::swap(src0_ptr, src1_ptr);
-    }
-
-    DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
-
-    auto         coeff           = _CoeffLoadFunc();
-    const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
-    for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
-        const auto i03 = ir / rows_per_cube;
-        const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
-        const auto i01 = ir % out->get_ne(1);  // TODO: should we use divide instead of mod?
-        const auto i13 = i03 % src1->get_ne(3);
-        const auto i12 = i02 % src1->get_ne(2);
-        const auto i11 = i01 % src1->get_ne(1);
-
-        auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
-        auto * src0_row   = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
-        auto * src1_row   = src1_plane + i11 * src1->get_nb(1);
-        auto * dst_row    = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
-        if (ir + 1 < start_end.second) {
-            hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
-            hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
-        }
-
-        _GluRowFunc(reinterpret_cast<const data_type *>(src0_row),
-                    reinterpret_cast<const data_type *>(src1_row),
-                    reinterpret_cast<data_type *>(dst_row),
-                    static_cast<size_t>(total_cols),
-                    coeff);
-    }
-
-    out->release_write_buffer();  // mark the output tensor as modified
-    return true;
-}
-
-template <npu_device_tensor_data_type _DataType>
-bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) {
-    using namespace hexagon::vec::math;
-
-    if (out->get_op_param<int32_t>(0) != NPU_GLU_OP_SWIGLU) {
-        DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", out->get_op_param<int32_t>(0));
-        return false;
-    }
-
-    if (out->get_type() != _DataType) {
-        DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n",
-                         hexagon::get_type_name(out->get_type()),
-                         hexagon::get_type_name(_DataType));
-        return false;
-    }
-
-    if constexpr (_DataType == NPU_DATA_TYPE_F32) {
-        return glu_impl<glu_vec_op_f32_f32, qhmath_load_div_sf_ltu>(out, params);
-    } else if constexpr (_DataType == NPU_DATA_TYPE_F16) {
-        return glu_impl<glu_vec_op_f16_f16, qhmath_load_div_hf_ltu>(out, params);
-    }
-
-    DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type()));
-    return true;
-}
-
-bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
-                         const npu_device_tensor_spec *    dst,
-                         const npu_device_tensor_spec *    srcs,
-                         size_t                            src_len) {
-    const auto op = op_spec->op;
-    if (op != NPU_OP_GLU) {
-        DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
-        return false;
-    }
-
-    if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) {
-        DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), op_spec->params[0]);
-        return false;
-    }
-
-    if (!dst || !srcs || src_len < 1) {
-        DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
-        return false;
-    }
-
-    const auto & src0 = srcs[0];
-    if (dst->type != src0.type) {
-        DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n",
-                         hexagon::op_get_name(op),
-                         hexagon::get_type_name(src0.type),
-                         hexagon::get_type_name(dst->type));
-        return false;
-    }
-
-    if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
-        DEVICE_LOG_DEBUG(
-            "[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type));
-        return false;
-    }
-
-    if (!hexagon::is_same_shape(src0, *dst)) {
-        DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
-        return false;
-    }
-
-    return false;  // TODO: fix: for some input hexagon intrinsics will generate nan instead of inf.
-}
-
 struct op_capabilities {
     npu_device_tensor_op               op;
     hexagon::op_is_supported_func_type is_supported;
@@ -499,60 +331,60 @@ struct op_capabilities {
 
 constexpr const op_capabilities kOpCapabilities[] = {
     {
-     NPU_OP_MUL_MAT,                                                           hexagon::is_mul_mat_supported,
+     NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported,
      {
             hexagon::mul_mat_f32,  // NPU_DATA_TYPE_F32
             nullptr,               // NPU_DATA_TYPE_F16
-        },                                                                                                             true, // requires_thread_barrier
+        }, true,                      // requires_thread_barrier
     },
     {
-     NPU_OP_ADD,                                                                         is_element_wise_op_supported,
+     NPU_OP_ADD, is_element_wise_op_supported,
      {
             element_wise_op<vec_op_f32_f32<vadd_f32_f32>>,  // NPU_DATA_TYPE_F32
             element_wise_op<vec_op_f16_f16<vadd_f16_f16>>,  // NPU_DATA_TYPE_F16
-        },                                                                                                                   false,                                                                               // requires_thread_barrier
-    },
+        }, false,
+     },
     {
      NPU_OP_SUB, is_element_wise_op_supported,
      {
             element_wise_op<vec_op_f32_f32<vsub_f32_f32>>,  // NPU_DATA_TYPE_F32
             element_wise_op<vec_op_f16_f16<vsub_f16_f16>>,  // NPU_DATA_TYPE_F16
-        },                                                                                                             false,                                                                                                                       // requires_thread_barrier
-    },
+        }, false,
+     },
     {
-     NPU_OP_MUL,                                                                   is_element_wise_op_supported,
+     NPU_OP_MUL, is_element_wise_op_supported,
      {
             element_wise_op<vec_op_f32_f32<vmul_f32_f32>>,  // NPU_DATA_TYPE_F32
             element_wise_op<vec_op_f16_f16<vmul_f16_f16>>,  // NPU_DATA_TYPE_F16
-        },                                                      false,                                                                                                             // requires_thread_barrier
-    },
+        }, false,
+     },
     {
-     NPU_OP_RMS_NORM,                                                                     is_unary_op_supported,
+     NPU_OP_RMS_NORM, is_unary_op_supported,
      {
             unary_op<rms_norm_vec_f32>,  // NPU_DATA_TYPE_F32
             nullptr,                     // NPU_DATA_TYPE_F16
-        },                                                                                                                   false,                           // requires_thread_barrier
-    },
+        }, false,
+     },
     {
-     NPU_OP_FLASH_ATTN,hexagon::is_flash_attn_supported,
+     NPU_OP_FLASH_ATTN, hexagon::is_flash_attn_supported,
      {
             hexagon::flash_attn_f32,  // NPU_DATA_TYPE_F32
             nullptr,                  // NPU_DATA_TYPE_F16
         }, true,                         // requires_thread_barrier
     },
     {
-     NPU_OP_ROPE,                                                        hexagon::is_rope_supported,
+     NPU_OP_ROPE, hexagon::is_rope_supported,
      {
             hexagon::rope_f32,  // NPU_DATA_TYPE_F32
             nullptr,            // NPU_DATA_TYPE_F16
-        }, false,                  // requires_thread_barrier
-    },
+        }, false,
+     },
     {
-     NPU_OP_GLU,                                                                         is_glu_op_supported,
+     NPU_OP_GLU, hexagon::is_glu_op_supported,
      {
-            glu_compute<NPU_DATA_TYPE_F32>,  // NPU_DATA_TYPE_F32
-            glu_compute<NPU_DATA_TYPE_F16>,  // NPU_DATA_TYPE_F16
-        }, false,                               // requires_thread_barrier
+            hexagon::glu_f32,  // NPU_DATA_TYPE_F32
+            hexagon::glu_f16,  // NPU_DATA_TYPE_F16
+        }, true,                  // TODO: should we avoid using thread barrier?
     },
 };
 
diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp
index 41bf2c7838d6b..852b347bce212 100644
--- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp
@@ -36,8 +36,9 @@ void mul_mat_impl(hexagon::tensor *         src0,
     using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
     using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
 
-    const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
-    auto *     dequantize_row_func  = hexagon::get_type_traits(src0->get_type()).to_float;
+    const auto src0_actual_row_size    = hexagon::get_dequantized_row_size(src0);
+    auto *     dequantize_row_func     = hexagon::get_type_traits(src0->get_type()).to_float;
+    auto *     load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
     if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) {
         DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
         return;
@@ -62,8 +63,8 @@ void mul_mat_impl(hexagon::tensor *         src0,
     if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first ||
         start_end_element.second <= start_end_element.first) {
         DEVICE_LOG_DEBUG(
-            "mul_mat_impl: no work to do, start_end_plane: (%ld, %ld), start_end_row: (%ld, %ld), "
-            "start_end_element: (%ld, %ld)\n",
+            "mul_mat_impl: no work to do, start_end_plane: (%lld, %lld), start_end_row: (%lld, %lld), "
+            "start_end_element: (%lld, %lld)\n",
             start_end_plane.first,
             start_end_plane.second,
             start_end_row.first,
@@ -116,6 +117,7 @@ void mul_mat_impl(hexagon::tensor *         src0,
         return;
     }
 
+    auto            dequant_table         = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
     constexpr bool  should_fetch_src0_row = !_ShouldCacheSrc0;
     const uint8_t * src0_ptr              = src0->get_read_buffer();
     const uint8_t * src1_ptr              = src1->get_read_buffer();
@@ -146,7 +148,8 @@ void mul_mat_impl(hexagon::tensor *         src0,
                         auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size;
                         dequantize_row_func(src0_row,
                                             reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
-                                            src0->get_ne(0));
+                                            src0->get_ne(0),
+                                            dequant_table);
                     }
 
                     last_cached_plane_ptr = src0_plane;
@@ -218,8 +221,9 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
     using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
     using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
 
-    const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
-    auto *     dequantize_row_func  = hexagon::get_type_traits(src0->get_type()).to_float;
+    const auto src0_actual_row_size    = hexagon::get_dequantized_row_size(src0);
+    auto *     dequantize_row_func     = hexagon::get_type_traits(src0->get_type()).to_float;
+    auto *     load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
     if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) {
         DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
         return;
@@ -229,7 +233,7 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
     if (dst->get_ne(0) >= params->get_thread_count()) {
         start_end_element = params->get_work_slice(dst->get_ne(0));
     } else {
-        DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %ldx%ldx%ldx%ld\n",
+        DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %lldx%lldx%lldx%lld\n",
                          hexagon::get_type_name(src1->get_type()),
                          src1->get_ne(0),
                          src1->get_ne(1),
@@ -241,7 +245,7 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
     if (start_end_element.second <= start_end_element.first) {
         DEVICE_LOG_DEBUG(
             "mul_mat_impl: no work to do, start_end_plane: [0, 1), start_end_row: [0, 1), "
-            "start_end_element: [%ld, %ld)\n",
+            "start_end_element: [%lld, %lld)\n",
             start_end_element.first,
             start_end_element.second);
         return;
@@ -297,6 +301,7 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
         return;
     }
 
+    auto            dequant_table         = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
     constexpr bool  should_fetch_src0_row = !_ShouldCacheSrc0;
     const uint8_t * src0_ptr              = src0->get_read_buffer();
     const uint8_t * src1_ptr              = src1->get_read_buffer();
@@ -325,8 +330,10 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
                     }
 
                     auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size;
-                    dequantize_row_func(
-                        src0_row, reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr), src0->get_ne(0));
+                    dequantize_row_func(src0_row,
+                                        reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
+                                        src0->get_ne(0),
+                                        dequant_table);
                 }
 
                 src0_plane = src0_plane_cache_ptr;
diff --git a/ggml/src/ggml-qnn/npu/device/op_rope.cpp b/ggml/src/ggml-qnn/npu/device/op_rope.cpp
index 27a35394c50c4..d73d13983ac01 100644
--- a/ggml/src/ggml-qnn/npu/device/op_rope.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_rope.cpp
@@ -165,7 +165,7 @@ bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) {
     }
 
     if (n_dims % 2 || (_IsVision && n_dims != out->get_ne(0) / 2)) {
-        DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %d\n", n_dims, out->get_ne(0) / 2);
+        DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %lld\n", n_dims, out->get_ne(0) / 2);
         return false;  // invalid n_dims for vision ROPE
     }
 
diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp
index a6feefe2ecaa6..5d2dc44d5475b 100644
--- a/ggml/src/ggml-qnn/npu/device/tensor.hpp
+++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp
@@ -20,30 +20,30 @@ class tensor {
         void * mmap_address = nullptr;
         auto   ret          = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address);
         if (ret != AEE_SUCCESS) {
-            DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d\n", (int) ret);
             return;
         }
 
         _data = static_cast<uint8_t *>(mmap_address);
-        DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n",
-                        (void *) this,
-                        (long) _info.ne[0],
-                        (long) _info.ne[1],
-                        (long) _info.ne[2],
-                        (long) _info.ne[3],
-                        _info.buffer_fd,
-                        _info.offset,
-                        (void *) mmap_address,
-                        phy_address);
+        DEVICE_LOG_DEBUG("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n",
+                         (void *) this,
+                         (long) _info.ne[0],
+                         (long) _info.ne[1],
+                         (long) _info.ne[2],
+                         (long) _info.ne[3],
+                         (int) _info.buffer_fd,
+                         (size_t) _info.offset,
+                         (void *) mmap_address,
+                         (long) phy_address);
     }
 
     ~tensor() noexcept {
         auto ret = HAP_mmap_put(_info.buffer_fd);
         if (ret != AEE_SUCCESS) {
-            DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d\n", (int) ret);
         }
 
-        DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd);
+        DEVICE_LOG_DEBUG("~tensor(%p) fd: %d\n", (void *) this, _info.buffer_fd);
     }
 
     void flush() const {
@@ -131,7 +131,7 @@ class tensor {
 
     uint8_t * get_write_buffer() const {
         if (_info.is_constant) {
-            DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p", (void *) this);
+            DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p\n", (void *) this);
             return nullptr;  // Do not allow writing to constant tensors
         }
 
diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp
index 902bdcfc564c7..aeaee16bf9d3b 100644
--- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp
+++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp
@@ -14,7 +14,7 @@
 namespace hexagon {
 
 constexpr const size_t kMaxThreadCount   = 4;
-constexpr const size_t kDefaultStackSize = 1024 * 64;  // 64KB
+constexpr const size_t kDefaultStackSize = NPU_THREAD_STACK_SIZE;  // 64KB
 
 template <size_t _stack_size> class qurt_thread {
   public:
@@ -24,7 +24,7 @@ template <size_t _stack_size> class qurt_thread {
                          qurt_thread_func_type thread_func,
                          void *                arg,
                          unsigned short        priority) {
-        DEVICE_LOG_DEBUG("qurt_thread.create: %s", thread_name.c_str());
+        DEVICE_LOG_DEBUG("qurt_thread.create: %s\n", thread_name.c_str());
         qurt_thread_attr_init(&_attributes);
         qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str());
         qurt_thread_attr_set_stack_addr(&_attributes, _stack);
@@ -37,26 +37,26 @@ template <size_t _stack_size> class qurt_thread {
         auto ret = qurt_thread_create(
             &_tid, &_attributes, reinterpret_cast<void (*)(void *)>(&qurt_thread::thread_func_impl), (void *) this);
         if (ret != QURT_EOK) {
-            DEVICE_LOG_ERROR("Failed to create thread: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to create thread: %d\n", (int) ret);
             _func = nullptr;
             _arg  = nullptr;
             return;
         }
 
-        DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d", thread_name.c_str(), (int) _tid);
+        DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d\n", thread_name.c_str(), (int) _tid);
     }
 
     ~qurt_thread() {
-        DEVICE_LOG_DEBUG("qurt_thread.destroy: %d", (int) _tid);
+        DEVICE_LOG_DEBUG("qurt_thread.destroy: %d\n", (int) _tid);
         int  thread_exit_code = QURT_EOK;
         auto ret              = qurt_thread_join(_tid, &thread_exit_code);
         if (ret != QURT_EOK && ret != QURT_ENOTHREAD) {
-            DEVICE_LOG_ERROR("Failed to join thread: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to join thread: %d\n", (int) ret);
             return;
         }
 
         if (thread_exit_code != QURT_EOK) {
-            DEVICE_LOG_ERROR("Thread exit code: %d", (int) thread_exit_code);
+            DEVICE_LOG_ERROR("Thread exit code: %d\n", (int) thread_exit_code);
         }
     }
 
@@ -135,7 +135,7 @@ template <size_t _ThreadCount> class thread_pool {
             auto thread = std::make_unique<thread_type>(
                 thread_name_base + std::to_string(i), &thread_pool::thread_func_impl, &_thread_params[i + 1], priority);
             if (!thread->is_valid()) {
-                DEVICE_LOG_ERROR("Failed to create thread: %zu", i);
+                DEVICE_LOG_ERROR("Failed to create thread: %zu\n", i);
                 // destroy all barriers and threads at destructor
                 return;
             }
@@ -143,11 +143,11 @@ template <size_t _ThreadCount> class thread_pool {
             _threads[i] = std::move(thread);
         }
 
-        DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount);
+        DEVICE_LOG_DEBUG("thread_pool.created: %zu\n", kMaxSubThreadCount);
     }
 
     ~thread_pool() {
-        DEVICE_LOG_DEBUG("thread_pool.destroy");
+        DEVICE_LOG_DEBUG("thread_pool.destroy\n");
         _thread_exit = true;
         qurt_barrier_wait(&_pending);  // release all task threads
 
@@ -161,7 +161,7 @@ template <size_t _ThreadCount> class thread_pool {
 
     bool sync_execute(task_type task, void * arg) {
         if (!task) {
-            DEVICE_LOG_ERROR("Invalid task");
+            DEVICE_LOG_ERROR("Invalid task\n");
             return false;
         }
 
@@ -174,7 +174,7 @@ template <size_t _ThreadCount> class thread_pool {
         qurt_barrier_wait(&_pending);
 
         task(this, &_thread_params[0], arg);
-        DEVICE_LOG_DEBUG("main_thread.task_completed: 0");
+        DEVICE_LOG_DEBUG("main_thread.task_completed: 0\n");
 
         qurt_barrier_wait(&_completed);
 
@@ -198,19 +198,19 @@ template <size_t _ThreadCount> class thread_pool {
 
         auto * param = reinterpret_cast<thread_params *>(arg);
 
-        DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", param->tidx);
+        DEVICE_LOG_DEBUG("thread_func_impl.start: %zu\n", param->tidx);
 
         auto & pool = *(param->pool);
         for (;;) {
             qurt_barrier_wait(&pool._pending);
             if (pool._thread_exit) {
-                DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", param->tidx);
+                DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu\n", param->tidx);
                 break;
             }
 
 #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
             auto task_begin_cycles = pool._task_begin_cycles.load();
-            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus",
+            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus\n",
                             param->tidx,
                             static_cast<unsigned long long>(
                                 HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
@@ -221,18 +221,18 @@ template <size_t _ThreadCount> class thread_pool {
                 task(param->pool, param, pool._arg);
             }
 
-            DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", param->tidx);
+            DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu\n", param->tidx);
             qurt_barrier_wait(&pool._completed);
 
 #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
-            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus",
+            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus\n",
                             param->tidx,
                             static_cast<unsigned long long>(
                                 HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
 #endif
         }
 
-        DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", param->tidx);
+        DEVICE_LOG_DEBUG("thread_func_impl.end: %zu\n", param->tidx);
     }
 
     std::atomic_bool                                _thread_exit                    = false;
diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp
index 3350167749230..0589aa414cf2b 100644
--- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp
+++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp
@@ -3,8 +3,6 @@
 #include "op_types.hpp"  // TODO: remove this include
 #include "vec_ops.hpp"
 
-#include <hexagon_types.h>
-
 #include <array>
 
 static_assert(sizeof(npu_device_block_q4_k) ==
@@ -31,42 +29,122 @@ inline npu_device_fp16_t to_fp16(const float src) {
 template <typename _TStruct, size_t _Count, auto _MemberPtr> inline HVX_Vector load_into_vector(const _TStruct * src) {
     static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");
 
-    const HVX_Vector * qs0  = reinterpret_cast<const HVX_Vector *>(&(src->*_MemberPtr));
-    HVX_Vector         prev = *qs0;
-    HVX_Vector         curr = hexagon::is_addr_aligned(qs0) ? Q6_V_vzero() : *(qs0 + 1);
-    return Q6_V_valign_VVR(curr, prev, (size_t) qs0);
+    return *reinterpret_cast<const HVX_UVector *>(&(src->*_MemberPtr));
+}
+
+template <typename _TStruct, size_t _Count> inline HVX_Vector load_struct_into_vector(const _TStruct * src) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");
+
+    return *reinterpret_cast<const HVX_UVector *>(src);
 }
 
 template <typename _TBlock> inline HVX_Vector load_block_generic(const _TBlock & src) {
-    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding");
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong block size/padding");
     return load_into_vector<_TBlock, 1, &_TBlock::qs>(&src);
 }
 
-template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock * srcs) {
-    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong q4_0 block size/padding");
-    constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
+template <typename _TBlock> inline HVX_Vector make_scale_load_mask() {
+    static_assert(sizeof(_TBlock) < 32, "wrong block size/padding");
+    static_assert(sizeof(_TBlock::qs) == 16 || sizeof(_TBlock::qs) == 32, "wrong quantization block size");
+
+    constexpr const size_t kScaleBlockSize = QUANT_BLOCK_SIZE * sizeof(hexagon::dequant_output_type);
+
+    // TODO: handle the case that scale not at the start of struct
+    hexagon::HVX_VectorAlias ret;
+    for (size_t i = 0; i < QUANT_BLOCK_SIZE; ++i) {
+        size_t base      = i * 2;
+        ret.u8[base]     = 0;
+        ret.u8[base + 1] = 1;
+
+        ret.u8[base + kScaleBlockSize]     = sizeof(_TBlock);
+        ret.u8[base + kScaleBlockSize + 1] = sizeof(_TBlock) + 1;
+    }
+
+    return ret.v;
+}
+
+template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock * srcs, HVX_VectorPred mask) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding");
+    constexpr const uint32_t kSizeOfQs    = sizeof(_TBlock::qs);
+    constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
 
     HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
-    HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock));
-    return Q6_V_lo_W(Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs));
+    HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale);
+    return Q6_V_vmux_QVV(mask, blocks, block1);
 }
 
-template <typename _TBlock> inline HVX_Vector load_qual_block_generic(const _TBlock * srcs) {
-    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong q4_0 block size/padding");
+template <typename _TBlock>
+inline hexagon::HVX_Vector_x2 load_dual_block_generic(const _TBlock *  srcs,
+                                                      HVX_VectorPred   mask,
+                                                      const HVX_Vector scale_indices) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding");
+    constexpr const uint32_t kSizeOfQs    = sizeof(_TBlock::qs);
+    constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
+
+    hexagon::HVX_Vector_x2 result;
+
+    HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
+
+    HVX_Vector block1  = Q6_V_vror_VR(blocks, kSizeOfScale);
+    HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks);
+
+    result.val[0] = Q6_V_vmux_QVV(mask, blocks, block1);
+    result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0);
+
+    return result;
+}
+
+template <typename _TBlock> inline hexagon::HVX_VectorPred_x3 make_quad_block_mask() {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding");
     constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
 
-    HVX_Vector     blocks = load_into_vector<_TBlock, 4, &_TBlock::qs>(srcs);
-    HVX_Vector     block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock));
-    HVX_VectorPair qp0    = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs);
+    hexagon::HVX_VectorPred_x3 mask;
+    mask.val[0] = Q6_Q_vsetq_R(kSizeOfQs);
+    mask.val[1] = Q6_Q_vsetq_R(kSizeOfQs * 3);
+    mask.val[2] = Q6_Q_vsetq_R(kSizeOfQs * 2);
+    return mask;
+}
+
+template <typename _TBlock>
+inline hexagon::HVX_Vector_x3 load_qual_block_generic(const _TBlock *                  srcs,
+                                                      const hexagon::HVX_VectorPred_x3 mask,
+                                                      const HVX_Vector                 scale_indices) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding");
+    constexpr const uint32_t kSizeOfQs    = sizeof(_TBlock::qs);
+    constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
+
+    hexagon::HVX_Vector_x3 result;
+
+    const HVX_Vector blocks = load_struct_into_vector<_TBlock, 4>(srcs);
+
+    {
+        HVX_Vector block0 = Q6_V_vror_VR(blocks, kSizeOfScale);
+        HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale * 2);
+
+        HVX_Vector block2 = Q6_V_vror_VR(blocks, kSizeOfScale * 3);
+        HVX_Vector block3 = Q6_V_vror_VR(blocks, kSizeOfScale * 4);
+
+        HVX_Vector block01 = Q6_V_vmux_QVV(mask.val[0], block0, block1);
+        HVX_Vector block23 = Q6_V_vmux_QVV(mask.val[1], block2, block3);
 
-    HVX_Vector     block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2);
-    HVX_Vector     block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3);
-    HVX_VectorPair qp1    = Q6_W_vshuff_VVR(block3, block2, kSizeOfQs);
+        result.val[0] = Q6_V_vmux_QVV(mask.val[2], block01, block23);
+    }
+
+    {
+        HVX_Vector scale23 = Q6_V_vror_VR(blocks, sizeof(_TBlock) * 2);
+
+        HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks);
+        scale23            = Q6_Vb_vshuff_Vb(scale23);
+
+        result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0);
+        result.val[2] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale23, 0);
+    }
 
-    return Q6_V_lo_W(Q6_W_vshuff_VVR(Q6_V_lo_W(qp1), Q6_V_lo_W(qp0), kSizeOfQs * 2));
+    return result;
 }
 
 inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
+    // TODO: use intrinsics
     if (j < 4) {
         *d = q[j] & 63;
         *m = q[j + 4] & 63;
@@ -324,23 +402,24 @@ void quantize_row_q4_K(const float * src, void * dst, size_t count) {
     }
 }
 
-void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
     constexpr const int qk = QUANT_BLOCK_SIZE;
     static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
 
-    const int    nb      = count / qk;
-    const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
-    auto *       dst_ptr = ((hexagon::dequant_output_type *) dst);  // TODO: opt for aligned access
+    const int            nb         = count / qk;
+    const auto *         src_ptr    = reinterpret_cast<const npu_device_block_q8_0 *>(src);
+    auto *               dst_ptr    = ((hexagon::dequant_output_type *) dst);  // TODO: opt for aligned access
+    const HVX_VectorPred mask       = Q6_Q_vsetq_R(sizeof(npu_device_block_q8_0::qs));
+    const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);
 
     int i = 0;
     for (; i + 1 < nb; i += 2) {
         const auto & src0 = src_ptr[i];
         const auto & src1 = src_ptr[i + 1];
 
-        HVX_Vector scales01 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
+        HVX_Vector scales01 = Q6_V_vmux_QVV(scale_mask, Q6_Vh_vsplat_R(src0.d), Q6_Vh_vsplat_R(src1.d));
 
-        HVX_Vector qs     = load_dual_block_generic(src_ptr + i);
+        HVX_Vector qs     = load_dual_block_generic(src_ptr + i, mask);
         HVX_Vector q_lo   = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(qs)));
         HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
 
@@ -363,44 +442,39 @@ void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, s
 }
 
 template <bool _IsDstAligned>
-void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
     constexpr const int qk = QUANT_BLOCK_SIZE;
     static_assert(qk % 2 == 0, "qk must be even");
     static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
     constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
 
+    static const auto       load_masks = make_quad_block_mask<npu_device_block_q4_0>();
+    static const HVX_Vector scale_indices __attribute__((aligned(hexagon::kBytesPerVector))) =
+        make_scale_load_mask<npu_device_block_q4_0>();
+
     const int                      nb      = count / qk;
     const auto *                   src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
-    const HVX_Vector               mask    = Q6_Vb_vsplat_R(0x0F);
-    const HVX_Vector               minus   = Q6_Vb_vsplat_R(8);
     hexagon::dequant_output_type * dst_ptr = dst;  // TODO: opt for aligned access
 
     int i = 0;
     for (; i + 3 < nb; i += 4) {
-        const auto & src0 = src_ptr[i];
-        const auto & src1 = src_ptr[i + 1];
-        const auto & src2 = src_ptr[i + 2];
-        const auto & src3 = src_ptr[i + 3];
+        auto qs = load_qual_block_generic(src_ptr + i, load_masks, scale_indices);
 
-        HVX_Vector scales01 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
-        HVX_Vector scales23 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src3.d), Q6_Vh_vsplat_R(src2.d), hexagon::kBytesPerVector / 2);
+        HVX_Vector q_lo = qs.val[0];
+        HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4);
 
-        HVX_Vector     qs   = load_qual_block_generic(src_ptr + i);
-        HVX_Vector     q_lo = Q6_V_vand_VV(qs, mask);
-        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs, 4);
-        HVX_VectorPair qp0  = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4));
-        q_lo                = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
-        qp0                 = Q6_Wh_vunpack_Vb(q_lo);
+        HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4));
 
-        q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
-        q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0));
+        q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
+        qp0  = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
 
-        q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
-        q_lo = Q6_Vhf_equals_Vqf16(q_lo);
+        q_lo = Q6_V_lo_W(qp0);
+        q_hi = Q6_V_hi_W(qp0);
 
-        q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23);
+        q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, qs.val[1]);
+        q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, qs.val[2]);
+
+        q_lo = Q6_Vhf_equals_Vqf16(q_lo);
         q_hi = Q6_Vhf_equals_Vqf16(q_hi);
 
         if constexpr (_IsDstAligned) {
@@ -415,21 +489,16 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
     }
 
     for (; i + 1 < nb; i += 2) {
-        const auto & src0 = src_ptr[i];
-        const auto & src1 = src_ptr[i + 1];
+        auto           qs   = load_dual_block_generic(src_ptr + i, load_masks.val[0], scale_indices);
+        HVX_Vector     q_lo = qs.val[0];
+        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4);
+        HVX_VectorPair qp0  = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2));
 
-        HVX_Vector scales01 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
+        q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
+        qp0  = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
 
-        HVX_Vector     qs   = load_dual_block_generic(src_ptr + i);
-        HVX_Vector     q_lo = Q6_V_vand_VV(qs, mask);
-        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs, 4);
-        HVX_VectorPair qp0  = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2));
-        q_lo                = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
-        qp0                 = Q6_Wh_vunpack_Vb(q_lo);
-        q_lo                = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
-        q_lo                = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
-        q_lo                = Q6_Vhf_equals_Vqf16(q_lo);
+        q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), qs.val[1]);
+        q_lo = Q6_Vhf_equals_Vqf16(q_lo);
 
         if constexpr (_IsDstAligned) {
             *reinterpret_cast<HVX_Vector *>(dst_ptr) = q_lo;
@@ -445,14 +514,15 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
         HVX_Vector   scales   = Q6_Vh_vsplat_R(curr_blk.d);
 
         HVX_Vector     qs   = load_block_generic(curr_blk);
-        HVX_Vector     q_lo = Q6_V_vand_VV(qs, mask);
+        HVX_Vector     q_lo = qs;
         HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs, 4);
         HVX_VectorPair qp0  = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs);
-        q_lo                = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
-        qp0                 = Q6_Wh_vunpack_Vb(q_lo);
-        q_lo                = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
-        q_lo                = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales);
-        q_lo                = Q6_Vhf_equals_Vqf16(q_lo);
+
+        q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
+        qp0  = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
+
+        q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), scales);
+        q_lo = Q6_Vhf_equals_Vqf16(q_lo);
 
         if constexpr (_IsDstAligned) {
             hexagon::q6op_vstu_variable_aligned<hexagon::kBytesPerVector / 2>(dst_ptr, q_lo);
@@ -462,24 +532,82 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
     }
 }
 
-void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+HVX_Vector load_dequant_table_q4_0() {
+    constexpr const int kTableSize   = 1 << 4;  // 4 bits per value, 16 values
+    constexpr const int kQ4ZeroPoint = 8;       // zero point for q4_0 quantization
+    static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
+
+    static const HVX_Vector result = []() -> HVX_Vector {
+        union {
+            HVX_Vector v;
+            __fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)];
+        } table __attribute__((aligned(hexagon::kBytesPerVector)));
+
+        table.v = Q6_V_vzero();
+        for (int i = 0; i < kTableSize; ++i) {
+            table.f16[i * 2] = i - kQ4ZeroPoint;  // TODO: vectorize this?
+        }
+        return table.v;
+    }();
+
+    return result;
+}
+
+void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
     const bool dst_aligned = hexagon::is_addr_aligned(dst);
     if (dst_aligned) {
-        dequantize_row_q4_0_impl<true>(src, dst, count);
+        dequantize_row_q4_0_impl<true>(src, dst, count, table);
     } else {
-        dequantize_row_q4_0_impl<false>(src, dst, count);
+        dequantize_row_q4_0_impl<false>(src, dst, count, table);
     }
 }
 
-void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+HVX_Vector load_dequant_table_q4_k() {
+    constexpr const int kTableSize = 1 << 4;  // 4 bits per value, 16 values
+    static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
+
+    const static HVX_Vector result = []() -> HVX_Vector {
+        union {
+            HVX_Vector v;
+            __fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)];
+        } table __attribute__((aligned(hexagon::kBytesPerVector)));
+
+        table.v = Q6_V_vzero();
+        for (int i = 0; i < kTableSize; ++i) {
+            table.f16[i * 2] = i;  // TODO: vectorize this?
+        }
+        return table.v;
+    }();
+
+    return result;
+}
+
+void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
+    constexpr const int kQuantSubBlockSize = 32;
+
     const int    nb      = count / QUANT_K_BLOCK_SIZE;
     const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_k *>(src);
-    auto *       dst_ptr = reinterpret_cast<__fp16 *>(dst);
+    auto *       dst_ptr = reinterpret_cast<npu_device_fp16_t *>(dst);
+
+    const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);
+
+    union {
+        HVX_VectorPair p[2];
+        HVX_Vector     v[4];
+    } dual_pair __attribute__((aligned(hexagon::kBytesPerVector * 4)));
 
-    // TODO: use intrinsics
     for (int i = 0; i < nb; i++) {
         const uint8_t * q = src_ptr[i].qs;
 
+        HVX_Vector qv = *reinterpret_cast<const HVX_UVector *>(q);
+
+        HVX_Vector     q_lo = qv;
+        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qv, 4);
+        HVX_VectorPair qp   = Q6_W_vshuff_VVR(q_hi, q_lo, kQuantSubBlockSize * 3);
+
+        dual_pair.p[0] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp)), table, 0);
+        dual_pair.p[1] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_hi_W(qp)), table, 0);
+
         const __fp16 d   = reinterpret_cast<const __fp16 &>(src_ptr[i].d);
         const __fp16 min = reinterpret_cast<const __fp16 &>(src_ptr[i].dmin);
 
@@ -487,30 +615,61 @@ void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, s
         uint8_t      sc     = 0;
         uint8_t      m      = 0;
         const auto * scales = src_ptr[i].scales;
-        for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) {
+        for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 128) {
             get_scale_min_k4(is + 0, scales, &sc, &m);
+            const __fp16 d0 = d * sc;
+            const __fp16 m0 = min * m;
+
+            HVX_Vector dv0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d0));
+            HVX_Vector dm0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m0));
+
+            get_scale_min_k4(is + 1, scales, &sc, &m);
             const __fp16 d1 = d * sc;
             const __fp16 m1 = min * m;
-            get_scale_min_k4(is + 1, scales, &sc, &m);
+
+            HVX_Vector dv1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d1));
+            HVX_Vector dm1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m1));
+
+            get_scale_min_k4(is + 2, scales, &sc, &m);
             const __fp16 d2 = d * sc;
             const __fp16 m2 = min * m;
-            for (int l = 0; l < 32; ++l) {
-                dst_ptr[0]  = d1 * (q[l] & 0xF) - m1;
-                dst_ptr[32] = d2 * ((q[l] >> 4) & 0xF) - m2;
-                dst_ptr++;
-            }
-            dst_ptr += 32;
-            q += 32;
-            is += 2;
+
+            HVX_Vector dv2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d2));
+            HVX_Vector dm2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m2));
+
+            get_scale_min_k4(is + 3, scales, &sc, &m);
+            const __fp16 d3 = d * sc;
+            const __fp16 m3 = min * m;
+
+            HVX_Vector dv3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d3));
+            HVX_Vector dm3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m3));
+
+            HVX_Vector dv01 = Q6_V_vmux_QVV(scale_mask, dv0, dv1);
+            HVX_Vector dm01 = Q6_V_vmux_QVV(scale_mask, dm0, dm1);
+
+            HVX_Vector dv23 = Q6_V_vmux_QVV(scale_mask, dv2, dv3);
+            HVX_Vector dm23 = Q6_V_vmux_QVV(scale_mask, dm2, dm3);
+
+            q_lo = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64], dv01);
+            q_lo = Q6_Vqf16_vsub_Vqf16Vhf(q_lo, dm01);
+
+            q_hi = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64 + 1], dv23);
+            q_hi = Q6_Vqf16_vsub_Vqf16Vhf(q_hi, dm23);
+
+            reinterpret_cast<HVX_UVector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo);
+            reinterpret_cast<HVX_UVector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi);
+
+            dst_ptr += 128;
+            is += 4;
         }
     }
 }
 
-void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
     hexagon::vec_cpy_f16(reinterpret_cast<const npu_device_fp16_t *>(src), dst, count);
 }
 
-void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
     hexagon::vec_cpy_f32(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), count);
 }
 
@@ -539,12 +698,16 @@ constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = {
      "Q4_0", QUANT_BLOCK_SIZE,
      sizeof(npu_device_block_q4_0),
      true, dequantize_row_q4_0,
-     quantize_row_q4_0 },
+     quantize_row_q4_0, nullptr,
+     nullptr, nullptr,
+     load_dequant_table_q4_0 },
     { NPU_DATA_TYPE_Q4_K,
      "Q4_K", QUANT_K_BLOCK_SIZE,
      sizeof(npu_device_block_q4_k),
      true, dequantize_row_q4_K,
-     quantize_row_q4_K },
+     quantize_row_q4_K, nullptr,
+     nullptr, nullptr,
+     load_dequant_table_q4_k },
 };
 
 static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT,
diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.hpp b/ggml/src/ggml-qnn/npu/device/type_traits.hpp
index 363827de0af3d..cfa844aba5e09 100644
--- a/ggml/src/ggml-qnn/npu/device/type_traits.hpp
+++ b/ggml/src/ggml-qnn/npu/device/type_traits.hpp
@@ -3,6 +3,8 @@
 #include "tensor.hpp"
 #include "util.hpp"
 
+#include <hexagon_types.h>
+
 namespace hexagon {
 
 using dequant_output_type = npu_device_fp16_t;
@@ -10,9 +12,10 @@ using dequant_output_type = npu_device_fp16_t;
 bool init_f16_f32_table(float * table, size_t count);
 
 typedef void (*quantize_row_type)(const float * src, void * dst, size_t count);
-typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count);
+typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count, HVX_Vector table);
 typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count);
 typedef bool (*can_use_aligned_vec_dot_type)(const void * src0, const void * src1, size_t count);
+typedef HVX_Vector (*load_dequant_table_type)();
 
 struct device_type_traits {
     npu_device_tensor_data_type type;
@@ -21,11 +24,12 @@ struct device_type_traits {
     size_t                      type_size;
     bool                        is_quantized;
 
-    dequantize_row_type          to_float;
-    quantize_row_type            from_float;
-    vec_dot_type                 vec_dot;
-    vec_dot_type                 vec_dot_aligned;
-    can_use_aligned_vec_dot_type can_use_aligned_vec_dot;
+    dequantize_row_type          to_float                = nullptr;
+    quantize_row_type            from_float              = nullptr;
+    vec_dot_type                 vec_dot                 = nullptr;
+    vec_dot_type                 vec_dot_aligned         = nullptr;
+    can_use_aligned_vec_dot_type can_use_aligned_vec_dot = nullptr;
+    load_dequant_table_type      load_dequant_table      = nullptr;
 };
 
 const device_type_traits & get_type_traits(npu_device_tensor_data_type type);
@@ -49,7 +53,7 @@ namespace hexagon {
 inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) {
     auto * src0 = op->get_src(0);
     auto * src1 = op->get_src(1);
-    char   buffer[1024];
+    char   buffer[512];
     if (src1 == nullptr) {
         snprintf(buffer,
                  sizeof(buffer),
@@ -96,8 +100,10 @@ inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) {
 #    define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) \
         auto __npu_op_timer_##tracker_name = hexagon::make_scoped_op_perf_timer(op, tidx)
 
-#    define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix)          \
-        hexagon::npu_sub_process_scoped_timer<decltype(__npu_op_timer_##tracker_name)::kBufferCount, idx> \
+#    define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \
+        hexagon::npu_sub_process_scoped_timer<                                                   \
+            std::remove_reference_t<decltype(__npu_op_timer_##tracker_name)>::kBufferCount,      \
+            idx>                                                                                 \
         __npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix)
 
 #else
diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp
index d70d7401805d4..71b735ff5ac70 100644
--- a/ggml/src/ggml-qnn/npu/device/util.hpp
+++ b/ggml/src/ggml-qnn/npu/device/util.hpp
@@ -8,17 +8,16 @@
 #include <HAP_power.h>
 
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <utility>
 
-#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__)
-#define DEVICE_LOG_WARN(...)  FARF(ERROR, __VA_ARGS__)
-#define DEVICE_LOG_INFO(...)  FARF(HIGH, __VA_ARGS__)
+#define DEVICE_LOG_ERROR(...) hexagon::log_error(__VA_ARGS__)
+#define DEVICE_LOG_WARN(...)  hexagon::log_message(__VA_ARGS__)
+#define DEVICE_LOG_INFO(...)  hexagon::log_message(__VA_ARGS__)
 
 #ifdef _DEBUG
-#    undef FARF_LOW
-#    define FARF_LOW              1
-#    define DEVICE_LOG_DEBUG(...) FARF(LOW, __VA_ARGS__)
+#    define DEVICE_LOG_DEBUG(...) hexagon::log_message(__VA_ARGS__)
 #else
 #    define DEVICE_LOG_DEBUG(...) (void) 0
 #endif
@@ -40,6 +39,20 @@
 
 namespace hexagon {
 
+__attribute__((format(printf, 1, 2))) inline void log_error(const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    std::vfprintf(stderr, format, args);
+    va_end(args);
+}
+
+__attribute__((format(printf, 1, 2))) inline void log_message(const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    std::vprintf(format, args);
+    va_end(args);
+}
+
 inline constexpr const char * op_get_name(npu_device_tensor_op op) {
     switch (op) {
         case NPU_OP_MUL_MAT:
@@ -137,23 +150,22 @@ class power_utils {
             return;
         }
 
-        HAP_power_request_t request = {};
-        request.type                = HAP_power_set_DCVS_v3;
-        request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
+        HAP_power_request_t request     = {};
+        request.type                    = HAP_power_set_DCVS_v3;
+        request.dcvs_v3.set_dcvs_enable = enable ? TRUE : FALSE;
+        request.dcvs_v3.dcvs_enable     = enable ? TRUE : FALSE;
+        request.dcvs_v3.set_core_params = TRUE;
         if (enable) {
-            request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
-            /*
-             * sleep_latency : To request for sleep latency in micro-seconds.
-             *                 Sleep latency is the minimum time before which the DSP sleeps
-             *                 Set latency to 65535 to reset it to the default value
-             */
-            request.dcvs_v3.set_latency = TRUE;
-            request.dcvs_v3.latency     = 1000;
-
-            request.dcvs_v3.set_bus_params           = TRUE;
-            request.dcvs_v3.bus_params.min_corner    = HAP_DCVS_VCORNER_SVS;
-            request.dcvs_v3.bus_params.max_corner    = HAP_DCVS_VCORNER_TURBO;
-            request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM;
+            request.dcvs_v3.dcvs_option               = HAP_DCVS_V2_PERFORMANCE_MODE;
+            request.dcvs_v3.set_bus_params            = TRUE;
+            request.dcvs_v3.bus_params.min_corner     = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.bus_params.max_corner     = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.bus_params.target_corner  = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.core_params.min_corner    = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.core_params.max_corner    = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.set_sleep_disable         = TRUE;
+            request.dcvs_v3.sleep_disable             = TRUE;
         }
 
         auto ret = HAP_power_set(_context_ptr, &request);
@@ -359,7 +371,7 @@ template <size_t _buffer_count, size_t _sub_idx> class npu_sub_process_scoped_ti
 inline auto make_scoped_perf_timer(const char * format, ...) {
     va_list args;
     va_start(args, format);
-    char buffer[1024];
+    char buffer[512];
     vsnprintf(buffer, sizeof(buffer), format, args);
     va_end(args);
     return npu_scoped_timer<1024>(buffer);
diff --git a/ggml/src/ggml-qnn/npu/device/vec_math.inl b/ggml/src/ggml-qnn/npu/device/vec_math.inl
index ab7f01cf1b3d3..77782734cdce1 100644
--- a/ggml/src/ggml-qnn/npu/device/vec_math.inl
+++ b/ggml/src/ggml-qnn/npu/device/vec_math.inl
@@ -1120,10 +1120,75 @@ inline HVX_VectorPair hvx_vqf32_convert_vhf(HVX_Vector vxl) {
 
 inline HVX_Vector_x2 hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) {
     HVX_VectorPair res = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vxl), one);
-    return {
-        Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)),
-        Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)),
-    };
+
+    HVX_Vector_x2 ret;
+    ret.val[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res));
+    ret.val[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res));
+    return ret;
+}
+
+/**
+ * @brief Calculates exponential (e^x) for vector elements with infinity guard
+ *
+ * This function computes the exponential value for each element in the input vector.
+ * For input values greater than kMaxExp (88.02f), the function returns the provided
+ * infinity value instead of attempting to calculate an exponential that would overflow.
+ *
+ * @param sline The input vector containing values to compute exponential for
+ * @param inf The vector containing the infinity representation to use for guarded values
+ * @return HVX_Vector containing exponential values, with values > kMaxExp replaced by inf
+ *
+ * @note Input values greater than 88.02f will return the specified infinity value
+ */
+inline HVX_Vector qhmath_hvx_exp_vf_guard_inf(HVX_Vector sline, const HVX_Vector inf) {
+    constexpr float  kMaxExp = 88.02f;
+    const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(kMaxExp));
+
+    HVX_VectorPred pred_gt_max_exp = Q6_Q_vcmp_gt_VsfVsf(sline, max_exp);
+
+    HVX_Vector out = qhmath_hvx_exp_vf(sline);
+
+    out = Q6_V_vmux_QVV(pred_gt_max_exp, inf, out);
+    return out;
+}
+
+/**
+ * @brief Vectorized division with guard for infinite denominators on HVX.
+ *
+ * Performs element-wise division num/denom using qhmath_hvx_div_vf and then
+ * masks out lanes where denom equals the provided inf value, forcing those
+ * lanes of the result to zero. This is a temporary guard until proper INF
+ * handling is implemented in the underlying division routine.
+ *
+ * @param num    Numerator vector (per-lane).
+ * @param denom  Denominator vector (per-lane); lanes equal to inf are zeroed in the output.
+ * @param coeffs Coefficients used by qhmath_hvx_div_vf for the reciprocal/division approximation.
+ * @param inf    Lane value representing +INF to compare against denom.
+ * @return       Vector of num/denom with lanes set to zero where denom == inf.
+ *
+ * @note NaNs, negative infinity, zero denominators, and subnormals are not explicitly handled.
+ * @see qhmath_hvx_div_vf
+ */
+inline HVX_Vector qhmath_hvx_div_vf_guard_inf(HVX_Vector        num,
+                                              HVX_Vector        denom,
+                                              HVX_VectorPair_x4 coeffs,
+                                              const HVX_Vector  inf) {
+    HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(denom, inf);
+
+    // TODO: fix the inf in div
+    HVX_Vector out = qhmath_hvx_div_vf(num, denom, coeffs);
+
+    out = Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out);
+    return out;
+}
+
+inline HVX_Vector Q6_Vsf_vadd_VsfVsf_guard_inf(HVX_Vector num0, HVX_Vector num1, const HVX_Vector inf) {
+    HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(num0, inf);
+
+    HVX_Vector out = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(num0, num1));
+
+    out = Q6_V_vmux_QVV(pred0, inf, out);
+    return out;
 }
 
 }  // namespace hexagon::vec::math
diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp
index 92cb8ed9993b8..e286aebbb569b 100644
--- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp
+++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp
@@ -8,12 +8,18 @@
 
 namespace hexagon {
 
+constexpr const size_t kBytesPerVector = sizeof(HVX_Vector);  // 128 for v73
+constexpr const size_t kAlignMask      = kBytesPerVector - 1;
+
 template <typename T, int N> struct HEXAGON_pack {
     T val[N];
 };
 
-using HVX_Vector_x2     = std::pair<HVX_Vector, HVX_Vector>;
+using HVX_Vector_x2     = HEXAGON_pack<HVX_Vector, 2>;
+using HVX_Vector_x3     = HEXAGON_pack<HVX_Vector, 3>;
+using HVX_Vector_x4     = HEXAGON_pack<HVX_Vector, 4>;
 using HVX_VectorPair_x4 = HEXAGON_pack<HVX_VectorPair, 4>;
+using HVX_VectorPred_x3 = HEXAGON_pack<HVX_VectorPred, 3>;
 
 typedef union {
     HVX_VectorPair VV;
@@ -24,8 +30,14 @@ typedef union {
     } V;
 } HVX_DV;
 
-constexpr const size_t kBytesPerVector = sizeof(HVX_Vector);  // 128 for v73
-constexpr const size_t kAlignMask      = kBytesPerVector - 1;
+typedef union {
+    HVX_Vector v;
+    float      f32[kBytesPerVector / sizeof(float)];
+    uint32_t   u32[kBytesPerVector / sizeof(uint32_t)];
+    __fp16 f16[kBytesPerVector / sizeof(__fp16)];
+    uint16_t u16[kBytesPerVector / sizeof(uint16_t)];
+    uint8_t  u8[kBytesPerVector];
+} HVX_VectorAlias;
 
 inline size_t get_aligned_size(size_t size) {
     return (size + kAlignMask) & ~kAlignMask;
@@ -383,22 +395,35 @@ _TReturn type_erase_dot_func(const void * src0, const void * src1, size_t count)
 inline HVX_Vector vec_silu_f32_f32(HVX_Vector x, HVX_VectorPair_x4 coeff) {
     using namespace hexagon::vec::math;
 
-    HVX_Vector one = Q6_V_vsplat_R(0x3F800000);
+    constexpr float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(kMaxExp));
+    HVX_Vector       one     = Q6_V_vsplat_R(0x3F800000);
 
     // x/(1.0f + expf(-x));
-    HVX_Vector exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x));
-    HVX_Vector denom     = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one));
-    return qhmath_hvx_div_vf(x, denom, coeff);
+    HVX_Vector     exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x));
+    HVX_VectorPred pred0     = Q6_Q_vcmp_gt_VsfVsf(exp_neg_x, max_exp);
+    HVX_Vector     denom     = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one));
+    HVX_Vector     out       = qhmath_hvx_div_vf(x, denom, coeff);
+    out                      = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
+    return out;
 }
 
 inline HVX_Vector vec_silu_f16_f16(HVX_Vector x, HVX_VectorPair_x4 coeff) {
     using namespace hexagon::vec::math;
-    HVX_Vector one = Q6_Vh_vsplat_R(0x3c00);
+
+    constexpr __fp16 kMaxExp = 11.0898664f;  // log(INF)
+
+    const HVX_Vector max_exp = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kMaxExp));
+    HVX_Vector       one     = Q6_Vh_vsplat_R(0x3c00);
 
     // x/(1.0f + expf(-x));
-    HVX_Vector exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x));
-    HVX_Vector denom     = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one));
-    return qhmath_hvx_div_vhf(x, denom, coeff);
+    HVX_Vector     exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x));
+    HVX_VectorPred pred0     = Q6_Q_vcmp_gt_VhfVhf(exp_neg_x, max_exp);
+    HVX_Vector     denom     = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one));
+    HVX_Vector     out       = qhmath_hvx_div_vhf(x, denom, coeff);
+    out                      = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
+    return out;
 }
 
 inline HVX_Vector vec_swiglu_f32_f32(HVX_Vector x, HVX_Vector g, HVX_VectorPair_x4 coeff) {
diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl
index 854d975edb94c..f2bb174499ee4 100644
--- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl
+++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl
@@ -16,16 +16,18 @@ template <typename _TElem,
 inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size_t count) {
     constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);
 
+    const HVX_Vector kZeroV = Q6_V_vzero();
+
     HVX_Vector *       src0_vec_ptr     = ((HVX_Vector *) src0);
     HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
     HVX_Vector *       src1_vec_ptr     = ((HVX_Vector *) src1);
     HVX_Vector         prev0            = *src0_vec_ptr++;
     HVX_Vector         prev1            = *src1_vec_ptr++;
-    HVX_Vector         sum              = Q6_V_vzero();
+    HVX_Vector         sum              = kZeroV;
 
     if (src0_vec_ptr_end - src0_vec_ptr > 1) {
-        HVX_Vector sum0 = Q6_V_vzero();
-        HVX_Vector sum1 = Q6_V_vzero();
+        HVX_Vector sum0 = kZeroV;
+        HVX_Vector sum1 = kZeroV;
 
         do {
             HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
@@ -33,14 +35,19 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
 
             HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0);
             HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
-            sum0          = _AddFunc(_MpyFunc(l0, l1), sum0);
 
             HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0);
             HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
-            sum1          = _AddFunc(_MpyFunc(h0, h1), sum1);
+
+            HVX_Vector mpy0 = _MpyFunc(l0, l1);
+            HVX_Vector mpy1 = _MpyFunc(h0, h1);
 
             prev0 = Q6_V_hi_W(curr0);
             prev1 = Q6_V_hi_W(curr1);
+
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
+
             src0_vec_ptr += 2;
             src1_vec_ptr += 2;
         } while (src0_vec_ptr_end - src0_vec_ptr > 1);
@@ -73,10 +80,11 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
         src1_vec_ptr += should_fetch_src1 ? 1 : 0;
         HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
         HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
-        prev0         = curr0;
-        prev1         = curr1;
 
-        sum = _AddFunc(_MpyFunc(s0, s1), sum);
+        HVX_Vector mpy0 = _MpyFunc(s0, s1);
+        prev0           = curr0;
+        prev1           = curr1;
+        sum             = _AddFunc(mpy0, sum);
     }
 
     if (leftover > 0) {
@@ -92,7 +100,7 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
                                prev1;
         curr1            = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
 
-        sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
+        sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes), sum);
     }
 
     return _ReduceFunc(sum);
@@ -106,36 +114,38 @@ template <typename _TElem,
 inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * src1, size_t count) {
     constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);
 
+    const HVX_Vector kZeroV = Q6_V_vzero();
+
     HVX_Vector *       src0_vec_ptr     = ((HVX_Vector *) src0);
     HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
     HVX_Vector *       src1_vec_ptr     = ((HVX_Vector *) src1);
-    HVX_Vector         sum              = Q6_V_vzero();
+    HVX_Vector         sum              = kZeroV;
 
     {
-        HVX_Vector sum0 = Q6_V_vzero();
-        HVX_Vector sum1 = Q6_V_vzero();
-        if (src0_vec_ptr_end - src0_vec_ptr > 3) {
-            HVX_Vector sum2 = Q6_V_vzero();
-            HVX_Vector sum3 = Q6_V_vzero();
-
-            do {
-                HVX_VectorPair curr00 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
-                HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
-                sum0                  = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0);
-                sum1                  = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1);
-
-                HVX_VectorPair curr01 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[1];
-                HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
-                sum2                  = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2);
-                sum3                  = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3);
-
-                src0_vec_ptr += 4;
-                src1_vec_ptr += 4;
-            } while (src0_vec_ptr_end - src0_vec_ptr > 3);
-
-            sum0 = _AddFunc(sum2, sum0);
-            sum1 = _AddFunc(sum3, sum1);
-        }
+        HVX_Vector sum0 = kZeroV;
+        HVX_Vector sum1 = kZeroV;
+        while (src0_vec_ptr_end - src0_vec_ptr > 3) {
+            HVX_VectorPair curr00 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
+            HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
+
+            HVX_VectorPair curr01 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[1];
+            HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
+
+            HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10));
+            HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10));
+
+            HVX_Vector mpy2 = _MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11));
+            HVX_Vector mpy3 = _MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11));
+
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
+
+            sum0 = _AddFunc(mpy2, sum0);
+            sum1 = _AddFunc(mpy3, sum1);
+
+            src0_vec_ptr += 4;
+            src1_vec_ptr += 4;
+        };
 
         if (src0_vec_ptr_end - src0_vec_ptr > 1) {
             HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
@@ -143,8 +153,11 @@ inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr
             src0_vec_ptr += 2;
             src1_vec_ptr += 2;
 
-            sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)), sum0);
-            sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1);
+            HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1));
+            HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1));
+
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
         }
 
         sum = _AddFunc(sum0, sum1);
@@ -195,6 +208,7 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
 
     constexpr const __fp16 kOne = 1.0f;
     const HVX_Vector kOneV      = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
+    const HVX_Vector kZeroV     = Q6_V_vzero();
 
     const _TElem0 * const src0_ptr_end     = src0 + count;
     HVX_Vector *          src0_vec_ptr     = ((HVX_Vector *) src0);
@@ -202,27 +216,33 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
     HVX_Vector * const    src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
     HVX_Vector            prev0            = *src0_vec_ptr++;
     HVX_Vector            prev1            = *src1_vec_ptr++;
-    HVX_Vector            sum              = Q6_V_vzero();
+    HVX_Vector            sum              = kZeroV;
 
     if (src1_vec_ptr_end - src1_vec_ptr > 1) {
-        HVX_Vector sum0 = Q6_V_vzero();
-        HVX_Vector sum1 = Q6_V_vzero();
+        HVX_Vector sum0 = kZeroV;
+        HVX_Vector sum1 = kZeroV;
 
         do {
-            HVX_Vector     curr0 = src0_vec_ptr[0];
-            HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
+            HVX_Vector curr0 = src0_vec_ptr[0];
 
             HVX_Vector    s0      = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
             HVX_Vector_x2 s0_pair = _ExpandFunc(s0, kOneV);
 
-            HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
-            sum0          = _AddFunc(_MpyFunc(s0_pair.first, l1), sum0);
+            HVX_Vector curr10 = src1_vec_ptr[0];
+            HVX_Vector curr11 = src1_vec_ptr[1];
 
-            HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
-            sum1          = _AddFunc(_MpyFunc(s0_pair.second, h1), sum1);
+            HVX_Vector l1 = Q6_V_valign_VVR(curr10, prev1, (size_t) src1);
+            HVX_Vector h1 = Q6_V_valign_VVR(curr11, curr10, (size_t) src1);
+
+            HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], l1);
+            HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], h1);
 
             prev0 = curr0;
-            prev1 = Q6_V_hi_W(curr1);
+            prev1 = curr11;
+
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
+
             src0_vec_ptr++;
             src1_vec_ptr += 2;
         } while (src1_vec_ptr_end - src1_vec_ptr > 1);
@@ -245,8 +265,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
         if (has_remaining_src1_vector) {
             HVX_Vector curr1 = *src1_vec_ptr++;
             HVX_Vector s1    = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
-            sum              = _AddFunc(_MpyFunc(s0_pair.first, s1), sum);
-            prev1            = curr1;
+
+            HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], s1);
+            prev1           = curr1;
+
+            sum = _AddFunc(mpy0, sum);
         }
 
         bool       should_fetch_src1 = leftover1 != 0 || !hexagon::is_addr_aligned(src1_vec_ptr);
@@ -254,9 +277,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
         src1_vec_ptr += should_fetch_src1 ? 1 : 0;
         HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
         prev0         = curr0;
-        prev1         = curr1;
 
-        sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? s0_pair.second : s0_pair.first, s1), sum);
+        HVX_Vector mpy1 = _MpyFunc(has_remaining_src1_vector ? s0_pair.val[1] : s0_pair.val[0], s1);
+        prev1           = curr1;
+
+        sum = _AddFunc(mpy1, sum);
     }
 
     if (leftover1 > 0) {
@@ -274,8 +299,8 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
 
         HVX_Vector_x2 curr0_pair = _ExpandFunc(curr0, kOneV);
 
-        curr0 = leftover1 == leftover0 ? curr0_pair.first : curr0_pair.second;
-        sum   = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes1), sum);
+        curr0 = leftover1 == leftover0 ? curr0_pair.val[0] : curr0_pair.val[1];
+        sum   = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes1), sum);
     }
 
     return _ReduceFunc(sum);
@@ -299,44 +324,55 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem
 
     constexpr const __fp16 kOne = 1.0f;
     const HVX_Vector kOneV      = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
+    const HVX_Vector kZeroV     = Q6_V_vzero();
 
     HVX_Vector *       src0_vec_ptr     = ((HVX_Vector *) src0);
     HVX_Vector *       src1_vec_ptr     = ((HVX_Vector *) src1);
     HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
-    HVX_Vector         sum0             = Q6_V_vzero();
-    HVX_Vector         sum1             = Q6_V_vzero();
+    HVX_Vector         sum0             = kZeroV;
+    HVX_Vector         sum1             = kZeroV;
 
-    if (src1_vec_ptr_end - src1_vec_ptr > 3) {
-        HVX_Vector sum2 = Q6_V_vzero();
-        HVX_Vector sum3 = Q6_V_vzero();
+    while (src1_vec_ptr_end - src1_vec_ptr > 3) {
+        HVX_Vector curr0_lo  = src0_vec_ptr[0];
+        HVX_Vector curr10_lo = src1_vec_ptr[0];
 
-        do {
-            HVX_VectorPair curr0  = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
-            HVX_Vector_x2  curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV);
-            HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
-            sum0                  = _AddFunc(_MpyFunc(curr00.first, Q6_V_lo_W(curr10)), sum0);
-            sum1                  = _AddFunc(_MpyFunc(curr00.second, Q6_V_hi_W(curr10)), sum1);
+        HVX_Vector    curr0_hi = src0_vec_ptr[1];
+        HVX_Vector_x2 curr00   = _ExpandFunc(curr0_lo, kOneV);
 
-            HVX_Vector_x2  curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV);
-            HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
-            sum2                  = _AddFunc(_MpyFunc(curr01.first, Q6_V_lo_W(curr11)), sum2);
-            sum3                  = _AddFunc(_MpyFunc(curr01.second, Q6_V_hi_W(curr11)), sum3);
+        HVX_Vector    curr10_hi = src1_vec_ptr[1];
+        HVX_Vector_x2 curr01    = _ExpandFunc(curr0_hi, kOneV);
 
-            src0_vec_ptr += 2;
-            src1_vec_ptr += 4;
-        } while (src1_vec_ptr_end - src1_vec_ptr > 3);
+        HVX_Vector mpy0 = _MpyFunc(curr00.val[0], curr10_lo);
+        HVX_Vector mpy1 = _MpyFunc(curr00.val[1], curr10_hi);
 
-        sum0 = _AddFunc(sum0, sum2);
-        sum1 = _AddFunc(sum1, sum3);
-    }
+        HVX_Vector curr11_lo = src1_vec_ptr[2];
+        HVX_Vector curr11_hi = src1_vec_ptr[3];
+
+        sum0 = _AddFunc(mpy0, sum0);
+        sum1 = _AddFunc(mpy1, sum1);
+
+        HVX_Vector mpy2 = _MpyFunc(curr01.val[0], curr11_lo);
+        HVX_Vector mpy3 = _MpyFunc(curr01.val[1], curr11_hi);
+
+        sum0 = _AddFunc(mpy2, sum0);
+        sum1 = _AddFunc(mpy3, sum1);
+
+        src0_vec_ptr += 2;
+        src1_vec_ptr += 4;
+    };
 
     if (src1_vec_ptr_end - src1_vec_ptr > 1) {
-        HVX_Vector    curr0   = src0_vec_ptr[0];
-        HVX_Vector_x2 s0_pair = _ExpandFunc(curr0, kOneV);
+        HVX_Vector curr0    = src0_vec_ptr[0];
+        HVX_Vector curr1_lo = src1_vec_ptr[0];
 
-        HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
-        sum0                 = _AddFunc(_MpyFunc(s0_pair.first, Q6_V_lo_W(curr1)), sum0);
-        sum1                 = _AddFunc(_MpyFunc(s0_pair.second, Q6_V_hi_W(curr1)), sum1);
+        HVX_Vector_x2 s0_pair  = _ExpandFunc(curr0, kOneV);
+        HVX_Vector    curr1_hi = src1_vec_ptr[1];
+
+        HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], curr1_lo);
+        HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], curr1_hi);
+
+        sum0 = _AddFunc(mpy0, sum0);
+        sum1 = _AddFunc(mpy1, sum1);
     }
 
     return _ReduceFunc(_AddFunc(sum0, sum1));
@@ -360,14 +396,14 @@ inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size
         HVX_VectorPair curr = reinterpret_cast<HVX_VectorPair *>(src_vec_ptr)[0];
         src_vec_ptr += 2;
 
-        HVX_Vector lo  = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src);
-        dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec);
+        HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src);
+        HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src);
+        prev          = Q6_V_hi_W(curr);
 
-        HVX_Vector hi  = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src);
+        dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec);
         dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec);
 
         dst_vec_ptr += 2;
-        prev = Q6_V_hi_W(curr);
     }
 
     if (src_vec_end - src_vec_ptr > 0) {
@@ -405,14 +441,16 @@ template <typename _TData> inline void vec_zero_impl(_TData * src, size_t count)
     HVX_UVector *       src_vec_ptr = ((HVX_UVector *) src);
     HVX_UVector * const src_vec_end = ((HVX_UVector *) src) + (count / kElementsPerVector);
 
+    const HVX_Vector kZeroV = Q6_V_vzero();
+
     while (src_vec_end - src_vec_ptr > 1) {
-        src_vec_ptr[0] = Q6_V_vzero();
-        src_vec_ptr[1] = Q6_V_vzero();
+        src_vec_ptr[0] = kZeroV;
+        src_vec_ptr[1] = kZeroV;
         src_vec_ptr += 2;
     }
 
     if (src_vec_end - src_vec_ptr > 0) {
-        src_vec_ptr[0] = Q6_V_vzero();
+        src_vec_ptr[0] = kZeroV;
         src_vec_ptr++;
     }
 
@@ -420,7 +458,7 @@ template <typename _TData> inline void vec_zero_impl(_TData * src, size_t count)
     if (leftover > 0) {
         // handle the leftover elements
         const size_t leftover_bytes = leftover * sizeof(_TData);
-        q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, Q6_V_vzero());
+        q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, kZeroV);
     }
 }
 
diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp
index 526191173dd17..7e8f5db7dd85e 100644
--- a/ggml/src/ggml-qnn/npu/host/graph.cpp
+++ b/ggml/src/ggml-qnn/npu/host/graph.cpp
@@ -90,13 +90,16 @@ bool host_graph::compute() {
         return false;
     }
 
+    LOG_DEBUG("[%p]host_graph::compute started\n", (void *) this);
     SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle);
     auto status = npu_device_graph_compute(_device_handle, _graph_handle);
     if (status != AEE_SUCCESS) {
         LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);
+        LOG_DEBUG("[%p]host_graph::compute finished with failure\n", (void *) this);
         return false;
     }
 
+    LOG_DEBUG("[%p]host_graph::compute finished\n", (void *) this);
     return true;
 }
 
diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp
index fca1167282765..5e9f51887961b 100644
--- a/ggml/src/ggml-qnn/npu/host/host_device.cpp
+++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp
@@ -242,6 +242,7 @@ bool npu_device::init_rpc_mem() {
 
 bool npu_device::init_device_lib() {
     if (!_device_handle) {
+        set_fast_rpc_stack_size(_rpc_interface, _dsp_domain_id, NPU_THREAD_STACK_SIZE);
         auto         arch            = get_dsp_arch(_rpc_interface, _dsp_domain_id);
         const auto & device_lib_info = get_device_library_info(arch);
         std::string  device_lib_uri  = device_lib_info.device_lib_uri;
diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp
index f70526bf25dff..d2d07de897a95 100644
--- a/ggml/src/ggml-qnn/npu/host/tensor.hpp
+++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp
@@ -1,14 +1,14 @@
 #pragma once
 
-#include <list>
-#include <type_traits>
-#include <vector>
-
 #include "common.hpp"
 #include "ggml-impl.h"
 #include "hexagon_npu.h"
 #include "util.hpp"
 
+#include <list>
+#include <type_traits>
+#include <vector>
+
 namespace hexagon {
 
 // TODO: merge this with device tensor?
@@ -62,7 +62,7 @@ class host_tensor {
 
     ~host_tensor() {
         LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle);
-        if (_device_tensor_handle) {
+        if (_device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE) {
             npu_device_tensor_free(_device_handle, _device_tensor_handle);
             // TODO: figure out why the _ggml_tensor is invalid here
         }
@@ -113,8 +113,11 @@ class host_tensor {
         if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) {
             params_changed = true;
             memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
-            LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this,
-                      (int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2],
+            LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n",
+                      (void *) this,
+                      (int) _info_update.params[0],
+                      (int) _info_update.params[1],
+                      (int) _info_update.params[2],
                       (int) _info_update.params[3]);
         }
 
@@ -136,19 +139,29 @@ class host_tensor {
         if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) {
             params_changed = true;
             memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles));
-            LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this,
-                      (void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]);
+            LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n",
+                      (void *) this,
+                      (void *) _info_update.src_handles[0],
+                      (void *) _info_update.src_handles[1]);
         }
 
         if (params_changed) {
             npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update);
-            LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
-                      ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
-                      (int) _info_update.params[2], (int) _info_update.params[3]);
+            LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
+                      (void *) this,
+                      ggml_op_desc(_ggml_tensor),
+                      (int) _info_update.params[0],
+                      (int) _info_update.params[1],
+                      (int) _info_update.params[2],
+                      (int) _info_update.params[3]);
         } else {
-            LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
-                      ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
-                      (int) _info_update.params[2], (int) _info_update.params[3]);
+            LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n",
+                      (void *) this,
+                      ggml_op_desc(_ggml_tensor),
+                      (int) _info_update.params[0],
+                      (int) _info_update.params[1],
+                      (int) _info_update.params[2],
+                      (int) _info_update.params[3]);
         }
     }
 
@@ -174,9 +187,13 @@ class host_tensor {
 #endif
         }
 
-        LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
-                  ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
-                  (int) _info_update.params[2], (int) _info_update.params[3]);
+        LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
+                  (void *) this,
+                  ggml_op_desc(_ggml_tensor),
+                  (int) _info_update.params[0],
+                  (int) _info_update.params[1],
+                  (int) _info_update.params[2],
+                  (int) _info_update.params[3]);
         return _info_update;
     }
 
@@ -192,11 +209,21 @@ class host_tensor {
     }
 
     int get_desc(char * buffer, size_t size) const {
-        return snprintf(buffer, size, "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p",
-                        _ggml_tensor->name, (long) _ggml_tensor->ne[0], (long) _ggml_tensor->ne[1],
-                        (long) _ggml_tensor->ne[2], (long) _ggml_tensor->ne[3], (long) _ggml_tensor->nb[0],
-                        (long) _ggml_tensor->nb[1], (long) _ggml_tensor->nb[2], (long) _ggml_tensor->nb[3],
-                        ggml_type_name(_ggml_tensor->type), (void *) this, (void *) _ggml_tensor,
+        return snprintf(buffer,
+                        size,
+                        "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p",
+                        _ggml_tensor->name,
+                        (long) _ggml_tensor->ne[0],
+                        (long) _ggml_tensor->ne[1],
+                        (long) _ggml_tensor->ne[2],
+                        (long) _ggml_tensor->ne[3],
+                        (long) _ggml_tensor->nb[0],
+                        (long) _ggml_tensor->nb[1],
+                        (long) _ggml_tensor->nb[2],
+                        (long) _ggml_tensor->nb[3],
+                        ggml_type_name(_ggml_tensor->type),
+                        (void *) this,
+                        (void *) _ggml_tensor,
                         (void *) _device_tensor_handle);
     }
 
diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp
index 13a21c1f9efe1..28aaf34cb7ee2 100644
--- a/ggml/src/ggml-qnn/npu/host/util.cpp
+++ b/ggml/src/ggml-qnn/npu/host/util.cpp
@@ -149,6 +149,23 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_
     }
 }
 
+void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size) {
+    constexpr const uint32_t FASTRPC_THREAD_PARAMS = 1;
+
+    if (!rpc_interface || !rpc_interface->is_valid()) {
+        return;
+    }
+
+    remote_rpc_thread_params tp = {};
+    tp.domain                   = domain_id;
+    tp.prio                     = -1;
+    tp.stack_size               = stack_size;
+    auto ret                    = rpc_interface->remote_session_control(FASTRPC_THREAD_PARAMS, &tp, sizeof(tp));
+    if (ret != AEE_SUCCESS) {
+        LOG_ERROR("failed to set fast RPC stack size: 0x%x\n", ret);
+    }
+}
+
 void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
     if (dst == nullptr) {
         snprintf(out, max_len, "null");
@@ -161,15 +178,30 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
         switch (dims) {
             default:
             case 4:
-                snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
-                         (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]);
+                snprintf(out,
+                         max_len,
+                         "%s[%ldx%ldx%ldx%ld]",
+                         ggml_type_name(tensor->type),
+                         (long) tensor->ne[0],
+                         (long) tensor->ne[1],
+                         (long) tensor->ne[2],
+                         (long) tensor->ne[3]);
                 break;
             case 3:
-                snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
-                         (long) tensor->ne[1], (long) tensor->ne[2]);
+                snprintf(out,
+                         max_len,
+                         "%s[%ldx%ldx%ld]",
+                         ggml_type_name(tensor->type),
+                         (long) tensor->ne[0],
+                         (long) tensor->ne[1],
+                         (long) tensor->ne[2]);
                 break;
             case 2:
-                snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
+                snprintf(out,
+                         max_len,
+                         "%s[%ldx%ld]",
+                         ggml_type_name(tensor->type),
+                         (long) tensor->ne[0],
                          (long) tensor->ne[1]);
                 break;
             case 1:
@@ -201,8 +233,14 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
                 print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
                 char src3_desc[256];
                 print_tensor(dst->src[3], src3_desc, sizeof(src3_desc));
-                snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s", dst_desc, src0_desc,
-                         src1_desc, src2_desc, src3_desc);
+                snprintf(out,
+                         max_len,
+                         "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s",
+                         dst_desc,
+                         src0_desc,
+                         src1_desc,
+                         src2_desc,
+                         src3_desc);
                 return;
             }
         case 3:
@@ -213,8 +251,8 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
                 print_tensor(dst->src[1], src1_desc, sizeof(src1_desc));
                 char src2_desc[256];
                 print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
-                snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc,
-                         src2_desc);
+                snprintf(
+                    out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc, src2_desc);
                 return;
             }
         case 2:
diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp
index b4c2355cac298..44e482679b3ad 100644
--- a/ggml/src/ggml-qnn/npu/host/util.hpp
+++ b/ggml/src/ggml-qnn/npu/host/util.hpp
@@ -23,6 +23,7 @@ hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t
 const char * get_dsp_arch_desc(hexagon_dsp_arch arch);
 
 void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
+void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size);
 
 void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len);
 
diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl
index 5aab3524c6043..bc7de725abd3f 100644
--- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl
+++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl
@@ -13,6 +13,8 @@ const uint32_t NPU_ROPE_TYPE_NEOX = 2;
 const uint32_t NPU_ROPE_TYPE_MROPE = 8;
 const uint32_t NPU_ROPE_TYPE_VISION = 24;
 
+const uint32_t NPU_THREAD_STACK_SIZE = 64 * 1024;
+
 interface npu_device : remote_handle64{
 
     typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS];