chraac · chraac · Aug 29, 2025 · Aug 2, 2025 · Aug 2, 2025 · Aug 3, 2025
diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt
@@ -220,7 +220,7 @@ else()
         target_compile_options(hexagon_npu_skel_OBJS PUBLIC
             -fsanitize=address -fno-omit-frame-pointer
         )
-        target_link_libraries(hexagon_npu_skel_OBJS PUBLIC
+        target_link_options(hexagon_npu_skel_OBJS PUBLIC
             -fsanitize=address
         )
     endif()
@@ -248,9 +248,9 @@ else()
 
     add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
     target_link_libraries(hexagon_npu_skel
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.a
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.so.1
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.so.1
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.so
     )
     set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
     target_link_libraries(hexagon_npu_skel qprintf_static)

diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp
@@ -17,21 +17,30 @@
 namespace {
 
 struct npu_device_context {
+    std::unique_ptr<hexagon::power_utils>         power_utils;       // Power management utilities
     std::unique_ptr<hexagon::default_thread_pool> thread_pool;
     std::unique_ptr<float[]>                      f16_to_f32_table;  // TODO: store vtcm?
 
     bool init() {
         if (!init_ltu()) {
-            DEVICE_LOG_ERROR("Failed to initialize LTU");
+            DEVICE_LOG_ERROR("Failed to initialize LTU\n");
             return false;
         }
 
         if (!init_thread_pool()) {
-            DEVICE_LOG_ERROR("Failed to initialize thread pool");
+            DEVICE_LOG_ERROR("Failed to initialize thread pool\n");
             return false;
         }
 
-        DEVICE_LOG_DEBUG("NPU device context initialized");
+        power_utils = std::make_unique<hexagon::power_utils>();
+        if (power_utils && power_utils->is_valid()) {
+            power_utils->set_dvcs_performance_mode(true);
+            DEVICE_LOG_DEBUG("Power utilities initialized with DVCS performance mode enabled\n");
+        } else {
+            DEVICE_LOG_ERROR("Failed to initialize power utilities\n");
+        }
+
+        DEVICE_LOG_DEBUG("NPU device context initialized\n");
         return true;
     }
 
@@ -41,29 +50,29 @@ struct npu_device_context {
 
         f16_to_f32_table = std::make_unique<float[]>(kLtuCount);
         if (!f16_to_f32_table) {
-            DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table");
+            DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table\n");
             return false;
         }
 
         hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount);
-        DEVICE_LOG_DEBUG("f16_to_f32 table initialized");
+        DEVICE_LOG_DEBUG("f16_to_f32 table initialized\n");
         return true;
     }
 
     bool init_thread_pool() {
         if (thread_pool) {
-            DEVICE_LOG_DEBUG("Thread pool already initialized");
+            DEVICE_LOG_DEBUG("Thread pool already initialized\n");
             return true;
         }
 
         auto pool = std::make_unique<hexagon::default_thread_pool>();
         if (!pool) {
-            DEVICE_LOG_ERROR("Failed to create thread pool");
+            DEVICE_LOG_ERROR("Failed to create thread pool\n");
             return false;
         }
 
         thread_pool = std::move(pool);
-        DEVICE_LOG_DEBUG("Thread pool initialized");
+        DEVICE_LOG_DEBUG("Thread pool initialized\n");
         return true;
     }
 };
@@ -102,25 +111,25 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
     // TODO: should we have a device context here?
     auto * context = new npu_device_context();
     if (!context->init()) {
-        DEVICE_LOG_ERROR("Failed to initialize npu_device_context");
+        DEVICE_LOG_ERROR("Failed to initialize npu_device_context\n");
         delete context;
         return AEE_EFAILED;
     }
 
     *h = reinterpret_cast<remote_handle64>(context);
-    DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
+    DEVICE_LOG_INFO("NPU device context created: %p\n", (void *) *h);
     return AEE_SUCCESS;
 }
 
 int npu_device_close(remote_handle64 h) {
     auto * context = device_context_from_handle(h);
     if (!context) {
-        DEVICE_LOG_ERROR("Invalid npu_device_context handle");
+        DEVICE_LOG_ERROR("Invalid npu_device_context handle\n");
         return AEE_EINVHANDLE;
     }
 
     delete context;
-    DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
+    DEVICE_LOG_INFO("NPU device context destroyed: %p\n", (void *) h);
     return AEE_SUCCESS;
 }
 
@@ -139,7 +148,7 @@ AEEResult npu_device_device_support_op(remote_handle64                   _h,
     NPU_UNUSED(_h);
 
     if (!srcs || srcsLen <= 0 || !dst || !is_supported) {
-        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
+        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments\n");
         return AEE_EINVARGS;
     }
 
@@ -185,7 +194,7 @@ AEEResult npu_device_tensors_free(remote_handle64                    _h,
                                   int                                tensor_handlesLen) {
     NPU_UNUSED(_h);
     if (!tensor_handles || tensor_handlesLen < 0) {
-        DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments");
+        DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments\n");
         return AEE_EINVARGS;
     }
 
@@ -194,7 +203,7 @@ AEEResult npu_device_tensors_free(remote_handle64                    _h,
         if (tensor) {
             delete tensor;
         } else {
-            DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d", i);
+            DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d\n", i);
         }
     }
 
@@ -250,13 +259,13 @@ AEEResult npu_device_graph_set_tensor_with_param(remote_handle64
 AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
     auto dev_ctx = device_context_from_handle(_h);
     if (!dev_ctx) {
-        DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
+        DEVICE_LOG_DEBUG("Invalid npu_device_context handle\n");
         return AEE_EINVHANDLE;
     }
 
     auto * graph = graph_from_handle(graph_handle);
     if (!graph) {
-        DEVICE_LOG_ERROR("Invalid graph handle");
+        DEVICE_LOG_ERROR("Invalid graph handle\n");
         return AEE_EINVHANDLE;
     }
 

diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp
@@ -91,6 +91,7 @@ void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread
 
         const bool should_sync = requires_thread_barrier(op);
         if (pool && should_sync && i < _tensor_count - 1) {
+            // For the last tensor, the thread pool will handle synchronization
             DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]",
                                               (void *) this,
                                               params.get_thread_index(),

diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp
@@ -13,7 +13,7 @@ inline float f16_to_f32(const npu_device_fp16_t src) {
 }
 
 // From: ggml/src/ggml-cpu/ops.cpp
-template <bool _IsKvF16>
+template <bool _IsKvF16, bool _HasMask>
 void flash_attn_impl(hexagon::tensor *         out,
                      const hexagon::tensor *   q,
                      const hexagon::tensor *   k,
@@ -24,6 +24,7 @@ void flash_attn_impl(hexagon::tensor *         out,
     static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count");
 
     constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32;
+    constexpr const bool                        kHasMask    = _HasMask;
 
     if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) {
         DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n",
@@ -32,6 +33,11 @@ void flash_attn_impl(hexagon::tensor *         out,
         return;
     }
 
+    if (kHasMask != (mask != nullptr)) {
+        DEVICE_LOG_ERROR("flash_attn_impl: mask is required when kHasMask is true\n");
+        return;
+    }
+
     float       scale         = out->get_op_param<float>(0);
     const float max_bias      = out->get_op_param<float>(1);
     const float logit_softcap = out->get_op_param<float>(2);
@@ -96,7 +102,7 @@ void flash_attn_impl(hexagon::tensor *         out,
     const uint8_t * q_ptr     = q->get_read_buffer();
     const uint8_t * k_ptr     = k->get_read_buffer();
     const uint8_t * v_ptr     = v->get_read_buffer();
-    const uint8_t * mask_ptr  = mask ? mask->get_read_buffer() : nullptr;
+    const uint8_t * mask_ptr  = kHasMask ? mask->get_read_buffer() : nullptr;
     const uint8_t * sinks_ptr = sinks ? sinks->get_read_buffer() : nullptr;
     float *         VKQ32     = reinterpret_cast<float *>(cache_ptr);          // FP32 VKQ accumulator
     auto * VKQ16 = reinterpret_cast<npu_device_fp16_t *>(VKQ32 + aligned_dv);  // (temporary) FP16 VKQ accumulator
@@ -125,11 +131,17 @@ void flash_attn_impl(hexagon::tensor *         out,
         }
 
         const npu_device_fp16_t * mp =
-            mask_ptr ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
+            kHasMask ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
                                                                    (iq2 % mask->get_ne(2)) * mask->get_nb(2) +
                                                                    (iq3 % mask->get_ne(3)) * mask->get_nb(3)) :
                        nullptr;
 
+        q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
+
+        if (kHasMask) {
+            hexagon::l2fetch_row(reinterpret_cast<const uint8_t *>(mp), mask->get_nb(1));
+        }
+
         // k indices
         const int ik3 = iq3 / rk3;
         const int ik2 = iq2 / rk2;
@@ -138,16 +150,14 @@ void flash_attn_impl(hexagon::tensor *         out,
         const int iv3 = iq3 / rv3;
         const int iv2 = iq2 / rv2;
 
-        q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
-
         // online softmax / attention
         // loop over n_kv and n_head_kv
         // ref: https://arxiv.org/pdf/2112.05682.pdf
         const auto * k_plane_ptr = k_ptr + ik2 * k->get_nb(2) + ik3 * k->get_nb(3);
         const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3);
         for (int64_t ic = 0; ic < k->get_ne(1); ++ic) {
             DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop);
-            float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f;
+            float mv = kHasMask ? (slope * f16_to_f32(mp[ic])) : 0.0f;
             if (mv == -INFINITY) {
                 continue;
             }
@@ -282,9 +292,17 @@ bool flash_attn_f32(tensor * out, compute_params * params) {
     const auto * mask  = out->get_src(3);
     const auto * sinks = out->get_src(4);
     if (k->get_type() == NPU_DATA_TYPE_F16) {
-        flash_attn_impl<true>(out, q, k, v, mask, sinks, params);
+        if (mask) {
+            flash_attn_impl<true, true>(out, q, k, v, mask, sinks, params);
+        } else {
+            flash_attn_impl<true, false>(out, q, k, v, mask, sinks, params);
+        }
     } else {
-        flash_attn_impl<false>(out, q, k, v, mask, sinks, params);
+        if (mask) {
+            flash_attn_impl<false, true>(out, q, k, v, mask, sinks, params);
+        } else {
+            flash_attn_impl<false, false>(out, q, k, v, mask, sinks, params);
+        }
     }
     return true;
 }
@@ -338,8 +356,8 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
 
     if (dst->ne[0] != v->ne[0] || dst->ne[2] != q->ne[1]) {
         DEVICE_LOG_DEBUG(
-            "[%s]dst shape does not match q and v: dst ne: %ld, %ld, %ld, %ld, q ne: %ld, %ld, %ld, %ld, "
-            "v ne: %ld, %ld, %ld, %ld\n",
+            "[%s]dst shape does not match q and v: dst ne: %lld, %lld, %lld, %lld, q ne: %lld, %lld, %lld, %lld, "
+            "v ne: %lld, %lld, %lld, %lld\n",
             op_get_name(op),
             dst->ne[0],
             dst->ne[1],
@@ -359,24 +377,25 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
     if (is_transposed_or_permuted(dst->nb)) {
         DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n",
                          op_get_name(op),
-                         dst->nb[0],
-                         dst->nb[1],
-                         dst->nb[2],
-                         dst->nb[3]);
+                         (size_t) dst->nb[0],
+                         (size_t) dst->nb[1],
+                         (size_t) dst->nb[2],
+                         (size_t) dst->nb[3]);
         return false;
     }
 
     if (q->ne[0] != k->ne[0]) {
-        DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n",
-                         op_get_name(op),
-                         q->ne[0],
-                         q->ne[1],
-                         q->ne[2],
-                         q->ne[3],
-                         k->ne[0],
-                         k->ne[1],
-                         k->ne[2],
-                         k->ne[3]);
+        DEVICE_LOG_DEBUG(
+            "[%s]q and k shapes do not match: q ne: %lld, %lld, %lld, %lld, k ne: %lld, %lld, %lld, %lld\n",
+            op_get_name(op),
+            q->ne[0],
+            q->ne[1],
+            q->ne[2],
+            q->ne[3],
+            k->ne[0],
+            k->ne[1],
+            k->ne[2],
+            k->ne[3]);
         return false;
     }