From 04c99b10b171562787e63a5f6e37c0e8c9770363 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 4 Jul 2025 09:22:49 +0800 Subject: [PATCH 01/53] rename --- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 68 +++++++++++++----------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 6f89f454598ba..410ea4af560f6 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -16,54 +16,58 @@ template inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * const iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector * optr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; - - while (iptr0 < iptr0_end) { - HVX_Vector curr0 = *iptr0++; - HVX_Vector curr1 = *iptr1++; + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + + while (src0_vec_ptr < src0_vec_ptr_end) { + HVX_Vector curr0 = *src0_vec_ptr++; + HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - *optr++ = _OpIntrinsic(s0, s1); - prev0 = curr0; - prev1 = curr1; + dst_vec_ptr[0] = _OpIntrinsic(s0, s1); + dst_vec_ptr++; + prev0 = curr0; + prev1 = curr1; } const size_t leftover = count % kElementsPerVector; - if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { // handle the last vector // see also: // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(iptr0); - bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(iptr1); - HVX_Vector curr0 = should_fetch_src0 ? *iptr0 : prev0; - HVX_Vector curr1 = should_fetch_src1 ? *iptr1 : prev1; - iptr0 += should_fetch_src0 ? 1 : 0; - iptr1 += should_fetch_src1 ? 1 : 0; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - *optr++ = _OpIntrinsic(s0, s1); - prev0 = curr0; - prev1 = curr1; + bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr); + bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; + HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; + src0_vec_ptr += should_fetch_src0 ? 1 : 0; + src1_vec_ptr += should_fetch_src1 ? 1 : 0; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + dst_vec_ptr[0] = _OpIntrinsic(s0, s1); + dst_vec_ptr++; + prev0 = curr0; + prev1 = curr1; } const size_t leftover_bytes = leftover * sizeof(_TyData); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr0 = - (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector curr1 = - (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - hexagon::q6op_vstu_variable_ARV(optr, leftover_bytes, _OpIntrinsic(curr0, curr1)); + hexagon::q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpIntrinsic(curr0, curr1)); } } From 28d527e8d92242a22feb78524bae1fcd2b206801 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 4 Jul 2025 09:41:17 +0800 Subject: [PATCH 02/53] Refactor vector operations in vec_op_impl and vec_dot_product_impl for improved clarity and performance --- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 38 +++++++++++++++++++----- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 36 ++++++++++++---------- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 410ea4af560f6..9eaa27e388ec8 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -23,15 +23,36 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count HVX_Vector prev0 = *src0_vec_ptr++; HVX_Vector prev1 = *src1_vec_ptr++; - while (src0_vec_ptr < src0_vec_ptr_end) { + { + while (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); + prev0 = Q6_V_hi_W(curr0); + prev1 = Q6_V_hi_W(curr1); + src0_vec_ptr += 2; + src1_vec_ptr += 2; + + dst_vec_ptr[0] = _OpIntrinsic(l0, l1); + dst_vec_ptr[1] = _OpIntrinsic(h0, h1); + dst_vec_ptr += 2; + } + } + + if (src0_vec_ptr_end - src0_vec_ptr > 0) { HVX_Vector curr0 = *src0_vec_ptr++; HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - dst_vec_ptr[0] = _OpIntrinsic(s0, s1); + prev0 = curr0; + prev1 = curr1; + + dst_vec_ptr[0] = _OpIntrinsic(s0, s1); dst_vec_ptr++; - prev0 = curr0; - prev1 = curr1; } const size_t leftover = count % kElementsPerVector; @@ -46,12 +67,13 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; src0_vec_ptr += should_fetch_src0 ? 1 : 0; src1_vec_ptr += should_fetch_src1 ? 1 : 0; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + prev0 = curr0; + prev1 = curr1; + dst_vec_ptr[0] = _OpIntrinsic(s0, s1); dst_vec_ptr++; - prev0 = curr0; - prev1 = curr1; } const size_t leftover_bytes = leftover * sizeof(_TyData); diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index 4375bb7d5b7ae..21da3493369f8 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -15,27 +15,31 @@ inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size HVX_Vector prev0 = *src0_vec_ptr++; HVX_Vector prev1 = *src1_vec_ptr++; HVX_Vector sum = Q6_V_vzero(); - HVX_Vector sum0 = Q6_V_vzero(); - HVX_Vector sum1 = Q6_V_vzero(); - while (src0_vec_ptr_end - src0_vec_ptr > 1) { - HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + { + HVX_Vector sum0 = Q6_V_vzero(); + HVX_Vector sum1 = Q6_V_vzero(); + + while (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); + prev0 = Q6_V_hi_W(curr0); + prev1 = Q6_V_hi_W(curr1); + src0_vec_ptr += 2; + src1_vec_ptr += 2; - HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); - HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); - HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - prev0 = Q6_V_hi_W(curr0); - prev1 = Q6_V_hi_W(curr1); - src0_vec_ptr += 2; - src1_vec_ptr += 2; + sum0 = _AddFunc(_MpyFunc(l0, l1), sum0); + sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); + } - sum0 = _AddFunc(_MpyFunc(l0, l1), sum0); - sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); + sum = _AddFunc(sum0, sum1); } - sum = _AddFunc(sum0, sum1); if (src0_vec_ptr_end - src0_vec_ptr > 0) { HVX_Vector curr0 = *src0_vec_ptr++; HVX_Vector curr1 = *src1_vec_ptr++; From ddf95af70279cf04275fb505d1a353f626bc04df Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 4 Jul 2025 13:02:46 +0800 Subject: [PATCH 03/53] wip --- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 22 ++++++++++---------- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 14 ++++++------- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 6 +++++- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 9eaa27e388ec8..f39f6c230bf38 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -12,8 +12,8 @@ namespace { -template -inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { +template +inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); @@ -37,8 +37,8 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count src0_vec_ptr += 2; src1_vec_ptr += 2; - dst_vec_ptr[0] = _OpIntrinsic(l0, l1); - dst_vec_ptr[1] = _OpIntrinsic(h0, h1); + dst_vec_ptr[0] = _OpBinaryTransform(l0, l1); + dst_vec_ptr[1] = _OpBinaryTransform(h0, h1); dst_vec_ptr += 2; } } @@ -51,7 +51,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count prev0 = curr0; prev1 = curr1; - dst_vec_ptr[0] = _OpIntrinsic(s0, s1); + dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); dst_vec_ptr++; } @@ -72,7 +72,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count prev0 = curr0; prev1 = curr1; - dst_vec_ptr[0] = _OpIntrinsic(s0, s1); + dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); dst_vec_ptr++; } @@ -89,13 +89,13 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count prev1; curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - hexagon::q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpIntrinsic(curr0, curr1)); + hexagon::q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpBinaryTransform(curr0, curr1)); } } -template +template inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { - vec_op_impl<_OpIntrinsic, float>(src0, src1, count, dst); + vec_trans_op_impl<_OpBinaryTransform, float>(src0, src1, count, dst); } inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) { @@ -110,10 +110,10 @@ inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b)); } -template +template inline void vec_op_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count, npu_device_fp16_t * dst) { - vec_op_impl<_OpIntrinsic, npu_device_fp16_t>(src0, src1, count, dst); + vec_trans_op_impl<_OpBinaryTransform, npu_device_fp16_t>(src0, src1, count, dst); } inline HVX_Vector vadd_f16_f16(HVX_Vector a, HVX_Vector b) { diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index ff1335ace2731..9490d940995d6 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -14,18 +14,18 @@ struct get_data_type { using data_type1 = _TyData1; }; -template +template void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst, hexagon::compute_params * params) { using data_type0 = typename get_data_type::data_type0; using data_type1 = typename get_data_type::data_type1; - static_assert(!_IsQuantized || std::is_same_v, + static_assert(!_ShouldCacheSrc0 || std::is_same_v, "data_type0 must be the same as hexagon::dequant_target_type"); const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; - if (_IsQuantized && dequantize_row_func == nullptr) { + if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) { DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); return; } @@ -61,7 +61,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso size_t src0_plane_cache_size = 0; uint8_t * src0_plane_cache_ptr = nullptr; const uint8_t * last_cached_plane_ptr = nullptr; - if constexpr (_IsQuantized) { + if constexpr (_ShouldCacheSrc0) { src0_plane_slice_row_count = std::min(params->get_vtcm_quota_size() / src0_actual_row_size, src0_plane_slice_row_count); src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count; @@ -78,7 +78,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso DEVICE_LOG_DEBUG( "mul_mat_impl src0_actual_row_size: %zu, src0_plane_slice_row_count: %zu, is_quantized: %d, vtcm_mem: " "%p(%zu)\n", - src0_actual_row_size, src0_plane_slice_row_count, _IsQuantized, (void *) src0_plane_cache_ptr, + src0_actual_row_size, src0_plane_slice_row_count, _ShouldCacheSrc0, (void *) src0_plane_cache_ptr, src0_plane_cache_size); const size_t valid_row0_bytes = src0->get_ne(0) * sizeof(data_type0); @@ -92,7 +92,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso return; } - constexpr bool should_fetch_src0_row = !_IsQuantized; + constexpr bool should_fetch_src0_row = !_ShouldCacheSrc0; const uint8_t * src0_ptr = src0->get_read_buffer(); const uint8_t * src1_ptr = src1->get_read_buffer(); for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { @@ -107,7 +107,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso start_end_element.second - col_idx); // number of rows in this slice const uint8_t * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + col_idx * src0->get_nb(1); - if constexpr (_IsQuantized) { + if constexpr (_ShouldCacheSrc0) { if (last_cached_plane_ptr != src0_plane) { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 0, dequant); diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 704607167fec5..1f735fc3504f5 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -488,6 +488,10 @@ void dequantize_row_q4_K(const void * src, hexagon::dequant_target_type * dst, s } } +void copy_row_f16(const void * src, hexagon::dequant_target_type * dst, size_t count) { + memcpy(dst, src, count * sizeof(hexagon::dequant_target_type)); +} + template struct dot_func_traits {}; template struct dot_func_traits { @@ -505,7 +509,7 @@ template float wrap_dot_func(const void * src0, const void * src constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { { NPU_DATA_TYPE_F32, "F32", 1, sizeof(float), false, nullptr, nullptr, wrap_dot_func }, - { NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, nullptr, quantize_row_fp16, + { NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, copy_row_f16, quantize_row_fp16, wrap_dot_func }, { NPU_DATA_TYPE_I32, "I32", 1, sizeof(int32_t), false, nullptr, nullptr, nullptr }, { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q8_0), true, dequantize_row_q8_0, From f0d51d24449e7fa3ccdcb9a73bf081d5928b2f6e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 4 Jul 2025 13:25:59 +0800 Subject: [PATCH 04/53] Enhance vector copy functions for improved performance and clarity in vec_ops.hpp --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 4 +-- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 2 +- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 32 +++++++++++++++----- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 9490d940995d6..7a8370a9a0b9e 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -300,8 +300,8 @@ bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { switch (src1->get_type()) { case NPU_DATA_TYPE_F32: if (is_src0_quantized || src0->get_type() == NPU_DATA_TYPE_F16) { - kMulMatF16F32Funcs[is_src0_quantized][is_mul_mat_f16_f32_src_tensors_aligned( - src0, src1, is_src0_quantized)](src0, src1, out, params); + kMulMatF16F32Funcs[1][is_mul_mat_f16_f32_src_tensors_aligned(src0, src1, is_src0_quantized)]( + src0, src1, out, params); } else { if (is_mul_mat_f32_f32_src_tensors_aligned(src0, src1)) { mul_mat_impl(src0, src1, out, params); diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 1f735fc3504f5..85d8506e6756f 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -489,7 +489,7 @@ void dequantize_row_q4_K(const void * src, hexagon::dequant_target_type * dst, s } void copy_row_f16(const void * src, hexagon::dequant_target_type * dst, size_t count) { - memcpy(dst, src, count * sizeof(hexagon::dequant_target_type)); + hexagon::vec_cpy_f16(reinterpret_cast(src), dst, count); } template struct dot_func_traits {}; diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 220dc8f77c02d..a88235bb89ee8 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -288,14 +288,6 @@ inline HVX_Vector hvx_vec_mad_f32_f32(HVX_Vector src, HVX_UVector * dst_ptr, HVX return Q6_Vsf_equals_Vqf32(src); } -inline void vec_scale_f32(const float * src, float scale, float * dst, size_t count) { - vec_scale_impl(src, scale, dst, count); -} - -inline void vec_mad_f32(const float * src, float scale, float * dst, size_t count) { - vec_scale_impl(src, scale, dst, count); -} - inline HVX_Vector hvx_scale_f16(float scale) { __fp16 f16_scale = scale; return Q6_Vh_vsplat_R(reinterpret_cast(f16_scale)); @@ -312,6 +304,26 @@ inline HVX_Vector hvx_vec_mad_f16_f16(HVX_Vector src, HVX_UVector * dst_ptr, HVX return Q6_Vhf_equals_Vqf16(result); } +inline HVX_Vector hvx_nop(float scale) { + return HVX_Vector(); +} + +inline HVX_Vector hvx_passthru(HVX_Vector src, HVX_UVector *, HVX_Vector) { + return src; +} + +inline void vec_scale_f32(const float * src, float scale, float * dst, size_t count) { + vec_scale_impl(src, scale, dst, count); +} + +inline void vec_mad_f32(const float * src, float scale, float * dst, size_t count) { + vec_scale_impl(src, scale, dst, count); +} + +inline void vec_cpy_f32(const float * src, float * dst, size_t count) { + vec_scale_impl(src, 0, dst, count); +} + inline void vec_scale_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) { vec_scale_impl(src, scale, dst, count); } @@ -320,6 +332,10 @@ inline void vec_mad_f16(const npu_device_fp16_t * src, float scale, npu_device_f vec_scale_impl(src, scale, dst, count); } +inline void vec_cpy_f16(const npu_device_fp16_t * src, npu_device_fp16_t * dst, size_t count) { + vec_scale_impl(src, 0, dst, count); +} + template inline bool is_dot_product_aligned(const _TElem0 * src0, const _TElem1 * src1, size_t count) { static_assert(sizeof(_TElem0) <= sizeof(_TElem1), "src0 should be smaller than src1"); From 814a8d40ea9af0920d292623bea4f8e1b5f77b89 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 4 Jul 2025 14:08:10 +0800 Subject: [PATCH 05/53] wip --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 7a8370a9a0b9e..b80446cb6fa75 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -250,17 +250,10 @@ bool is_mul_mat_f32_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::ten typedef void (*mul_mat_func_type)(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst, hexagon::compute_params * params); -constexpr const mul_mat_func_type kMulMatF16F32Funcs[2][2] = { - { - // non-quantized - mul_mat_impl, // F32 * F32 unaligned - mul_mat_impl, // F32 * F32 aligned - }, - { - // quantized - mul_mat_impl, // F32 * F32 quantized unaligned - mul_mat_impl, // F32 * F32 quantized aligned - }, +constexpr const mul_mat_func_type kMulMatF16F32Funcs[2] = { + // quantized and non-quantized + mul_mat_impl, // F32 * F32 quantized unaligned + mul_mat_impl, // F32 * F32 quantized aligned }; constexpr const mul_mat_func_type kMulMatF16Funcs[2][2] = { @@ -300,8 +293,8 @@ bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { switch (src1->get_type()) { case NPU_DATA_TYPE_F32: if (is_src0_quantized || src0->get_type() == NPU_DATA_TYPE_F16) { - kMulMatF16F32Funcs[1][is_mul_mat_f16_f32_src_tensors_aligned(src0, src1, is_src0_quantized)]( - src0, src1, out, params); + kMulMatF16F32Funcs[is_mul_mat_f16_f32_src_tensors_aligned(src0, src1, is_src0_quantized)](src0, src1, + out, params); } else { if (is_mul_mat_f32_f32_src_tensors_aligned(src0, src1)) { mul_mat_impl(src0, src1, out, params); From 41f3f6400c1c2a238b4729b8aa7e6592d3ab7d94 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 4 Jul 2025 16:37:15 +0800 Subject: [PATCH 06/53] wip --- ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp | 5 ++++- ggml/src/ggml-qnn/npu/device/op_rope.cpp | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 9c264654c1c9e..bc423a7cc4e44 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -218,7 +218,10 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const int i3 = iq3; // permute(0, 2, 1, 3) - memcpy(dst_ptr + (i3 * out_rows_per_batch + i2 + i1 * out->get_ne(1)) * out->get_nb(1), VKQ32, out->get_nb(1)); + hexagon::vec_cpy_f32( + reinterpret_cast(VKQ32), + reinterpret_cast(dst_ptr + (i3 * out_rows_per_batch + i2 + i1 * out->get_ne(1)) * out->get_nb(1)), + out->get_ne(0)); } out->release_write_buffer(); // mark the output tensor as modified diff --git a/ggml/src/ggml-qnn/npu/device/op_rope.cpp b/ggml/src/ggml-qnn/npu/device/op_rope.cpp index 514c445290ef2..34bd0409db90e 100644 --- a/ggml/src/ggml-qnn/npu/device/op_rope.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_rope.cpp @@ -270,8 +270,9 @@ bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) { } } else { // fill the remain channels with data from src tensor - memcpy(dst_row + n_dims * out->get_nb(0), src0_row + n_dims * src0->get_nb(0), - (out->get_ne(0) - n_dims) * sizeof(float)); + hexagon::vec_cpy_f32(reinterpret_cast(src0_row + n_dims * src0->get_nb(0)), + reinterpret_cast(dst_row + n_dims * out->get_nb(0)), + out->get_ne(0) - n_dims); } } } From ceb2fe224283e8901570b7dba3dbbf39342ff4e6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 4 Jul 2025 23:48:19 +0800 Subject: [PATCH 07/53] wip --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 6 ++- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 46 ++++++------------- 2 files changed, 18 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index bc423a7cc4e44..2f5d7077ef7f8 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -127,6 +127,8 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex // online softmax / attention // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf + const auto * k_plane_ptr = k_ptr + ik2 * k->get_nb(2) + ik3 * k->get_nb(3); + const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3); for (int64_t ic = 0; ic < k->get_ne(1); ++ic) { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop); float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f; @@ -137,7 +139,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex float s = 0.f; { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 1, kq_dot); - const auto * k_data = k_ptr + (ic * k->get_nb(1) + ik2 * k->get_nb(2) + ik3 * k->get_nb(3)); + const auto * k_data = k_plane_ptr + ic * k->get_nb(1); if (ic < k->get_ne(1) - 1) { hexagon::l2fetch_row(k_data + k->get_nb(1), row_bytes_k); } @@ -156,7 +158,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value float vs = 1.0f; // post-softmax KQ value, expf(s - M) - const auto * v_data = v_ptr + (ic * v->get_nb(1) + iv2 * v->get_nb(2) + iv3 * v->get_nb(3)); + const auto * v_data = v_plane_ptr + ic * v->get_nb(1); if (ic < v->get_ne(1)) { hexagon::l2fetch_row(v_data, row_bytes_v); } diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index a88235bb89ee8..5307bb756481d 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -166,22 +166,13 @@ inline HVX_VectorPair hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) { inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); - - // TODO: do we have a better way to do the reduction? - switch (kFloatsPerVector) { - default: - case 32: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); - // fallthrough - case 16: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); - break; - } + static_assert(kFloatsPerVector == 32, "kFloatsPerVector should be 32"); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); return sums; } @@ -191,23 +182,14 @@ inline float vec_reduction_f32_qf32(HVX_Vector sums) { inline HVX_Vector vec_reduction_qf16(HVX_Vector sums) { constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t); - static_assert(kFloatsPerVector == 64 || kFloatsPerVector == 32, "kFloatsPerVector should be 32 or 64"); - - // TODO: do we have a better way to do the reduction? - switch (kFloatsPerVector) { - default: - case 64: - sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 32 * sizeof(npu_device_fp16_t))); - // fallthrough - case 32: - sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 16 * sizeof(npu_device_fp16_t))); - sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 8 * sizeof(npu_device_fp16_t))); - sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 4 * sizeof(npu_device_fp16_t))); - sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 2 * sizeof(npu_device_fp16_t))); - sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, sizeof(npu_device_fp16_t))); - break; - } - + static_assert(kFloatsPerVector == 64, "kFloatsPerVector should be 64"); + + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 32 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 16 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 8 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 4 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 2 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, sizeof(npu_device_fp16_t))); return sums; } From 889cb692b56886416b5a5405ce09e9c3c8369a36 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 5 Jul 2025 00:30:23 +0800 Subject: [PATCH 08/53] Optimize vector dot product implementations for enhanced performance and efficiency --- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 71 ++++++++++++++++-------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index 21da3493369f8..ebfb0f41ecfd4 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -101,14 +101,35 @@ inline float vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr HVX_Vector sum0 = Q6_V_vzero(); HVX_Vector sum1 = Q6_V_vzero(); - while (src0_vec_ptr_end - src0_vec_ptr > 1) { - HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - src0_vec_ptr += 2; - src1_vec_ptr += 2; + { + HVX_Vector sum2 = Q6_V_vzero(); + HVX_Vector sum3 = Q6_V_vzero(); + while (src0_vec_ptr_end - src0_vec_ptr > 3) { + HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; + HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; + HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + src0_vec_ptr += 4; + src1_vec_ptr += 4; - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1); + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); + sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); + sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); + } + + if (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + src0_vec_ptr += 2; + src1_vec_ptr += 2; + + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1); + } + + sum0 = _AddFunc(sum2, sum0); + sum1 = _AddFunc(sum3, sum1); } if (src0_vec_ptr_end - src0_vec_ptr > 0) { @@ -160,27 +181,31 @@ inline float vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr HVX_Vector prev0 = *src0_vec_ptr++; HVX_Vector prev1 = *src1_vec_ptr++; HVX_Vector sum = Q6_V_vzero(); - HVX_Vector sum0 = Q6_V_vzero(); - HVX_Vector sum1 = Q6_V_vzero(); - while (src1_vec_ptr_end - src1_vec_ptr > 1) { - HVX_Vector curr0 = src0_vec_ptr[0]; - HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + { + HVX_Vector sum0 = Q6_V_vzero(); + HVX_Vector sum1 = Q6_V_vzero(); - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - HVX_VectorPair s0_pair = _ExpandFunc(s0, kOneV); - prev0 = curr0; - prev1 = Q6_V_hi_W(curr1); - src0_vec_ptr++; - src1_vec_ptr += 2; + while (src1_vec_ptr_end - src1_vec_ptr > 1) { + HVX_Vector curr0 = src0_vec_ptr[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); + HVX_VectorPair s0_pair = _ExpandFunc(s0, kOneV); + prev0 = curr0; + prev1 = Q6_V_hi_W(curr1); + src0_vec_ptr++; + src1_vec_ptr += 2; + + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), l1), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), h1), sum1); + } - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), l1), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), h1), sum1); + sum = _AddFunc(sum0, sum1); } - sum = _AddFunc(sum0, sum1); const size_t leftover1 = count % kElementsPerVector1; if ((src1_vec_ptr_end - ((HVX_Vector *) src1)) > 0) { // handle the last vector From cec3fd8273de03d4958b91b973f17d70e6fbafeb Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 5 Jul 2025 01:26:31 +0800 Subject: [PATCH 09/53] Enhance flash attention implementation and type traits for improved vector operations and alignment checks # Conflicts: # ggml/src/ggml-qnn/npu/device/type_traits.cpp --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 20 +++++++++++-------- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 16 +++++++++------ ggml/src/ggml-qnn/npu/device/type_traits.hpp | 9 ++++++--- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 2f5d7077ef7f8..943f45d42289a 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -37,8 +37,14 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const float m0 = powf(2.0f, -(max_bias) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const auto q_to_vec_dot = hexagon::get_type_traits(k->get_type()).from_float; // TODO: fix this - const auto kq_vec_dot = hexagon::get_type_traits(k->get_type()).vec_dot; + const auto DK = k->get_ne(0); + const uint8_t * k_ptr = k->get_read_buffer(); + const auto & k_type_traits = hexagon::get_type_traits(k->get_type()); + const auto is_vec_aligned = k_type_traits.can_use_aligned_vec_dot; + const auto is_k_vec_aligned = is_vec_aligned(k_ptr, k_ptr, DK); + + const auto q_to_vec_dot = k_type_traits.from_float; // TODO: fix this + const auto kq_vec_dot = is_k_vec_aligned ? k_type_traits.vec_dot_aligned : k_type_traits.vec_dot; if (!q_to_vec_dot || !kq_vec_dot) { DEVICE_LOG_ERROR("flash_attn_impl: unsupported data type for q, k, or v\n"); return; @@ -47,15 +53,14 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const int64_t total_rows = q->get_ne(1) * q->get_ne(2) * q->get_ne(3); // total number of rows in Q const auto start_end_row = params->get_work_slice(total_rows); // work slice for this thread - const auto DK = k->get_ne(0); const auto DV = v->get_ne(0); const auto row_bytes_q = q->get_ne(0) * hexagon::get_type_traits(q->get_type()).type_size; - const auto row_bytes_k = DK * hexagon::get_type_traits(k->get_type()).type_size; + const auto row_bytes_k = DK * k_type_traits.type_size; const auto row_bytes_v = DV * hexagon::get_type_traits(v->get_type()).type_size; - constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - const auto aligned_dk = (DK + kFloatsPerVector - 1) / kFloatsPerVector * kFloatsPerVector; - const auto aligned_dv = (DV + kFloatsPerVector - 1) / kFloatsPerVector * kFloatsPerVector; + constexpr const size_t kFloatsPerVectorPair = hexagon::kBytesPerVector * 2 / sizeof(float); + const auto aligned_dk = (DK + kFloatsPerVectorPair - 1) / kFloatsPerVectorPair * kFloatsPerVectorPair; + const auto aligned_dv = (DV + kFloatsPerVectorPair - 1) / kFloatsPerVectorPair * kFloatsPerVectorPair; size_t total_cache_size = sizeof(float) * (aligned_dk + 2 * aligned_dv); auto * cache_ptr = params->get_vtcm_cache(total_cache_size); if (!cache_ptr) { @@ -77,7 +82,6 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(out, params->get_thread_index(), flash_attn); const uint8_t * q_ptr = q->get_read_buffer(); - const uint8_t * k_ptr = k->get_read_buffer(); const uint8_t * v_ptr = v->get_read_buffer(); const uint8_t * mask_ptr = mask ? mask->get_read_buffer() : nullptr; for (auto ir = start_end_row.first; ir < start_end_row.second; ++ir) { diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 85d8506e6756f..05a46b95fec5b 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -494,11 +494,13 @@ void copy_row_f16(const void * src, hexagon::dequant_target_type * dst, size_t c template struct dot_func_traits {}; -template struct dot_func_traits { - using param_type = std::remove_const_t>; +template struct dot_func_traits<_TReturn (*)(_TData, _TData, size_t)> { + using param_type = std::remove_const_t>; + using return_type = _TReturn; }; -template float wrap_dot_func(const void * src0, const void * src1, size_t count) { +template ::return_type> +_TReturn wrap_dot_func(const void * src0, const void * src1, size_t count) { using param_type = typename dot_func_traits::param_type; auto * src0_typed = reinterpret_cast(src0); @@ -508,10 +510,12 @@ template float wrap_dot_func(const void * src0, const void * src constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { { NPU_DATA_TYPE_F32, "F32", 1, sizeof(float), false, nullptr, nullptr, - wrap_dot_func }, + wrap_dot_func, wrap_dot_func, + wrap_dot_func }, { NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, copy_row_f16, quantize_row_fp16, - wrap_dot_func }, - { NPU_DATA_TYPE_I32, "I32", 1, sizeof(int32_t), false, nullptr, nullptr, nullptr }, + wrap_dot_func, wrap_dot_func, + wrap_dot_func }, + { NPU_DATA_TYPE_I32, "I32", 1, sizeof(int32_t), false }, { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q8_0), true, dequantize_row_q8_0, quantize_row_q8_0 }, { NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q4_0), true, dequantize_row_q4_0, diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.hpp b/ggml/src/ggml-qnn/npu/device/type_traits.hpp index aa6e7d11ed500..224fffdc4d3c0 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.hpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.hpp @@ -12,6 +12,7 @@ bool init_f16_f32_table(float * table, size_t count); typedef void (*quantize_row_type)(const float * src, void * dst, size_t count); typedef void (*dequantize_row_type)(const void * src, dequant_target_type * dst, size_t count); typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count); +typedef bool (*can_use_aligned_vec_dot_type)(const void * src0, const void * src1, size_t count); struct device_type_traits { npu_device_tensor_data_type type; @@ -20,9 +21,11 @@ struct device_type_traits { size_t type_size; bool is_quantized; - dequantize_row_type to_float; - quantize_row_type from_float; - vec_dot_type vec_dot; + dequantize_row_type to_float; + quantize_row_type from_float; + vec_dot_type vec_dot; + vec_dot_type vec_dot_aligned; + can_use_aligned_vec_dot_type can_use_aligned_vec_dot; }; const device_type_traits & get_type_traits(npu_device_tensor_data_type type); From 311be5708d386dce55ec61761655ab78e7b8881c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 5 Jul 2025 11:38:57 +0800 Subject: [PATCH 10/53] remove align --- ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 943f45d42289a..b5dce01f68d28 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -37,14 +37,9 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const float m0 = powf(2.0f, -(max_bias) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const auto DK = k->get_ne(0); - const uint8_t * k_ptr = k->get_read_buffer(); - const auto & k_type_traits = hexagon::get_type_traits(k->get_type()); - const auto is_vec_aligned = k_type_traits.can_use_aligned_vec_dot; - const auto is_k_vec_aligned = is_vec_aligned(k_ptr, k_ptr, DK); - - const auto q_to_vec_dot = k_type_traits.from_float; // TODO: fix this - const auto kq_vec_dot = is_k_vec_aligned ? k_type_traits.vec_dot_aligned : k_type_traits.vec_dot; + const auto & k_type_traits = hexagon::get_type_traits(k->get_type()); + const auto q_to_vec_dot = k_type_traits.from_float; // TODO: fix this + const auto kq_vec_dot = k_type_traits.vec_dot; if (!q_to_vec_dot || !kq_vec_dot) { DEVICE_LOG_ERROR("flash_attn_impl: unsupported data type for q, k, or v\n"); return; @@ -53,6 +48,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const int64_t total_rows = q->get_ne(1) * q->get_ne(2) * q->get_ne(3); // total number of rows in Q const auto start_end_row = params->get_work_slice(total_rows); // work slice for this thread + const auto DK = k->get_ne(0); const auto DV = v->get_ne(0); const auto row_bytes_q = q->get_ne(0) * hexagon::get_type_traits(q->get_type()).type_size; const auto row_bytes_k = DK * k_type_traits.type_size; @@ -82,6 +78,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(out, params->get_thread_index(), flash_attn); const uint8_t * q_ptr = q->get_read_buffer(); + const uint8_t * k_ptr = k->get_read_buffer(); const uint8_t * v_ptr = v->get_read_buffer(); const uint8_t * mask_ptr = mask ? mask->get_read_buffer() : nullptr; for (auto ir = start_end_row.first; ir < start_end_row.second; ++ir) { From 3eb8efc8c6dc7c20b71a80adcc29efd62b59248b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 5 Jul 2025 13:24:17 +0800 Subject: [PATCH 11/53] wip --- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index ebfb0f41ecfd4..28f2b733eb35e 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -98,12 +98,14 @@ inline float vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); - HVX_Vector sum0 = Q6_V_vzero(); - HVX_Vector sum1 = Q6_V_vzero(); + HVX_Vector sum = Q6_V_vzero(); { + HVX_Vector sum0 = Q6_V_vzero(); + HVX_Vector sum1 = Q6_V_vzero(); HVX_Vector sum2 = Q6_V_vzero(); HVX_Vector sum3 = Q6_V_vzero(); + while (src0_vec_ptr_end - src0_vec_ptr > 3) { HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; @@ -130,16 +132,17 @@ inline float vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr sum0 = _AddFunc(sum2, sum0); sum1 = _AddFunc(sum3, sum1); + sum = _AddFunc(sum0, sum1); } if (src0_vec_ptr_end - src0_vec_ptr > 0) { HVX_Vector curr0 = src0_vec_ptr[0]; HVX_Vector curr1 = src1_vec_ptr[0]; - sum0 = _AddFunc(_MpyFunc(curr0, curr1), sum0); + sum = _AddFunc(_MpyFunc(curr0, curr1), sum); } - return _ReduceFunc(_AddFunc(sum0, sum1)); + return _ReduceFunc(sum); } inline HVX_Vector vec_mpy_qf32(HVX_Vector src0, HVX_Vector src1) { From 04f1c2cb0ea64f8c246b098247fd3f18dae41589 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 5 Jul 2025 13:56:22 +0800 Subject: [PATCH 12/53] Enhance vector dot product implementation for improved performance by adding parallel processing for multiple vector pairs --- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 35 ++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index 28f2b733eb35e..44fdcdf11d80d 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -19,8 +19,37 @@ inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size { HVX_Vector sum0 = Q6_V_vzero(); HVX_Vector sum1 = Q6_V_vzero(); + HVX_Vector sum2 = Q6_V_vzero(); + HVX_Vector sum3 = Q6_V_vzero(); + + while (src0_vec_ptr_end - src0_vec_ptr > 3) { + HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; + HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; + HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + + HVX_Vector l00 = Q6_V_valign_VVR(Q6_V_lo_W(curr00), prev0, (size_t) src0); + HVX_Vector l10 = Q6_V_valign_VVR(Q6_V_lo_W(curr10), prev1, (size_t) src1); + HVX_Vector h00 = Q6_V_valign_VVR(Q6_V_hi_W(curr00), Q6_V_lo_W(curr00), (size_t) src0); + HVX_Vector h10 = Q6_V_valign_VVR(Q6_V_hi_W(curr10), Q6_V_lo_W(curr10), (size_t) src1); + + HVX_Vector l01 = Q6_V_valign_VVR(Q6_V_lo_W(curr01), Q6_V_hi_W(curr00), (size_t) src0); + HVX_Vector l11 = Q6_V_valign_VVR(Q6_V_lo_W(curr11), Q6_V_hi_W(curr10), (size_t) src1); + HVX_Vector h01 = Q6_V_valign_VVR(Q6_V_hi_W(curr01), Q6_V_lo_W(curr01), (size_t) src0); + HVX_Vector h11 = Q6_V_valign_VVR(Q6_V_hi_W(curr11), Q6_V_lo_W(curr11), (size_t) src1); - while (src0_vec_ptr_end - src0_vec_ptr > 1) { + prev0 = Q6_V_hi_W(curr01); + prev1 = Q6_V_hi_W(curr11); + src0_vec_ptr += 4; + src1_vec_ptr += 4; + + sum0 = _AddFunc(_MpyFunc(l00, l10), sum0); + sum1 = _AddFunc(_MpyFunc(h00, h10), sum1); + sum2 = _AddFunc(_MpyFunc(l01, l11), sum2); + sum3 = _AddFunc(_MpyFunc(h01, h11), sum3); + } + + if (src0_vec_ptr_end - src0_vec_ptr > 1) { HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; @@ -37,7 +66,9 @@ inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); } - sum = _AddFunc(sum0, sum1); + sum0 = _AddFunc(sum2, sum0); + sum1 = _AddFunc(sum3, sum1); + sum = _AddFunc(sum0, sum1); } if (src0_vec_ptr_end - src0_vec_ptr > 0) { From 661c9164839761348bed9cc87c152756e8590ada Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 5 Jul 2025 13:56:37 +0800 Subject: [PATCH 13/53] Revert "Enhance vector dot product implementation for improved performance by adding parallel processing for multiple vector pairs" This reverts commit 78cc24ed2285002ca29d6189fa61ba4ce24f8d16. --- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 35 ++---------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index 44fdcdf11d80d..28f2b733eb35e 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -19,37 +19,8 @@ inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size { HVX_Vector sum0 = Q6_V_vzero(); HVX_Vector sum1 = Q6_V_vzero(); - HVX_Vector sum2 = Q6_V_vzero(); - HVX_Vector sum3 = Q6_V_vzero(); - - while (src0_vec_ptr_end - src0_vec_ptr > 3) { - HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; - HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; - HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; - - HVX_Vector l00 = Q6_V_valign_VVR(Q6_V_lo_W(curr00), prev0, (size_t) src0); - HVX_Vector l10 = Q6_V_valign_VVR(Q6_V_lo_W(curr10), prev1, (size_t) src1); - HVX_Vector h00 = Q6_V_valign_VVR(Q6_V_hi_W(curr00), Q6_V_lo_W(curr00), (size_t) src0); - HVX_Vector h10 = Q6_V_valign_VVR(Q6_V_hi_W(curr10), Q6_V_lo_W(curr10), (size_t) src1); - - HVX_Vector l01 = Q6_V_valign_VVR(Q6_V_lo_W(curr01), Q6_V_hi_W(curr00), (size_t) src0); - HVX_Vector l11 = Q6_V_valign_VVR(Q6_V_lo_W(curr11), Q6_V_hi_W(curr10), (size_t) src1); - HVX_Vector h01 = Q6_V_valign_VVR(Q6_V_hi_W(curr01), Q6_V_lo_W(curr01), (size_t) src0); - HVX_Vector h11 = Q6_V_valign_VVR(Q6_V_hi_W(curr11), Q6_V_lo_W(curr11), (size_t) src1); - prev0 = Q6_V_hi_W(curr01); - prev1 = Q6_V_hi_W(curr11); - src0_vec_ptr += 4; - src1_vec_ptr += 4; - - sum0 = _AddFunc(_MpyFunc(l00, l10), sum0); - sum1 = _AddFunc(_MpyFunc(h00, h10), sum1); - sum2 = _AddFunc(_MpyFunc(l01, l11), sum2); - sum3 = _AddFunc(_MpyFunc(h01, h11), sum3); - } - - if (src0_vec_ptr_end - src0_vec_ptr > 1) { + while (src0_vec_ptr_end - src0_vec_ptr > 1) { HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; @@ -66,9 +37,7 @@ inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); } - sum0 = _AddFunc(sum2, sum0); - sum1 = _AddFunc(sum3, sum1); - sum = _AddFunc(sum0, sum1); + sum = _AddFunc(sum0, sum1); } if (src0_vec_ptr_end - src0_vec_ptr > 0) { From afb8ea504ae896f92e8089c72df2dbd72d85bace Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 5 Jul 2025 17:14:33 +0800 Subject: [PATCH 14/53] Enhance flash attention implementation with type checks for tensor data types and improved constexpr usage --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index b5dce01f68d28..8e6f7ed900af2 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -13,10 +13,17 @@ inline float f16_to_f32(const npu_device_fp16_t src) { } // From: ggml/src/ggml-cpu/ops.cpp +template void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hexagon::tensor * k, const hexagon::tensor * v, const hexagon::tensor * mask, hexagon::compute_params * params) { static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count"); + if (k->get_type() != (_IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32) || v->get_type() != k->get_type()) { + DEVICE_LOG_ERROR("flash_attn_impl: k and v must be F16 type, got k: %s, v: %s\n", + hexagon::get_type_name(k->get_type()), hexagon::get_type_name(v->get_type())); + return; + } + float scale = out->get_op_param(0); const float max_bias = out->get_op_param(1); const float logit_softcap = out->get_op_param(2); @@ -65,11 +72,10 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex } // loop over n_batch and n_head - const auto rows_per_batch = q->get_ne(2) * q->get_ne(1); - const auto out_rows_per_batch = out->get_ne(2) * out->get_ne(1); - const bool is_v_f16 = - v->get_type() == NPU_DATA_TYPE_F16; // check if V is in FP16 format, otherwise it is in FP32 format - uint8_t * dst_ptr = out->get_write_buffer(); + constexpr bool is_v_f16 = _IsKvF16; // check if V is in FP16 format, otherwise it is in FP32 format + const auto rows_per_batch = q->get_ne(2) * q->get_ne(1); + const auto out_rows_per_batch = out->get_ne(2) * out->get_ne(1); + uint8_t * dst_ptr = out->get_write_buffer(); if (!dst_ptr) { DEVICE_LOG_ERROR("flash_attn_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out, hexagon::get_type_name(out->get_type())); @@ -99,7 +105,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex auto * Q_q = reinterpret_cast( VKQ32 + 2 * aligned_dv); // (temporary) buffer for Q converted to quantized/FP16 - if (is_v_f16) { + if constexpr (is_v_f16) { memset(VKQ16, 0, DV * sizeof(npu_device_fp16_t)); } else { memset(VKQ32, 0, DV * sizeof(float)); @@ -164,7 +170,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex hexagon::l2fetch_row(v_data, row_bytes_v); } - if (is_v_f16) { + if constexpr (is_v_f16) { if (s > M) { // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f M = s; @@ -204,7 +210,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex S = S * ms + vs; // scale and increment sum with partial sum } - if (is_v_f16) { + if constexpr (is_v_f16) { // TODO: use a more efficient conversion for (int64_t d = 0; d < DV; ++d) { VKQ32[d] = f16_to_f32(VKQ16[d]); @@ -250,7 +256,11 @@ bool flash_attn_f32(tensor * out, compute_params * params) { return false; } - flash_attn_impl(out, q, k, v, mask, params); + if (k->get_type() == NPU_DATA_TYPE_F16) { + flash_attn_impl(out, q, k, v, mask, params); + } else { + flash_attn_impl(out, q, k, v, mask, params); + } return true; } From 0e626a86f5295b31c13c06f04bf2b43663bd68e8 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 5 Jul 2025 18:11:34 +0800 Subject: [PATCH 15/53] wip --- ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 8e6f7ed900af2..965698953f8ff 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -105,6 +105,9 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex auto * Q_q = reinterpret_cast( VKQ32 + 2 * aligned_dv); // (temporary) buffer for Q converted to quantized/FP16 + const auto * q_data = q_ptr + (iq1 * q->get_nb(1) + iq2 * q->get_nb(2) + iq3 * q->get_nb(3)); + hexagon::l2fetch_row(q_data, row_bytes_q); + if constexpr (is_v_f16) { memset(VKQ16, 0, DV * sizeof(npu_device_fp16_t)); } else { @@ -124,11 +127,6 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const int iv3 = iq3 / rv3; const int iv2 = iq2 / rv2; - const auto * q_data = q_ptr + (iq1 * q->get_nb(1) + iq2 * q->get_nb(2) + iq3 * q->get_nb(3)); - if (iq1 < q->get_ne(1) - 1) { - hexagon::l2fetch_row(q_data + q->get_nb(1), row_bytes_q); - } - q_to_vec_dot(reinterpret_cast(q_data), Q_q, DK); // online softmax / attention From 854bc237d9dfb9d0c087495210185b7a4023cbd3 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 5 Jul 2025 22:39:00 +0800 Subject: [PATCH 16/53] opt mask calc --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 130 ++++++++++-------- 1 file changed, 69 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 965698953f8ff..371cbfefda691 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -12,6 +12,9 @@ inline float f16_to_f32(const npu_device_fp16_t src) { return reinterpret_cast(src); } +constexpr const float kF32Infinity = -INFINITY; // Use negative infinity to indicate invalid values in F32 tensors +constexpr const __fp16 kF16Infinity = (__fp16) kF32Infinity; + // From: ggml/src/ggml-cpu/ops.cpp template void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hexagon::tensor * k, @@ -96,9 +99,10 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const uint32_t h = iq2; // head index const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + const bool is_slope_valid = slope != kF32Infinity; float S = 0.0f; // sum - float M = -INFINITY; // maximum KQ value + float M = kF32Infinity; // maximum KQ value float * VKQ32 = reinterpret_cast(cache_ptr); // FP32 VKQ accumulator auto * VKQ16 = reinterpret_cast(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator @@ -134,78 +138,82 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex // ref: https://arxiv.org/pdf/2112.05682.pdf const auto * k_plane_ptr = k_ptr + ik2 * k->get_nb(2) + ik3 * k->get_nb(3); const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3); - for (int64_t ic = 0; ic < k->get_ne(1); ++ic) { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop); - float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f; - if (mv == -INFINITY) { - continue; - } - - float s = 0.f; - { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 1, kq_dot); - const auto * k_data = k_plane_ptr + ic * k->get_nb(1); - if (ic < k->get_ne(1) - 1) { - hexagon::l2fetch_row(k_data + k->get_nb(1), row_bytes_k); + if (is_slope_valid) { + for (int64_t ic = 0; ic < k->get_ne(1); ++ic) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop); + const auto mask_elem = mp[ic]; // mask element for current channel + if (reinterpret_cast(mask_elem) == kF16Infinity) { + continue; } - s = kq_vec_dot(k_data, Q_q, DK); // KQ value - s = s * scale; // scale KQ value - if (logit_softcap != 0.0f) { - s = logit_softcap * tanhf(s); // TODO: vectorize this? - } - - s += mv; // apply mask - } - - const float Mold = M; + float mv = mp ? (slope * f16_to_f32(mask_elem)) : 0.0f; - float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value - float vs = 1.0f; // post-softmax KQ value, expf(s - M) + float s = 0.f; + { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 1, kq_dot); + const auto * k_data = k_plane_ptr + ic * k->get_nb(1); + if (ic < k->get_ne(1) - 1) { + hexagon::l2fetch_row(k_data + k->get_nb(1), row_bytes_k); + } + + s = kq_vec_dot(k_data, Q_q, DK); // KQ value + s = s * scale; // scale KQ value + if (logit_softcap != 0.0f) { + s = logit_softcap * tanhf(s); // TODO: vectorize this? + } + + s += mv; // apply mask + } - const auto * v_data = v_plane_ptr + ic * v->get_nb(1); - if (ic < v->get_ne(1)) { - hexagon::l2fetch_row(v_data, row_bytes_v); - } + const float Mold = M; - if constexpr (is_v_f16) { - if (s > M) { - // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f - M = s; - ms = expf(Mold - M); + float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value + float vs = 1.0f; // post-softmax KQ value, expf(s - M) - // V = V*expf(Mold - M) - hexagon::vec_scale_f16(VKQ16, ms, VKQ16, DV); - } else { - // no new maximum, ms == 1.0f, vs != 1.0f - vs = expf(s - M); + const auto * v_data = v_plane_ptr + ic * v->get_nb(1); + if (ic < v->get_ne(1)) { + hexagon::l2fetch_row(v_data, row_bytes_v); } - // V += v*expf(s - M) - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); - hexagon::vec_mad_f16(reinterpret_cast(v_data), vs, VKQ16, DV); - } else { - if (s > M) { - // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f - M = s; - ms = expf(Mold - M); - - // V = V*expf(Mold - M) - hexagon::vec_scale_f32(VKQ32, ms, VKQ32, DV); + if constexpr (is_v_f16) { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + hexagon::vec_scale_f16(VKQ16, ms, VKQ16, DV); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } + + // V += v*expf(s - M) + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); + hexagon::vec_mad_f16(reinterpret_cast(v_data), vs, VKQ16, DV); } else { - // no new maximum, ms == 1.0f, vs != 1.0f - vs = expf(s - M); + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + hexagon::vec_scale_f32(VKQ32, ms, VKQ32, DV); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } + + // V += v*expf(s - M) + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); + { + // V is F32 + hexagon::vec_mad_f32(reinterpret_cast(v_data), vs, VKQ32, DV); + } } - // V += v*expf(s - M) - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); - { - // V is F32 - hexagon::vec_mad_f32(reinterpret_cast(v_data), vs, VKQ32, DV); - } + S = S * ms + vs; // scale and increment sum with partial sum } - - S = S * ms + vs; // scale and increment sum with partial sum } if constexpr (is_v_f16) { From 709d7525810f8dc9319bc5bbb5150fd379e8e6fa Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 5 Jul 2025 22:53:46 +0800 Subject: [PATCH 17/53] Revert "opt mask calc" This reverts commit bb1840876692a11511d5ab7828b8a707402e30b9. --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 130 ++++++++---------- 1 file changed, 61 insertions(+), 69 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 371cbfefda691..965698953f8ff 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -12,9 +12,6 @@ inline float f16_to_f32(const npu_device_fp16_t src) { return reinterpret_cast(src); } -constexpr const float kF32Infinity = -INFINITY; // Use negative infinity to indicate invalid values in F32 tensors -constexpr const __fp16 kF16Infinity = (__fp16) kF32Infinity; - // From: ggml/src/ggml-cpu/ops.cpp template void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hexagon::tensor * k, @@ -99,10 +96,9 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const uint32_t h = iq2; // head index const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; - const bool is_slope_valid = slope != kF32Infinity; float S = 0.0f; // sum - float M = kF32Infinity; // maximum KQ value + float M = -INFINITY; // maximum KQ value float * VKQ32 = reinterpret_cast(cache_ptr); // FP32 VKQ accumulator auto * VKQ16 = reinterpret_cast(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator @@ -138,82 +134,78 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex // ref: https://arxiv.org/pdf/2112.05682.pdf const auto * k_plane_ptr = k_ptr + ik2 * k->get_nb(2) + ik3 * k->get_nb(3); const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3); - if (is_slope_valid) { - for (int64_t ic = 0; ic < k->get_ne(1); ++ic) { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop); - const auto mask_elem = mp[ic]; // mask element for current channel - if (reinterpret_cast(mask_elem) == kF16Infinity) { - continue; - } + for (int64_t ic = 0; ic < k->get_ne(1); ++ic) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop); + float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f; + if (mv == -INFINITY) { + continue; + } - float mv = mp ? (slope * f16_to_f32(mask_elem)) : 0.0f; + float s = 0.f; + { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 1, kq_dot); + const auto * k_data = k_plane_ptr + ic * k->get_nb(1); + if (ic < k->get_ne(1) - 1) { + hexagon::l2fetch_row(k_data + k->get_nb(1), row_bytes_k); + } - float s = 0.f; - { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 1, kq_dot); - const auto * k_data = k_plane_ptr + ic * k->get_nb(1); - if (ic < k->get_ne(1) - 1) { - hexagon::l2fetch_row(k_data + k->get_nb(1), row_bytes_k); - } - - s = kq_vec_dot(k_data, Q_q, DK); // KQ value - s = s * scale; // scale KQ value - if (logit_softcap != 0.0f) { - s = logit_softcap * tanhf(s); // TODO: vectorize this? - } - - s += mv; // apply mask + s = kq_vec_dot(k_data, Q_q, DK); // KQ value + s = s * scale; // scale KQ value + if (logit_softcap != 0.0f) { + s = logit_softcap * tanhf(s); // TODO: vectorize this? } - const float Mold = M; + s += mv; // apply mask + } + + const float Mold = M; - float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value - float vs = 1.0f; // post-softmax KQ value, expf(s - M) + float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value + float vs = 1.0f; // post-softmax KQ value, expf(s - M) - const auto * v_data = v_plane_ptr + ic * v->get_nb(1); - if (ic < v->get_ne(1)) { - hexagon::l2fetch_row(v_data, row_bytes_v); + const auto * v_data = v_plane_ptr + ic * v->get_nb(1); + if (ic < v->get_ne(1)) { + hexagon::l2fetch_row(v_data, row_bytes_v); + } + + if constexpr (is_v_f16) { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + hexagon::vec_scale_f16(VKQ16, ms, VKQ16, DV); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); } - if constexpr (is_v_f16) { - if (s > M) { - // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f - M = s; - ms = expf(Mold - M); - - // V = V*expf(Mold - M) - hexagon::vec_scale_f16(VKQ16, ms, VKQ16, DV); - } else { - // no new maximum, ms == 1.0f, vs != 1.0f - vs = expf(s - M); - } - - // V += v*expf(s - M) - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); - hexagon::vec_mad_f16(reinterpret_cast(v_data), vs, VKQ16, DV); + // V += v*expf(s - M) + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); + hexagon::vec_mad_f16(reinterpret_cast(v_data), vs, VKQ16, DV); + } else { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + hexagon::vec_scale_f32(VKQ32, ms, VKQ32, DV); } else { - if (s > M) { - // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f - M = s; - ms = expf(Mold - M); - - // V = V*expf(Mold - M) - hexagon::vec_scale_f32(VKQ32, ms, VKQ32, DV); - } else { - // no new maximum, ms == 1.0f, vs != 1.0f - vs = expf(s - M); - } - - // V += v*expf(s - M) - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); - { - // V is F32 - hexagon::vec_mad_f32(reinterpret_cast(v_data), vs, VKQ32, DV); - } + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); } - S = S * ms + vs; // scale and increment sum with partial sum + // V += v*expf(s - M) + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); + { + // V is F32 + hexagon::vec_mad_f32(reinterpret_cast(v_data), vs, VKQ32, DV); + } } + + S = S * ms + vs; // scale and increment sum with partial sum } if constexpr (is_v_f16) { From fb1614e2c7b91ef8d37ae628ec376173364b81ee Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 5 Jul 2025 23:16:16 +0800 Subject: [PATCH 18/53] wip --- ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 965698953f8ff..1db48924504b7 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -87,6 +87,10 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const uint8_t * k_ptr = k->get_read_buffer(); const uint8_t * v_ptr = v->get_read_buffer(); const uint8_t * mask_ptr = mask ? mask->get_read_buffer() : nullptr; + float * VKQ32 = reinterpret_cast(cache_ptr); // FP32 VKQ accumulator + auto * VKQ16 = reinterpret_cast(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator + auto * Q_q = reinterpret_cast( + VKQ32 + 2 * aligned_dv); // (temporary) buffer for Q converted to quantized/FP16 for (auto ir = start_end_row.first; ir < start_end_row.second; ++ir) { // q indices const auto iq3 = ir / rows_per_batch; @@ -97,13 +101,8 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; - float S = 0.0f; // sum - float M = -INFINITY; // maximum KQ value - - float * VKQ32 = reinterpret_cast(cache_ptr); // FP32 VKQ accumulator - auto * VKQ16 = reinterpret_cast(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator - auto * Q_q = reinterpret_cast( - VKQ32 + 2 * aligned_dv); // (temporary) buffer for Q converted to quantized/FP16 + float S = 0.0f; // sum + float M = -INFINITY; // maximum KQ value const auto * q_data = q_ptr + (iq1 * q->get_nb(1) + iq2 * q->get_nb(2) + iq3 * q->get_nb(3)); hexagon::l2fetch_row(q_data, row_bytes_q); From 05decd9e83cb1d2462b9e6d4aa8c079093ee29d3 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 6 Jul 2025 22:31:49 +0800 Subject: [PATCH 19/53] opt mul mat caching logic to add dst cache --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 28 +++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index b80446cb6fa75..5ec15ca394897 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -60,10 +60,12 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso size_t src0_plane_slice_row_count = start_end_element.second - start_end_element.first; size_t src0_plane_cache_size = 0; uint8_t * src0_plane_cache_ptr = nullptr; + uint8_t * dst_row_cache_ptr = nullptr; const uint8_t * last_cached_plane_ptr = nullptr; if constexpr (_ShouldCacheSrc0) { - src0_plane_slice_row_count = - std::min(params->get_vtcm_quota_size() / src0_actual_row_size, src0_plane_slice_row_count); + const size_t dst_row_cache_size = hexagon::get_aligned_size(src0_plane_slice_row_count * sizeof(float)); + src0_plane_slice_row_count = std::min( + (params->get_vtcm_quota_size() - dst_row_cache_size) / src0_actual_row_size, src0_plane_slice_row_count); src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count; src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size); if (src0_plane_cache_ptr == nullptr) { @@ -73,6 +75,9 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size); return; } + + dst_row_cache_ptr = src0_plane_cache_ptr; + src0_plane_cache_ptr += dst_row_cache_size; } DEVICE_LOG_DEBUG( @@ -136,8 +141,14 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 1, vec_dot); auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; - int64_t i0 = 0; + float * dst_row; + if constexpr (_ShouldCacheSrc0) { + dst_row = reinterpret_cast(dst_row_cache_ptr); + } else { + dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; + } + + int64_t i0 = 0; for (; i0 + 1 < (int64_t) actual_row_count; i0 += 2) { auto * src0_row = src0_plane + i0 * src0_actual_row_size; if constexpr (should_fetch_src0_row) { @@ -167,6 +178,11 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); } + + if constexpr (_ShouldCacheSrc0) { + hexagon::vec_cpy_f32(dst_row, reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx, + actual_row_count); + } } } } @@ -195,7 +211,9 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n } const auto vtcm_thread_quota_size = hexagon::default_thread_pool::get_per_thread_vtcm_quota(); - if (src0.ne[0] * sizeof(hexagon::dequant_target_type) > vtcm_thread_quota_size) { + const auto src0_cache_size = src0.ne[0] * sizeof(hexagon::dequant_target_type); + const auto dst_cache_size = hexagon::get_aligned_size(src0.ne[1] * sizeof(float)); + if (src0_cache_size + dst_cache_size > vtcm_thread_quota_size) { DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size); return false; From 9e3f7597884ff7d235a8b2c02038990932c60325 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 7 Jul 2025 11:09:34 +0800 Subject: [PATCH 20/53] Revert "opt mul mat caching logic to add dst cache" This reverts commit ab442fa9f763b3873c929936e4cb739cb1c83850. --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 28 ++++----------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 5ec15ca394897..b80446cb6fa75 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -60,12 +60,10 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso size_t src0_plane_slice_row_count = start_end_element.second - start_end_element.first; size_t src0_plane_cache_size = 0; uint8_t * src0_plane_cache_ptr = nullptr; - uint8_t * dst_row_cache_ptr = nullptr; const uint8_t * last_cached_plane_ptr = nullptr; if constexpr (_ShouldCacheSrc0) { - const size_t dst_row_cache_size = hexagon::get_aligned_size(src0_plane_slice_row_count * sizeof(float)); - src0_plane_slice_row_count = std::min( - (params->get_vtcm_quota_size() - dst_row_cache_size) / src0_actual_row_size, src0_plane_slice_row_count); + src0_plane_slice_row_count = + std::min(params->get_vtcm_quota_size() / src0_actual_row_size, src0_plane_slice_row_count); src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count; src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size); if (src0_plane_cache_ptr == nullptr) { @@ -75,9 +73,6 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size); return; } - - dst_row_cache_ptr = src0_plane_cache_ptr; - src0_plane_cache_ptr += dst_row_cache_size; } DEVICE_LOG_DEBUG( @@ -141,14 +136,8 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 1, vec_dot); auto * src1_row = src1_plane + i1 * src1->get_nb(1); - float * dst_row; - if constexpr (_ShouldCacheSrc0) { - dst_row = reinterpret_cast(dst_row_cache_ptr); - } else { - dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; - } - - int64_t i0 = 0; + auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; + int64_t i0 = 0; for (; i0 + 1 < (int64_t) actual_row_count; i0 += 2) { auto * src0_row = src0_plane + i0 * src0_actual_row_size; if constexpr (should_fetch_src0_row) { @@ -178,11 +167,6 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); } - - if constexpr (_ShouldCacheSrc0) { - hexagon::vec_cpy_f32(dst_row, reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx, - actual_row_count); - } } } } @@ -211,9 +195,7 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n } const auto vtcm_thread_quota_size = hexagon::default_thread_pool::get_per_thread_vtcm_quota(); - const auto src0_cache_size = src0.ne[0] * sizeof(hexagon::dequant_target_type); - const auto dst_cache_size = hexagon::get_aligned_size(src0.ne[1] * sizeof(float)); - if (src0_cache_size + dst_cache_size > vtcm_thread_quota_size) { + if (src0.ne[0] * sizeof(hexagon::dequant_target_type) > vtcm_thread_quota_size) { DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size); return false; From 9643f21b552bf449a72ca90d3a531246faa1686e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 7 Jul 2025 12:33:51 +0800 Subject: [PATCH 21/53] wip --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 16 ++--- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 71 +++++++++++++++------ ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 10 +++ 3 files changed, 69 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index b80446cb6fa75..a61f7e4c548ae 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -252,20 +252,20 @@ typedef void (*mul_mat_func_type)(hexagon::tensor * src0, hexagon::tensor * src1 constexpr const mul_mat_func_type kMulMatF16F32Funcs[2] = { // quantized and non-quantized - mul_mat_impl, // F32 * F32 quantized unaligned - mul_mat_impl, // F32 * F32 quantized aligned + mul_mat_impl, // F32 * F32 quantized unaligned + mul_mat_impl, // F32 * F32 quantized aligned }; constexpr const mul_mat_func_type kMulMatF16Funcs[2][2] = { { // non-quantized - mul_mat_impl, // F16 * F16 unaligned - mul_mat_impl, // F16 * F16 aligned + mul_mat_impl, // F16 * F16 unaligned + mul_mat_impl, // F16 * F16 aligned }, { // quantized - mul_mat_impl, // F16 * F16 quantized unaligned - mul_mat_impl, // F16 * F16 quantized aligned + mul_mat_impl, // F16 * F16 quantized unaligned + mul_mat_impl, // F16 * F16 quantized aligned }, }; @@ -297,9 +297,9 @@ bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { out, params); } else { if (is_mul_mat_f32_f32_src_tensors_aligned(src0, src1)) { - mul_mat_impl(src0, src1, out, params); + mul_mat_impl(src0, src1, out, params); } else { - mul_mat_impl(src0, src1, out, params); + mul_mat_impl(src0, src1, out, params); } } return true; diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index 28f2b733eb35e..f16d8d60161de 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -4,9 +4,9 @@ namespace { -template -inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size_t count) { +template +inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size_t count) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem); HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); @@ -90,9 +90,9 @@ inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size return _ReduceFunc(sum); } -template -inline float vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * src1, size_t count) { +template +inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * src1, size_t count) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem); HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); @@ -161,10 +161,10 @@ inline HVX_Vector vec_add_qf16(HVX_Vector sum, HVX_Vector result) { return Q6_Vqf16_vadd_Vqf16Vqf16(sum, result); } -template -inline float vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { + _TRet (*_ReduceFunc)(HVX_Vector)> +inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { static_assert(sizeof(_TElem0) < sizeof(_TElem1), "Element size mismatch: _TElem0 must be smaller than _TElem1"); static_assert((sizeof(_TElem1) / sizeof(_TElem0)) == 2, "Element size mismatch: _TElem1 must be twice the size of _TElem0"); @@ -260,10 +260,10 @@ inline float vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr return _ReduceFunc(sum); } -template -inline float vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { + _TRet (*_ReduceFunc)(HVX_Vector)> +inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { static_assert(sizeof(_TElem0) < sizeof(_TElem1), "Element size mismatch: _TElem0 must be smaller than _TElem1"); static_assert((sizeof(_TElem1) / sizeof(_TElem0)) == 2, "Element size mismatch: _TElem1 must be twice the size of _TElem0"); @@ -322,32 +322,63 @@ inline float vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem namespace hexagon { +HVX_Vector vec_dot_product_vqf32_f32_f32(const float * src0, const float * src1, size_t count) { + return vec_dot_product_impl(src0, src1, count); +} + +HVX_Vector vec_dot_product_aligned_vqf32_f32_f32(const float * src0, const float * src1, size_t count) { + return vec_dot_product_aligned_impl(src0, src1, + count); +} + float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_impl(src0, src1, count); + return vec_dot_product_impl(src0, src1, count); } float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_aligned_impl(src0, src1, count); + return vec_dot_product_aligned_impl(src0, src1, + count); +} + +HVX_Vector vec_dot_product_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { + return vec_dot_product_impl( + src0, src1, count); +} + +HVX_Vector vec_dot_product_aligned_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, + size_t count) { + return vec_dot_product_aligned_impl( + src0, src1, count); } float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - return vec_dot_product_impl(src0, src1, - count); + return vec_dot_product_impl( + src0, src1, count); } float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - return vec_dot_product_aligned_impl( + return vec_dot_product_aligned_impl( src0, src1, count); } +HVX_Vector vec_dot_product_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { + return vec_dot_product_mixed_impl(src0, src1, count); +} + +HVX_Vector vec_dot_product_aligned_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { + return vec_dot_product_mix_aligned_impl(src0, src1, count); +} + float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { - return vec_dot_product_mixed_impl(src0, src1, count); } float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { - return vec_dot_product_mix_aligned_impl(src0, src1, count); + return vec_dot_product_mix_aligned_impl(src0, src1, count); } } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 5307bb756481d..aa484611ecc49 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -333,6 +333,9 @@ inline bool is_dot_product_aligned(const _TElem0 * src0, const _TElem1 * src1, s return true; } +HVX_Vector vec_dot_product_vqf32_f32_f32(const float * src0, const float * src1, size_t count); +HVX_Vector vec_dot_product_aligned_vqf32_f32_f32(const float * src0, const float * src1, size_t count); + float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count); float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count); @@ -340,6 +343,10 @@ inline bool is_f32_f32_dot_product_aligned(const float * src0, const float * src return is_dot_product_aligned(src0, src1, count); } +HVX_Vector vec_dot_product_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); +HVX_Vector vec_dot_product_aligned_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, + size_t count); + float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); @@ -348,6 +355,9 @@ inline bool is_f16_f16_dot_product_aligned(const npu_device_fp16_t * src0, const return is_dot_product_aligned(src0, src1, count); } +HVX_Vector vec_dot_product_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); +HVX_Vector vec_dot_product_aligned_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); + float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); From 7430cd39951b4c11993004e9fe7839ff119a6185 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 7 Jul 2025 14:14:35 +0800 Subject: [PATCH 22/53] Refactor matrix multiplication implementation to include vector conversion and performance tracking --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 47 ++++++++++++++++----- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index a61f7e4c548ae..1ca7e077d45a9 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -6,12 +6,26 @@ namespace { -template struct get_data_type {}; +template struct get_data_type {}; -template -struct get_data_type { - using data_type0 = _TyData0; - using data_type1 = _TyData1; +template +struct get_data_type { + using data_type0 = _TData0; + using data_type1 = _TData1; +}; + +template struct convert_vector {}; + +template <> struct convert_vector { + static float convert(HVX_Vector vec) { return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec)); } +}; + +template <> struct convert_vector { + static float convert(HVX_Vector vec) { + HVX_Vector vect = Q6_Vhf_equals_Vqf16(vec); + uint16_t i = (vect[0] & 0xffff); + return reinterpret_cast<__fp16 &>(i); + } }; template @@ -145,17 +159,26 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso } // TODO: figure dst how to handle a entire row - dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + auto res0 = _DotFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + + { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 2, store); + dst_row[i0] = convert_vector::convert(res0); + } if (should_fetch_src0_row && i0 + 2 < (int64_t) actual_row_count) { hexagon::l2fetch_row(src0_row + src0_actual_row_size + src0_actual_row_size, valid_row0_bytes); } // TODO: figure dst how to handle a entire row - dst_row[i0 + 1] = - _DotFunc(reinterpret_cast(src0_row + src0_actual_row_size), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + auto res1 = _DotFunc(reinterpret_cast(src0_row + src0_actual_row_size), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + + { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 2, store); + dst_row[i0 + 1] = convert_vector::convert(res1); + } } if (ip + 1 < start_end_plane.second) { @@ -164,8 +187,10 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso if (i0 < (int64_t) actual_row_count) { auto * src0_row = src0_plane + i0 * src0_actual_row_size; - dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), + auto res = _DotFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 2, store); + dst_row[i0] = convert_vector::convert(res); } } } From 420f1f6bcb8c1db1e85a1c81ca5471f28a226446 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 7 Jul 2025 15:34:54 +0800 Subject: [PATCH 23/53] wip --- ggml/src/ggml-qnn/npu/device/util.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index 86da92b9a3130..4fdcc786bacc3 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -344,8 +344,10 @@ inline auto make_scoped_perf_timer(const char * format, ...) { } // namespace hexagon #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +# define _MAKE_VARIABLE_NAME2(name, postfix) name##postfix +# define _MAKE_VARIABLE_NAME(name, postfix) _MAKE_VARIABLE_NAME2(name, postfix) # define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ - auto __npu_timer_##__LINE__ = hexagon::make_scoped_perf_timer(fmt, __VA_ARGS__) + auto _MAKE_VARIABLE_NAME(__npu_timer_, __LINE__) = hexagon::make_scoped_perf_timer(fmt, __VA_ARGS__) #else # define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) #endif From 464ad0298ace7f6be52d8f96def2b6163dcd0fbf Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 7 Jul 2025 16:00:47 +0800 Subject: [PATCH 24/53] wip --- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 14 +++++++------- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 16 ++++++++-------- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 14 +++++++------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index f39f6c230bf38..ad6d3d9298af2 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -76,13 +76,13 @@ inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t dst_vec_ptr++; } - const size_t leftover_bytes = leftover * sizeof(_TyData); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? - *src0_vec_ptr : - prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + const size_t leftover_bytes = leftover * sizeof(_TyData); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? *src1_vec_ptr : @@ -278,10 +278,10 @@ void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) { prev = curr; } - const size_t leftover_bytes = leftover * sizeof(float); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr = + const size_t leftover_bytes = leftover * sizeof(float); + HVX_Vector curr = (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; curr = Q6_V_valign_VVR(curr, prev, (size_t) src); sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index f16d8d60161de..cad6125550b54 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -71,13 +71,13 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size sum = _AddFunc(_MpyFunc(s0, s1), sum); } - const size_t leftover_bytes = leftover * sizeof(_TElem); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? - *src0_vec_ptr : - prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + const size_t leftover_bytes = leftover * sizeof(_TElem); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? *src1_vec_ptr : @@ -240,11 +240,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? Q6_V_hi_W(s0_pair) : Q6_V_lo_W(s0_pair), s1), sum); } - const size_t leftover0 = count % kElementsPerVector0; - const size_t leftover_bytes1 = leftover1 * sizeof(_TElem1); if (leftover1 > 0) { // handle the leftover elements - HVX_Vector curr0 = + const size_t leftover0 = count % kElementsPerVector0; + const size_t leftover_bytes1 = leftover1 * sizeof(_TElem1); + HVX_Vector curr0 = reinterpret_cast(hexagon::align_down(src0_vec_ptr)) < src0_ptr_end ? *src0_vec_ptr : prev0; HVX_Vector curr1 = (leftover_bytes1 + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? *src1_vec_ptr : diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index aa484611ecc49..4209d253334b1 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -208,12 +208,11 @@ template 0) { // handle the leftover elements - HVX_Vector curr = + const size_t leftover_bytes = leftover * sizeof(_TParam); + HVX_Vector curr = (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; curr = Q6_V_valign_VVR(curr, prev, (size_t) src); q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _Func(curr, dst_vec_ptr, scale_vec)); From 8b763a9426245cfbac40a694e7a59ac59873effe Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 8 Jul 2025 12:05:51 +0800 Subject: [PATCH 25/53] wip --- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index cad6125550b54..4c32e9066fe62 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -1,7 +1,5 @@ #include "vec_ops.hpp" -#include "util.hpp" - namespace { template Date: Tue, 8 Jul 2025 12:22:13 +0800 Subject: [PATCH 26/53] create vec_ops.inl for more aggressive compiler inline --- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 149 ++++++++++-------- .../npu/device/{vec_ops.cpp => vec_ops.inl} | 109 +++++++------ 2 files changed, 134 insertions(+), 124 deletions(-) rename ggml/src/ggml-qnn/npu/device/{vec_ops.cpp => vec_ops.inl} (80%) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 4209d253334b1..62d52a6ba97b6 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -203,62 +203,6 @@ inline HVX_Vector hvx_scale_f32(float scale) { return Q6_V_vsplat_R(reinterpret_cast(scale)); } -template -inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size_t count) { - constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TParam); - - HVX_Vector * src_vec_ptr = ((HVX_Vector *) src); - HVX_Vector * const src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector); - HVX_UVector * dst_vec_ptr = ((HVX_UVector *) dst); // TODO: opt the unaligned case? - HVX_Vector prev = *src_vec_ptr++; - const size_t leftover = count % kElementsPerVector; - - HVX_Vector scale_vec = _FuncScaleConvert(scale); - - while (src_vec_end - src_vec_ptr > 1) { - HVX_VectorPair curr = reinterpret_cast(src_vec_ptr)[0]; - src_vec_ptr += 2; - - HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src); - HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src); - - dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec); - dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec); - - dst_vec_ptr += 2; - prev = Q6_V_hi_W(curr); - } - - if (src_vec_end - src_vec_ptr > 0) { - HVX_Vector curr = *src_vec_ptr++; - HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); - dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec); - dst_vec_ptr++; - prev = curr; - } - - if ((src_vec_end - ((HVX_Vector *) src)) > 0) { - // handle the last vector - bool should_fetch_next = leftover == 0 && hexagon::is_addr_aligned(src_vec_ptr); - HVX_Vector curr = should_fetch_next ? prev : *src_vec_ptr; - src_vec_ptr = should_fetch_next ? src_vec_ptr : src_vec_ptr + 1; - HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); - dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec); - dst_vec_ptr++; - prev = curr; - } - - if (leftover > 0) { - // handle the leftover elements - const size_t leftover_bytes = leftover * sizeof(_TParam); - HVX_Vector curr = - (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; - curr = Q6_V_valign_VVR(curr, prev, (size_t) src); - q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _Func(curr, dst_vec_ptr, scale_vec)); - } -} - inline HVX_Vector hvx_vec_scale_f32_f32(HVX_Vector src, HVX_UVector *, HVX_Vector scale_vec) { return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(src, scale_vec)); } @@ -294,27 +238,39 @@ inline HVX_Vector hvx_passthru(HVX_Vector src, HVX_UVector *, HVX_Vector) { return src; } +} // namespace hexagon + +#include "vec_ops.inl" + +namespace hexagon { + inline void vec_scale_f32(const float * src, float scale, float * dst, size_t count) { + using namespace hexagon::vec; vec_scale_impl(src, scale, dst, count); } inline void vec_mad_f32(const float * src, float scale, float * dst, size_t count) { + using namespace hexagon::vec; vec_scale_impl(src, scale, dst, count); } inline void vec_cpy_f32(const float * src, float * dst, size_t count) { + using namespace hexagon::vec; vec_scale_impl(src, 0, dst, count); } inline void vec_scale_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) { + using namespace hexagon::vec; vec_scale_impl(src, scale, dst, count); } inline void vec_mad_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) { + using namespace hexagon::vec; vec_scale_impl(src, scale, dst, count); } inline void vec_cpy_f16(const npu_device_fp16_t * src, npu_device_fp16_t * dst, size_t count) { + using namespace hexagon::vec; vec_scale_impl(src, 0, dst, count); } @@ -333,33 +289,88 @@ inline bool is_dot_product_aligned(const _TElem0 * src0, const _TElem1 * src1, s return true; } -HVX_Vector vec_dot_product_vqf32_f32_f32(const float * src0, const float * src1, size_t count); -HVX_Vector vec_dot_product_aligned_vqf32_f32_f32(const float * src0, const float * src1, size_t count); +inline HVX_Vector vec_dot_product_vqf32_f32_f32(const float * src0, const float * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_impl(src0, src1, count); +} + +inline HVX_Vector vec_dot_product_aligned_vqf32_f32_f32(const float * src0, const float * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_aligned_impl(src0, src1, + count); +} + +inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_impl(src0, src1, count); +} -float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count); -float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count); +inline float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_aligned_impl(src0, src1, + count); +} inline bool is_f32_f32_dot_product_aligned(const float * src0, const float * src1, size_t count) { return is_dot_product_aligned(src0, src1, count); } -HVX_Vector vec_dot_product_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); -HVX_Vector vec_dot_product_aligned_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, - size_t count); +inline HVX_Vector vec_dot_product_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, + size_t count) { + using namespace hexagon::vec; + return vec_dot_product_impl( + src0, src1, count); +} -float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); -float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); +inline HVX_Vector vec_dot_product_aligned_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, + size_t count) { + using namespace hexagon::vec; + return vec_dot_product_aligned_impl( + src0, src1, count); +} + +inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_impl( + src0, src1, count); +} + +inline float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, + size_t count) { + using namespace hexagon::vec; + return vec_dot_product_aligned_impl( + src0, src1, count); +} inline bool is_f16_f16_dot_product_aligned(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { return is_dot_product_aligned(src0, src1, count); } -HVX_Vector vec_dot_product_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); -HVX_Vector vec_dot_product_aligned_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); +inline HVX_Vector vec_dot_product_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_mixed_impl(src0, src1, count); +} -float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); -float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); +inline HVX_Vector vec_dot_product_aligned_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, + size_t count) { + using namespace hexagon::vec; + return vec_dot_product_mix_aligned_impl(src0, src1, count); +} + +inline float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_mixed_impl(src0, src1, count); +} + +inline float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { + using namespace hexagon::vec; + return vec_dot_product_mix_aligned_impl(src0, src1, count); +} inline bool is_f16_f32_dot_product_aligned(const npu_device_fp16_t * src0, const float * src1, size_t count) { return is_dot_product_aligned(src0, src1, count); diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.inl similarity index 80% rename from ggml/src/ggml-qnn/npu/device/vec_ops.cpp rename to ggml/src/ggml-qnn/npu/device/vec_ops.inl index 4c32e9066fe62..fbaba70d822a8 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -1,6 +1,12 @@ -#include "vec_ops.hpp" +#pragma once -namespace { +#include + +#include + +#include "hexagon_npu.h" + +namespace hexagon::vec { template @@ -316,67 +322,60 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem return _ReduceFunc(_AddFunc(sum0, sum1)); } -} // namespace +template +inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TParam); -namespace hexagon { + HVX_Vector * src_vec_ptr = ((HVX_Vector *) src); + HVX_Vector * const src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector); + HVX_UVector * dst_vec_ptr = ((HVX_UVector *) dst); // TODO: opt the unaligned case? + HVX_Vector prev = *src_vec_ptr++; + const size_t leftover = count % kElementsPerVector; -HVX_Vector vec_dot_product_vqf32_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_impl(src0, src1, count); -} + HVX_Vector scale_vec = _FuncScaleConvert(scale); -HVX_Vector vec_dot_product_aligned_vqf32_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_aligned_impl(src0, src1, - count); -} + while (src_vec_end - src_vec_ptr > 1) { + HVX_VectorPair curr = reinterpret_cast(src_vec_ptr)[0]; + src_vec_ptr += 2; -float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_impl(src0, src1, count); -} + HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src); + HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src); -float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_aligned_impl(src0, src1, - count); -} - -HVX_Vector vec_dot_product_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - return vec_dot_product_impl( - src0, src1, count); -} - -HVX_Vector vec_dot_product_aligned_vqf16_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, - size_t count) { - return vec_dot_product_aligned_impl( - src0, src1, count); -} - -float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - return vec_dot_product_impl( - src0, src1, count); -} + dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec); + dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec); -float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - return vec_dot_product_aligned_impl( - src0, src1, count); -} - -HVX_Vector vec_dot_product_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { - return vec_dot_product_mixed_impl(src0, src1, count); -} + dst_vec_ptr += 2; + prev = Q6_V_hi_W(curr); + } -HVX_Vector vec_dot_product_aligned_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { - return vec_dot_product_mix_aligned_impl(src0, src1, count); -} + if (src_vec_end - src_vec_ptr > 0) { + HVX_Vector curr = *src_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec); + dst_vec_ptr++; + prev = curr; + } -float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { - return vec_dot_product_mixed_impl(src0, src1, count); -} + if ((src_vec_end - ((HVX_Vector *) src)) > 0) { + // handle the last vector + bool should_fetch_next = leftover == 0 && hexagon::is_addr_aligned(src_vec_ptr); + HVX_Vector curr = should_fetch_next ? prev : *src_vec_ptr; + src_vec_ptr = should_fetch_next ? src_vec_ptr : src_vec_ptr + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec); + dst_vec_ptr++; + prev = curr; + } -float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { - return vec_dot_product_mix_aligned_impl(src0, src1, count); + if (leftover > 0) { + // handle the leftover elements + const size_t leftover_bytes = leftover * sizeof(_TParam); + HVX_Vector curr = + (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; + curr = Q6_V_valign_VVR(curr, prev, (size_t) src); + q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _Func(curr, dst_vec_ptr, scale_vec)); + } } -} // namespace hexagon +} // namespace hexagon::vec From a86df9e2488a3a21ef64df75506827be96a365e8 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 8 Jul 2025 12:33:00 +0800 Subject: [PATCH 27/53] wip --- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 83 +----------------------- ggml/src/ggml-qnn/npu/device/vec_ops.inl | 81 +++++++++++++++++++++++ 2 files changed, 83 insertions(+), 81 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index ad6d3d9298af2..a794a8b750138 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -12,89 +12,9 @@ namespace { -template -inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { - constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); - - HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); - HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; - HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); - HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned - HVX_Vector prev0 = *src0_vec_ptr++; - HVX_Vector prev1 = *src1_vec_ptr++; - - { - while (src0_vec_ptr_end - src0_vec_ptr > 1) { - HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - - HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); - HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); - HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - prev0 = Q6_V_hi_W(curr0); - prev1 = Q6_V_hi_W(curr1); - src0_vec_ptr += 2; - src1_vec_ptr += 2; - - dst_vec_ptr[0] = _OpBinaryTransform(l0, l1); - dst_vec_ptr[1] = _OpBinaryTransform(h0, h1); - dst_vec_ptr += 2; - } - } - - if (src0_vec_ptr_end - src0_vec_ptr > 0) { - HVX_Vector curr0 = *src0_vec_ptr++; - HVX_Vector curr1 = *src1_vec_ptr++; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - prev0 = curr0; - prev1 = curr1; - - dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); - dst_vec_ptr++; - } - - const size_t leftover = count % kElementsPerVector; - if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { - // handle the last vector - // see also: - // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 - // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr); - bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); - HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; - HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; - src0_vec_ptr += should_fetch_src0 ? 1 : 0; - src1_vec_ptr += should_fetch_src1 ? 1 : 0; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - prev0 = curr0; - prev1 = curr1; - - dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); - dst_vec_ptr++; - } - - if (leftover > 0) { - // handle the leftover elements - const size_t leftover_bytes = leftover * sizeof(_TyData); - HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? - *src0_vec_ptr : - prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - - HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? - *src1_vec_ptr : - prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - - hexagon::q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpBinaryTransform(curr0, curr1)); - } -} - template inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { + using namespace hexagon::vec; vec_trans_op_impl<_OpBinaryTransform, float>(src0, src1, count, dst); } @@ -113,6 +33,7 @@ inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { template inline void vec_op_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count, npu_device_fp16_t * dst) { + using namespace hexagon::vec; vec_trans_op_impl<_OpBinaryTransform, npu_device_fp16_t>(src0, src1, count, dst); } diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index fbaba70d822a8..16d0a250e7420 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -378,4 +378,85 @@ inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size } } +template +inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); + + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + + { + while (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); + prev0 = Q6_V_hi_W(curr0); + prev1 = Q6_V_hi_W(curr1); + src0_vec_ptr += 2; + src1_vec_ptr += 2; + + dst_vec_ptr[0] = _OpBinaryTransform(l0, l1); + dst_vec_ptr[1] = _OpBinaryTransform(h0, h1); + dst_vec_ptr += 2; + } + } + + if (src0_vec_ptr_end - src0_vec_ptr > 0) { + HVX_Vector curr0 = *src0_vec_ptr++; + HVX_Vector curr1 = *src1_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + prev0 = curr0; + prev1 = curr1; + + dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); + dst_vec_ptr++; + } + + const size_t leftover = count % kElementsPerVector; + if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr); + bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; + HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; + src0_vec_ptr += should_fetch_src0 ? 1 : 0; + src1_vec_ptr += should_fetch_src1 ? 1 : 0; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + prev0 = curr0; + prev1 = curr1; + + dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); + dst_vec_ptr++; + } + + if (leftover > 0) { + // handle the leftover elements + const size_t leftover_bytes = leftover * sizeof(_TyData); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + hexagon::q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpBinaryTransform(curr0, curr1)); + } +} + } // namespace hexagon::vec From ec953fad3250addd17b3a52f95b6b19541d643a1 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 8 Jul 2025 22:57:54 +0800 Subject: [PATCH 28/53] refactor vector dot product implementations for improved readability and performance --- ggml/src/ggml-qnn/npu/device/vec_ops.inl | 85 +++++++++++++----------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index 16d0a250e7420..8479372082d31 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -30,15 +30,16 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + sum0 = _AddFunc(_MpyFunc(l0, l1), sum0); + HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - prev0 = Q6_V_hi_W(curr0); - prev1 = Q6_V_hi_W(curr1); + sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); + + prev0 = Q6_V_hi_W(curr0); + prev1 = Q6_V_hi_W(curr1); src0_vec_ptr += 2; src1_vec_ptr += 2; - - sum0 = _AddFunc(_MpyFunc(l0, l1), sum0); - sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); } sum = _AddFunc(sum0, sum1); @@ -112,16 +113,17 @@ inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr while (src0_vec_ptr_end - src0_vec_ptr > 3) { HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); + + HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); + sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); + src0_vec_ptr += 4; src1_vec_ptr += 4; - - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); - sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); - sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); } if (src0_vec_ptr_end - src0_vec_ptr > 1) { @@ -198,16 +200,18 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); HVX_VectorPair s0_pair = _ExpandFunc(s0, kOneV); - prev0 = curr0; - prev1 = Q6_V_hi_W(curr1); + + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), l1), sum0); + + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), h1), sum1); + + prev0 = curr0; + prev1 = Q6_V_hi_W(curr1); src0_vec_ptr++; src1_vec_ptr += 2; - - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), l1), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), h1), sum1); } sum = _AddFunc(sum0, sum1); @@ -291,18 +295,18 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem while (src1_vec_ptr_end - src1_vec_ptr > 3) { HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV); HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; - HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); - HVX_VectorPair curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV); HVX_VectorPair curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV); + HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); + sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); + src0_vec_ptr += 2; src1_vec_ptr += 4; - - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); - sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); - sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); } sum0 = _AddFunc(sum0, sum2); @@ -310,11 +314,11 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem } if (src1_vec_ptr_end - src1_vec_ptr > 1) { - HVX_Vector curr0 = src0_vec_ptr[0]; - HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - + HVX_Vector curr0 = src0_vec_ptr[0]; HVX_VectorPair s0_pair = _ExpandFunc(curr0, kOneV); + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), Q6_V_lo_W(curr1)), sum0); sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), Q6_V_hi_W(curr1)), sum1); } @@ -339,10 +343,10 @@ inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size HVX_VectorPair curr = reinterpret_cast(src_vec_ptr)[0]; src_vec_ptr += 2; - HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src); - HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src); - + HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src); dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec); + + HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src); dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec); dst_vec_ptr += 2; @@ -394,17 +398,18 @@ inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); - HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); - HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - prev0 = Q6_V_hi_W(curr0); - prev1 = Q6_V_hi_W(curr1); - src0_vec_ptr += 2; - src1_vec_ptr += 2; - + HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); dst_vec_ptr[0] = _OpBinaryTransform(l0, l1); + + HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); dst_vec_ptr[1] = _OpBinaryTransform(h0, h1); + + prev0 = Q6_V_hi_W(curr0); + prev1 = Q6_V_hi_W(curr1); + src0_vec_ptr += 2; + src1_vec_ptr += 2; dst_vec_ptr += 2; } } From f16492d7a2c4af1197148b5163e7c8b929f6e30f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 8 Jul 2025 23:29:10 +0800 Subject: [PATCH 29/53] refactor vector conversion functions to use HVX_Vector_Dual for improved clarity and consistency --- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 13 +++--- ggml/src/ggml-qnn/npu/device/vec_ops.inl | 52 ++++++++++++------------ 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 62d52a6ba97b6..41951a09b7a69 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -157,11 +157,14 @@ inline HVX_VectorPair hvx_vqf32_convert_vhf(HVX_Vector vxl) { return qhmath_hvx_vqf32_convert_vqf16(qhmath_hvx_vqf16_convert_vhf(vxl)); } -inline HVX_VectorPair hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) { - HVX_VectorPair res = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vxl), one); - HVX_Vector vxl_w = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)); - HVX_Vector vxh_w = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)); - return Q6_W_vcombine_VV(vxh_w, vxl_w); +using HVX_Vector_Dual = std::pair; + +inline HVX_Vector_Dual hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) { + HVX_VectorPair res = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vxl), one); + return { + Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)), + Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)), + }; } inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index 8479372082d31..0b96968e2bcb0 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -167,7 +167,7 @@ inline HVX_Vector vec_add_qf16(HVX_Vector sum, HVX_Vector result) { return Q6_Vqf16_vadd_Vqf16Vqf16(sum, result); } -template inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { @@ -199,14 +199,14 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr HVX_Vector curr0 = src0_vec_ptr[0]; HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_VectorPair s0_pair = _ExpandFunc(s0, kOneV); + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector_Dual s0_pair = _ExpandFunc(s0, kOneV); HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), l1), sum0); + sum0 = _AddFunc(_MpyFunc(s0_pair.first, l1), sum0); HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), h1), sum1); + sum1 = _AddFunc(_MpyFunc(s0_pair.second, h1), sum1); prev0 = curr0; prev1 = Q6_V_hi_W(curr1); @@ -225,17 +225,15 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; src0_vec_ptr += should_fetch_src0 ? 1 : 0; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_VectorPair s0_pair = _ExpandFunc(s0, kOneV); + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector_Dual s0_pair = _ExpandFunc(s0, kOneV); const bool has_remaining_src1_vector = src1_vec_ptr_end - src1_vec_ptr > 0; if (has_remaining_src1_vector) { HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = _AddFunc(_MpyFunc(s0_pair.first, s1), sum); prev1 = curr1; - - // should_handle_last_vector will be always true here - sum = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), s1), sum); } bool should_fetch_src1 = leftover1 != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); @@ -245,7 +243,7 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr prev0 = curr0; prev1 = curr1; - sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? Q6_V_hi_W(s0_pair) : Q6_V_lo_W(s0_pair), s1), sum); + sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? s0_pair.second : s0_pair.first, s1), sum); } if (leftover1 > 0) { @@ -259,16 +257,16 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr prev1; curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - HVX_VectorPair curr0_pair = _ExpandFunc(curr0, kOneV); + HVX_Vector_Dual curr0_pair = _ExpandFunc(curr0, kOneV); - curr0 = leftover1 == leftover0 ? Q6_V_lo_W(curr0_pair) : Q6_V_hi_W(curr0_pair); + curr0 = leftover1 == leftover0 ? curr0_pair.first : curr0_pair.second; sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes1), sum); } return _ReduceFunc(sum); } -template inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { @@ -294,16 +292,16 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem HVX_Vector sum3 = Q6_V_vzero(); while (src1_vec_ptr_end - src1_vec_ptr > 3) { - HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV); - HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_Vector_Dual curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV); + HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; + sum0 = _AddFunc(_MpyFunc(curr00.first, Q6_V_lo_W(curr10)), sum0); + sum1 = _AddFunc(_MpyFunc(curr00.second, Q6_V_hi_W(curr10)), sum1); - HVX_VectorPair curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV); - HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; - sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); - sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); + HVX_Vector_Dual curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV); + HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + sum2 = _AddFunc(_MpyFunc(curr01.first, Q6_V_lo_W(curr11)), sum2); + sum3 = _AddFunc(_MpyFunc(curr01.second, Q6_V_hi_W(curr11)), sum3); src0_vec_ptr += 2; src1_vec_ptr += 4; @@ -314,13 +312,13 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem } if (src1_vec_ptr_end - src1_vec_ptr > 1) { - HVX_Vector curr0 = src0_vec_ptr[0]; - HVX_VectorPair s0_pair = _ExpandFunc(curr0, kOneV); + HVX_Vector curr0 = src0_vec_ptr[0]; + HVX_Vector_Dual s0_pair = _ExpandFunc(curr0, kOneV); HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), Q6_V_lo_W(curr1)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), Q6_V_hi_W(curr1)), sum1); + sum0 = _AddFunc(_MpyFunc(s0_pair.first, Q6_V_lo_W(curr1)), sum0); + sum1 = _AddFunc(_MpyFunc(s0_pair.second, Q6_V_hi_W(curr1)), sum1); } return _ReduceFunc(_AddFunc(sum0, sum1)); From 06627fb0fadc29debfafdf5b95044c4ac8bec1c9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 8 Jul 2025 23:53:10 +0800 Subject: [PATCH 30/53] wip --- ggml/src/ggml-qnn/npu/device/vec_ops.inl | 40 ++++++++++++++---------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index 0b96968e2bcb0..ecf30a456ce4a 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -252,11 +252,13 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr const size_t leftover_bytes1 = leftover1 * sizeof(_TElem1); HVX_Vector curr0 = reinterpret_cast(hexagon::align_down(src0_vec_ptr)) < src0_ptr_end ? *src0_vec_ptr : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector curr1 = (leftover_bytes1 + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? *src1_vec_ptr : prev1; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_Vector_Dual curr0_pair = _ExpandFunc(curr0, kOneV); curr0 = leftover1 == leftover0 ? curr0_pair.first : curr0_pair.second; @@ -316,9 +318,8 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem HVX_Vector_Dual s0_pair = _ExpandFunc(curr0, kOneV); HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - - sum0 = _AddFunc(_MpyFunc(s0_pair.first, Q6_V_lo_W(curr1)), sum0); - sum1 = _AddFunc(_MpyFunc(s0_pair.second, Q6_V_hi_W(curr1)), sum1); + sum0 = _AddFunc(_MpyFunc(s0_pair.first, Q6_V_lo_W(curr1)), sum0); + sum1 = _AddFunc(_MpyFunc(s0_pair.second, Q6_V_hi_W(curr1)), sum1); } return _ReduceFunc(_AddFunc(sum0, sum1)); @@ -414,13 +415,15 @@ inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t if (src0_vec_ptr_end - src0_vec_ptr > 0) { HVX_Vector curr0 = *src0_vec_ptr++; - HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - prev0 = curr0; - prev1 = curr1; dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); + + prev0 = curr0; + prev1 = curr1; dst_vec_ptr++; } @@ -430,18 +433,21 @@ inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t // see also: // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr); - bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); - HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; - HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; - src0_vec_ptr += should_fetch_src0 ? 1 : 0; - src1_vec_ptr += should_fetch_src1 ? 1 : 0; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - prev0 = curr0; - prev1 = curr1; + bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr); + bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); + + HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); dst_vec_ptr[0] = _OpBinaryTransform(s0, s1); + + src0_vec_ptr += should_fetch_src0 ? 1 : 0; + src1_vec_ptr += should_fetch_src1 ? 1 : 0; + prev0 = curr0; + prev1 = curr1; dst_vec_ptr++; } From b3e3e7ed97d539fda51529c2e38dd9d5701e157a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 9 Jul 2025 10:25:28 +0800 Subject: [PATCH 31/53] wip --- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 29 ++++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 05a46b95fec5b..d32bebc3fc3a8 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -55,11 +55,12 @@ template inline HVX_Vector load_qual_block_generic(const _TBl const HVX_Vector * qs1 = qs0 + 1; HVX_Vector blocks = Q6_V_valign_VVR(*qs1, *qs0, (size_t) srcs->qs); HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); - HVX_Vector block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2); - HVX_Vector block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs); + + HVX_Vector block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2); + HVX_Vector block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3); + HVX_VectorPair qp1 = Q6_W_vshuff_VVR(block3, block2, kSizeOfQs); - HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs); - HVX_VectorPair qp1 = Q6_W_vshuff_VVR(block3, block2, kSizeOfQs); return Q6_V_lo_W(Q6_W_vshuff_VVR(Q6_V_lo_W(qp1), Q6_V_lo_W(qp0), kSizeOfQs * 2)); } @@ -381,17 +382,21 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_target_type * d HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4)); q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus); qp0 = Q6_Wh_vunpack_Vb(q_lo); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); - q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0)); - q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); - q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23); + + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); + q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); + q_lo = Q6_Vhf_equals_Vqf16(q_lo); + + q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0)); + q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23); + q_hi = Q6_Vhf_equals_Vqf16(q_hi); if constexpr (_IsDstAligned) { - reinterpret_cast(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo); - reinterpret_cast(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi); + reinterpret_cast(dst_ptr)[0] = q_lo; + reinterpret_cast(dst_ptr)[1] = q_hi; } else { - reinterpret_cast(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo); - reinterpret_cast(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi); + reinterpret_cast(dst_ptr)[0] = q_lo; + reinterpret_cast(dst_ptr)[1] = q_hi; } dst_ptr += hexagon::kBytesPerVector / sizeof(hexagon::dequant_target_type) * 2; From 5090f8e77b04dbaf56ae4307c7d8171afd0fbf55 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 9 Jul 2025 11:43:54 +0800 Subject: [PATCH 32/53] wip --- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index d32bebc3fc3a8..781dc6176bdc9 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -417,11 +417,12 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_target_type * d qp0 = Q6_Wh_vunpack_Vb(q_lo); q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); + q_lo = Q6_Vhf_equals_Vqf16(q_lo); if constexpr (_IsDstAligned) { - *reinterpret_cast(dst_ptr) = Q6_Vhf_equals_Vqf16(q_lo); + *reinterpret_cast(dst_ptr) = q_lo; } else { - *reinterpret_cast(dst_ptr) = Q6_Vhf_equals_Vqf16(q_lo); + *reinterpret_cast(dst_ptr) = q_lo; } dst_ptr += hexagon::kBytesPerVector / sizeof(hexagon::dequant_target_type); @@ -439,12 +440,12 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_target_type * d qp0 = Q6_Wh_vunpack_Vb(q_lo); q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales); + q_lo = Q6_Vhf_equals_Vqf16(q_lo); + if constexpr (_IsDstAligned) { - hexagon::q6op_vstu_variable_aligned(dst_ptr, Q6_Vhf_equals_Vqf16(q_lo)); + hexagon::q6op_vstu_variable_aligned(dst_ptr, q_lo); } else { - hexagon::q6op_vstu_variable_ARV( - dst_ptr, - Q6_Vhf_equals_Vqf16(q_lo)); // TODO: opt the store + hexagon::q6op_vstu_variable_ARV(dst_ptr, q_lo); } } } From 84e56c88f7c73500fde2dd4046e2d2f55bfcf698 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 9 Jul 2025 12:41:27 +0800 Subject: [PATCH 33/53] implement row size caching logic and enhance type traits for F32 support --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 24 ++++++++++++++++---- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 9 ++++++-- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 1ca7e077d45a9..d8f005a06d64e 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -199,6 +199,25 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso dst->release_write_buffer(); // mark the output tensor as modified } +bool is_row_size_cacheable(const npu_device_tensor_spec & src) { + const auto & type_traits = hexagon::get_type_traits(src.type); + if (type_traits.to_float == nullptr) { + DEVICE_LOG_DEBUG("[MUL_MAT]src.type(%s) cannot be cached, to_float is null\n", + hexagon::get_type_name(src.type)); + return false; + } + + const size_t type_size = type_traits.is_quantized ? sizeof(hexagon::dequant_target_type) : type_traits.type_size; + const auto vtcm_thread_quota_size = hexagon::default_thread_pool::get_per_thread_vtcm_quota(); + if (src.ne[0] * type_size > vtcm_thread_quota_size) { + DEVICE_LOG_DEBUG("[MUL_MAT]src.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", + hexagon::get_type_name(src.type), (long) src.ne[0], vtcm_thread_quota_size); + return false; + } + + return true; +} + bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) { if (src1.type != NPU_DATA_TYPE_F32 && src1.type != NPU_DATA_TYPE_F16) { DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", @@ -219,10 +238,7 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n return false; } - const auto vtcm_thread_quota_size = hexagon::default_thread_pool::get_per_thread_vtcm_quota(); - if (src0.ne[0] * sizeof(hexagon::dequant_target_type) > vtcm_thread_quota_size) { - DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", - hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size); + if (!is_row_size_cacheable(src0)) { return false; } diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 781dc6176bdc9..619472a6ed774 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -495,7 +495,12 @@ void dequantize_row_q4_K(const void * src, hexagon::dequant_target_type * dst, s } void copy_row_f16(const void * src, hexagon::dequant_target_type * dst, size_t count) { - hexagon::vec_cpy_f16(reinterpret_cast(src), dst, count); + hexagon::vec_cpy_f16(reinterpret_cast(src), dst, count); +} + +void copy_row_f32(const void * src, hexagon::dequant_target_type * dst, size_t count) { + hexagon::vec_cpy_f16(reinterpret_cast(src), dst, + count * sizeof(float) / sizeof(npu_device_fp16_t)); } template struct dot_func_traits {}; @@ -515,7 +520,7 @@ _TReturn wrap_dot_func(const void * src0, const void * src1, size_t count) { } constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { - { NPU_DATA_TYPE_F32, "F32", 1, sizeof(float), false, nullptr, nullptr, + { NPU_DATA_TYPE_F32, "F32", 1, sizeof(float), false, copy_row_f32, nullptr, wrap_dot_func, wrap_dot_func, wrap_dot_func }, { NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, copy_row_f16, quantize_row_fp16, From 56ad5f860b703d5b991a630c1940796a2c422738 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 9 Jul 2025 13:35:59 +0800 Subject: [PATCH 34/53] refactor matrix multiplication functions to improve caching logic and simplify tensor alignment handling --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 56 ++++++++++++--------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index d8f005a06d64e..29280a0915650 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -34,9 +34,6 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso using data_type0 = typename get_data_type::data_type0; using data_type1 = typename get_data_type::data_type1; - static_assert(!_ShouldCacheSrc0 || std::is_same_v, - "data_type0 must be the same as hexagon::dequant_target_type"); - const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) { @@ -291,25 +288,34 @@ bool is_mul_mat_f32_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::ten typedef void (*mul_mat_func_type)(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst, hexagon::compute_params * params); +constexpr const mul_mat_func_type kMulMatF32F32CachedFuncs[2] = { + // quantized and non-quantized + mul_mat_impl, // F32 * F32 quantized unaligned + mul_mat_impl, // F32 * F32 quantized aligned +}; + +constexpr const mul_mat_func_type kMulMatF32F32Funcs[2] = { + // quantized and non-quantized + mul_mat_impl, // F32 * F32 quantized unaligned + mul_mat_impl, // F32 * F32 quantized aligned +}; + +constexpr const mul_mat_func_type kMulMatF16CachedFuncs[2] = { + mul_mat_impl, // F16 * F16 quantized unaligned + mul_mat_impl, // F16 * F16 quantized aligned +}; + +constexpr const mul_mat_func_type kMulMatF16Funcs[2] = { + mul_mat_impl, // F16 * F16 quantized unaligned + mul_mat_impl, // F16 * F16 quantized aligned +}; + constexpr const mul_mat_func_type kMulMatF16F32Funcs[2] = { // quantized and non-quantized mul_mat_impl, // F32 * F32 quantized unaligned mul_mat_impl, // F32 * F32 quantized aligned }; -constexpr const mul_mat_func_type kMulMatF16Funcs[2][2] = { - { - // non-quantized - mul_mat_impl, // F16 * F16 unaligned - mul_mat_impl, // F16 * F16 aligned - }, - { - // quantized - mul_mat_impl, // F16 * F16 quantized unaligned - mul_mat_impl, // F16 * F16 quantized aligned - }, -}; - } // namespace namespace hexagon { @@ -331,22 +337,26 @@ bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { } const bool is_src0_quantized = is_quantized_type(src0->get_type()); + const bool should_cache_src0 = is_src0_quantized || src1->get_ne(1) > 1; switch (src1->get_type()) { case NPU_DATA_TYPE_F32: if (is_src0_quantized || src0->get_type() == NPU_DATA_TYPE_F16) { kMulMatF16F32Funcs[is_mul_mat_f16_f32_src_tensors_aligned(src0, src1, is_src0_quantized)](src0, src1, out, params); + } else if (should_cache_src0) { + kMulMatF32F32CachedFuncs[is_mul_mat_f32_f32_src_tensors_aligned(src0, src1)](src0, src1, out, params); } else { - if (is_mul_mat_f32_f32_src_tensors_aligned(src0, src1)) { - mul_mat_impl(src0, src1, out, params); - } else { - mul_mat_impl(src0, src1, out, params); - } + kMulMatF32F32Funcs[is_mul_mat_f32_f32_src_tensors_aligned(src0, src1)](src0, src1, out, params); } return true; case NPU_DATA_TYPE_F16: - kMulMatF16Funcs[is_src0_quantized][is_mul_mat_f16_f16_src_tensors_aligned(src0, src1, is_src0_quantized)]( - src0, src1, out, params); + if (should_cache_src0) { + kMulMatF16CachedFuncs[is_mul_mat_f16_f16_src_tensors_aligned(src0, src1, is_src0_quantized)]( + src0, src1, out, params); + } else { + kMulMatF16Funcs[is_mul_mat_f16_f16_src_tensors_aligned(src0, src1, is_src0_quantized)](src0, src1, out, + params); + } return true; default: break; From 973ce415981c6cb258623f788c70a2485d9e2970 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 9 Jul 2025 23:36:00 +0800 Subject: [PATCH 35/53] add vector zeroing functions for F32 and F16 types to optimize memory initialization --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 11 +++++--- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 10 ++++++++ ggml/src/ggml-qnn/npu/device/vec_ops.inl | 25 +++++++++++++++++++ 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 1db48924504b7..1135d7298aebe 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -107,10 +107,13 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const auto * q_data = q_ptr + (iq1 * q->get_nb(1) + iq2 * q->get_nb(2) + iq3 * q->get_nb(3)); hexagon::l2fetch_row(q_data, row_bytes_q); - if constexpr (is_v_f16) { - memset(VKQ16, 0, DV * sizeof(npu_device_fp16_t)); - } else { - memset(VKQ32, 0, DV * sizeof(float)); + { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, memset); + if constexpr (is_v_f16) { + hexagon::vec_zero_f16(VKQ16, DV); + } else { + hexagon::vec_zero_f32(VKQ32, DV); + } } const npu_device_fp16_t * mp = diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 41951a09b7a69..783da3319bd1d 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -262,6 +262,11 @@ inline void vec_cpy_f32(const float * src, float * dst, size_t count) { vec_scale_impl(src, 0, dst, count); } +inline void vec_zero_f32(float * src, size_t count) { + using namespace hexagon::vec; + vec_zero_impl(src, count); +} + inline void vec_scale_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) { using namespace hexagon::vec; vec_scale_impl(src, scale, dst, count); @@ -277,6 +282,11 @@ inline void vec_cpy_f16(const npu_device_fp16_t * src, npu_device_fp16_t * dst, vec_scale_impl(src, 0, dst, count); } +inline void vec_zero_f16(npu_device_fp16_t * src, size_t count) { + using namespace hexagon::vec; + vec_zero_impl(src, count); +} + template inline bool is_dot_product_aligned(const _TElem0 * src0, const _TElem1 * src1, size_t count) { static_assert(sizeof(_TElem0) <= sizeof(_TElem1), "src0 should be smaller than src1"); diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index ecf30a456ce4a..bd60b569303d5 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -381,6 +381,31 @@ inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size } } +template inline void vec_zero_impl(_TData * src, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TData); + + HVX_UVector * src_vec_ptr = ((HVX_UVector *) src); + HVX_UVector * const src_vec_end = ((HVX_UVector *) src) + (count / kElementsPerVector); + + while (src_vec_end - src_vec_ptr > 1) { + src_vec_ptr[0] = Q6_V_vzero(); + src_vec_ptr[1] = Q6_V_vzero(); + src_vec_ptr += 2; + } + + if (src_vec_end - src_vec_ptr > 0) { + src_vec_ptr[0] = Q6_V_vzero(); + src_vec_ptr++; + } + + const size_t leftover = count % kElementsPerVector; + if (leftover > 0) { + // handle the leftover elements + const size_t leftover_bytes = leftover * sizeof(_TData); + q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, Q6_V_vzero()); + } +} + template inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); From 549c4fd1a9752d812f21eca64c3bc06e19a2c287 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 9 Jul 2025 23:36:39 +0800 Subject: [PATCH 36/53] Revert "add vector zeroing functions for F32 and F16 types to optimize memory initialization" This reverts commit e374326dc74d049e6603e393ade418d9ef2b83f3. --- ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 1135d7298aebe..1db48924504b7 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -107,13 +107,10 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const auto * q_data = q_ptr + (iq1 * q->get_nb(1) + iq2 * q->get_nb(2) + iq3 * q->get_nb(3)); hexagon::l2fetch_row(q_data, row_bytes_q); - { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, memset); - if constexpr (is_v_f16) { - hexagon::vec_zero_f16(VKQ16, DV); - } else { - hexagon::vec_zero_f32(VKQ32, DV); - } + if constexpr (is_v_f16) { + memset(VKQ16, 0, DV * sizeof(npu_device_fp16_t)); + } else { + memset(VKQ32, 0, DV * sizeof(float)); } const npu_device_fp16_t * mp = From 0652b72a10d674fcd4074b70735f2ea5f43e0546 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 10 Jul 2025 10:48:20 +0800 Subject: [PATCH 37/53] wip --- ggml/src/ggml-qnn/npu/device/vec_ops.inl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index bd60b569303d5..c22b20c9ce819 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -191,11 +191,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr HVX_Vector prev1 = *src1_vec_ptr++; HVX_Vector sum = Q6_V_vzero(); - { + if (src1_vec_ptr_end - src1_vec_ptr > 1) { HVX_Vector sum0 = Q6_V_vzero(); HVX_Vector sum1 = Q6_V_vzero(); - while (src1_vec_ptr_end - src1_vec_ptr > 1) { + do { HVX_Vector curr0 = src0_vec_ptr[0]; HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; @@ -212,7 +212,7 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr prev1 = Q6_V_hi_W(curr1); src0_vec_ptr++; src1_vec_ptr += 2; - } + } while (src1_vec_ptr_end - src1_vec_ptr > 1); sum = _AddFunc(sum0, sum1); } @@ -289,11 +289,11 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem HVX_Vector sum0 = Q6_V_vzero(); HVX_Vector sum1 = Q6_V_vzero(); - { + if (src1_vec_ptr_end - src1_vec_ptr > 3) { HVX_Vector sum2 = Q6_V_vzero(); HVX_Vector sum3 = Q6_V_vzero(); - while (src1_vec_ptr_end - src1_vec_ptr > 3) { + do { HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; HVX_Vector_Dual curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV); HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; @@ -307,7 +307,7 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem src0_vec_ptr += 2; src1_vec_ptr += 4; - } + } while (src1_vec_ptr_end - src1_vec_ptr > 3); sum0 = _AddFunc(sum0, sum2); sum1 = _AddFunc(sum1, sum3); From 40d0632be1d03d89c6cafd6c8bb17870f1ae8277 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 11 Jul 2025 00:48:41 +0800 Subject: [PATCH 38/53] refactor alignment checks in dot product function to handle null pointers --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 21 ++++++++++----------- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 29280a0915650..65d2cab488e49 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -113,7 +113,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2); for (int64_t col_idx = start_end_element.first; col_idx < start_end_element.second; col_idx += src0_plane_slice_row_count) { - const auto actual_row_count = + const int64_t actual_row_count = std::min(src0_plane_slice_row_count, start_end_element.second - col_idx); // number of rows in this slice const uint8_t * src0_plane = @@ -122,14 +122,14 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso if (last_cached_plane_ptr != src0_plane) { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 0, dequant); - for (int64_t ir = 0; ir < (int64_t) actual_row_count; ir++) { + hexagon::l2fetch_row(src0_plane, src0->get_nb(1)); + for (int64_t ir = 0; ir < actual_row_count; ir++) { auto * src0_row = src0_plane + ir * src0->get_nb(1); if (ir + 1 < actual_row_count) { hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); } - auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + - ir * src0_actual_row_size); + auto * dst_row = src0_plane_cache_ptr + ir * src0_actual_row_size; dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0)); } @@ -149,7 +149,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso auto * src1_row = src1_plane + i1 * src1->get_nb(1); auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; int64_t i0 = 0; - for (; i0 + 1 < (int64_t) actual_row_count; i0 += 2) { + for (; i0 + 1 < actual_row_count; i0 += 2) { auto * src0_row = src0_plane + i0 * src0_actual_row_size; if constexpr (should_fetch_src0_row) { hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row0_bytes); @@ -164,7 +164,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso dst_row[i0] = convert_vector::convert(res0); } - if (should_fetch_src0_row && i0 + 2 < (int64_t) actual_row_count) { + if (should_fetch_src0_row && i0 + 2 < actual_row_count) { hexagon::l2fetch_row(src0_row + src0_actual_row_size + src0_actual_row_size, valid_row0_bytes); } @@ -182,7 +182,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row1_bytes); } - if (i0 < (int64_t) actual_row_count) { + if (i0 < actual_row_count) { auto * src0_row = src0_plane + i0 * src0_actual_row_size; auto res = _DotFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); @@ -246,9 +246,8 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n bool is_mul_mat_f16_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::tensor * src1, bool is_src0_quantized) { const auto * src1_ptr = src1->get_read_buffer_as(); - const auto * src0_ptr = is_src0_quantized ? - src1->get_read_buffer_as() : - src0->get_read_buffer_as(); // skip src0 for quantized tensors + const auto * src0_ptr = + is_src0_quantized ? nullptr : src0->get_read_buffer_as(); // skip src0 for quantized tensors if (!hexagon::is_f16_f32_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) { DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0)); @@ -261,7 +260,7 @@ bool is_mul_mat_f16_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::ten bool is_mul_mat_f16_f16_src_tensors_aligned(hexagon::tensor * src0, hexagon::tensor * src1, bool is_src0_quantized) { const auto * src1_ptr = src1->get_read_buffer_as(); - const auto * src0_ptr = is_src0_quantized ? src1_ptr : src0->get_read_buffer_as(); + const auto * src0_ptr = is_src0_quantized ? nullptr : src0->get_read_buffer_as(); if (!hexagon::is_f16_f16_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) { DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0)); diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 783da3319bd1d..06b2a1261710a 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -291,7 +291,7 @@ template inline bool is_dot_product_aligned(const _TElem0 * src0, const _TElem1 * src1, size_t count) { static_assert(sizeof(_TElem0) <= sizeof(_TElem1), "src0 should be smaller than src1"); - if (!hexagon::is_addr_aligned(src0) || !hexagon::is_addr_aligned(src1)) { + if ((src0 && !hexagon::is_addr_aligned(src0)) || (src1 && !hexagon::is_addr_aligned(src1))) { return false; } From 009e058ec61cd8c37d81109902cce417f7249ed0 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 11 Jul 2025 01:00:55 +0800 Subject: [PATCH 39/53] wip --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 65d2cab488e49..6c49376b988ea 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -93,7 +93,8 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso src0_plane_cache_size); const size_t valid_row0_bytes = src0->get_ne(0) * sizeof(data_type0); - const size_t valid_row1_bytes = src1->get_ne(0) * sizeof(data_type1); + const size_t valid_row1_bytes = + src0->get_ne(0) * sizeof(data_type1); // src0 and src1 should have the same element count in the 1st dimension DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(dst, params->get_thread_index(), mul_mat); uint8_t * dst_ptr = dst->get_write_buffer(); From 94eea19e939280285b899f0ab96434ed33a793a3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 11 Jul 2025 13:32:07 +0800 Subject: [PATCH 40/53] refactor load_block_generic and related functions for improved alignment handling --- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 21 +++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 619472a6ed774..a82bc73a71ecb 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -31,19 +31,18 @@ inline npu_device_fp16_t to_fp16(const float src) { template inline HVX_Vector load_block_generic(const _TBlock & src) { static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding"); - const HVX_Vector * qs0 = reinterpret_cast(src.qs); - const HVX_Vector * qs1 = qs0 + 1; - return Q6_V_valign_VVR(*qs1, *qs0, (size_t) src.qs); + const HVX_Vector * qs0 = reinterpret_cast(src.qs); + HVX_Vector prev = *qs0; + HVX_Vector curr = hexagon::is_addr_aligned(src.qs) ? Q6_V_vzero() : *(qs0 + 1); + return Q6_V_valign_VVR(curr, prev, (size_t) src.qs); } template inline HVX_Vector load_dual_block_generic(const _TBlock * srcs) { static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong q4_0 block size/padding"); constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); - const HVX_Vector * qs0 = reinterpret_cast(srcs->qs); - const HVX_Vector * qs1 = qs0 + 1; - HVX_Vector blocks = Q6_V_valign_VVR(*qs1, *qs0, (size_t) srcs->qs); - HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); + HVX_Vector blocks = load_block_generic(srcs[0]); + HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); return Q6_V_lo_W(Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs)); } @@ -51,11 +50,9 @@ template inline HVX_Vector load_qual_block_generic(const _TBl static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong q4_0 block size/padding"); constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); - const HVX_Vector * qs0 = reinterpret_cast(srcs->qs); - const HVX_Vector * qs1 = qs0 + 1; - HVX_Vector blocks = Q6_V_valign_VVR(*qs1, *qs0, (size_t) srcs->qs); - HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); - HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs); + HVX_Vector blocks = load_block_generic(srcs[0]); + HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs); HVX_Vector block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2); HVX_Vector block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3); From 3f6a48732da7e421d1c7d796597897cedd45cb66 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 11 Jul 2025 16:15:11 +0800 Subject: [PATCH 41/53] wip --- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 22 +++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index a82bc73a71ecb..b6e600abdd944 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -28,20 +28,25 @@ inline npu_device_fp16_t to_fp16(const float src) { return reinterpret_cast(f16_value); } -template inline HVX_Vector load_block_generic(const _TBlock & src) { - static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding"); +template inline HVX_Vector load_into_vector(const _TStruct * src) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load"); - const HVX_Vector * qs0 = reinterpret_cast(src.qs); + const HVX_Vector * qs0 = reinterpret_cast(&(src->*_MemberPtr)); HVX_Vector prev = *qs0; - HVX_Vector curr = hexagon::is_addr_aligned(src.qs) ? Q6_V_vzero() : *(qs0 + 1); - return Q6_V_valign_VVR(curr, prev, (size_t) src.qs); + HVX_Vector curr = hexagon::is_addr_aligned(qs0) ? Q6_V_vzero() : *(qs0 + 1); + return Q6_V_valign_VVR(curr, prev, (size_t) qs0); +} + +template inline HVX_Vector load_block_generic(const _TBlock & src) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding"); + return load_into_vector<_TBlock, 1, &_TBlock::qs>(&src); } template inline HVX_Vector load_dual_block_generic(const _TBlock * srcs) { static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong q4_0 block size/padding"); constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); - HVX_Vector blocks = load_block_generic(srcs[0]); + HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs); HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); return Q6_V_lo_W(Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs)); } @@ -50,7 +55,7 @@ template inline HVX_Vector load_qual_block_generic(const _TBl static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong q4_0 block size/padding"); constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); - HVX_Vector blocks = load_block_generic(srcs[0]); + HVX_Vector blocks = load_into_vector<_TBlock, 4, &_TBlock::qs>(srcs); HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs); @@ -381,10 +386,11 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_target_type * d qp0 = Q6_Wh_vunpack_Vb(q_lo); q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); + q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0)); + q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); q_lo = Q6_Vhf_equals_Vqf16(q_lo); - q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0)); q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23); q_hi = Q6_Vhf_equals_Vqf16(q_hi); From dcf1580adba2fabdf1ff74e7d35d3e1cfd40891f Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 12 Jul 2025 23:24:21 +0800 Subject: [PATCH 42/53] refactor flash attention implementation and introduce type-erased dot function for improved type handling --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 15 ++++++----- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 26 +++++-------------- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 16 ++++++++++++ 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 1db48924504b7..5beea614a308c 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -18,8 +18,10 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const hexagon::tensor * v, const hexagon::tensor * mask, hexagon::compute_params * params) { static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count"); - if (k->get_type() != (_IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32) || v->get_type() != k->get_type()) { - DEVICE_LOG_ERROR("flash_attn_impl: k and v must be F16 type, got k: %s, v: %s\n", + constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32; + + if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) { + DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n", hexagon::get_type_name(k->get_type()), hexagon::get_type_name(v->get_type())); return; } @@ -44,10 +46,11 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const float m0 = powf(2.0f, -(max_bias) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const auto & k_type_traits = hexagon::get_type_traits(k->get_type()); - const auto q_to_vec_dot = k_type_traits.from_float; // TODO: fix this - const auto kq_vec_dot = k_type_traits.vec_dot; - if (!q_to_vec_dot || !kq_vec_dot) { + const auto & k_type_traits = hexagon::get_type_traits(kKvDataType); + const auto q_to_vec_dot = k_type_traits.from_float; + constexpr const auto kq_vec_dot = _IsKvF16 ? hexagon::type_erase_dot_func : + hexagon::type_erase_dot_func; + if (!q_to_vec_dot) { DEVICE_LOG_ERROR("flash_attn_impl: unsupported data type for q, k, or v\n"); return; } diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index b6e600abdd944..0fc07cbb7153b 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -506,29 +506,15 @@ void copy_row_f32(const void * src, hexagon::dequant_target_type * dst, size_t c count * sizeof(float) / sizeof(npu_device_fp16_t)); } -template struct dot_func_traits {}; - -template struct dot_func_traits<_TReturn (*)(_TData, _TData, size_t)> { - using param_type = std::remove_const_t>; - using return_type = _TReturn; -}; - -template ::return_type> -_TReturn wrap_dot_func(const void * src0, const void * src1, size_t count) { - using param_type = typename dot_func_traits::param_type; - - auto * src0_typed = reinterpret_cast(src0); - auto * src1_typed = reinterpret_cast(src1); - return _DotFunc(src0_typed, src1_typed, count); -} - constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { { NPU_DATA_TYPE_F32, "F32", 1, sizeof(float), false, copy_row_f32, nullptr, - wrap_dot_func, wrap_dot_func, - wrap_dot_func }, + hexagon::type_erase_dot_func, + hexagon::type_erase_dot_func, + hexagon::type_erase_dot_func }, { NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, copy_row_f16, quantize_row_fp16, - wrap_dot_func, wrap_dot_func, - wrap_dot_func }, + hexagon::type_erase_dot_func, + hexagon::type_erase_dot_func, + hexagon::type_erase_dot_func }, { NPU_DATA_TYPE_I32, "I32", 1, sizeof(int32_t), false }, { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q8_0), true, dequantize_row_q8_0, quantize_row_q8_0 }, diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 06b2a1261710a..2da30cd0671c8 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -389,4 +389,20 @@ inline bool is_f16_f32_dot_product_aligned(const npu_device_fp16_t * src0, const return is_dot_product_aligned(src0, src1, count); } +template struct dot_func_traits {}; + +template struct dot_func_traits<_TReturn (*)(_TData, _TData, size_t)> { + using param_type = std::remove_const_t>; + using return_type = _TReturn; +}; + +template ::return_type> +_TReturn type_erase_dot_func(const void * src0, const void * src1, size_t count) { + using param_type = typename dot_func_traits::param_type; + + auto * src0_typed = reinterpret_cast(src0); + auto * src1_typed = reinterpret_cast(src1); + return _DotFunc(src0_typed, src1_typed, count); +} + } // namespace hexagon From 97a567888126171b75f6971ab67191ea1621382a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 14 Jul 2025 12:19:15 +0800 Subject: [PATCH 43/53] refactor dot product implementations for improved loop handling and clarity --- ggml/src/ggml-qnn/npu/device/vec_ops.inl | 47 +++++++++++++----------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index c22b20c9ce819..13110cb037454 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -20,11 +20,11 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size HVX_Vector prev1 = *src1_vec_ptr++; HVX_Vector sum = Q6_V_vzero(); - { + if (src0_vec_ptr_end - src0_vec_ptr > 1) { HVX_Vector sum0 = Q6_V_vzero(); HVX_Vector sum1 = Q6_V_vzero(); - while (src0_vec_ptr_end - src0_vec_ptr > 1) { + do { HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; @@ -40,7 +40,7 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size prev1 = Q6_V_hi_W(curr1); src0_vec_ptr += 2; src1_vec_ptr += 2; - } + } while (src0_vec_ptr_end - src0_vec_ptr > 1); sum = _AddFunc(sum0, sum1); } @@ -108,22 +108,27 @@ inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr { HVX_Vector sum0 = Q6_V_vzero(); HVX_Vector sum1 = Q6_V_vzero(); - HVX_Vector sum2 = Q6_V_vzero(); - HVX_Vector sum3 = Q6_V_vzero(); - - while (src0_vec_ptr_end - src0_vec_ptr > 3) { - HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); - - HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; - HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; - sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); - sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); - - src0_vec_ptr += 4; - src1_vec_ptr += 4; + if (src0_vec_ptr_end - src0_vec_ptr > 3) { + HVX_Vector sum2 = Q6_V_vzero(); + HVX_Vector sum3 = Q6_V_vzero(); + + do { + HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); + + HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; + HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); + sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); + + src0_vec_ptr += 4; + src1_vec_ptr += 4; + } while (src0_vec_ptr_end - src0_vec_ptr > 3); + + sum0 = _AddFunc(sum2, sum0); + sum1 = _AddFunc(sum3, sum1); } if (src0_vec_ptr_end - src0_vec_ptr > 1) { @@ -136,9 +141,7 @@ inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1); } - sum0 = _AddFunc(sum2, sum0); - sum1 = _AddFunc(sum3, sum1); - sum = _AddFunc(sum0, sum1); + sum = _AddFunc(sum0, sum1); } if (src0_vec_ptr_end - src0_vec_ptr > 0) { From 00cdd3fa88d909feef44ddaa42095274b7627685 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 16 Jul 2025 22:58:27 +0800 Subject: [PATCH 44/53] refactor thread_pool constructor to pre-allocate VTCM cache for each thread --- ggml/src/ggml-qnn/npu/device/thread_pool.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index 455d4eec301b5..218afc10f2e01 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -127,9 +127,11 @@ template class thread_pool { thread_pool() { for (size_t i = 0; i < kMaxThreadCount; ++i) { - _thread_params[i].tidx = i; - _thread_params[i].vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size() / kMaxThreadCount; - _thread_params[i].pool = this; + auto & param = _thread_params[i]; + param.tidx = i; + param.vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size() / kMaxThreadCount; + param.pool = this; + param.get_vtcm_cache(param.vtcm_quota_size); // pre-allocate VTCM cache for each thread } qurt_barrier_init(&_pending, kMaxSubThreadCount + 1); From 93fbaad19b24b783b0c0bcd20a25b5d53d5c3c81 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 16 Jul 2025 23:55:46 +0800 Subject: [PATCH 45/53] Revert "refactor thread_pool constructor to pre-allocate VTCM cache for each thread" This reverts commit 00cdd3fa88d909feef44ddaa42095274b7627685. --- ggml/src/ggml-qnn/npu/device/thread_pool.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index 218afc10f2e01..455d4eec301b5 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -127,11 +127,9 @@ template class thread_pool { thread_pool() { for (size_t i = 0; i < kMaxThreadCount; ++i) { - auto & param = _thread_params[i]; - param.tidx = i; - param.vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size() / kMaxThreadCount; - param.pool = this; - param.get_vtcm_cache(param.vtcm_quota_size); // pre-allocate VTCM cache for each thread + _thread_params[i].tidx = i; + _thread_params[i].vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size() / kMaxThreadCount; + _thread_params[i].pool = this; } qurt_barrier_init(&_pending, kMaxSubThreadCount + 1); From e0f795b9d46e8ccd63b857f3cc2c92f0338fbcc9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 17 Jul 2025 20:40:18 +0800 Subject: [PATCH 46/53] wip --- ggml/src/ggml-qnn/npu/host/buffer.cpp | 3 +++ ggml/src/ggml-qnn/npu/host/graph.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp index c7482f8b590e6..a0305e502293c 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.cpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -3,6 +3,7 @@ #include #include "host_device.hpp" +#include "profiler.hpp" #include "tensor.hpp" namespace { @@ -78,6 +79,8 @@ void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { void backend_buffer_reset(ggml_backend_buffer_t buffer) { auto * buffer_obj = get_buffer_object(buffer); GGML_ASSERT(buffer_obj != nullptr); + + SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_reset", (void *) buffer_obj); buffer_obj->clear_tensors(); } diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index 1d40fe0dd5176..526191173dd17 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -57,10 +57,10 @@ bool host_graph::update(ggml_cgraph * cgraph) { _tensor_handles.push_back(tensor_obj->get_device_tensor_handle()); _tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node)); - PROFILER_LOG_DEBUG("node[%d]%s(%s), addr(%p), %s_%ldx%ldx%ldx%ld, handle(%p)\n", i, ggml_get_name(node), - ggml_op_desc(node), (void *) tensor_obj, ggml_type_name(node->type), - (long) tensor_obj->get_ne(0), (long) tensor_obj->get_ne(1), (long) tensor_obj->get_ne(2), - (long) tensor_obj->get_ne(3), (void *) tensor_obj->get_device_tensor_handle()); + PROFILER_LOG_DEBUG("node[%d]%s(%s), addr(%p), %ldx%ldx%ldx%ld%s, handle(%p)\n", i, ggml_get_name(node), + ggml_op_desc(node), (void *) tensor_obj, (long) tensor_obj->get_ne(0), + (long) tensor_obj->get_ne(1), (long) tensor_obj->get_ne(2), (long) tensor_obj->get_ne(3), + ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle()); } GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size()); From 68cf1cae1f99bbb5ba59965f1906cb2cd5c075cd Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 17 Jul 2025 21:09:26 +0800 Subject: [PATCH 47/53] opt interfaces for tensor cleanup --- ggml/src/ggml-qnn/npu/device/device.cpp | 20 ++++++++++++++++++++ ggml/src/ggml-qnn/npu/host/buffer.cpp | 2 +- ggml/src/ggml-qnn/npu/host/tensor.hpp | 23 +++++++++++++++++++++++ ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl | 4 ++++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index ff2819bae65e5..bfdd34b89ec11 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -177,6 +177,26 @@ AEEResult npu_device_tensor_free(remote_handle64 _h, npu_device_tensor_handle_t return AEE_SUCCESS; } +AEEResult npu_device_tensors_free(remote_handle64 _h, const npu_device_tensor_handle_t * tensor_handles, + int tensor_handlesLen) { + NPU_UNUSED(_h); + if (!tensor_handles || tensor_handlesLen < 0) { + DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments"); + return AEE_EINVARGS; + } + + for (int i = 0; i < tensor_handlesLen; ++i) { + auto * tensor = tensor_from_handle(tensor_handles[i]); + if (tensor) { + delete tensor; + } else { + DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d", i); + } + } + + return AEE_SUCCESS; +} + AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t * graph_handle) { NPU_UNUSED(_h); auto * graph = new (std::nothrow) hexagon::graph(); diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp index a0305e502293c..3eeb611f1d712 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.cpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -202,8 +202,8 @@ std::shared_ptr host_buffer::init_tensor(ggml_tensor * tensor, remo } void host_buffer::clear_tensors() { - _tensors.clear(); LOG_DEBUG("clear host_buffer(%p) tensors\n", (void *) _data); + host_tensor::destroy_tensors(_tensors); } host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) : diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index 7e8ee8f34cc09..6e75c7f88f892 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -1,6 +1,8 @@ #pragma once +#include #include +#include #include "common.hpp" #include "ggml-impl.h" @@ -66,6 +68,27 @@ class host_tensor { } } + static void destroy_tensors(std::list> & tensors) { + std::vector handles; + + handles.reserve(tensors.size()); + remote_handle64 device_handle = 0; + + for (auto tensor : tensors) { + if (tensor) { + handles.push_back(tensor->_device_tensor_handle); + tensor->_device_tensor_handle = 0; // prevent double free + device_handle = tensor->_device_handle; + } + } + + if (!handles.empty()) { + npu_device_tensors_free(device_handle, handles.data(), handles.size()); + } + + tensors.clear(); + } + npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; } void update_params(ggml_tensor * ggml_tensor) { diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index 0aa8d8e8ab48b..3a54561d71091 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -107,6 +107,10 @@ interface npu_device : remote_handle64{ in tensor_handle_t tensor_handle ); + AEEResult tensors_free( + in sequence tensor_handles + ); + AEEResult graph_init( rout graph_handle_t graph_handle ); From b5d316c61a984866a134f9237581fe2cfb94ecea Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 18 Jul 2025 12:06:48 +0800 Subject: [PATCH 48/53] refactor mul_mat_impl to use aligned size for src0 row calculation --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 6c49376b988ea..22fdbaced73ed 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -34,7 +34,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso using data_type0 = typename get_data_type::data_type0; using data_type1 = typename get_data_type::data_type1; - const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); + const auto src0_actual_row_size = hexagon::get_aligned_size(hexagon::get_dequantized_row_size(src0)); auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) { DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); From ef4550f8da4d3e67202d24b75d240bceef94e1ad Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 18 Jul 2025 17:18:26 +0800 Subject: [PATCH 49/53] refactor: update dequantized_row_size logic and add size alignment checks for tensors --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 22 +++++++++++++++++++- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 9 ++++++++ ggml/src/ggml-qnn/npu/device/type_traits.hpp | 9 +------- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 4 ++++ 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 22fdbaced73ed..f7e6b6413ad67 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -34,7 +34,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso using data_type0 = typename get_data_type::data_type0; using data_type1 = typename get_data_type::data_type1; - const auto src0_actual_row_size = hexagon::get_aligned_size(hexagon::get_dequantized_row_size(src0)); + const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) { DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); @@ -268,6 +268,16 @@ bool is_mul_mat_f16_f16_src_tensors_aligned(hexagon::tensor * src0, hexagon::ten return false; } + if (!is_src0_quantized && !hexagon::is_size_aligned(src0->get_nb(1))) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0 tensor nb[1] is not aligned: %zu\n", src0->get_nb(1)); + return false; + } + + if (!hexagon::is_size_aligned(src1->get_nb(1))) { + DEVICE_LOG_DEBUG("[MUL_MAT]src1 tensor nb[1] is not aligned: %zu\n", src1->get_nb(1)); + return false; + } + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0)); return true; } @@ -281,6 +291,16 @@ bool is_mul_mat_f32_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::ten return false; } + if (!hexagon::is_size_aligned(src0->get_nb(1))) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0 tensor nb[1] is not aligned: %zu\n", src0->get_nb(1)); + return false; + } + + if (!hexagon::is_size_aligned(src1->get_nb(1))) { + DEVICE_LOG_DEBUG("[MUL_MAT]src1 tensor nb[1] is not aligned: %zu\n", src1->get_nb(1)); + return false; + } + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0)); return true; } diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 0fc07cbb7153b..23c4831f00477 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -560,4 +560,13 @@ const device_type_traits & get_type_traits(npu_device_tensor_data_type type) { return kDeviceTypeTraits[type]; } +size_t get_dequantized_row_size(const tensor * tensor) { + if (!is_quantized_type(tensor->get_type())) { + return tensor->get_nb(1); // for f32 and f16 + } + + auto row_elems_count = tensor->get_ne(0); + return hexagon::get_aligned_size(row_elems_count * sizeof(dequant_target_type)); // currently only f32 is supported +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.hpp b/ggml/src/ggml-qnn/npu/device/type_traits.hpp index 224fffdc4d3c0..645101a676c60 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.hpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.hpp @@ -34,14 +34,7 @@ inline bool is_quantized_type(npu_device_tensor_data_type type) { return get_type_traits(type).is_quantized; } -inline size_t get_dequantized_row_size(const tensor * tensor) { - if (!is_quantized_type(tensor->get_type())) { - return tensor->get_nb(1); // for f32 and f16 - } - - auto row_elems_count = tensor->get_ne(0); - return row_elems_count * sizeof(dequant_target_type); // currently only f32 is supported -} +size_t get_dequantized_row_size(const tensor * tensor); inline const char * get_type_name(npu_device_tensor_data_type type) { return get_type_traits(type).type_name; diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 2da30cd0671c8..051255c9b76a7 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -31,6 +31,10 @@ inline bool is_addr_aligned(const void * addr) { return unaligned_bytes(addr) == 0; } +inline bool is_size_aligned(size_t size) { + return (size & kAlignMask) == 0; +} + inline float get_flt0_from_fltv(HVX_Vector vect) { static_assert(sizeof(vect[0]) == sizeof(float), "vect[0] should be a float"); int32_t i = vect[0]; From f7ab2db3d91f20239f93ac047a82a208281eaa08 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 18 Jul 2025 17:25:13 +0800 Subject: [PATCH 50/53] wip --- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 23c4831f00477..debdd36a670ab 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -502,8 +502,7 @@ void copy_row_f16(const void * src, hexagon::dequant_target_type * dst, size_t c } void copy_row_f32(const void * src, hexagon::dequant_target_type * dst, size_t count) { - hexagon::vec_cpy_f16(reinterpret_cast(src), dst, - count * sizeof(float) / sizeof(npu_device_fp16_t)); + hexagon::vec_cpy_f32(reinterpret_cast(src), reinterpret_cast(dst), count); } constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { From bcb0e7938e94e7bb20323a793d24784eca970741 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 18 Jul 2025 18:49:43 +0800 Subject: [PATCH 51/53] wip --- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 4 ++-- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index f7e6b6413ad67..e7ca2ea4404c5 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -130,8 +130,8 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); } - auto * dst_row = src0_plane_cache_ptr + ir * src0_actual_row_size; - dequantize_row_func(src0_row, reinterpret_cast(dst_row), + auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size; + dequantize_row_func(src0_row, reinterpret_cast(cached_row_ptr), src0->get_ne(0)); } diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index debdd36a670ab..31377f6e55219 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -565,7 +565,8 @@ size_t get_dequantized_row_size(const tensor * tensor) { } auto row_elems_count = tensor->get_ne(0); - return hexagon::get_aligned_size(row_elems_count * sizeof(dequant_target_type)); // currently only f32 is supported + return hexagon::get_aligned_size( + row_elems_count * sizeof(dequant_target_type)); // dequant_target_type is currently restricted to f32 } } // namespace hexagon From 9298936c90ee93463d5f1b9ef194507efa546580 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 18 Jul 2025 19:04:27 +0800 Subject: [PATCH 52/53] refactor: replace raw pointer initialization with invalid handle constants for better clarity --- ggml/src/ggml-qnn/npu/device/device.cpp | 41 ++++++++++------------- ggml/src/ggml-qnn/npu/host/graph.hpp | 2 +- ggml/src/ggml-qnn/npu/host/tensor.hpp | 6 ++-- ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl | 3 ++ 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index bfdd34b89ec11..db987217fa4c5 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -4,7 +4,6 @@ #include #include -#include #include "graph.hpp" #include "hexagon_npu.h" @@ -69,20 +68,28 @@ struct npu_device_context { } }; -inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) { +inline hexagon::tensor * tensor_from_handle(npu_device_tensor_handle_t h) { + if (h == npu_device_INVALID_DEVICE_TENSOR_HANDLE) { + return nullptr; + } + return reinterpret_cast(h); } -inline npu_device_graph_handle_t tensor_to_handle(hexagon::tensor * tensor) { - return reinterpret_cast(tensor); +inline npu_device_tensor_handle_t tensor_to_handle(hexagon::tensor * tensor) { + return reinterpret_cast(tensor); } -inline hexagon::graph * graph_from_handle(npu_device_tensor_handle_t h) { +inline hexagon::graph * graph_from_handle(npu_device_graph_handle_t h) { + if (h == npu_device_INVALID_DEVICE_GRAPH_HANDLE) { + return nullptr; + } + return reinterpret_cast(h); } -inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) { - return reinterpret_cast(graph); +inline npu_device_graph_handle_t graph_to_handle(hexagon::graph * graph) { + return reinterpret_cast(graph); } inline npu_device_context * device_context_from_handle(remote_handle64 h) { @@ -93,12 +100,7 @@ inline npu_device_context * device_context_from_handle(remote_handle64 h) { int npu_device_open(const char * uri, remote_handle64 * h) { // TODO: should we have a device context here? - auto * context = new (std::nothrow) npu_device_context(); - if (!context) { - DEVICE_LOG_ERROR("Failed to allocate memory for the npu_device_context"); - return AEE_ENOMEMORY; - } - + auto * context = new npu_device_context(); if (!context->init()) { DEVICE_LOG_ERROR("Failed to initialize npu_device_context"); delete context; @@ -144,12 +146,7 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, npu_device_tensor_op AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_config * info, npu_device_tensor_handle_t * tensor_handle) { NPU_UNUSED(_h); - auto * tensor = new (std::nothrow) hexagon::tensor(*info); - if (!tensor) { - DEVICE_LOG_ERROR("Failed to allocate memory for the tensor"); - return AEE_ENOMEMORY; - } - + auto * tensor = new hexagon::tensor(*info); *tensor_handle = tensor_to_handle(tensor); return AEE_SUCCESS; } @@ -199,11 +196,7 @@ AEEResult npu_device_tensors_free(remote_handle64 _h, const npu_device_tensor_ha AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t * graph_handle) { NPU_UNUSED(_h); - auto * graph = new (std::nothrow) hexagon::graph(); - if (!graph) { - return AEE_ENOMEMORY; - } - + auto * graph = new hexagon::graph(); *graph_handle = graph_to_handle(graph); return AEE_SUCCESS; } diff --git a/ggml/src/ggml-qnn/npu/host/graph.hpp b/ggml/src/ggml-qnn/npu/host/graph.hpp index b871c125563f2..0f8efe1079785 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.hpp +++ b/ggml/src/ggml-qnn/npu/host/graph.hpp @@ -22,7 +22,7 @@ class host_graph { private: remote_handle64 _device_handle = 0; - npu_device_graph_handle_t _graph_handle = 0; + npu_device_graph_handle_t _graph_handle = npu_device_INVALID_DEVICE_GRAPH_HANDLE; std::vector _tensor_handles; std::vector _tensor_update_configs; diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index 6e75c7f88f892..f4b1b70a05c89 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -44,7 +44,7 @@ class host_tensor { auto status = npu_device_tensor_init(_device_handle, &_info, &_device_tensor_handle); if (status != AEE_SUCCESS) { LOG_ERROR("Failed to init tensor: %d", (int) status); - _device_tensor_handle = 0; + _device_tensor_handle = npu_device_INVALID_DEVICE_TENSOR_HANDLE; return; } @@ -77,7 +77,7 @@ class host_tensor { for (auto tensor : tensors) { if (tensor) { handles.push_back(tensor->_device_tensor_handle); - tensor->_device_tensor_handle = 0; // prevent double free + tensor->_device_tensor_handle = npu_device_INVALID_DEVICE_TENSOR_HANDLE; // prevent double free device_handle = tensor->_device_handle; } } @@ -180,7 +180,7 @@ class host_tensor { return _info_update; } - bool is_valid() const { return _device_tensor_handle != 0; } + bool is_valid() const { return _device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE; } int64_t get_ne(size_t index) const { if (index >= DEVICE_TENSOR_MAX_DIMS) { diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index 3a54561d71091..513b69d88a25b 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -20,6 +20,9 @@ interface npu_device : remote_handle64{ typedef uint64_t tensor_handle_t; typedef uint64_t graph_handle_t; + const graph_handle_t INVALID_DEVICE_GRAPH_HANDLE = 0; + const tensor_handle_t INVALID_DEVICE_TENSOR_HANDLE = 0; + typedef uint16_t fp16_t; struct block_q4_0 { From a7ea14571e89957ac0dc75b5751d3b6d58c49610 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 18 Jul 2025 19:15:05 +0800 Subject: [PATCH 53/53] wip --- ggml/src/ggml-qnn/npu/device/tensor.hpp | 3 ++- ggml/src/ggml-qnn/npu/device/vec_ops.inl | 2 +- ggml/src/ggml-qnn/npu/host/tensor.hpp | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index 3bf834f826f4c..c6a7fb10779dc 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -60,7 +60,8 @@ class tensor { memcpy(_op_params, config.params, sizeof(_op_params)); for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) { auto src_handle = config.src_handles[i]; - _src[i] = (src_handle ? reinterpret_cast(src_handle) : nullptr); + _src[i] = (src_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE ? reinterpret_cast(src_handle) : + nullptr); } } diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index 13110cb037454..f21d6b06d9e46 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -492,7 +492,7 @@ inline void vec_trans_op_impl(const _TyData * src0, const _TyData * src1, size_t prev1; curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - hexagon::q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpBinaryTransform(curr0, curr1)); + q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpBinaryTransform(curr0, curr1)); } } diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index f4b1b70a05c89..f70526bf25dff 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -75,7 +75,7 @@ class host_tensor { remote_handle64 device_handle = 0; for (auto tensor : tensors) { - if (tensor) { + if (tensor && tensor->_device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE) { handles.push_back(tensor->_device_tensor_handle); tensor->_device_tensor_handle = npu_device_INVALID_DEVICE_TENSOR_HANDLE; // prevent double free device_handle = tensor->_device_handle;