diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index 1723fd3d4fb73..e43af19ff9390 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -220,7 +220,7 @@ else() target_compile_options(hexagon_npu_skel_OBJS PUBLIC -fsanitize=address -fno-omit-frame-pointer ) - target_link_libraries(hexagon_npu_skel_OBJS PUBLIC + target_link_options(hexagon_npu_skel_OBJS PUBLIC -fsanitize=address ) endif() @@ -248,9 +248,9 @@ else() add_library(hexagon_npu_skel SHARED $) target_link_libraries(hexagon_npu_skel - ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a - ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a - ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.a + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.so.1 + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.so.1 + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.so ) set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}") target_link_libraries(hexagon_npu_skel qprintf_static) diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index f07391711241d..cbe136cf1fb74 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -17,21 +17,30 @@ namespace { struct npu_device_context { + std::unique_ptr power_utils; // Power management utilities std::unique_ptr thread_pool; std::unique_ptr f16_to_f32_table; // TODO: store vtcm? bool init() { if (!init_ltu()) { - DEVICE_LOG_ERROR("Failed to initialize LTU"); + DEVICE_LOG_ERROR("Failed to initialize LTU\n"); return false; } if (!init_thread_pool()) { - DEVICE_LOG_ERROR("Failed to initialize thread pool"); + DEVICE_LOG_ERROR("Failed to initialize thread pool\n"); return false; } - DEVICE_LOG_DEBUG("NPU device context initialized"); + power_utils = std::make_unique(); + if (power_utils && power_utils->is_valid()) { + power_utils->set_dvcs_performance_mode(true); + DEVICE_LOG_DEBUG("Power utilities initialized with DVCS performance mode enabled\n"); + } else { + DEVICE_LOG_ERROR("Failed to initialize power utilities\n"); + } + + DEVICE_LOG_DEBUG("NPU device context initialized\n"); return true; } @@ -41,29 +50,29 @@ struct npu_device_context { f16_to_f32_table = std::make_unique(kLtuCount); if (!f16_to_f32_table) { - DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table"); + DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table\n"); return false; } hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount); - DEVICE_LOG_DEBUG("f16_to_f32 table initialized"); + DEVICE_LOG_DEBUG("f16_to_f32 table initialized\n"); return true; } bool init_thread_pool() { if (thread_pool) { - DEVICE_LOG_DEBUG("Thread pool already initialized"); + DEVICE_LOG_DEBUG("Thread pool already initialized\n"); return true; } auto pool = std::make_unique(); if (!pool) { - DEVICE_LOG_ERROR("Failed to create thread pool"); + DEVICE_LOG_ERROR("Failed to create thread pool\n"); return false; } thread_pool = std::move(pool); - DEVICE_LOG_DEBUG("Thread pool initialized"); + DEVICE_LOG_DEBUG("Thread pool initialized\n"); return true; } }; @@ -102,25 +111,25 @@ int npu_device_open(const char * uri, remote_handle64 * h) { // TODO: should we have a device context here? auto * context = new npu_device_context(); if (!context->init()) { - DEVICE_LOG_ERROR("Failed to initialize npu_device_context"); + DEVICE_LOG_ERROR("Failed to initialize npu_device_context\n"); delete context; return AEE_EFAILED; } *h = reinterpret_cast(context); - DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h); + DEVICE_LOG_INFO("NPU device context created: %p\n", (void *) *h); return AEE_SUCCESS; } int npu_device_close(remote_handle64 h) { auto * context = device_context_from_handle(h); if (!context) { - DEVICE_LOG_ERROR("Invalid npu_device_context handle"); + DEVICE_LOG_ERROR("Invalid npu_device_context handle\n"); return AEE_EINVHANDLE; } delete context; - DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h); + DEVICE_LOG_INFO("NPU device context destroyed: %p\n", (void *) h); return AEE_SUCCESS; } @@ -139,7 +148,7 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, NPU_UNUSED(_h); if (!srcs || srcsLen <= 0 || !dst || !is_supported) { - DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments"); + DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments\n"); return AEE_EINVARGS; } @@ -185,7 +194,7 @@ AEEResult npu_device_tensors_free(remote_handle64 _h, int tensor_handlesLen) { NPU_UNUSED(_h); if (!tensor_handles || tensor_handlesLen < 0) { - DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments"); + DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments\n"); return AEE_EINVARGS; } @@ -194,7 +203,7 @@ AEEResult npu_device_tensors_free(remote_handle64 _h, if (tensor) { delete tensor; } else { - DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d", i); + DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d\n", i); } } @@ -250,13 +259,13 @@ AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { auto dev_ctx = device_context_from_handle(_h); if (!dev_ctx) { - DEVICE_LOG_DEBUG("Invalid npu_device_context handle"); + DEVICE_LOG_DEBUG("Invalid npu_device_context handle\n"); return AEE_EINVHANDLE; } auto * graph = graph_from_handle(graph_handle); if (!graph) { - DEVICE_LOG_ERROR("Invalid graph handle"); + DEVICE_LOG_ERROR("Invalid graph handle\n"); return AEE_EINVHANDLE; } diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index c963ef966ea22..19326a523d3ff 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -91,6 +91,7 @@ void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread const bool should_sync = requires_thread_barrier(op); if (pool && should_sync && i < _tensor_count - 1) { + // For the last tensor, the thread pool will handle synchronization DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]", (void *) this, params.get_thread_index(), diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 5437fb848e3b0..b128a6b82ff82 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -13,7 +13,7 @@ inline float f16_to_f32(const npu_device_fp16_t src) { } // From: ggml/src/ggml-cpu/ops.cpp -template +template void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hexagon::tensor * k, @@ -24,6 +24,7 @@ void flash_attn_impl(hexagon::tensor * out, static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count"); constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32; + constexpr const bool kHasMask = _HasMask; if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) { DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n", @@ -32,6 +33,11 @@ void flash_attn_impl(hexagon::tensor * out, return; } + if (kHasMask != (mask != nullptr)) { + DEVICE_LOG_ERROR("flash_attn_impl: mask is required when kHasMask is true\n"); + return; + } + float scale = out->get_op_param(0); const float max_bias = out->get_op_param(1); const float logit_softcap = out->get_op_param(2); @@ -96,7 +102,7 @@ void flash_attn_impl(hexagon::tensor * out, const uint8_t * q_ptr = q->get_read_buffer(); const uint8_t * k_ptr = k->get_read_buffer(); const uint8_t * v_ptr = v->get_read_buffer(); - const uint8_t * mask_ptr = mask ? mask->get_read_buffer() : nullptr; + const uint8_t * mask_ptr = kHasMask ? mask->get_read_buffer() : nullptr; const uint8_t * sinks_ptr = sinks ? sinks->get_read_buffer() : nullptr; float * VKQ32 = reinterpret_cast(cache_ptr); // FP32 VKQ accumulator auto * VKQ16 = reinterpret_cast(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator @@ -125,11 +131,17 @@ void flash_attn_impl(hexagon::tensor * out, } const npu_device_fp16_t * mp = - mask_ptr ? reinterpret_cast(mask_ptr + iq1 * mask->get_nb(1) + + kHasMask ? reinterpret_cast(mask_ptr + iq1 * mask->get_nb(1) + (iq2 % mask->get_ne(2)) * mask->get_nb(2) + (iq3 % mask->get_ne(3)) * mask->get_nb(3)) : nullptr; + q_to_vec_dot(reinterpret_cast(q_data), Q_q, DK); + + if (kHasMask) { + hexagon::l2fetch_row(reinterpret_cast(mp), mask->get_nb(1)); + } + // k indices const int ik3 = iq3 / rk3; const int ik2 = iq2 / rk2; @@ -138,8 +150,6 @@ void flash_attn_impl(hexagon::tensor * out, const int iv3 = iq3 / rv3; const int iv2 = iq2 / rv2; - q_to_vec_dot(reinterpret_cast(q_data), Q_q, DK); - // online softmax / attention // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf @@ -147,7 +157,7 @@ void flash_attn_impl(hexagon::tensor * out, const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3); for (int64_t ic = 0; ic < k->get_ne(1); ++ic) { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop); - float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f; + float mv = kHasMask ? (slope * f16_to_f32(mp[ic])) : 0.0f; if (mv == -INFINITY) { continue; } @@ -282,9 +292,17 @@ bool flash_attn_f32(tensor * out, compute_params * params) { const auto * mask = out->get_src(3); const auto * sinks = out->get_src(4); if (k->get_type() == NPU_DATA_TYPE_F16) { - flash_attn_impl(out, q, k, v, mask, sinks, params); + if (mask) { + flash_attn_impl(out, q, k, v, mask, sinks, params); + } else { + flash_attn_impl(out, q, k, v, mask, sinks, params); + } } else { - flash_attn_impl(out, q, k, v, mask, sinks, params); + if (mask) { + flash_attn_impl(out, q, k, v, mask, sinks, params); + } else { + flash_attn_impl(out, q, k, v, mask, sinks, params); + } } return true; } @@ -338,8 +356,8 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec, if (dst->ne[0] != v->ne[0] || dst->ne[2] != q->ne[1]) { DEVICE_LOG_DEBUG( - "[%s]dst shape does not match q and v: dst ne: %ld, %ld, %ld, %ld, q ne: %ld, %ld, %ld, %ld, " - "v ne: %ld, %ld, %ld, %ld\n", + "[%s]dst shape does not match q and v: dst ne: %lld, %lld, %lld, %lld, q ne: %lld, %lld, %lld, %lld, " + "v ne: %lld, %lld, %lld, %lld\n", op_get_name(op), dst->ne[0], dst->ne[1], @@ -359,24 +377,25 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec, if (is_transposed_or_permuted(dst->nb)) { DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n", op_get_name(op), - dst->nb[0], - dst->nb[1], - dst->nb[2], - dst->nb[3]); + (size_t) dst->nb[0], + (size_t) dst->nb[1], + (size_t) dst->nb[2], + (size_t) dst->nb[3]); return false; } if (q->ne[0] != k->ne[0]) { - DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n", - op_get_name(op), - q->ne[0], - q->ne[1], - q->ne[2], - q->ne[3], - k->ne[0], - k->ne[1], - k->ne[2], - k->ne[3]); + DEVICE_LOG_DEBUG( + "[%s]q and k shapes do not match: q ne: %lld, %lld, %lld, %lld, k ne: %lld, %lld, %lld, %lld\n", + op_get_name(op), + q->ne[0], + q->ne[1], + q->ne[2], + q->ne[3], + k->ne[0], + k->ne[1], + k->ne[2], + k->ne[3]); return false; } diff --git a/ggml/src/ggml-qnn/npu/device/op_glu.cpp b/ggml/src/ggml-qnn/npu/device/op_glu.cpp new file mode 100644 index 0000000000000..f3e89064fb99c --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_glu.cpp @@ -0,0 +1,228 @@ + +#include "op_glu.hpp" + +#include "type_traits.hpp" +#include "util.hpp" + +namespace { + +template struct get_data_type {}; + +template +struct get_data_type { + using type = _TyData; + using param_type = typename std::remove_cv::type>::type; +}; + +inline float dummy_load_coeff() { + // This is a dummy function to satisfy the template requirements. + // In practice, this should be replaced with a proper coefficient loading function. + return 0; +} + +inline float expf_f16_guard_inf(float x) { + // Avoid overflow for large values, f16: log(65504) + constexpr float kMaxExp = 11.0898664f; + + if (x >= kMaxExp) { + // Avoid overflow for large values + return std::numeric_limits::infinity(); + } + + return std::expf(x); +} + +inline void glu_vec_op_f16_f16(const __fp16 * src0, const __fp16 * src1, __fp16 * dst, size_t count, float coeff) { + // TODO: use simd version, for some input hexagon intrinsics will generate nan instead of inf. + for (uint32_t i = 0; i < count; ++i) { + float x = src0[i]; + float g = src1[i]; + + dst[i] = (x / (1.0f + expf_f16_guard_inf(-x))) * g; + } +} + +inline void glu_vec_op_f32_f32(const float * src0, + const float * src1, + float * dst, + size_t count, + hexagon::HVX_VectorPair_x4 coeff) { + using namespace hexagon::vec; + vec_trans_with_param_impl( + src0, src1, dst, count, coeff); +} + +template +bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) { + using data_type = typename get_data_type::type; + using param_type = typename get_data_type::param_type; + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); + static_assert(std::is_same_v, + "GluRowFunc must have the same param type as CoeffLoadFunc"); + + if (!out) { + return false; + } + + const bool has_src1 = out->get_src(1) != nullptr; + auto * src0 = out->get_src(0); + auto * src1 = has_src1 ? out->get_src(1) : src0; + if (!src0 || !src1) { + return true; // skip if no src + } + + const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2; + if (out->get_ne(0) != total_cols) { + DEVICE_LOG_ERROR( + "[hexagon-npu][GLU]out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols); + return false; + } + + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); + const auto start_end = params->get_work_slice(total_rows); + if (start_end.first >= start_end.second) { + return true; + } + + uint8_t * dst_ptr = out->get_write_buffer(); + if (!dst_ptr) { + DEVICE_LOG_ERROR("[hexagon-npu][GLU]glu_impl: dst_ptr is not writable, tensor: %p, type: %s\n", + (void *) out, + hexagon::get_type_name(out->get_type())); + return false; + } + + const int32_t swapped = out->get_op_param(1); + const uint8_t * src0_ptr = src0->get_read_buffer(); + const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type)); + if (swapped) { + std::swap(src0_ptr, src1_ptr); + } + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index()); + + auto coeff = _CoeffLoadFunc(); + const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); + for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { + const auto i03 = ir / rows_per_cube; + const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); + const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod? + const auto i13 = i03 % src1->get_ne(3); + const auto i12 = i02 % src1->get_ne(2); + const auto i11 = i01 % src1->get_ne(1); + + auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2); + auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); + auto * src1_row = src1_plane + i11 * src1->get_nb(1); + auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); + if (ir + 1 < start_end.second) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes); + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); + } + + _GluRowFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), + reinterpret_cast(dst_row), + static_cast(total_cols), + coeff); + } + + out->release_write_buffer(); // mark the output tensor as modified + return true; +} + +template +bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) { + using namespace hexagon::vec::math; + + if (out->get_op_param(0) != NPU_GLU_OP_SWIGLU) { + DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", (int) out->get_op_param(0)); + return false; + } + + if (out->get_type() != _DataType) { + DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n", + hexagon::get_type_name(out->get_type()), + hexagon::get_type_name(_DataType)); + return false; + } + + if constexpr (_DataType == NPU_DATA_TYPE_F32) { + return glu_impl(out, params); + } else if constexpr (_DataType == NPU_DATA_TYPE_F16) { + return glu_impl(out, params); + } + + DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type())); + return true; +} + +} // namespace + +namespace hexagon { + +bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params) { + return glu_compute(out, params); +} + +bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params) { + return glu_compute(out, params); +} + +bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec, + const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, + size_t src_len) { + const auto op = op_spec->op; + if (op != NPU_OP_GLU) { + DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); + return false; + } + + if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) { + DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), (int) op_spec->params[0]); + return false; + } + + if (!dst || !srcs || src_len < 1) { + DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op)); + return false; + } + + const auto & src0 = srcs[0]; + if (dst->type != src0.type) { + DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", + hexagon::op_get_name(op), + hexagon::get_type_name(src0.type), + hexagon::get_type_name(dst->type)); + return false; + } + + if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) { + DEVICE_LOG_DEBUG( + "[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type)); + return false; + } + + if (src_len > 1) { + if (!hexagon::is_same_shape(src0, *dst) || !hexagon::is_same_shape(srcs[1], *dst)) { + DEVICE_LOG_DEBUG("[%s]src0, src1 and dst have different shape\n", hexagon::op_get_name(op)); + return false; // src0 and src1 have the same shape as dst + } + } else { + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "GLU requires max dims 4"); + if (src0.ne[0] / 2 != dst->ne[0] || src0.ne[1] != dst->ne[1] || src0.ne[2] != dst->ne[2] || + src0.ne[3] != dst->ne[3]) { + DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape: src0.ne[0]: %ld, dst.ne[0]: %ld\n", + hexagon::op_get_name(op), + (long) src0.ne[0], + (long) dst->ne[0]); + return false; + } + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_glu.hpp b/ggml/src/ggml-qnn/npu/device/op_glu.hpp new file mode 100644 index 0000000000000..075dce9ad6ca9 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_glu.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include "op_types.hpp" + +namespace hexagon { + +bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params); +bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params); + +bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec, + const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, + size_t src_len); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 647fdd347ff3e..c423b24778981 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -3,11 +3,13 @@ #include "op_impl.hpp" #include "op_flash_attn.hpp" +#include "op_glu.hpp" #include "op_mul_mat.hpp" #include "op_rope.hpp" #include "type_traits.hpp" #include "vec_ops.hpp" +#include #include namespace { @@ -59,15 +61,10 @@ template struct get_data_type -struct get_data_type { - using type = _TyData; -}; - template struct get_data_type { using type = _TyData; - using param_type = typename std::remove_cv::type>::type; + using param_type = typename std::remove_cv::type>::type; }; template bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) { @@ -325,171 +322,6 @@ bool is_unary_op_supported(const npu_device_tensor_op_spec * op_spec, return true; } -inline void glu_vec_op_f32_f32(const float * src0, - const float * src1, - float * dst, - size_t count, - hexagon::HVX_VectorPair_x4 coeff) { - using namespace hexagon::vec; - vec_trans_with_param_impl( - src0, src1, dst, count, coeff); -} - -inline void glu_vec_op_f16_f16(const npu_device_fp16_t * src0, - const npu_device_fp16_t * src1, - npu_device_fp16_t * dst, - size_t count, - hexagon::HVX_VectorPair_x4 coeff) { - using namespace hexagon::vec; - vec_trans_with_param_impl( - src0, src1, dst, count, coeff); -} - -template -bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) { - using data_type = typename get_data_type::type; - static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); - - if (!out) { - return false; - } - - const bool has_src1 = out->get_src(1) != nullptr; - auto * src0 = out->get_src(0); - auto * src1 = has_src1 ? out->get_src(1) : src0; - if (!src0 || !src1) { - return true; // skip if no src - } - - const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2; - if (out->get_ne(0) != total_cols) { - DEVICE_LOG_ERROR("out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols); - return false; - } - - auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); - const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); - const auto start_end = params->get_work_slice(total_rows); - if (start_end.first >= start_end.second) { - return true; - } - - uint8_t * dst_ptr = out->get_write_buffer(); - if (!dst_ptr) { - DEVICE_LOG_ERROR("element_wise_op: dst_ptr is not writable, tensor: %p, type: %s\n", - (void *) out, - hexagon::get_type_name(out->get_type())); - return false; - } - - const int32_t swapped = out->get_op_param(1); - const uint8_t * src0_ptr = src0->get_read_buffer(); - const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type)); - if (swapped) { - std::swap(src0_ptr, src1_ptr); - } - - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index()); - - auto coeff = _CoeffLoadFunc(); - const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); - for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { - const auto i03 = ir / rows_per_cube; - const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); - const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod? - const auto i13 = i03 % src1->get_ne(3); - const auto i12 = i02 % src1->get_ne(2); - const auto i11 = i01 % src1->get_ne(1); - - auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2); - auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); - auto * src1_row = src1_plane + i11 * src1->get_nb(1); - auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); - if (ir + 1 < start_end.second) { - hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes); - hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); - } - - _GluRowFunc(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), - reinterpret_cast(dst_row), - static_cast(total_cols), - coeff); - } - - out->release_write_buffer(); // mark the output tensor as modified - return true; -} - -template -bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) { - using namespace hexagon::vec::math; - - if (out->get_op_param(0) != NPU_GLU_OP_SWIGLU) { - DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", out->get_op_param(0)); - return false; - } - - if (out->get_type() != _DataType) { - DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n", - hexagon::get_type_name(out->get_type()), - hexagon::get_type_name(_DataType)); - return false; - } - - if constexpr (_DataType == NPU_DATA_TYPE_F32) { - return glu_impl(out, params); - } else if constexpr (_DataType == NPU_DATA_TYPE_F16) { - return glu_impl(out, params); - } - - DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type())); - return true; -} - -bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec, - const npu_device_tensor_spec * dst, - const npu_device_tensor_spec * srcs, - size_t src_len) { - const auto op = op_spec->op; - if (op != NPU_OP_GLU) { - DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); - return false; - } - - if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) { - DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), op_spec->params[0]); - return false; - } - - if (!dst || !srcs || src_len < 1) { - DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op)); - return false; - } - - const auto & src0 = srcs[0]; - if (dst->type != src0.type) { - DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", - hexagon::op_get_name(op), - hexagon::get_type_name(src0.type), - hexagon::get_type_name(dst->type)); - return false; - } - - if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) { - DEVICE_LOG_DEBUG( - "[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type)); - return false; - } - - if (!hexagon::is_same_shape(src0, *dst)) { - DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); - return false; - } - - return false; // TODO: fix: for some input hexagon intrinsics will generate nan instead of inf. -} - struct op_capabilities { npu_device_tensor_op op; hexagon::op_is_supported_func_type is_supported; @@ -499,60 +331,60 @@ struct op_capabilities { constexpr const op_capabilities kOpCapabilities[] = { { - NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported, + NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported, { hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 - }, true, // requires_thread_barrier + }, true, // requires_thread_barrier }, { - NPU_OP_ADD, is_element_wise_op_supported, + NPU_OP_ADD, is_element_wise_op_supported, { element_wise_op>, // NPU_DATA_TYPE_F32 element_wise_op>, // NPU_DATA_TYPE_F16 - }, false, // requires_thread_barrier - }, + }, false, + }, { NPU_OP_SUB, is_element_wise_op_supported, { element_wise_op>, // NPU_DATA_TYPE_F32 element_wise_op>, // NPU_DATA_TYPE_F16 - }, false, // requires_thread_barrier - }, + }, false, + }, { - NPU_OP_MUL, is_element_wise_op_supported, + NPU_OP_MUL, is_element_wise_op_supported, { element_wise_op>, // NPU_DATA_TYPE_F32 element_wise_op>, // NPU_DATA_TYPE_F16 - }, false, // requires_thread_barrier - }, + }, false, + }, { - NPU_OP_RMS_NORM, is_unary_op_supported, + NPU_OP_RMS_NORM, is_unary_op_supported, { unary_op, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 - }, false, // requires_thread_barrier - }, + }, false, + }, { - NPU_OP_FLASH_ATTN,hexagon::is_flash_attn_supported, + NPU_OP_FLASH_ATTN, hexagon::is_flash_attn_supported, { hexagon::flash_attn_f32, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 }, true, // requires_thread_barrier }, { - NPU_OP_ROPE, hexagon::is_rope_supported, + NPU_OP_ROPE, hexagon::is_rope_supported, { hexagon::rope_f32, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 - }, false, // requires_thread_barrier - }, + }, false, + }, { - NPU_OP_GLU, is_glu_op_supported, + NPU_OP_GLU, hexagon::is_glu_op_supported, { - glu_compute, // NPU_DATA_TYPE_F32 - glu_compute, // NPU_DATA_TYPE_F16 - }, false, // requires_thread_barrier + hexagon::glu_f32, // NPU_DATA_TYPE_F32 + hexagon::glu_f16, // NPU_DATA_TYPE_F16 + }, true, // TODO: should we avoid using thread barrier? }, }; diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 41bf2c7838d6b..852b347bce212 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -36,8 +36,9 @@ void mul_mat_impl(hexagon::tensor * src0, using data_type0 = typename get_data_type::data_type0; using data_type1 = typename get_data_type::data_type1; - const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); - auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; + const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); + auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; + auto * load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table; if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) { DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); return; @@ -62,8 +63,8 @@ void mul_mat_impl(hexagon::tensor * src0, if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first || start_end_element.second <= start_end_element.first) { DEVICE_LOG_DEBUG( - "mul_mat_impl: no work to do, start_end_plane: (%ld, %ld), start_end_row: (%ld, %ld), " - "start_end_element: (%ld, %ld)\n", + "mul_mat_impl: no work to do, start_end_plane: (%lld, %lld), start_end_row: (%lld, %lld), " + "start_end_element: (%lld, %lld)\n", start_end_plane.first, start_end_plane.second, start_end_row.first, @@ -116,6 +117,7 @@ void mul_mat_impl(hexagon::tensor * src0, return; } + auto dequant_table = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector(); constexpr bool should_fetch_src0_row = !_ShouldCacheSrc0; const uint8_t * src0_ptr = src0->get_read_buffer(); const uint8_t * src1_ptr = src1->get_read_buffer(); @@ -146,7 +148,8 @@ void mul_mat_impl(hexagon::tensor * src0, auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size; dequantize_row_func(src0_row, reinterpret_cast(cached_row_ptr), - src0->get_ne(0)); + src0->get_ne(0), + dequant_table); } last_cached_plane_ptr = src0_plane; @@ -218,8 +221,9 @@ void mul_mat_gemv_impl(hexagon::tensor * src0, using data_type0 = typename get_data_type::data_type0; using data_type1 = typename get_data_type::data_type1; - const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); - auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; + const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); + auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; + auto * load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table; if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) { DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); return; @@ -229,7 +233,7 @@ void mul_mat_gemv_impl(hexagon::tensor * src0, if (dst->get_ne(0) >= params->get_thread_count()) { start_end_element = params->get_work_slice(dst->get_ne(0)); } else { - DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %ldx%ldx%ldx%ld\n", + DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %lldx%lldx%lldx%lld\n", hexagon::get_type_name(src1->get_type()), src1->get_ne(0), src1->get_ne(1), @@ -241,7 +245,7 @@ void mul_mat_gemv_impl(hexagon::tensor * src0, if (start_end_element.second <= start_end_element.first) { DEVICE_LOG_DEBUG( "mul_mat_impl: no work to do, start_end_plane: [0, 1), start_end_row: [0, 1), " - "start_end_element: [%ld, %ld)\n", + "start_end_element: [%lld, %lld)\n", start_end_element.first, start_end_element.second); return; @@ -297,6 +301,7 @@ void mul_mat_gemv_impl(hexagon::tensor * src0, return; } + auto dequant_table = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector(); constexpr bool should_fetch_src0_row = !_ShouldCacheSrc0; const uint8_t * src0_ptr = src0->get_read_buffer(); const uint8_t * src1_ptr = src1->get_read_buffer(); @@ -325,8 +330,10 @@ void mul_mat_gemv_impl(hexagon::tensor * src0, } auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size; - dequantize_row_func( - src0_row, reinterpret_cast(cached_row_ptr), src0->get_ne(0)); + dequantize_row_func(src0_row, + reinterpret_cast(cached_row_ptr), + src0->get_ne(0), + dequant_table); } src0_plane = src0_plane_cache_ptr; diff --git a/ggml/src/ggml-qnn/npu/device/op_rope.cpp b/ggml/src/ggml-qnn/npu/device/op_rope.cpp index 27a35394c50c4..d73d13983ac01 100644 --- a/ggml/src/ggml-qnn/npu/device/op_rope.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_rope.cpp @@ -165,7 +165,7 @@ bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) { } if (n_dims % 2 || (_IsVision && n_dims != out->get_ne(0) / 2)) { - DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %d\n", n_dims, out->get_ne(0) / 2); + DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %lld\n", n_dims, out->get_ne(0) / 2); return false; // invalid n_dims for vision ROPE } diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index a6feefe2ecaa6..5d2dc44d5475b 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -20,30 +20,30 @@ class tensor { void * mmap_address = nullptr; auto ret = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address); if (ret != AEE_SUCCESS) { - DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d", (int) ret); + DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d\n", (int) ret); return; } _data = static_cast(mmap_address); - DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n", - (void *) this, - (long) _info.ne[0], - (long) _info.ne[1], - (long) _info.ne[2], - (long) _info.ne[3], - _info.buffer_fd, - _info.offset, - (void *) mmap_address, - phy_address); + DEVICE_LOG_DEBUG("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n", + (void *) this, + (long) _info.ne[0], + (long) _info.ne[1], + (long) _info.ne[2], + (long) _info.ne[3], + (int) _info.buffer_fd, + (size_t) _info.offset, + (void *) mmap_address, + (long) phy_address); } ~tensor() noexcept { auto ret = HAP_mmap_put(_info.buffer_fd); if (ret != AEE_SUCCESS) { - DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d", (int) ret); + DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d\n", (int) ret); } - DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd); + DEVICE_LOG_DEBUG("~tensor(%p) fd: %d\n", (void *) this, _info.buffer_fd); } void flush() const { @@ -131,7 +131,7 @@ class tensor { uint8_t * get_write_buffer() const { if (_info.is_constant) { - DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p", (void *) this); + DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p\n", (void *) this); return nullptr; // Do not allow writing to constant tensors } diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index 902bdcfc564c7..aeaee16bf9d3b 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -14,7 +14,7 @@ namespace hexagon { constexpr const size_t kMaxThreadCount = 4; -constexpr const size_t kDefaultStackSize = 1024 * 64; // 64KB +constexpr const size_t kDefaultStackSize = NPU_THREAD_STACK_SIZE; // 64KB template class qurt_thread { public: @@ -24,7 +24,7 @@ template class qurt_thread { qurt_thread_func_type thread_func, void * arg, unsigned short priority) { - DEVICE_LOG_DEBUG("qurt_thread.create: %s", thread_name.c_str()); + DEVICE_LOG_DEBUG("qurt_thread.create: %s\n", thread_name.c_str()); qurt_thread_attr_init(&_attributes); qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str()); qurt_thread_attr_set_stack_addr(&_attributes, _stack); @@ -37,26 +37,26 @@ template class qurt_thread { auto ret = qurt_thread_create( &_tid, &_attributes, reinterpret_cast(&qurt_thread::thread_func_impl), (void *) this); if (ret != QURT_EOK) { - DEVICE_LOG_ERROR("Failed to create thread: %d", (int) ret); + DEVICE_LOG_ERROR("Failed to create thread: %d\n", (int) ret); _func = nullptr; _arg = nullptr; return; } - DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d", thread_name.c_str(), (int) _tid); + DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d\n", thread_name.c_str(), (int) _tid); } ~qurt_thread() { - DEVICE_LOG_DEBUG("qurt_thread.destroy: %d", (int) _tid); + DEVICE_LOG_DEBUG("qurt_thread.destroy: %d\n", (int) _tid); int thread_exit_code = QURT_EOK; auto ret = qurt_thread_join(_tid, &thread_exit_code); if (ret != QURT_EOK && ret != QURT_ENOTHREAD) { - DEVICE_LOG_ERROR("Failed to join thread: %d", (int) ret); + DEVICE_LOG_ERROR("Failed to join thread: %d\n", (int) ret); return; } if (thread_exit_code != QURT_EOK) { - DEVICE_LOG_ERROR("Thread exit code: %d", (int) thread_exit_code); + DEVICE_LOG_ERROR("Thread exit code: %d\n", (int) thread_exit_code); } } @@ -135,7 +135,7 @@ template class thread_pool { auto thread = std::make_unique( thread_name_base + std::to_string(i), &thread_pool::thread_func_impl, &_thread_params[i + 1], priority); if (!thread->is_valid()) { - DEVICE_LOG_ERROR("Failed to create thread: %zu", i); + DEVICE_LOG_ERROR("Failed to create thread: %zu\n", i); // destroy all barriers and threads at destructor return; } @@ -143,11 +143,11 @@ template class thread_pool { _threads[i] = std::move(thread); } - DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount); + DEVICE_LOG_DEBUG("thread_pool.created: %zu\n", kMaxSubThreadCount); } ~thread_pool() { - DEVICE_LOG_DEBUG("thread_pool.destroy"); + DEVICE_LOG_DEBUG("thread_pool.destroy\n"); _thread_exit = true; qurt_barrier_wait(&_pending); // release all task threads @@ -161,7 +161,7 @@ template class thread_pool { bool sync_execute(task_type task, void * arg) { if (!task) { - DEVICE_LOG_ERROR("Invalid task"); + DEVICE_LOG_ERROR("Invalid task\n"); return false; } @@ -174,7 +174,7 @@ template class thread_pool { qurt_barrier_wait(&_pending); task(this, &_thread_params[0], arg); - DEVICE_LOG_DEBUG("main_thread.task_completed: 0"); + DEVICE_LOG_DEBUG("main_thread.task_completed: 0\n"); qurt_barrier_wait(&_completed); @@ -198,19 +198,19 @@ template class thread_pool { auto * param = reinterpret_cast(arg); - DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", param->tidx); + DEVICE_LOG_DEBUG("thread_func_impl.start: %zu\n", param->tidx); auto & pool = *(param->pool); for (;;) { qurt_barrier_wait(&pool._pending); if (pool._thread_exit) { - DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", param->tidx); + DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu\n", param->tidx); break; } #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING auto task_begin_cycles = pool._task_begin_cycles.load(); - DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus", + DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus\n", param->tidx, static_cast( HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles))); @@ -221,18 +221,18 @@ template class thread_pool { task(param->pool, param, pool._arg); } - DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", param->tidx); + DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu\n", param->tidx); qurt_barrier_wait(&pool._completed); #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING - DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus", + DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus\n", param->tidx, static_cast( HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles))); #endif } - DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", param->tidx); + DEVICE_LOG_DEBUG("thread_func_impl.end: %zu\n", param->tidx); } std::atomic_bool _thread_exit = false; diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 3350167749230..0589aa414cf2b 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -3,8 +3,6 @@ #include "op_types.hpp" // TODO: remove this include #include "vec_ops.hpp" -#include - #include static_assert(sizeof(npu_device_block_q4_k) == @@ -31,42 +29,122 @@ inline npu_device_fp16_t to_fp16(const float src) { template inline HVX_Vector load_into_vector(const _TStruct * src) { static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load"); - const HVX_Vector * qs0 = reinterpret_cast(&(src->*_MemberPtr)); - HVX_Vector prev = *qs0; - HVX_Vector curr = hexagon::is_addr_aligned(qs0) ? Q6_V_vzero() : *(qs0 + 1); - return Q6_V_valign_VVR(curr, prev, (size_t) qs0); + return *reinterpret_cast(&(src->*_MemberPtr)); +} + +template inline HVX_Vector load_struct_into_vector(const _TStruct * src) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load"); + + return *reinterpret_cast(src); } template inline HVX_Vector load_block_generic(const _TBlock & src) { - static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding"); + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong block size/padding"); return load_into_vector<_TBlock, 1, &_TBlock::qs>(&src); } -template inline HVX_Vector load_dual_block_generic(const _TBlock * srcs) { - static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong q4_0 block size/padding"); - constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); +template inline HVX_Vector make_scale_load_mask() { + static_assert(sizeof(_TBlock) < 32, "wrong block size/padding"); + static_assert(sizeof(_TBlock::qs) == 16 || sizeof(_TBlock::qs) == 32, "wrong quantization block size"); + + constexpr const size_t kScaleBlockSize = QUANT_BLOCK_SIZE * sizeof(hexagon::dequant_output_type); + + // TODO: handle the case that scale not at the start of struct + hexagon::HVX_VectorAlias ret; + for (size_t i = 0; i < QUANT_BLOCK_SIZE; ++i) { + size_t base = i * 2; + ret.u8[base] = 0; + ret.u8[base + 1] = 1; + + ret.u8[base + kScaleBlockSize] = sizeof(_TBlock); + ret.u8[base + kScaleBlockSize + 1] = sizeof(_TBlock) + 1; + } + + return ret.v; +} + +template inline HVX_Vector load_dual_block_generic(const _TBlock * srcs, HVX_VectorPred mask) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding"); + constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); + constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs; HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs); - HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); - return Q6_V_lo_W(Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs)); + HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale); + return Q6_V_vmux_QVV(mask, blocks, block1); } -template inline HVX_Vector load_qual_block_generic(const _TBlock * srcs) { - static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong q4_0 block size/padding"); +template +inline hexagon::HVX_Vector_x2 load_dual_block_generic(const _TBlock * srcs, + HVX_VectorPred mask, + const HVX_Vector scale_indices) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding"); + constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); + constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs; + + hexagon::HVX_Vector_x2 result; + + HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs); + + HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale); + HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks); + + result.val[0] = Q6_V_vmux_QVV(mask, blocks, block1); + result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0); + + return result; +} + +template inline hexagon::HVX_VectorPred_x3 make_quad_block_mask() { + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding"); constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); - HVX_Vector blocks = load_into_vector<_TBlock, 4, &_TBlock::qs>(srcs); - HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); - HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs); + hexagon::HVX_VectorPred_x3 mask; + mask.val[0] = Q6_Q_vsetq_R(kSizeOfQs); + mask.val[1] = Q6_Q_vsetq_R(kSizeOfQs * 3); + mask.val[2] = Q6_Q_vsetq_R(kSizeOfQs * 2); + return mask; +} + +template +inline hexagon::HVX_Vector_x3 load_qual_block_generic(const _TBlock * srcs, + const hexagon::HVX_VectorPred_x3 mask, + const HVX_Vector scale_indices) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding"); + constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); + constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs; + + hexagon::HVX_Vector_x3 result; + + const HVX_Vector blocks = load_struct_into_vector<_TBlock, 4>(srcs); + + { + HVX_Vector block0 = Q6_V_vror_VR(blocks, kSizeOfScale); + HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale * 2); + + HVX_Vector block2 = Q6_V_vror_VR(blocks, kSizeOfScale * 3); + HVX_Vector block3 = Q6_V_vror_VR(blocks, kSizeOfScale * 4); + + HVX_Vector block01 = Q6_V_vmux_QVV(mask.val[0], block0, block1); + HVX_Vector block23 = Q6_V_vmux_QVV(mask.val[1], block2, block3); - HVX_Vector block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2); - HVX_Vector block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3); - HVX_VectorPair qp1 = Q6_W_vshuff_VVR(block3, block2, kSizeOfQs); + result.val[0] = Q6_V_vmux_QVV(mask.val[2], block01, block23); + } + + { + HVX_Vector scale23 = Q6_V_vror_VR(blocks, sizeof(_TBlock) * 2); + + HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks); + scale23 = Q6_Vb_vshuff_Vb(scale23); + + result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0); + result.val[2] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale23, 0); + } - return Q6_V_lo_W(Q6_W_vshuff_VVR(Q6_V_lo_W(qp1), Q6_V_lo_W(qp0), kSizeOfQs * 2)); + return result; } inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { + // TODO: use intrinsics if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -324,23 +402,24 @@ void quantize_row_q4_K(const float * src, void * dst, size_t count) { } } -void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count) { +void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) { constexpr const int qk = QUANT_BLOCK_SIZE; static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); - auto * dst_ptr = ((hexagon::dequant_output_type *) dst); // TODO: opt for aligned access + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + auto * dst_ptr = ((hexagon::dequant_output_type *) dst); // TODO: opt for aligned access + const HVX_VectorPred mask = Q6_Q_vsetq_R(sizeof(npu_device_block_q8_0::qs)); + const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2); int i = 0; for (; i + 1 < nb; i += 2) { const auto & src0 = src_ptr[i]; const auto & src1 = src_ptr[i + 1]; - HVX_Vector scales01 = - Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2); + HVX_Vector scales01 = Q6_V_vmux_QVV(scale_mask, Q6_Vh_vsplat_R(src0.d), Q6_Vh_vsplat_R(src1.d)); - HVX_Vector qs = load_dual_block_generic(src_ptr + i); + HVX_Vector qs = load_dual_block_generic(src_ptr + i, mask); HVX_Vector q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(qs))); HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); @@ -363,44 +442,39 @@ void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, s } template -void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count) { +void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) { constexpr const int qk = QUANT_BLOCK_SIZE; static_assert(qk % 2 == 0, "qk must be even"); static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs); + static const auto load_masks = make_quad_block_mask(); + static const HVX_Vector scale_indices __attribute__((aligned(hexagon::kBytesPerVector))) = + make_scale_load_mask(); + const int nb = count / qk; const auto * src_ptr = reinterpret_cast(src); - const HVX_Vector mask = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector minus = Q6_Vb_vsplat_R(8); hexagon::dequant_output_type * dst_ptr = dst; // TODO: opt for aligned access int i = 0; for (; i + 3 < nb; i += 4) { - const auto & src0 = src_ptr[i]; - const auto & src1 = src_ptr[i + 1]; - const auto & src2 = src_ptr[i + 2]; - const auto & src3 = src_ptr[i + 3]; + auto qs = load_qual_block_generic(src_ptr + i, load_masks, scale_indices); - HVX_Vector scales01 = - Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2); - HVX_Vector scales23 = - Q6_V_valign_VVR(Q6_Vh_vsplat_R(src3.d), Q6_Vh_vsplat_R(src2.d), hexagon::kBytesPerVector / 2); + HVX_Vector q_lo = qs.val[0]; + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4); - HVX_Vector qs = load_qual_block_generic(src_ptr + i); - HVX_Vector q_lo = Q6_V_vand_VV(qs, mask); - HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4); - HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4)); - q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus); - qp0 = Q6_Wh_vunpack_Vb(q_lo); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4)); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); - q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0)); + q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0)); + qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0); - q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); - q_lo = Q6_Vhf_equals_Vqf16(q_lo); + q_lo = Q6_V_lo_W(qp0); + q_hi = Q6_V_hi_W(qp0); - q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23); + q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, qs.val[1]); + q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, qs.val[2]); + + q_lo = Q6_Vhf_equals_Vqf16(q_lo); q_hi = Q6_Vhf_equals_Vqf16(q_hi); if constexpr (_IsDstAligned) { @@ -415,21 +489,16 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d } for (; i + 1 < nb; i += 2) { - const auto & src0 = src_ptr[i]; - const auto & src1 = src_ptr[i + 1]; + auto qs = load_dual_block_generic(src_ptr + i, load_masks.val[0], scale_indices); + HVX_Vector q_lo = qs.val[0]; + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2)); - HVX_Vector scales01 = - Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2); + q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0)); + qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0); - HVX_Vector qs = load_dual_block_generic(src_ptr + i); - HVX_Vector q_lo = Q6_V_vand_VV(qs, mask); - HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4); - HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2)); - q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus); - qp0 = Q6_Wh_vunpack_Vb(q_lo); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); - q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); - q_lo = Q6_Vhf_equals_Vqf16(q_lo); + q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), qs.val[1]); + q_lo = Q6_Vhf_equals_Vqf16(q_lo); if constexpr (_IsDstAligned) { *reinterpret_cast(dst_ptr) = q_lo; @@ -445,14 +514,15 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d HVX_Vector scales = Q6_Vh_vsplat_R(curr_blk.d); HVX_Vector qs = load_block_generic(curr_blk); - HVX_Vector q_lo = Q6_V_vand_VV(qs, mask); + HVX_Vector q_lo = qs; HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4); HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs); - q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus); - qp0 = Q6_Wh_vunpack_Vb(q_lo); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); - q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales); - q_lo = Q6_Vhf_equals_Vqf16(q_lo); + + q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0)); + qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0); + + q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), scales); + q_lo = Q6_Vhf_equals_Vqf16(q_lo); if constexpr (_IsDstAligned) { hexagon::q6op_vstu_variable_aligned(dst_ptr, q_lo); @@ -462,24 +532,82 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d } } -void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count) { +HVX_Vector load_dequant_table_q4_0() { + constexpr const int kTableSize = 1 << 4; // 4 bits per value, 16 values + constexpr const int kQ4ZeroPoint = 8; // zero point for q4_0 quantization + static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large"); + + static const HVX_Vector result = []() -> HVX_Vector { + union { + HVX_Vector v; + __fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)]; + } table __attribute__((aligned(hexagon::kBytesPerVector))); + + table.v = Q6_V_vzero(); + for (int i = 0; i < kTableSize; ++i) { + table.f16[i * 2] = i - kQ4ZeroPoint; // TODO: vectorize this? + } + return table.v; + }(); + + return result; +} + +void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) { const bool dst_aligned = hexagon::is_addr_aligned(dst); if (dst_aligned) { - dequantize_row_q4_0_impl(src, dst, count); + dequantize_row_q4_0_impl(src, dst, count, table); } else { - dequantize_row_q4_0_impl(src, dst, count); + dequantize_row_q4_0_impl(src, dst, count, table); } } -void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count) { +HVX_Vector load_dequant_table_q4_k() { + constexpr const int kTableSize = 1 << 4; // 4 bits per value, 16 values + static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large"); + + const static HVX_Vector result = []() -> HVX_Vector { + union { + HVX_Vector v; + __fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)]; + } table __attribute__((aligned(hexagon::kBytesPerVector))); + + table.v = Q6_V_vzero(); + for (int i = 0; i < kTableSize; ++i) { + table.f16[i * 2] = i; // TODO: vectorize this? + } + return table.v; + }(); + + return result; +} + +void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) { + constexpr const int kQuantSubBlockSize = 32; + const int nb = count / QUANT_K_BLOCK_SIZE; const auto * src_ptr = reinterpret_cast(src); - auto * dst_ptr = reinterpret_cast<__fp16 *>(dst); + auto * dst_ptr = reinterpret_cast(dst); + + const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2); + + union { + HVX_VectorPair p[2]; + HVX_Vector v[4]; + } dual_pair __attribute__((aligned(hexagon::kBytesPerVector * 4))); - // TODO: use intrinsics for (int i = 0; i < nb; i++) { const uint8_t * q = src_ptr[i].qs; + HVX_Vector qv = *reinterpret_cast(q); + + HVX_Vector q_lo = qv; + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qv, 4); + HVX_VectorPair qp = Q6_W_vshuff_VVR(q_hi, q_lo, kQuantSubBlockSize * 3); + + dual_pair.p[0] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp)), table, 0); + dual_pair.p[1] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_hi_W(qp)), table, 0); + const __fp16 d = reinterpret_cast(src_ptr[i].d); const __fp16 min = reinterpret_cast(src_ptr[i].dmin); @@ -487,30 +615,61 @@ void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, s uint8_t sc = 0; uint8_t m = 0; const auto * scales = src_ptr[i].scales; - for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) { + for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 128) { get_scale_min_k4(is + 0, scales, &sc, &m); + const __fp16 d0 = d * sc; + const __fp16 m0 = min * m; + + HVX_Vector dv0 = Q6_Vh_vsplat_R(reinterpret_cast(d0)); + HVX_Vector dm0 = Q6_Vh_vsplat_R(reinterpret_cast(m0)); + + get_scale_min_k4(is + 1, scales, &sc, &m); const __fp16 d1 = d * sc; const __fp16 m1 = min * m; - get_scale_min_k4(is + 1, scales, &sc, &m); + + HVX_Vector dv1 = Q6_Vh_vsplat_R(reinterpret_cast(d1)); + HVX_Vector dm1 = Q6_Vh_vsplat_R(reinterpret_cast(m1)); + + get_scale_min_k4(is + 2, scales, &sc, &m); const __fp16 d2 = d * sc; const __fp16 m2 = min * m; - for (int l = 0; l < 32; ++l) { - dst_ptr[0] = d1 * (q[l] & 0xF) - m1; - dst_ptr[32] = d2 * ((q[l] >> 4) & 0xF) - m2; - dst_ptr++; - } - dst_ptr += 32; - q += 32; - is += 2; + + HVX_Vector dv2 = Q6_Vh_vsplat_R(reinterpret_cast(d2)); + HVX_Vector dm2 = Q6_Vh_vsplat_R(reinterpret_cast(m2)); + + get_scale_min_k4(is + 3, scales, &sc, &m); + const __fp16 d3 = d * sc; + const __fp16 m3 = min * m; + + HVX_Vector dv3 = Q6_Vh_vsplat_R(reinterpret_cast(d3)); + HVX_Vector dm3 = Q6_Vh_vsplat_R(reinterpret_cast(m3)); + + HVX_Vector dv01 = Q6_V_vmux_QVV(scale_mask, dv0, dv1); + HVX_Vector dm01 = Q6_V_vmux_QVV(scale_mask, dm0, dm1); + + HVX_Vector dv23 = Q6_V_vmux_QVV(scale_mask, dv2, dv3); + HVX_Vector dm23 = Q6_V_vmux_QVV(scale_mask, dm2, dm3); + + q_lo = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64], dv01); + q_lo = Q6_Vqf16_vsub_Vqf16Vhf(q_lo, dm01); + + q_hi = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64 + 1], dv23); + q_hi = Q6_Vqf16_vsub_Vqf16Vhf(q_hi, dm23); + + reinterpret_cast(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo); + reinterpret_cast(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi); + + dst_ptr += 128; + is += 4; } } } -void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count) { +void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) { hexagon::vec_cpy_f16(reinterpret_cast(src), dst, count); } -void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count) { +void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) { hexagon::vec_cpy_f32(reinterpret_cast(src), reinterpret_cast(dst), count); } @@ -539,12 +698,16 @@ constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { "Q4_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q4_0), true, dequantize_row_q4_0, - quantize_row_q4_0 }, + quantize_row_q4_0, nullptr, + nullptr, nullptr, + load_dequant_table_q4_0 }, { NPU_DATA_TYPE_Q4_K, "Q4_K", QUANT_K_BLOCK_SIZE, sizeof(npu_device_block_q4_k), true, dequantize_row_q4_K, - quantize_row_q4_K }, + quantize_row_q4_K, nullptr, + nullptr, nullptr, + load_dequant_table_q4_k }, }; static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT, diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.hpp b/ggml/src/ggml-qnn/npu/device/type_traits.hpp index 363827de0af3d..cfa844aba5e09 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.hpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.hpp @@ -3,6 +3,8 @@ #include "tensor.hpp" #include "util.hpp" +#include + namespace hexagon { using dequant_output_type = npu_device_fp16_t; @@ -10,9 +12,10 @@ using dequant_output_type = npu_device_fp16_t; bool init_f16_f32_table(float * table, size_t count); typedef void (*quantize_row_type)(const float * src, void * dst, size_t count); -typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count); +typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count, HVX_Vector table); typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count); typedef bool (*can_use_aligned_vec_dot_type)(const void * src0, const void * src1, size_t count); +typedef HVX_Vector (*load_dequant_table_type)(); struct device_type_traits { npu_device_tensor_data_type type; @@ -21,11 +24,12 @@ struct device_type_traits { size_t type_size; bool is_quantized; - dequantize_row_type to_float; - quantize_row_type from_float; - vec_dot_type vec_dot; - vec_dot_type vec_dot_aligned; - can_use_aligned_vec_dot_type can_use_aligned_vec_dot; + dequantize_row_type to_float = nullptr; + quantize_row_type from_float = nullptr; + vec_dot_type vec_dot = nullptr; + vec_dot_type vec_dot_aligned = nullptr; + can_use_aligned_vec_dot_type can_use_aligned_vec_dot = nullptr; + load_dequant_table_type load_dequant_table = nullptr; }; const device_type_traits & get_type_traits(npu_device_tensor_data_type type); @@ -49,7 +53,7 @@ namespace hexagon { inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) { auto * src0 = op->get_src(0); auto * src1 = op->get_src(1); - char buffer[1024]; + char buffer[512]; if (src1 == nullptr) { snprintf(buffer, sizeof(buffer), @@ -96,8 +100,10 @@ inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) { # define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) \ auto __npu_op_timer_##tracker_name = hexagon::make_scoped_op_perf_timer(op, tidx) -# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \ - hexagon::npu_sub_process_scoped_timer \ +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \ + hexagon::npu_sub_process_scoped_timer< \ + std::remove_reference_t::kBufferCount, \ + idx> \ __npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix) #else diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index d70d7401805d4..71b735ff5ac70 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -8,17 +8,16 @@ #include #include +#include #include #include -#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__) -#define DEVICE_LOG_WARN(...) FARF(ERROR, __VA_ARGS__) -#define DEVICE_LOG_INFO(...) FARF(HIGH, __VA_ARGS__) +#define DEVICE_LOG_ERROR(...) hexagon::log_error(__VA_ARGS__) +#define DEVICE_LOG_WARN(...) hexagon::log_message(__VA_ARGS__) +#define DEVICE_LOG_INFO(...) hexagon::log_message(__VA_ARGS__) #ifdef _DEBUG -# undef FARF_LOW -# define FARF_LOW 1 -# define DEVICE_LOG_DEBUG(...) FARF(LOW, __VA_ARGS__) +# define DEVICE_LOG_DEBUG(...) hexagon::log_message(__VA_ARGS__) #else # define DEVICE_LOG_DEBUG(...) (void) 0 #endif @@ -40,6 +39,20 @@ namespace hexagon { +__attribute__((format(printf, 1, 2))) inline void log_error(const char * format, ...) { + va_list args; + va_start(args, format); + std::vfprintf(stderr, format, args); + va_end(args); +} + +__attribute__((format(printf, 1, 2))) inline void log_message(const char * format, ...) { + va_list args; + va_start(args, format); + std::vprintf(format, args); + va_end(args); +} + inline constexpr const char * op_get_name(npu_device_tensor_op op) { switch (op) { case NPU_OP_MUL_MAT: @@ -137,23 +150,22 @@ class power_utils { return; } - HAP_power_request_t request = {}; - request.type = HAP_power_set_DCVS_v3; - request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE; + HAP_power_request_t request = {}; + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.set_dcvs_enable = enable ? TRUE : FALSE; + request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE; + request.dcvs_v3.set_core_params = TRUE; if (enable) { - request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; - /* - * sleep_latency : To request for sleep latency in micro-seconds. - * Sleep latency is the minimum time before which the DSP sleeps - * Set latency to 65535 to reset it to the default value - */ - request.dcvs_v3.set_latency = TRUE; - request.dcvs_v3.latency = 1000; - - request.dcvs_v3.set_bus_params = TRUE; - request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS; - request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO; - request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM; + request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; + request.dcvs_v3.set_bus_params = TRUE; + request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX; + request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX; + request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_MAX; + request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_MAX; + request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_MAX; + request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX; + request.dcvs_v3.set_sleep_disable = TRUE; + request.dcvs_v3.sleep_disable = TRUE; } auto ret = HAP_power_set(_context_ptr, &request); @@ -359,7 +371,7 @@ template class npu_sub_process_scoped_ti inline auto make_scoped_perf_timer(const char * format, ...) { va_list args; va_start(args, format); - char buffer[1024]; + char buffer[512]; vsnprintf(buffer, sizeof(buffer), format, args); va_end(args); return npu_scoped_timer<1024>(buffer); diff --git a/ggml/src/ggml-qnn/npu/device/vec_math.inl b/ggml/src/ggml-qnn/npu/device/vec_math.inl index ab7f01cf1b3d3..77782734cdce1 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_math.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_math.inl @@ -1120,10 +1120,75 @@ inline HVX_VectorPair hvx_vqf32_convert_vhf(HVX_Vector vxl) { inline HVX_Vector_x2 hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) { HVX_VectorPair res = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vxl), one); - return { - Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)), - Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)), - }; + + HVX_Vector_x2 ret; + ret.val[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)); + ret.val[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)); + return ret; +} + +/** + * @brief Calculates exponential (e^x) for vector elements with infinity guard + * + * This function computes the exponential value for each element in the input vector. + * For input values greater than kMaxExp (88.02f), the function returns the provided + * infinity value instead of attempting to calculate an exponential that would overflow. + * + * @param sline The input vector containing values to compute exponential for + * @param inf The vector containing the infinity representation to use for guarded values + * @return HVX_Vector containing exponential values, with values > kMaxExp replaced by inf + * + * @note Input values greater than 88.02f will return the specified infinity value + */ +inline HVX_Vector qhmath_hvx_exp_vf_guard_inf(HVX_Vector sline, const HVX_Vector inf) { + constexpr float kMaxExp = 88.02f; + const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast(kMaxExp)); + + HVX_VectorPred pred_gt_max_exp = Q6_Q_vcmp_gt_VsfVsf(sline, max_exp); + + HVX_Vector out = qhmath_hvx_exp_vf(sline); + + out = Q6_V_vmux_QVV(pred_gt_max_exp, inf, out); + return out; +} + +/** + * @brief Vectorized division with guard for infinite denominators on HVX. + * + * Performs element-wise division num/denom using qhmath_hvx_div_vf and then + * masks out lanes where denom equals the provided inf value, forcing those + * lanes of the result to zero. This is a temporary guard until proper INF + * handling is implemented in the underlying division routine. + * + * @param num Numerator vector (per-lane). + * @param denom Denominator vector (per-lane); lanes equal to inf are zeroed in the output. + * @param coeffs Coefficients used by qhmath_hvx_div_vf for the reciprocal/division approximation. + * @param inf Lane value representing +INF to compare against denom. + * @return Vector of num/denom with lanes set to zero where denom == inf. + * + * @note NaNs, negative infinity, zero denominators, and subnormals are not explicitly handled. + * @see qhmath_hvx_div_vf + */ +inline HVX_Vector qhmath_hvx_div_vf_guard_inf(HVX_Vector num, + HVX_Vector denom, + HVX_VectorPair_x4 coeffs, + const HVX_Vector inf) { + HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(denom, inf); + + // TODO: fix the inf in div + HVX_Vector out = qhmath_hvx_div_vf(num, denom, coeffs); + + out = Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out); + return out; +} + +inline HVX_Vector Q6_Vsf_vadd_VsfVsf_guard_inf(HVX_Vector num0, HVX_Vector num1, const HVX_Vector inf) { + HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(num0, inf); + + HVX_Vector out = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(num0, num1)); + + out = Q6_V_vmux_QVV(pred0, inf, out); + return out; } } // namespace hexagon::vec::math diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 92cb8ed9993b8..e286aebbb569b 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -8,12 +8,18 @@ namespace hexagon { +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kAlignMask = kBytesPerVector - 1; + template struct HEXAGON_pack { T val[N]; }; -using HVX_Vector_x2 = std::pair; +using HVX_Vector_x2 = HEXAGON_pack; +using HVX_Vector_x3 = HEXAGON_pack; +using HVX_Vector_x4 = HEXAGON_pack; using HVX_VectorPair_x4 = HEXAGON_pack; +using HVX_VectorPred_x3 = HEXAGON_pack; typedef union { HVX_VectorPair VV; @@ -24,8 +30,14 @@ typedef union { } V; } HVX_DV; -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kAlignMask = kBytesPerVector - 1; +typedef union { + HVX_Vector v; + float f32[kBytesPerVector / sizeof(float)]; + uint32_t u32[kBytesPerVector / sizeof(uint32_t)]; + __fp16 f16[kBytesPerVector / sizeof(__fp16)]; + uint16_t u16[kBytesPerVector / sizeof(uint16_t)]; + uint8_t u8[kBytesPerVector]; +} HVX_VectorAlias; inline size_t get_aligned_size(size_t size) { return (size + kAlignMask) & ~kAlignMask; @@ -383,22 +395,35 @@ _TReturn type_erase_dot_func(const void * src0, const void * src1, size_t count) inline HVX_Vector vec_silu_f32_f32(HVX_Vector x, HVX_VectorPair_x4 coeff) { using namespace hexagon::vec::math; - HVX_Vector one = Q6_V_vsplat_R(0x3F800000); + constexpr float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast(kMaxExp)); + HVX_Vector one = Q6_V_vsplat_R(0x3F800000); // x/(1.0f + expf(-x)); - HVX_Vector exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x)); - HVX_Vector denom = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one)); - return qhmath_hvx_div_vf(x, denom, coeff); + HVX_Vector exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x)); + HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(exp_neg_x, max_exp); + HVX_Vector denom = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one)); + HVX_Vector out = qhmath_hvx_div_vf(x, denom, coeff); + out = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out); + return out; } inline HVX_Vector vec_silu_f16_f16(HVX_Vector x, HVX_VectorPair_x4 coeff) { using namespace hexagon::vec::math; - HVX_Vector one = Q6_Vh_vsplat_R(0x3c00); + + constexpr __fp16 kMaxExp = 11.0898664f; // log(INF) + + const HVX_Vector max_exp = Q6_Vh_vsplat_R(reinterpret_cast(kMaxExp)); + HVX_Vector one = Q6_Vh_vsplat_R(0x3c00); // x/(1.0f + expf(-x)); - HVX_Vector exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x)); - HVX_Vector denom = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one)); - return qhmath_hvx_div_vhf(x, denom, coeff); + HVX_Vector exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x)); + HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VhfVhf(exp_neg_x, max_exp); + HVX_Vector denom = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one)); + HVX_Vector out = qhmath_hvx_div_vhf(x, denom, coeff); + out = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out); + return out; } inline HVX_Vector vec_swiglu_f32_f32(HVX_Vector x, HVX_Vector g, HVX_VectorPair_x4 coeff) { diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.inl b/ggml/src/ggml-qnn/npu/device/vec_ops.inl index 854d975edb94c..f2bb174499ee4 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl @@ -16,16 +16,18 @@ template 1) { - HVX_Vector sum0 = Q6_V_vzero(); - HVX_Vector sum1 = Q6_V_vzero(); + HVX_Vector sum0 = kZeroV; + HVX_Vector sum1 = kZeroV; do { HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; @@ -33,14 +35,19 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - sum0 = _AddFunc(_MpyFunc(l0, l1), sum0); HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); + + HVX_Vector mpy0 = _MpyFunc(l0, l1); + HVX_Vector mpy1 = _MpyFunc(h0, h1); prev0 = Q6_V_hi_W(curr0); prev1 = Q6_V_hi_W(curr1); + + sum0 = _AddFunc(mpy0, sum0); + sum1 = _AddFunc(mpy1, sum1); + src0_vec_ptr += 2; src1_vec_ptr += 2; } while (src0_vec_ptr_end - src0_vec_ptr > 1); @@ -73,10 +80,11 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size src1_vec_ptr += should_fetch_src1 ? 1 : 0; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - prev0 = curr0; - prev1 = curr1; - sum = _AddFunc(_MpyFunc(s0, s1), sum); + HVX_Vector mpy0 = _MpyFunc(s0, s1); + prev0 = curr0; + prev1 = curr1; + sum = _AddFunc(mpy0, sum); } if (leftover > 0) { @@ -92,7 +100,7 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size prev1; curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); + sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes), sum); } return _ReduceFunc(sum); @@ -106,36 +114,38 @@ template 3) { - HVX_Vector sum2 = Q6_V_vzero(); - HVX_Vector sum3 = Q6_V_vzero(); - - do { - HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); - - HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; - HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; - sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); - sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); - - src0_vec_ptr += 4; - src1_vec_ptr += 4; - } while (src0_vec_ptr_end - src0_vec_ptr > 3); - - sum0 = _AddFunc(sum2, sum0); - sum1 = _AddFunc(sum3, sum1); - } + HVX_Vector sum0 = kZeroV; + HVX_Vector sum1 = kZeroV; + while (src0_vec_ptr_end - src0_vec_ptr > 3) { + HVX_VectorPair curr00 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_VectorPair curr01 = reinterpret_cast(src0_vec_ptr)[1]; + HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + + HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)); + HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)); + + HVX_Vector mpy2 = _MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)); + HVX_Vector mpy3 = _MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)); + + sum0 = _AddFunc(mpy0, sum0); + sum1 = _AddFunc(mpy1, sum1); + + sum0 = _AddFunc(mpy2, sum0); + sum1 = _AddFunc(mpy3, sum1); + + src0_vec_ptr += 4; + src1_vec_ptr += 4; + }; if (src0_vec_ptr_end - src0_vec_ptr > 1) { HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; @@ -143,8 +153,11 @@ inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr src0_vec_ptr += 2; src1_vec_ptr += 2; - sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)), sum0); - sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1); + HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)); + HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)); + + sum0 = _AddFunc(mpy0, sum0); + sum1 = _AddFunc(mpy1, sum1); } sum = _AddFunc(sum0, sum1); @@ -195,6 +208,7 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr constexpr const __fp16 kOne = 1.0f; const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast(kOne)); + const HVX_Vector kZeroV = Q6_V_vzero(); const _TElem0 * const src0_ptr_end = src0 + count; HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); @@ -202,27 +216,33 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1; HVX_Vector prev0 = *src0_vec_ptr++; HVX_Vector prev1 = *src1_vec_ptr++; - HVX_Vector sum = Q6_V_vzero(); + HVX_Vector sum = kZeroV; if (src1_vec_ptr_end - src1_vec_ptr > 1) { - HVX_Vector sum0 = Q6_V_vzero(); - HVX_Vector sum1 = Q6_V_vzero(); + HVX_Vector sum0 = kZeroV; + HVX_Vector sum1 = kZeroV; do { - HVX_Vector curr0 = src0_vec_ptr[0]; - HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + HVX_Vector curr0 = src0_vec_ptr[0]; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector_x2 s0_pair = _ExpandFunc(s0, kOneV); - HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); - sum0 = _AddFunc(_MpyFunc(s0_pair.first, l1), sum0); + HVX_Vector curr10 = src1_vec_ptr[0]; + HVX_Vector curr11 = src1_vec_ptr[1]; - HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); - sum1 = _AddFunc(_MpyFunc(s0_pair.second, h1), sum1); + HVX_Vector l1 = Q6_V_valign_VVR(curr10, prev1, (size_t) src1); + HVX_Vector h1 = Q6_V_valign_VVR(curr11, curr10, (size_t) src1); + + HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], l1); + HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], h1); prev0 = curr0; - prev1 = Q6_V_hi_W(curr1); + prev1 = curr11; + + sum0 = _AddFunc(mpy0, sum0); + sum1 = _AddFunc(mpy1, sum1); + src0_vec_ptr++; src1_vec_ptr += 2; } while (src1_vec_ptr_end - src1_vec_ptr > 1); @@ -245,8 +265,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr if (has_remaining_src1_vector) { HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = _AddFunc(_MpyFunc(s0_pair.first, s1), sum); - prev1 = curr1; + + HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], s1); + prev1 = curr1; + + sum = _AddFunc(mpy0, sum); } bool should_fetch_src1 = leftover1 != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); @@ -254,9 +277,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr src1_vec_ptr += should_fetch_src1 ? 1 : 0; HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); prev0 = curr0; - prev1 = curr1; - sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? s0_pair.second : s0_pair.first, s1), sum); + HVX_Vector mpy1 = _MpyFunc(has_remaining_src1_vector ? s0_pair.val[1] : s0_pair.val[0], s1); + prev1 = curr1; + + sum = _AddFunc(mpy1, sum); } if (leftover1 > 0) { @@ -274,8 +299,8 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr HVX_Vector_x2 curr0_pair = _ExpandFunc(curr0, kOneV); - curr0 = leftover1 == leftover0 ? curr0_pair.first : curr0_pair.second; - sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes1), sum); + curr0 = leftover1 == leftover0 ? curr0_pair.val[0] : curr0_pair.val[1]; + sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes1), sum); } return _ReduceFunc(sum); @@ -299,44 +324,55 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem constexpr const __fp16 kOne = 1.0f; const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast(kOne)); + const HVX_Vector kZeroV = Q6_V_vzero(); HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1; - HVX_Vector sum0 = Q6_V_vzero(); - HVX_Vector sum1 = Q6_V_vzero(); + HVX_Vector sum0 = kZeroV; + HVX_Vector sum1 = kZeroV; - if (src1_vec_ptr_end - src1_vec_ptr > 3) { - HVX_Vector sum2 = Q6_V_vzero(); - HVX_Vector sum3 = Q6_V_vzero(); + while (src1_vec_ptr_end - src1_vec_ptr > 3) { + HVX_Vector curr0_lo = src0_vec_ptr[0]; + HVX_Vector curr10_lo = src1_vec_ptr[0]; - do { - HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; - HVX_Vector_x2 curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV); - HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; - sum0 = _AddFunc(_MpyFunc(curr00.first, Q6_V_lo_W(curr10)), sum0); - sum1 = _AddFunc(_MpyFunc(curr00.second, Q6_V_hi_W(curr10)), sum1); + HVX_Vector curr0_hi = src0_vec_ptr[1]; + HVX_Vector_x2 curr00 = _ExpandFunc(curr0_lo, kOneV); - HVX_Vector_x2 curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV); - HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; - sum2 = _AddFunc(_MpyFunc(curr01.first, Q6_V_lo_W(curr11)), sum2); - sum3 = _AddFunc(_MpyFunc(curr01.second, Q6_V_hi_W(curr11)), sum3); + HVX_Vector curr10_hi = src1_vec_ptr[1]; + HVX_Vector_x2 curr01 = _ExpandFunc(curr0_hi, kOneV); - src0_vec_ptr += 2; - src1_vec_ptr += 4; - } while (src1_vec_ptr_end - src1_vec_ptr > 3); + HVX_Vector mpy0 = _MpyFunc(curr00.val[0], curr10_lo); + HVX_Vector mpy1 = _MpyFunc(curr00.val[1], curr10_hi); - sum0 = _AddFunc(sum0, sum2); - sum1 = _AddFunc(sum1, sum3); - } + HVX_Vector curr11_lo = src1_vec_ptr[2]; + HVX_Vector curr11_hi = src1_vec_ptr[3]; + + sum0 = _AddFunc(mpy0, sum0); + sum1 = _AddFunc(mpy1, sum1); + + HVX_Vector mpy2 = _MpyFunc(curr01.val[0], curr11_lo); + HVX_Vector mpy3 = _MpyFunc(curr01.val[1], curr11_hi); + + sum0 = _AddFunc(mpy2, sum0); + sum1 = _AddFunc(mpy3, sum1); + + src0_vec_ptr += 2; + src1_vec_ptr += 4; + }; if (src1_vec_ptr_end - src1_vec_ptr > 1) { - HVX_Vector curr0 = src0_vec_ptr[0]; - HVX_Vector_x2 s0_pair = _ExpandFunc(curr0, kOneV); + HVX_Vector curr0 = src0_vec_ptr[0]; + HVX_Vector curr1_lo = src1_vec_ptr[0]; - HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; - sum0 = _AddFunc(_MpyFunc(s0_pair.first, Q6_V_lo_W(curr1)), sum0); - sum1 = _AddFunc(_MpyFunc(s0_pair.second, Q6_V_hi_W(curr1)), sum1); + HVX_Vector_x2 s0_pair = _ExpandFunc(curr0, kOneV); + HVX_Vector curr1_hi = src1_vec_ptr[1]; + + HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], curr1_lo); + HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], curr1_hi); + + sum0 = _AddFunc(mpy0, sum0); + sum1 = _AddFunc(mpy1, sum1); } return _ReduceFunc(_AddFunc(sum0, sum1)); @@ -360,14 +396,14 @@ inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size HVX_VectorPair curr = reinterpret_cast(src_vec_ptr)[0]; src_vec_ptr += 2; - HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src); - dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec); + HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src); + HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src); + prev = Q6_V_hi_W(curr); - HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src); + dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec); dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec); dst_vec_ptr += 2; - prev = Q6_V_hi_W(curr); } if (src_vec_end - src_vec_ptr > 0) { @@ -405,14 +441,16 @@ template inline void vec_zero_impl(_TData * src, size_t count) HVX_UVector * src_vec_ptr = ((HVX_UVector *) src); HVX_UVector * const src_vec_end = ((HVX_UVector *) src) + (count / kElementsPerVector); + const HVX_Vector kZeroV = Q6_V_vzero(); + while (src_vec_end - src_vec_ptr > 1) { - src_vec_ptr[0] = Q6_V_vzero(); - src_vec_ptr[1] = Q6_V_vzero(); + src_vec_ptr[0] = kZeroV; + src_vec_ptr[1] = kZeroV; src_vec_ptr += 2; } if (src_vec_end - src_vec_ptr > 0) { - src_vec_ptr[0] = Q6_V_vzero(); + src_vec_ptr[0] = kZeroV; src_vec_ptr++; } @@ -420,7 +458,7 @@ template inline void vec_zero_impl(_TData * src, size_t count) if (leftover > 0) { // handle the leftover elements const size_t leftover_bytes = leftover * sizeof(_TData); - q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, Q6_V_vzero()); + q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, kZeroV); } } diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index 526191173dd17..7e8f5db7dd85e 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -90,13 +90,16 @@ bool host_graph::compute() { return false; } + LOG_DEBUG("[%p]host_graph::compute started\n", (void *) this); SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle); auto status = npu_device_graph_compute(_device_handle, _graph_handle); if (status != AEE_SUCCESS) { LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status); + LOG_DEBUG("[%p]host_graph::compute finished with failure\n", (void *) this); return false; } + LOG_DEBUG("[%p]host_graph::compute finished\n", (void *) this); return true; } diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp index fca1167282765..5e9f51887961b 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.cpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -242,6 +242,7 @@ bool npu_device::init_rpc_mem() { bool npu_device::init_device_lib() { if (!_device_handle) { + set_fast_rpc_stack_size(_rpc_interface, _dsp_domain_id, NPU_THREAD_STACK_SIZE); auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id); const auto & device_lib_info = get_device_library_info(arch); std::string device_lib_uri = device_lib_info.device_lib_uri; diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index f70526bf25dff..d2d07de897a95 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -1,14 +1,14 @@ #pragma once -#include -#include -#include - #include "common.hpp" #include "ggml-impl.h" #include "hexagon_npu.h" #include "util.hpp" +#include +#include +#include + namespace hexagon { // TODO: merge this with device tensor? @@ -62,7 +62,7 @@ class host_tensor { ~host_tensor() { LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle); - if (_device_tensor_handle) { + if (_device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE) { npu_device_tensor_free(_device_handle, _device_tensor_handle); // TODO: figure out why the _ggml_tensor is invalid here } @@ -113,8 +113,11 @@ class host_tensor { if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) { params_changed = true; memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)); - LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this, - (int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2], + LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", + (void *) this, + (int) _info_update.params[0], + (int) _info_update.params[1], + (int) _info_update.params[2], (int) _info_update.params[3]); } @@ -136,19 +139,29 @@ class host_tensor { if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) { params_changed = true; memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)); - LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this, - (void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]); + LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", + (void *) this, + (void *) _info_update.src_handles[0], + (void *) _info_update.src_handles[1]); } if (params_changed) { npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update); - LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, - ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], - (int) _info_update.params[2], (int) _info_update.params[3]); + LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", + (void *) this, + ggml_op_desc(_ggml_tensor), + (int) _info_update.params[0], + (int) _info_update.params[1], + (int) _info_update.params[2], + (int) _info_update.params[3]); } else { - LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, - ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], - (int) _info_update.params[2], (int) _info_update.params[3]); + LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", + (void *) this, + ggml_op_desc(_ggml_tensor), + (int) _info_update.params[0], + (int) _info_update.params[1], + (int) _info_update.params[2], + (int) _info_update.params[3]); } } @@ -174,9 +187,13 @@ class host_tensor { #endif } - LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, - ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], - (int) _info_update.params[2], (int) _info_update.params[3]); + LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", + (void *) this, + ggml_op_desc(_ggml_tensor), + (int) _info_update.params[0], + (int) _info_update.params[1], + (int) _info_update.params[2], + (int) _info_update.params[3]); return _info_update; } @@ -192,11 +209,21 @@ class host_tensor { } int get_desc(char * buffer, size_t size) const { - return snprintf(buffer, size, "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p", - _ggml_tensor->name, (long) _ggml_tensor->ne[0], (long) _ggml_tensor->ne[1], - (long) _ggml_tensor->ne[2], (long) _ggml_tensor->ne[3], (long) _ggml_tensor->nb[0], - (long) _ggml_tensor->nb[1], (long) _ggml_tensor->nb[2], (long) _ggml_tensor->nb[3], - ggml_type_name(_ggml_tensor->type), (void *) this, (void *) _ggml_tensor, + return snprintf(buffer, + size, + "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p", + _ggml_tensor->name, + (long) _ggml_tensor->ne[0], + (long) _ggml_tensor->ne[1], + (long) _ggml_tensor->ne[2], + (long) _ggml_tensor->ne[3], + (long) _ggml_tensor->nb[0], + (long) _ggml_tensor->nb[1], + (long) _ggml_tensor->nb[2], + (long) _ggml_tensor->nb[3], + ggml_type_name(_ggml_tensor->type), + (void *) this, + (void *) _ggml_tensor, (void *) _device_tensor_handle); } diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp index 13a21c1f9efe1..28aaf34cb7ee2 100644 --- a/ggml/src/ggml-qnn/npu/host/util.cpp +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -149,6 +149,23 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_ } } +void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size) { + constexpr const uint32_t FASTRPC_THREAD_PARAMS = 1; + + if (!rpc_interface || !rpc_interface->is_valid()) { + return; + } + + remote_rpc_thread_params tp = {}; + tp.domain = domain_id; + tp.prio = -1; + tp.stack_size = stack_size; + auto ret = rpc_interface->remote_session_control(FASTRPC_THREAD_PARAMS, &tp, sizeof(tp)); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to set fast RPC stack size: 0x%x\n", ret); + } +} + void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) { if (dst == nullptr) { snprintf(out, max_len, "null"); @@ -161,15 +178,30 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) { switch (dims) { default: case 4: - snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], - (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]); + snprintf(out, + max_len, + "%s[%ldx%ldx%ldx%ld]", + ggml_type_name(tensor->type), + (long) tensor->ne[0], + (long) tensor->ne[1], + (long) tensor->ne[2], + (long) tensor->ne[3]); break; case 3: - snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], - (long) tensor->ne[1], (long) tensor->ne[2]); + snprintf(out, + max_len, + "%s[%ldx%ldx%ld]", + ggml_type_name(tensor->type), + (long) tensor->ne[0], + (long) tensor->ne[1], + (long) tensor->ne[2]); break; case 2: - snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], + snprintf(out, + max_len, + "%s[%ldx%ld]", + ggml_type_name(tensor->type), + (long) tensor->ne[0], (long) tensor->ne[1]); break; case 1: @@ -201,8 +233,14 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) { print_tensor(dst->src[2], src2_desc, sizeof(src2_desc)); char src3_desc[256]; print_tensor(dst->src[3], src3_desc, sizeof(src3_desc)); - snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s", dst_desc, src0_desc, - src1_desc, src2_desc, src3_desc); + snprintf(out, + max_len, + "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s", + dst_desc, + src0_desc, + src1_desc, + src2_desc, + src3_desc); return; } case 3: @@ -213,8 +251,8 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) { print_tensor(dst->src[1], src1_desc, sizeof(src1_desc)); char src2_desc[256]; print_tensor(dst->src[2], src2_desc, sizeof(src2_desc)); - snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc, - src2_desc); + snprintf( + out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc, src2_desc); return; } case 2: diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp index b4c2355cac298..44e482679b3ad 100644 --- a/ggml/src/ggml-qnn/npu/host/util.hpp +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -23,6 +23,7 @@ hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t const char * get_dsp_arch_desc(hexagon_dsp_arch arch); void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id); +void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size); void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len); diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index 5aab3524c6043..bc7de725abd3f 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -13,6 +13,8 @@ const uint32_t NPU_ROPE_TYPE_NEOX = 2; const uint32_t NPU_ROPE_TYPE_MROPE = 8; const uint32_t NPU_ROPE_TYPE_VISION = 24; +const uint32_t NPU_THREAD_STACK_SIZE = 64 * 1024; + interface npu_device : remote_handle64{ typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS];