Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,9 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
const bool has_weight_idx_indirect = weight_index_indirect != nullptr;
const bool single_scale_weights = (block_size == K * N);
if (M < min_M_for_tile_optimization) {
uint32_t tile_size_k_vec = 16;
uint32_t tile_size_n = 32;
uint32_t tile_size_k_vec = 32;
uint32_t tile_size_n = 4;

if (context.AdapterInfo().vendor == std::string_view{"intel"}) {
tile_size_k_vec = 32;
tile_size_n = 4;
}
const uint32_t b_components = (nbits == 2 ? kVec2Components : kVec4Components);
DP4AMatMulNBitsSmallMProgram mul_program{tile_size_k_vec, tile_size_n, nbits, has_zero_points, has_bias, has_weight_idx, has_weight_idx_indirect, single_scale_weights};
uint32_t num_N_tile = (N + tile_size_n - 1) / tile_size_n;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
}
#endif

// On FP32 only GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.
// On FP32 only GPUs and Qualcomm GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.
// DP4A Q2 path now supports custom zero points via a 1024-entry LUT (4 zero-point sections × 256 byte values).
if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType<float>() || context.AdapterInfo().vendor == std::string_view{"qualcomm"}) &&
CanApplyDP4AMatrixMatMulNBits(context, accuracy_level, block_size, N, K, components_a)) {
Expand Down
Loading