diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc index 900974a0b4a11..f7c679baa15b1 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc @@ -128,13 +128,9 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor const bool has_weight_idx_indirect = weight_index_indirect != nullptr; const bool single_scale_weights = (block_size == K * N); if (M < min_M_for_tile_optimization) { - uint32_t tile_size_k_vec = 16; - uint32_t tile_size_n = 32; + uint32_t tile_size_k_vec = 32; + uint32_t tile_size_n = 4; - if (context.AdapterInfo().vendor == std::string_view{"intel"}) { - tile_size_k_vec = 32; - tile_size_n = 4; - } const uint32_t b_components = (nbits == 2 ? kVec2Components : kVec4Components); DP4AMatMulNBitsSmallMProgram mul_program{tile_size_k_vec, tile_size_n, nbits, has_zero_points, has_bias, has_weight_idx, has_weight_idx_indirect, single_scale_weights}; uint32_t num_N_tile = (N + tile_size_n - 1) / tile_size_n; diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc index 7d99256682b6b..044bb484ec4fb 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc @@ -230,7 +230,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales, } #endif - // On FP32 only GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M. + // On FP32 only GPUs and Qualcomm GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M. // DP4A Q2 path now supports custom zero points via a 1024-entry LUT (4 zero-point sections × 256 byte values). if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType() || context.AdapterInfo().vendor == std::string_view{"qualcomm"}) && CanApplyDP4AMatrixMatMulNBits(context, accuracy_level, block_size, N, K, components_a)) {