Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4950,6 +4950,21 @@ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backen
return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
}

static inline bool use_flat_gemv_for_large_m_q4_K(const ggml_tensor *tensor) {
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
// note that this forces large M weights to use LM GEMM.
return tensor->ne[1] >= 32768 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
}

static inline bool use_flat_gemv_for_large_m_q6_K(const ggml_tensor *tensor) {
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
// q6_K flat gemv is worse for smaller K; 2048 seems to be a reasonable threshold.
// note that this forces large M weights to use LM GEMM.
return tensor->ne[1] >= 32768 && tensor->ne[0] >= 2048 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
}

static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
Expand Down Expand Up @@ -6595,7 +6610,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,

#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
if (use_adreno_kernels(backend_ctx, tensor)) {
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
}
#else
Expand Down Expand Up @@ -6623,7 +6638,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,

tensor->extra = extra;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {

int M = tensor->ne[1];
int K = tensor->ne[0];
Expand Down Expand Up @@ -6923,7 +6938,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
cl_kernel kernel;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
kernel = backend_ctx->kernel_convert_block_q6_K;
if (use_adreno_kernels(backend_ctx, tensor)) {
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
kernel = backend_ctx->kernel_convert_block_q6_K_noshuffle;
}
#else
Expand Down Expand Up @@ -6956,7 +6971,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
tensor->extra = extra;

#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
cl_int M = tensor->ne[1]; // ne01
cl_int K = tensor->ne[0]; // ne00

Expand Down Expand Up @@ -7599,7 +7614,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
CL_CHECK(clReleaseMemObject(data_device));
return;
}
if (use_adreno_kernels(backend_ctx, tensor)) {
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
int M = tensor->ne[1];
int K = tensor->ne[0];

Expand Down Expand Up @@ -7820,7 +7835,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
CL_CHECK(clReleaseMemObject(data_device));
return;
}
if (use_adreno_kernels(backend_ctx, tensor)) {
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
static ggml_cl_buffer buf_trans_ql;
static ggml_cl_buffer buf_trans_qh;
static ggml_cl_buffer buf_trans_s;
Expand Down Expand Up @@ -13213,13 +13228,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}

// q4_k x fp32
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q4_K(src0)) {
ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
return;
}

// q6_K x fp32
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q6_K(src0)) {
ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
return;
}
Expand Down
Loading