Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2569,6 +2569,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
} else {
Expand All @@ -2577,6 +2578,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}

Expand Down
47 changes: 47 additions & 0 deletions ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,53 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
return MMVQ_MAX_BATCH_SIZE;
}

bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
if (GGML_CUDA_CC_IS_CDNA(cc)) {
if (GGML_CUDA_CC_IS_CDNA1(cc)) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return ne11 <= 7;
case GGML_TYPE_Q5_1:
return ne11 <= 7;
case GGML_TYPE_Q8_0:
return ne11 <= 6;
case GGML_TYPE_Q2_K:
return ne11 <= 4;
case GGML_TYPE_Q3_K:
return ne11 <= 3;
case GGML_TYPE_Q4_K:
return ne11 <= 2;
case GGML_TYPE_Q5_K:
return ne11 <= 3;
case GGML_TYPE_Q6_K:
return ne11 <= 4;
case GGML_TYPE_IQ1_S:
return ne11 <= 5;
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
return ne11 <= 6;
default:
return ne11 <= MMVQ_MAX_BATCH_SIZE;
}
}
switch (type) { // tuned for CDNA2
case GGML_TYPE_Q2_K:
return ne11 <= 5;
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return ne11 <= 3;
case GGML_TYPE_Q6_K:
return ne11 <= 5;
default:
return ne11 <= MMVQ_MAX_BATCH_SIZE;
}
}
return ne11 <= MMVQ_MAX_BATCH_SIZE;
}

// Device constexpr: returns the max batch size for the current arch+type at compile time.
template <ggml_type type>
static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-cuda/mmvq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.

bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);

// Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
// based on the quantization type and GPU architecture (compute capability).
int get_mmvq_mmid_max_batch(ggml_type type, int cc);
Expand Down
Loading