Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2569,6 +2569,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
} else {
Expand All @@ -2577,6 +2578,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}

Expand Down
37 changes: 37 additions & 0 deletions ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,43 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
return MMVQ_MAX_BATCH_SIZE;
}

// On AMD MFMA hardware (CDNA), pick the per-quant batch threshold above which
// MMVQ should yield to the MMQ (MFMA-tiled GEMM) path. The crossover differs
// noticeably by quant family because the per-row GEMV cost is dominated by the
// dequantisation work, not the dot-product itself: K-quants pay a heavier
// per-row decode (super-block scales) and so MMQ wins sooner; legacy and IQ
// quants have lean decode and stay ahead until the batch is wide enough to
// fully populate an MFMA tile.
//
// Calibrated on MI250X with Llama-3.2-3B (pp512, ubatch 1..8, 10 reps each),
// across all 20 supported quant types. See PR description for the full table.
static int64_t mmvq_max_batch_amd_mfma(ggml_type type) {
switch (type) {
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return 3; // MMQ wins from batch=4 onward (+5% to +76%)
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q6_K:
return 5; // MMQ wins from batch=6 onward (+8% to +35%)
default:
// Legacy (Q4_0/Q4_1/Q5_0/Q5_1/Q8_0) and IQ quants regress under MMQ
// up to batch=7, so keep the global threshold for them.
return MMVQ_MAX_BATCH_SIZE;
}
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inline this function and remove the comments except for "tuned for CDNA2".


bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
static const bool force_mmvq = (getenv("GGML_CUDA_FORCE_MMVQ") != nullptr);
if (force_mmvq) {
return ne11 <= MMVQ_MAX_BATCH_SIZE;
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
static const bool force_mmvq = (getenv("GGML_CUDA_FORCE_MMVQ") != nullptr);
if (force_mmvq) {
return ne11 <= MMVQ_MAX_BATCH_SIZE;
}

if (amd_mfma_available(cc)) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (amd_mfma_available(cc)) {
if (GGML_CUDA_CC_IS_CDNA(cc)) {

return ne11 <= mmvq_max_batch_amd_mfma(type);
}
return ne11 <= MMVQ_MAX_BATCH_SIZE;
}

// Device constexpr: returns the max batch size for the current arch+type at compile time.
template <ggml_type type>
static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
Expand Down
14 changes: 14 additions & 0 deletions ggml/src/ggml-cuda/mmvq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@

#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.

// Returns true if a quantized matmul of shape (..., ne11) on a device with
// compute capability `cc` should take the MMVQ (per-row GEMV) path.
// Returning false sends it to the MMQ path (batched GEMM, MFMA-tiled on CDNA).
//
// On AMD MFMA hardware (CDNA) the optimal batch threshold is quant-dependent:
// K-quants have a heavier per-row GEMV (block scales + super-block decode), so
// MFMA-tiled MMQ overtakes MMVQ at a smaller batch; legacy and IQ quants have
// lean GEMV kernels that stay ahead until the batch nearly fills an MFMA tile.
// Thresholds calibrated on MI250X with Llama-3.2-3B (pp512, ubatch 1..8) — see
// the PR description for the full sweep.
//
// Set GGML_CUDA_FORCE_MMVQ=1 to restore the original global threshold.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Returns true if a quantized matmul of shape (..., ne11) on a device with
// compute capability `cc` should take the MMVQ (per-row GEMV) path.
// Returning false sends it to the MMQ path (batched GEMM, MFMA-tiled on CDNA).
//
// On AMD MFMA hardware (CDNA) the optimal batch threshold is quant-dependent:
// K-quants have a heavier per-row GEMV (block scales + super-block decode), so
// MFMA-tiled MMQ overtakes MMVQ at a smaller batch; legacy and IQ quants have
// lean GEMV kernels that stay ahead until the batch nearly fills an MFMA tile.
// Thresholds calibrated on MI250X with Llama-3.2-3B (pp512, ubatch 1..8) — see
// the PR description for the full sweep.
//
// Set GGML_CUDA_FORCE_MMVQ=1 to restore the original global threshold.

bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);

// Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
// based on the quantization type and GPU architecture (compute capability).
int get_mmvq_mmid_max_batch(ggml_type type, int cc);
Expand Down