ggml-org · JohannesGaessler · May 28, 2026 · May 19, 2026 · May 19, 2026 · May 21, 2026
@@ -2569,6 +2569,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
             use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
             use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
             use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+            use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
             any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
         }
     } else {
@@ -2577,6 +2578,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
         use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
         use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+        use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
         any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
     }
 

@@ -271,6 +271,43 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
     return MMVQ_MAX_BATCH_SIZE;
 }
 
+// On AMD MFMA hardware (CDNA), pick the per-quant batch threshold above which
+// MMVQ should yield to the MMQ (MFMA-tiled GEMM) path. The crossover differs
+// noticeably by quant family because the per-row GEMV cost is dominated by the
+// dequantisation work, not the dot-product itself: K-quants pay a heavier
+// per-row decode (super-block scales) and so MMQ wins sooner; legacy and IQ
+// quants have lean decode and stay ahead until the batch is wide enough to
+// fully populate an MFMA tile.
+//
+// Calibrated on MI250X with Llama-3.2-3B (pp512, ubatch 1..8, 10 reps each),
+// across all 20 supported quant types. See PR description for the full table.
+static int64_t mmvq_max_batch_amd_mfma(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return 3; // MMQ wins from batch=4 onward (+5% to +76%)
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q6_K:
+            return 5; // MMQ wins from batch=6 onward (+8% to +35%)
+        default:
+            // Legacy (Q4_0/Q4_1/Q5_0/Q5_1/Q8_0) and IQ quants regress under MMQ
+            // up to batch=7, so keep the global threshold for them.
+            return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
+    static const bool force_mmvq = (getenv("GGML_CUDA_FORCE_MMVQ") != nullptr);
+    if (force_mmvq) {
+        return ne11 <= MMVQ_MAX_BATCH_SIZE;
+    }
-    static const bool force_mmvq = (getenv("GGML_CUDA_FORCE_MMVQ") != nullptr);
-    if (force_mmvq) {
-        return ne11 <= MMVQ_MAX_BATCH_SIZE;
-    }
-    static const bool force_mmvq = (getenv("GGML_CUDA_FORCE_MMVQ") != nullptr);
-    if (force_mmvq) {
-        return ne11 <= MMVQ_MAX_BATCH_SIZE;
-    }
+    if (amd_mfma_available(cc)) {
-    if (amd_mfma_available(cc)) {
+    if (GGML_CUDA_CC_IS_CDNA(cc)) {
-    if (amd_mfma_available(cc)) {
+    if (GGML_CUDA_CC_IS_CDNA(cc)) {
+        return ne11 <= mmvq_max_batch_amd_mfma(type);
+    }
+    return ne11 <= MMVQ_MAX_BATCH_SIZE;
+}
+
 // Device constexpr: returns the max batch size for the current arch+type at compile time.
 template <ggml_type type>
 static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {

@@ -2,6 +2,20 @@
 
 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 
+// Returns true if a quantized matmul of shape (..., ne11) on a device with
+// compute capability `cc` should take the MMVQ (per-row GEMV) path.
+// Returning false sends it to the MMQ path (batched GEMM, MFMA-tiled on CDNA).
+//
+// On AMD MFMA hardware (CDNA) the optimal batch threshold is quant-dependent:
+// K-quants have a heavier per-row GEMV (block scales + super-block decode), so
+// MFMA-tiled MMQ overtakes MMVQ at a smaller batch; legacy and IQ quants have
+// lean GEMV kernels that stay ahead until the batch nearly fills an MFMA tile.
+// Thresholds calibrated on MI250X with Llama-3.2-3B (pp512, ubatch 1..8) — see
+// the PR description for the full sweep.
+//
+// Set GGML_CUDA_FORCE_MMVQ=1 to restore the original global threshold.
-// Returns true if a quantized matmul of shape (..., ne11) on a device with
-// compute capability `cc` should take the MMVQ (per-row GEMV) path.
-// Returning false sends it to the MMQ path (batched GEMM, MFMA-tiled on CDNA).
-//
-// On AMD MFMA hardware (CDNA) the optimal batch threshold is quant-dependent:
-// K-quants have a heavier per-row GEMV (block scales + super-block decode), so
-// MFMA-tiled MMQ overtakes MMVQ at a smaller batch; legacy and IQ quants have
-// lean GEMV kernels that stay ahead until the batch nearly fills an MFMA tile.
-// Thresholds calibrated on MI250X with Llama-3.2-3B (pp512, ubatch 1..8) — see
-// the PR description for the full sweep.
-//
-// Set GGML_CUDA_FORCE_MMVQ=1 to restore the original global threshold.
-// Returns true if a quantized matmul of shape (..., ne11) on a device with
-// compute capability `cc` should take the MMVQ (per-row GEMV) path.
-// Returning false sends it to the MMQ path (batched GEMM, MFMA-tiled on CDNA).
-//
-// On AMD MFMA hardware (CDNA) the optimal batch threshold is quant-dependent:
-// K-quants have a heavier per-row GEMV (block scales + super-block decode), so
-// MFMA-tiled MMQ overtakes MMVQ at a smaller batch; legacy and IQ quants have
-// lean GEMV kernels that stay ahead until the batch nearly fills an MFMA tile.
-// Thresholds calibrated on MI250X with Llama-3.2-3B (pp512, ubatch 1..8) — see
-// the PR description for the full sweep.
-//
-// Set GGML_CUDA_FORCE_MMVQ=1 to restore the original global threshold.
+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);
+
 // Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
 // based on the quantization type and GPU architecture (compute capability).
 int get_mmvq_mmid_max_batch(ggml_type type, int cc);