From 770a38f5ab2d5988bf39efb6a1a4218bd57ad927 Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Wed, 3 Jun 2026 16:34:17 +0200
Subject: [PATCH 1/2] Enroll mul_mat_vec_q_moe into PDL, boosting MTP
 performance on BW
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Data collected on a B4500:

Before
```
(llama.cpp) ➜  llama.cpp git:(master) ✗ python mtp-bench.py
  code_python        pred= 192 draft= 150 acc= 116 rate=0.773 tok/s=202.8
  code_cpp           pred= 192 draft= 147 acc= 117 rate=0.796 tok/s=212.8
  explain_concept    pred= 192 draft= 161 acc= 110 rate=0.683 tok/s=196.4
  summarize          pred= 192 draft= 138 acc= 122 rate=0.884 tok/s=226.6
  qa_factual         pred= 192 draft= 138 acc= 121 rate=0.877 tok/s=225.1
  translation        pred= 192 draft= 158 acc= 112 rate=0.709 tok/s=201.5
  creative_short     pred= 192 draft= 160 acc= 110 rate=0.688 tok/s=197.2
  stepwise_math      pred= 192 draft= 150 acc= 115 rate=0.767 tok/s=209.2
  long_code_review   pred= 192 draft= 148 acc= 116 rate=0.784 tok/s=208.9
```
After
```
(llama.cpp) ➜  llama.cpp git:(master) ✗ python mtp-bench.py
  code_python        pred= 192 draft= 150 acc= 116 rate=0.773 tok/s=211.9
  code_cpp           pred= 192 draft= 147 acc= 117 rate=0.796 tok/s=224.6
  explain_concept    pred= 192 draft= 161 acc= 110 rate=0.683 tok/s=207.8
  summarize          pred= 192 draft= 138 acc= 122 rate=0.884 tok/s=240.2
  qa_factual         pred= 192 draft= 138 acc= 121 rate=0.877 tok/s=238.5
  translation        pred= 192 draft= 158 acc= 112 rate=0.709 tok/s=213.4
  creative_short     pred= 192 draft= 160 acc= 110 rate=0.688 tok/s=208.8
  stepwise_math      pred= 192 draft= 150 acc= 115 rate=0.767 tok/s=221.7
  long_code_review   pred= 192 draft= 148 acc= 116 rate=0.784 tok/s=220.7
```

Server launched with:
```
➜  llama.cpp git:(osimons/enroll_mul_mat_vec_q_moe_into_PDL) ✗ ./build-x64-linux-gcc-reldbg/bin/llama-server \
    -m /mnt/share/gguf/unsloth/Qwen3.6-35B-A3B-MTP-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf -dio \
    --spec-type draft-mtp \
    --spec-draft-n-max 2 \
    -ngl all \
    -fa on \
    --host 0.0.0.0 \
    --port 8080 -np 1 --chat-template-kwargs "{\"preserve_thinking\": true}"
```
---
 ggml/src/ggml-cuda/mmvq.cu | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 4b0426590acc..0fb759a3de93 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -682,12 +682,16 @@ static __global__ void mul_mat_vec_q(
 template <ggml_type type, int c_rows_per_block>
 __launch_bounds__(get_mmvq_mmid_max_batch_for_device<type>()*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void mul_mat_vec_q_moe(
-        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids,
-        float * __restrict__ dst,
+        const void * vx_ptr, const void * vy_ptr, const int32_t * ids_ptr,
+        float * dst_ptr,
         const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
         const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
         const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
         const uint32_t ncols_dst, const uint32_t ids_stride) {
+    const void    * GGML_CUDA_RESTRICT vx  = vx_ptr;
+    const void    * GGML_CUDA_RESTRICT vy  = vy_ptr;
+    const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr;
+    float         * GGML_CUDA_RESTRICT dst = dst_ptr;
 
     constexpr int qk  = ggml_cuda_type_traits<type>::qk;
     constexpr int qi  = ggml_cuda_type_traits<type>::qi;
@@ -707,6 +711,7 @@ static __global__ void mul_mat_vec_q_moe(
         return;
     }
 
+    ggml_cuda_pdl_sync();
     const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride];
     const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y);
 
@@ -794,8 +799,9 @@ static void mul_mat_vec_q_moe_launch(
     const int64_t nblocks_rows = (nrows_x + rows_per_block - 1) / rows_per_block;
     const dim3 block_nums(nblocks_rows, nchannels_dst);
     const dim3 block_dims(warp_size, ncols_dst);
+    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
 
-    mul_mat_vec_q_moe<type, rows_per_block><<<block_nums, block_dims, 0, stream>>>(
+    ggml_cuda_kernel_launch(mul_mat_vec_q_moe<type, rows_per_block>, launch_params,
         vx, vy, ids, dst, ncols_x, nchannels_y, nrows_x,
         stride_row_x, stride_col_y, stride_col_dst,
         stride_channel_x, stride_channel_y, stride_channel_dst,

From b508845ca5b9062308f117d68f17028d194fc8d0 Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Wed, 3 Jun 2026 18:19:46 +0200
Subject: [PATCH 2/2] LC to overlap with following kernels

---
 ggml/src/ggml-cuda/mmvq.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 0fb759a3de93..bdfbfd2d387f 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -731,6 +731,8 @@ static __global__ void mul_mat_vec_q_moe(
         }
     }
 
+    ggml_cuda_pdl_lc();
+
     // Warp-level reduction only - no shared memory needed
 #pragma unroll
     for (int i = 0; i < c_rows_per_block; ++i) {