ggml-org · Zbig9000 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh
@@ -1194,7 +1194,12 @@ struct ggml_cuda_graph {
 
     bool is_enabled() const {
         static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
-        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
+        // Disable graphs when the per-op perf logger is on: graph capture
+        // would either hide individual-op timings inside cudaGraphLaunch
+        // or re-record over still-pending events on subsequent launches.
+        // See ggml-cuda.cu's ggml_cuda_perf_logger comment for context.
+        static const bool disable_cuda_graphs_due_to_perf_logger = (getenv("GGML_CUDA_PERF_LOGGER") != nullptr);
+        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_cuda_graphs_due_to_perf_logger);
     }
 #endif
 };
@@ -1467,11 +1472,23 @@ struct ggml_cuda_mm_fusion_args_host {
     const ggml_tensor * x_bias = nullptr;
     const ggml_tensor * gate = nullptr;
     const ggml_tensor * gate_bias = nullptr;
+    // Residual tensor added to the matmul output AFTER bias and (if any) gate.
+    // When both x_bias and x_residual are set the kernel performs
+    //     dst = mat * y + bias + residual
+    // in a single dispatch, mirroring ggml-vulkan's MUL_MAT_ADD_ADD shader and
+    // saving the launch overhead of a stand-alone GGML_OP_ADD per residual
+    // connection.  Used by the 3-op MUL_MAT + ADD(bias) + ADD(residual) fusion
+    // detected in ggml_backend_cuda_graph_compute.  Must have ne[0] ==
+    // dst->ne[0] and the same shape as the bias-add output (no broadcasting).
+    // Set to nullptr for normal 2-op MUL_MAT + ADD(bias) fusion or unfused
+    // dispatch.
+    const ggml_tensor * x_residual = nullptr;
     ggml_glu_op glu_op;
 };
 struct ggml_cuda_mm_fusion_args_device {
     const void * x_bias = nullptr;
     const void * gate = nullptr;
     const void * gate_bias = nullptr;
+    const void * x_residual = nullptr;  // see _host counterpart for semantics
     ggml_glu_op glu_op;
 };
diff --git a/src/ggml-cuda/conv-transpose-1d.cu b/src/ggml-cuda/conv-transpose-1d.cu
@@ -1,57 +1,110 @@
 #include "conv-transpose-1d.cuh"
 
-static  __global__ void conv_transpose_1d_kernel(
+// One CUDA warp (32 threads) cooperatively computes one output pixel
+// dst[oc, ol] (== dst[ol + OL*oc] in linear index, since we keep ne2/ne3 == 1).
+//
+// Grid : (OL, OC, 1)
+// Block: (32, 1, 1) — exactly one warp; sized below as CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE.
+//
+// Two perf-critical changes vs the original "1 thread per output pixel + scan
+// the full IC*IL grid + skip via conditional" implementation:
+//
+//   1. Narrow the input position i to the small range that actually
+//      contributes:
+//          out[ol, oc] = sum over (ic, i, ki) of  k[ki, oc, ic] * x[i, ic]
+//          subject to  i*s0 + ki == ol,  0 <= ki < K,  0 <= i < IL
+//      ⇒    i ∈ [ ceil((ol - K + 1)/s0), floor(ol/s0) ] ∩ [0, IL-1]
+//      typically (KS=16, s0=8) this is 2 iterations of i instead of IL=O(100).
+//
+//   2. Parallelise the IC reduction across the warp (each thread handles a
+//      strided slice of IC) and finalise with __shfl_xor_sync.  This gives
+//      32× useful work per warp on top of the i-range narrowing.
+//
+// Layouts (matching the original kernel and the Vulkan / Metal patches):
+//   src0 (kernel) : [K, OC, IC] row-major  → element (ki, oc, ic) at
+//                                            ic*(OC*K) + oc*K + ki
+//   src1 (input)  : [IL, IC]    row-major  → element (i, ic) at ic*IL + i
+//   dst           : [OL, OC]    row-major  → element (ol, oc) at oc*OL + ol
+//
+// Limitation (unchanged from the original kernel): only ne1==ne3==1 is
+// supported; the host-side wrapper enforces that via the contiguous +
+// shape assertions.
+static __global__ void conv_transpose_1d_kernel(
         const int s0, const int p0, const int d0, const int output_size,
         const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
         const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
-        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
-        const float * src0, const float * src1,  float * dst) {
-    int global_index = threadIdx.x + blockIdx.x * blockDim.x;
-    if (global_index >= output_size) {
+        const int dst_ne0,  const int dst_ne1,  const int dst_ne2,  const int dst_ne3,
+        const float * __restrict__ src0, const float * __restrict__ src1, float * __restrict__ dst) {
+
+    const int ol = blockIdx.x;
+    const int oc = blockIdx.y;
+    if (ol >= dst_ne0 || oc >= dst_ne1) {
         return;
     }
 
-    int out_index = global_index / dst_ne0;
-
-    float accumulator = 0;
-
-    for (int c = 0; c < src0_ne2; c++) {
-        int idx = global_index % dst_ne0;
-
-        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
-        int input_offset = src1_ne0 * c;
-
-        for (int i = 0; i < src1_ne0; i++) {
-            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
-                continue;
-            }
-            int weight_idx = idx - i*s0;
+    const int K  = src0_ne0;
+    const int OC = dst_ne1;
+    const int IC = src0_ne2;
+    const int IL = src1_ne0;
+
+    // Range of input positions i that contribute to this output pixel.
+    int i_start = (ol - K + 1 + s0 - 1) / s0;   // ceil((ol - K + 1) / s0)
+    if (i_start < 0)      i_start = 0;
+    int i_end = ol / s0;
+    if (i_end > IL - 1)   i_end = IL - 1;
+
+    const int tid = threadIdx.x;
+    const int nth = blockDim.x;
+
+    float v = 0.0f;
+
+    // Each thread handles a strided slice of IC; the range of i is
+    // already narrow (≤ K/s0 + 1), so the inner loop is the cheap one.
+    for (int ic = tid; ic < IC; ic += nth) {
+        const int kernel_base = (ic * OC + oc) * K;
+        const int input_base  = ic * IL;
+        #pragma unroll 4
+        for (int i = i_start; i <= i_end; ++i) {
+            const int ki = ol - i * s0;
+            v += src0[kernel_base + ki] * src1[input_base + i];
+        }
+    }
 
-            float kernel_weight = src0[kernel_offset + weight_idx];
-            float input_value =  src1[input_offset+i];
+    // Reduce across the warp.
+    #pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        v += __shfl_xor_sync(0xFFFFFFFFu, v, offset);
+    }
 
-            accumulator += kernel_weight * input_value;
-        }
+    if (tid == 0) {
+        dst[oc * dst_ne0 + ol] = v;
     }
-    dst[global_index] = accumulator;
-    GGML_UNUSED_VARS(p0, d0, src0_ne3, src1_ne3, dst_ne3, src1_ne1, dst_ne1, src1_ne2, dst_ne2);
+
+    GGML_UNUSED_VARS(p0, d0, output_size,
+                     src0_ne1, src0_ne3, src1_ne1, src1_ne2, src1_ne3,
+                     dst_ne2, dst_ne3);
 }
 
 static void conv_transpose_1d_f32_f32_cuda(
         const int s0, const int p0, const int d0, const int output_size,
         const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
         const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
-        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
-        const float * src0, const float * src1,  float * dst,
+        const int dst_ne0,  const int dst_ne1,  const int dst_ne2,  const int dst_ne3,
+        const float * src0, const float * src1, float * dst,
         cudaStream_t stream) {
 
-    const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
-    conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
-        s0,p0,d0,output_size,
-        src0_ne0, src0_ne1,  src0_ne2, src0_ne3,
-        src1_ne0, src1_ne1,  src1_ne2, src1_ne3,
-        dst_ne0,  dst_ne1,   dst_ne2,  dst_ne3,
-        src0,src1, dst);
+    // Block = one warp (32 threads).  Grid has one block per output pixel,
+    // i.e. (OL, OC).  ne2/ne3 are required to be 1 by the existing host-side
+    // assertions, so we don't extend the grid into z.
+    const dim3 block_dim(CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 1, 1);
+    const dim3 grid_dim((unsigned)dst_ne0, (unsigned)dst_ne1, 1);
+
+    conv_transpose_1d_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        s0, p0, d0, output_size,
+        src0_ne0, src0_ne1, src0_ne2, src0_ne3,
+        src1_ne0, src1_ne1, src1_ne2, src1_ne3,
+        dst_ne0,  dst_ne1,  dst_ne2,  dst_ne3,
+        src0, src1, dst);
 }
 
 void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

diff --git a/src/ggml-cuda/conv-transpose-1d.cuh b/src/ggml-cuda/conv-transpose-1d.cuh
@@ -1,5 +1,6 @@
 #include "common.cuh"
 
-#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+// One warp per output pixel; see conv-transpose-1d.cu for why.
+#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 32
 
 void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);