Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1194,7 +1194,12 @@ struct ggml_cuda_graph {

bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
// Disable graphs when the per-op perf logger is on: graph capture
// would either hide individual-op timings inside cudaGraphLaunch
// or re-record over still-pending events on subsequent launches.
// See ggml-cuda.cu's ggml_cuda_perf_logger comment for context.
static const bool disable_cuda_graphs_due_to_perf_logger = (getenv("GGML_CUDA_PERF_LOGGER") != nullptr);
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_cuda_graphs_due_to_perf_logger);
}
#endif
};
Expand Down Expand Up @@ -1467,11 +1472,23 @@ struct ggml_cuda_mm_fusion_args_host {
const ggml_tensor * x_bias = nullptr;
const ggml_tensor * gate = nullptr;
const ggml_tensor * gate_bias = nullptr;
// Residual tensor added to the matmul output AFTER bias and (if any) gate.
// When both x_bias and x_residual are set the kernel performs
// dst = mat * y + bias + residual
// in a single dispatch, mirroring ggml-vulkan's MUL_MAT_ADD_ADD shader and
// saving the launch overhead of a stand-alone GGML_OP_ADD per residual
// connection. Used by the 3-op MUL_MAT + ADD(bias) + ADD(residual) fusion
// detected in ggml_backend_cuda_graph_compute. Must have ne[0] ==
// dst->ne[0] and the same shape as the bias-add output (no broadcasting).
// Set to nullptr for normal 2-op MUL_MAT + ADD(bias) fusion or unfused
// dispatch.
const ggml_tensor * x_residual = nullptr;
ggml_glu_op glu_op;
};
struct ggml_cuda_mm_fusion_args_device {
const void * x_bias = nullptr;
const void * gate = nullptr;
const void * gate_bias = nullptr;
const void * x_residual = nullptr; // see _host counterpart for semantics
ggml_glu_op glu_op;
};
123 changes: 88 additions & 35 deletions src/ggml-cuda/conv-transpose-1d.cu
Original file line number Diff line number Diff line change
@@ -1,57 +1,110 @@
#include "conv-transpose-1d.cuh"

static __global__ void conv_transpose_1d_kernel(
// One CUDA warp (32 threads) cooperatively computes one output pixel
// dst[oc, ol] (== dst[ol + OL*oc] in linear index, since we keep ne2/ne3 == 1).
//
// Grid : (OL, OC, 1)
// Block: (32, 1, 1) — exactly one warp; sized below as CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE.
//
// Two perf-critical changes vs the original "1 thread per output pixel + scan
// the full IC*IL grid + skip via conditional" implementation:
//
// 1. Narrow the input position i to the small range that actually
// contributes:
// out[ol, oc] = sum over (ic, i, ki) of k[ki, oc, ic] * x[i, ic]
// subject to i*s0 + ki == ol, 0 <= ki < K, 0 <= i < IL
// ⇒ i ∈ [ ceil((ol - K + 1)/s0), floor(ol/s0) ] ∩ [0, IL-1]
// typically (KS=16, s0=8) this is 2 iterations of i instead of IL=O(100).
//
// 2. Parallelise the IC reduction across the warp (each thread handles a
// strided slice of IC) and finalise with __shfl_xor_sync. This gives
// 32× useful work per warp on top of the i-range narrowing.
//
// Layouts (matching the original kernel and the Vulkan / Metal patches):
// src0 (kernel) : [K, OC, IC] row-major → element (ki, oc, ic) at
// ic*(OC*K) + oc*K + ki
// src1 (input) : [IL, IC] row-major → element (i, ic) at ic*IL + i
// dst : [OL, OC] row-major → element (ol, oc) at oc*OL + ol
//
// Limitation (unchanged from the original kernel): only ne1==ne3==1 is
// supported; the host-side wrapper enforces that via the contiguous +
// shape assertions.
static __global__ void conv_transpose_1d_kernel(
const int s0, const int p0, const int d0, const int output_size,
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
const float * src0, const float * src1, float * dst) {
int global_index = threadIdx.x + blockIdx.x * blockDim.x;
if (global_index >= output_size) {
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
const float * __restrict__ src0, const float * __restrict__ src1, float * __restrict__ dst) {

const int ol = blockIdx.x;
const int oc = blockIdx.y;
if (ol >= dst_ne0 || oc >= dst_ne1) {
return;
}

int out_index = global_index / dst_ne0;

float accumulator = 0;

for (int c = 0; c < src0_ne2; c++) {
int idx = global_index % dst_ne0;

int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
int input_offset = src1_ne0 * c;

for (int i = 0; i < src1_ne0; i++) {
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
continue;
}
int weight_idx = idx - i*s0;
const int K = src0_ne0;
const int OC = dst_ne1;
const int IC = src0_ne2;
const int IL = src1_ne0;

// Range of input positions i that contribute to this output pixel.
int i_start = (ol - K + 1 + s0 - 1) / s0; // ceil((ol - K + 1) / s0)
if (i_start < 0) i_start = 0;
int i_end = ol / s0;
if (i_end > IL - 1) i_end = IL - 1;

const int tid = threadIdx.x;
const int nth = blockDim.x;

float v = 0.0f;

// Each thread handles a strided slice of IC; the range of i is
// already narrow (≤ K/s0 + 1), so the inner loop is the cheap one.
for (int ic = tid; ic < IC; ic += nth) {
const int kernel_base = (ic * OC + oc) * K;
const int input_base = ic * IL;
#pragma unroll 4
for (int i = i_start; i <= i_end; ++i) {
const int ki = ol - i * s0;
v += src0[kernel_base + ki] * src1[input_base + i];
}
}

float kernel_weight = src0[kernel_offset + weight_idx];
float input_value = src1[input_offset+i];
// Reduce across the warp.
#pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) {
v += __shfl_xor_sync(0xFFFFFFFFu, v, offset);
}

accumulator += kernel_weight * input_value;
}
if (tid == 0) {
dst[oc * dst_ne0 + ol] = v;
}
dst[global_index] = accumulator;
GGML_UNUSED_VARS(p0, d0, src0_ne3, src1_ne3, dst_ne3, src1_ne1, dst_ne1, src1_ne2, dst_ne2);

GGML_UNUSED_VARS(p0, d0, output_size,
src0_ne1, src0_ne3, src1_ne1, src1_ne2, src1_ne3,
dst_ne2, dst_ne3);
}

static void conv_transpose_1d_f32_f32_cuda(
const int s0, const int p0, const int d0, const int output_size,
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
const float * src0, const float * src1, float * dst,
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
const float * src0, const float * src1, float * dst,
cudaStream_t stream) {

const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
s0,p0,d0,output_size,
src0_ne0, src0_ne1, src0_ne2, src0_ne3,
src1_ne0, src1_ne1, src1_ne2, src1_ne3,
dst_ne0, dst_ne1, dst_ne2, dst_ne3,
src0,src1, dst);
// Block = one warp (32 threads). Grid has one block per output pixel,
// i.e. (OL, OC). ne2/ne3 are required to be 1 by the existing host-side
// assertions, so we don't extend the grid into z.
const dim3 block_dim(CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 1, 1);
const dim3 grid_dim((unsigned)dst_ne0, (unsigned)dst_ne1, 1);

conv_transpose_1d_kernel<<<grid_dim, block_dim, 0, stream>>>(
s0, p0, d0, output_size,
src0_ne0, src0_ne1, src0_ne2, src0_ne3,
src1_ne0, src1_ne1, src1_ne2, src1_ne3,
dst_ne0, dst_ne1, dst_ne2, dst_ne3,
src0, src1, dst);
}

void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand Down
3 changes: 2 additions & 1 deletion src/ggml-cuda/conv-transpose-1d.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common.cuh"

#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
// One warp per output pixel; see conv-transpose-1d.cu for why.
#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 32

void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
Loading