Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,12 @@ struct ggml_cuda_pdl_config {
};
#endif //defined(GGML_CUDA_USE_PDL)

// PDL and __restrict__ need to be mutually exclusive, see https://github.com/ggml-org/llama.cpp/pull/24030
# if (defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER)
# define GGML_CUDA_RESTRICT
# else
# define GGML_CUDA_RESTRICT __restrict__
# endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER

template<typename Kernel, typename... Args>
static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) {
Expand Down
25 changes: 16 additions & 9 deletions ggml/src/ggml-cuda/fattn-common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -678,8 +678,8 @@ static __global__ void flash_attn_mask_to_KV_max(
template<int D, int ncols1, int ncols2> // D == head size
__launch_bounds__(D, 1)
static __global__ void flash_attn_stream_k_fixup_uniform(
float * __restrict__ dst,
const float2 * __restrict__ dst_fixup,
float * dst_ptr,
const float2 * dst_fixup_ptr,
Comment on lines +681 to +682

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
float * dst_ptr,
const float2 * dst_fixup_ptr,
float * GGML_CUDA_RESTRICT dst_ptr,
const float2 * GGML_CUDA_RESTRICT dst_fixup_ptr,

I would say to just use the macro in the declaration. Also, if __restrict__ has a negligible impact on Blackwell anyways, please replace all instances of __restrict__ in the CUDA backend with GGML_CUDA_RESTRICT, even if those kernels currently do not use PDL.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sadly does not work in templated function headers which are __global__.
The reason is that __CUDA_ARCH__ is undefined on host compilation passes, but defined on device compilation passes, and this leads to ABI mismatches.

We could do that on some of the function headers (e.g. quantize_q8_1), but it is discouraged by our engineering and possibly/likely unstable.

@ORippler ORippler Jun 3, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could do that on some of the function headers (e.g. quantize_q8_1), but it is discouraged by our engineering and possibly/likely unstable.

This is not supported, quoting from the cuda docs:
https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/cpp-language-extensions.html#cuda-arch

image

@aendk aendk Jun 3, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, if __restrict__ has a negligible impact on Blackwell anyways, please replace all instances of __restrict__ in the CUDA backend with GGML_CUDA_RESTRICT, even if those kernels currently do not use PDL.

Would you still want me to do this by adding code outside of the function header?
I assume that because its not a seamless replacement, it does not make sense at this point.

const int ne01, const int ne02,
const int ne12, const int nblocks_stream_k,
const int gqa_ratio,
Expand All @@ -689,6 +689,8 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
const uint3 fd_iter_j) {
constexpr int ncols = ncols1*ncols2;
ggml_cuda_pdl_lc();
float * GGML_CUDA_RESTRICT dst = dst_ptr;
const float2 * GGML_CUDA_RESTRICT dst_fixup = dst_fixup_ptr;

const int tile_idx = blockIdx.x; // One block per output tile.
const int j = blockIdx.y;
Expand Down Expand Up @@ -760,15 +762,17 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
template <int D, int ncols1, int ncols2> // D == head size
__launch_bounds__(D, 1)
static __global__ void flash_attn_stream_k_fixup_general(
float * __restrict__ dst,
const float2 * __restrict__ dst_fixup,
float * dst_ptr,
const float2 * dst_fixup_ptr,
const int ne01, const int ne02,
const int gqa_ratio,
const int total_work,
const uint3 fd_iter_k_j_z_ne12,
const uint3 fd_iter_k_j_z,
const uint3 fd_iter_k_j,
const uint3 fd_iter_k) {
float * GGML_CUDA_RESTRICT dst = dst_ptr;
const float2 * GGML_CUDA_RESTRICT dst_fixup = dst_fixup_ptr;
constexpr int ncols = ncols1*ncols2;

const int bidx0 = blockIdx.x;
Expand Down Expand Up @@ -867,11 +871,14 @@ static __global__ void flash_attn_stream_k_fixup_general(
template<int D> // D == head size
__launch_bounds__(D, 1)
static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta,
float * __restrict__ dst,
const float * VKQ_parts_ptr,
const float2 * VKQ_meta_ptr,
float * dst_ptr,
const int parallel_blocks) {
ggml_cuda_pdl_lc();
const float * GGML_CUDA_RESTRICT VKQ_parts = VKQ_parts_ptr;
const float2 * GGML_CUDA_RESTRICT VKQ_meta = VKQ_meta_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
// Dimension 0: threadIdx.x
// Dimension 1: blockIdx.x
// Dimension 2: blockIdx.y
Expand Down Expand Up @@ -1153,8 +1160,8 @@ void launch_fattn(

GGML_ASSERT(block_dim.x % warp_size == 0);

// disabled PDL enrollment for now due to a compiler bug.
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream);
ggml_cuda_kernel_launch(fattn_kernel, launch_params,
(const char *) Q->data,
K_data,
V_data,
Expand Down
26 changes: 17 additions & 9 deletions ggml/src/ggml-cuda/fattn-mma-f16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1703,14 +1703,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap, bool V_is_K_view>
__launch_bounds__(ggml_cuda_fattn_mma_get_nthreads(DKQ, DV, ncols1*ncols2), ggml_cuda_fattn_mma_get_occupancy(DKQ, DV, ncols1*ncols2))
static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const char * Q_ptr,
const char * K_ptr,
const char * V_ptr,
const char * mask_ptr,
const char * sinks_ptr,
const int * KV_max_ptr,
float * dst_ptr,
float2 * dst_meta_ptr,
const float scale,
const float max_bias,
const float m0,
Expand All @@ -1726,6 +1726,14 @@ static __global__ void flash_attn_ext_f16(
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
ggml_cuda_pdl_sync(); // TODO optimize placement
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
const char * GGML_CUDA_RESTRICT K = K_ptr;
const char * GGML_CUDA_RESTRICT V = V_ptr;
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;

// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(DKQ == 128 || DKQ == 256 || DKQ == 512)) {
Expand Down Expand Up @@ -1871,7 +1879,7 @@ static __global__ void flash_attn_ext_f16(
(Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
ne01, ne02, gqa_ratio, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt_gqa, kb0_start, kb0_stop);
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
Expand Down
26 changes: 17 additions & 9 deletions ggml/src/ggml-cuda/fattn-tile.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -788,14 +788,14 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap> // D == head size
__launch_bounds__(ggml_cuda_fattn_tile_get_nthreads(DKQ, DV, ncols1*ncols2), ggml_cuda_fattn_tile_get_occupancy(DKQ, DV, ncols1*ncols2))
static __global__ void flash_attn_tile(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const char * Q_ptr,
const char * K_ptr,
const char * V_ptr,
const char * mask_ptr,
const char * sinks_ptr,
const int * KV_max_ptr,
float * dst_ptr,
float2 * dst_meta_ptr,
const float scale,
const float max_bias,
const float m0,
Expand All @@ -810,6 +810,14 @@ static __global__ void flash_attn_tile(
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#ifdef FLASH_ATTN_AVAILABLE
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
const char * GGML_CUDA_RESTRICT K = K_ptr;
const char * GGML_CUDA_RESTRICT V = V_ptr;
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;

// Skip unused kernel variants for faster compilation:

Expand Down Expand Up @@ -1126,7 +1134,7 @@ static __global__ void flash_attn_tile(
}
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
Expand Down
26 changes: 17 additions & 9 deletions ggml/src/ggml-cuda/fattn-vec.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 1)
static __global__ void flash_attn_ext_vec(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const char * Q_ptr,
const char * K_ptr,
const char * V_ptr,
const char * mask_ptr,
const char * sinks_ptr,
const int * KV_max_ptr,
float * dst_ptr,
float2 * dst_meta_ptr,
const float scale,
const float max_bias,
const float m0,
Expand All @@ -42,6 +42,14 @@ static __global__ void flash_attn_ext_vec(
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
ggml_cuda_pdl_lc();
#ifdef FLASH_ATTN_AVAILABLE
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
const char * GGML_CUDA_RESTRICT K = K_ptr;
const char * GGML_CUDA_RESTRICT V = V_ptr;
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;

// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
Expand Down Expand Up @@ -506,7 +514,7 @@ static __global__ void flash_attn_ext_vec(
dst_meta[((sequence*int(ne01.z) + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
Expand Down
26 changes: 17 additions & 9 deletions ggml/src/ggml-cuda/fattn-wmma-f16.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ namespace wmma = rocwmma;
template<int D, int ncols, int nwarps, int VKQ_stride, typename KQ_acc_t, bool use_logit_softcap>
__launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const char * Q_ptr,
const char * K_ptr,
const char * V_ptr,
const char * mask_ptr,
const char * sinks_ptr,
const int * KV_max_ptr,
float * dst_ptr,
float2 * dst_meta_ptr,
const float scale,
const float max_bias,
const float m0,
Expand All @@ -46,6 +46,14 @@ static __global__ void flash_attn_ext_f16(
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
const char * GGML_CUDA_RESTRICT K = K_ptr;
const char * GGML_CUDA_RESTRICT V = V_ptr;
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
Expand Down Expand Up @@ -494,7 +502,7 @@ static __global__ void flash_attn_ext_f16(
dst_meta[j_dst_unrolled] = dst_meta_val;
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-cuda/getrows.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,17 @@ static __global__ void k_get_rows(

template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const src0_t * src0_ptr, const int32_t * src1_ptr, dst_t * dst_ptr,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {

ggml_cuda_pdl_lc();
const src0_t * GGML_CUDA_RESTRICT src0 = src0_ptr;
const int32_t * GGML_CUDA_RESTRICT src1 = src1_ptr;
dst_t * GGML_CUDA_RESTRICT dst = dst_ptr;
ggml_cuda_pdl_sync();
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-cuda/mmvf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@

template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false, bool is_multi_token_id = false>
static __global__ void mul_mat_vec_f(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const T * x_ptr, const float * y_ptr, const int32_t * ids_ptr, const ggml_cuda_mm_fusion_args_device fusion, float * dst_ptr,
const int ncols2, const uint3 nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const int ids_stride) {
const T * GGML_CUDA_RESTRICT x = x_ptr;
const float * GGML_CUDA_RESTRICT y = y_ptr;
const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
const int row = blockIdx.x;
// for MUL_MAT_ID - blockIdx.y = n_expert_used, blockIdx.z = ncols_dst (tokens)
const int channel_dst = blockIdx.y;
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -476,12 +476,16 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
template <ggml_type type, int ncols_dst, bool has_fusion, bool small_k = false>
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const void * vx_ptr, const void * vy_ptr, const int32_t * ids_ptr, const ggml_cuda_mm_fusion_args_device fusion, float * dst_ptr,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
const uint32_t ids_stride) {
const void * GGML_CUDA_RESTRICT vx = vx_ptr;
const void * GGML_CUDA_RESTRICT vy = vy_ptr;
const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;

constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
Expand Down
4 changes: 3 additions & 1 deletion ggml/src/ggml-cuda/quantize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

__launch_bounds__(CUDA_QUANTIZE_BLOCK_SIZE, 1)
static __global__ void quantize_q8_1(
const float * __restrict__ x, void * __restrict__ vy,
const float * x_ptr, void * vy_ptr,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const uint32_t ne1, const uint3 ne2) {
ggml_cuda_pdl_lc();
const float * GGML_CUDA_RESTRICT x = x_ptr;
void * GGML_CUDA_RESTRICT vy = vy_ptr;
const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;

if (i0 >= ne0) {
Expand Down
4 changes: 3 additions & 1 deletion ggml/src/ggml-cuda/reduce_rows.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
template <bool norm>
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
static __global__ void reduce_rows_f32(const float * x_ptr, float * dst_ptr, const int ncols) {
const float * GGML_CUDA_RESTRICT x = x_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
const int row = blockIdx.x;
const int col = threadIdx.x;

Expand Down
9 changes: 6 additions & 3 deletions ggml/src/ggml-cuda/set-rows.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ static void set_rows_cuda_quant(
}

template <typename src_t, typename idx_t, typename dst_t>
static __global__ void k_set_rows(const src_t * __restrict__ src0,
const idx_t * __restrict__ src1,
dst_t * __restrict__ dst,
static __global__ void k_set_rows(const src_t * src0_ptr,
const idx_t * src1_ptr,
dst_t * dst_ptr,
const int64_t ne_total,
const int64_t ne10,
const int64_t ne11,
Expand All @@ -133,6 +133,9 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0,
const uint3 ne02,
const uint3 ne11_fd,
const uint3 ne12_fd) {
const src_t * GGML_CUDA_RESTRICT src0 = src0_ptr;
const idx_t * GGML_CUDA_RESTRICT src1 = src1_ptr;
dst_t * GGML_CUDA_RESTRICT dst = dst_ptr;
const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;

if (i >= ne_total) {
Expand Down
10 changes: 7 additions & 3 deletions ggml/src/ggml-cuda/ssm-conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
#include "unary.cuh"

template <bool apply_silu, size_t split_d_inner, size_t d_conv>
static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
const float * __restrict__ bias,
static __global__ void ssm_conv_f32(const float * src0_ptr, const float * src1_ptr,
const float * bias_ptr,
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
float * dst_ptr, const int dst_nb0, const int dst_nb1, const int dst_nb2,
const int64_t n_t) {
ggml_cuda_pdl_lc();
const float * GGML_CUDA_RESTRICT src0 = src0_ptr;
const float * GGML_CUDA_RESTRICT src1 = src1_ptr;
const float * GGML_CUDA_RESTRICT bias = bias_ptr;
float * GGML_CUDA_RESTRICT dst = dst_ptr;
GGML_UNUSED(src0_nb0);
const int tid = threadIdx.x;
const int bidx = blockIdx.x;
Expand Down
Loading
Loading