Skip to content

Commit eb87f3e

Browse files
committed
Reapply commit "CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID (ggml-org#13014)" minus MMVQ
1 parent 004d370 commit eb87f3e

File tree

6 files changed

+246
-192
lines changed

6 files changed

+246
-192
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 88 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,11 @@ static void ggml_cuda_op_mul_mat(
16291629
const int64_t ne0 = dst->ne[0];
16301630
const int64_t ne1 = dst->ne[1];
16311631

1632+
// const int64_t nb10 = src1->nb[0];
1633+
const int64_t nb11 = src1->nb[1];
1634+
const int64_t nb12 = src1->nb[2];
1635+
const int64_t nb13 = src1->nb[3];
1636+
16321637
const int64_t nb2 = dst->nb[2];
16331638
const int64_t nb3 = dst->nb[3];
16341639

@@ -1763,7 +1768,10 @@ static void ggml_cuda_op_mul_mat(
17631768
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
17641769

17651770
if (src1_on_device && src1_is_contiguous) {
1766-
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
1771+
quantize_src1(
1772+
dev[id].src1_ddf, dev[id].src1_ddq, src0->type, ne10,
1773+
nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
1774+
src1_padded_col_size, ne11, ne12, ne13, stream);
17671775
CUDA_CHECK(cudaGetLastError());
17681776
}
17691777
}
@@ -1861,7 +1869,9 @@ static void ggml_cuda_op_mul_mat(
18611869
}
18621870

18631871
if (quantize_src1 && !src1_is_contiguous) {
1864-
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
1872+
quantize_src1(
1873+
src1_ddf_i, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1874+
src1_padded_col_size, src1_ncols, 1, 1, stream);
18651875
CUDA_CHECK(cudaGetLastError());
18661876
}
18671877

@@ -2155,7 +2165,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
21552165
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
21562166
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
21572167

2158-
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2168+
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
21592169
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
21602170
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
21612171

@@ -2213,18 +2223,24 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
22132223
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
22142224

22152225
// } else if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
2216-
} else if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
2226+
} else if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
22172227
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
22182228
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2219-
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
2229+
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
2230+
2231+
// } else if (!split && use_mul_mat_vec_q) {
2232+
// ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
22202233

22212234
} else if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
2222-
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
2235+
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
22232236

2224-
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
2225-
dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
2237+
// } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
2238+
// dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
2239+
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
2240+
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
22262241
// general KQ + KQV multi-batch without FlashAttention
22272242
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
2243+
22282244
} else if (use_dequantize_mul_mat_vec) {
22292245
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
22302246
} else if (use_mul_mat_vec) {
@@ -2299,6 +2315,15 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
22992315

23002316
GGML_TENSOR_BINARY_OP_LOCALS
23012317

2318+
// if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1) {
2319+
// if (ggml_is_quantized(src0->type)) {
2320+
// ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
2321+
// } else {
2322+
// ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
2323+
// }
2324+
// return;
2325+
// }
2326+
23022327
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
23032328

23042329
cudaStream_t stream = ctx.stream();
@@ -2335,97 +2360,75 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
23352360
dst_row.nb[2] = nb1;
23362361
dst_row.nb[3] = nb1;
23372362

2338-
if (ne12 == 1) {
2339-
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2340-
for (int64_t id = 0; id < n_ids; id++) {
2341-
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2342-
2343-
GGML_ASSERT(i02 >= 0 && i02 < n_as);
2363+
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
2364+
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
23442365

2345-
const int64_t i11 = id % ne11;
2346-
const int64_t i12 = iid1;
2366+
src1_row.data = src1_contiguous.get();
2367+
dst_row.data = dst_contiguous.get();
23472368

2348-
const int64_t i1 = id;
2349-
const int64_t i2 = i12;
2369+
for (int64_t i02 = 0; i02 < n_as; i02++) {
2370+
int64_t num_src1_rows = 0;
23502371

2351-
src0_row.data = src0_original + i02*nb02;
2352-
src1_row.data = src1_original + i11*nb11 + i12*nb12;
2353-
dst_row.data = dst_original + i1*nb1 + i2*nb2;
2354-
2355-
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2356-
}
2357-
}
2358-
} else {
2359-
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
2360-
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
2361-
2362-
src1_row.data = src1_contiguous.get();
2363-
dst_row.data = dst_contiguous.get();
2364-
2365-
for (int64_t i02 = 0; i02 < n_as; i02++) {
2366-
int64_t num_src1_rows = 0;
2367-
2368-
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2369-
for (int64_t id = 0; id < n_ids; id++) {
2370-
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2371-
2372-
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2372+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2373+
for (int64_t id = 0; id < n_ids; id++) {
2374+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
23732375

2374-
if (row_id_i != i02) {
2375-
continue;
2376-
}
2376+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
23772377

2378-
num_src1_rows++;
2378+
if (row_id_i != i02) {
2379+
continue;
23792380
}
2380-
}
23812381

2382-
if (num_src1_rows == 0) {
2383-
continue;
2382+
num_src1_rows++;
23842383
}
2384+
}
23852385

2386-
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2387-
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2388-
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2386+
if (num_src1_rows == 0) {
2387+
continue;
2388+
}
23892389

2390-
{
2391-
dim3 block_dims(std::min((unsigned int)ne10, 768u));
2392-
dim3 grid_dims(ids->ne[1], n_ids);
2393-
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2394-
src1_original, src1_contiguous.get(),
2395-
dev_cur_src1_row.get(), dev_row_mapping.get(),
2396-
ids_dev, i02, ids->nb[1], ids->nb[0],
2397-
ne11, ne10,
2398-
nb11, nb12);
2399-
CUDA_CHECK(cudaGetLastError());
2400-
}
2390+
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2391+
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2392+
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
24012393

2402-
src0_row.data = src0_original + i02*nb02;
2394+
{
2395+
dim3 block_dims(std::min((unsigned int)ne10, 768u));
2396+
dim3 grid_dims(ids->ne[1], n_ids);
2397+
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2398+
src1_original, src1_contiguous.get(),
2399+
dev_cur_src1_row.get(), dev_row_mapping.get(),
2400+
ids_dev, i02, ids->nb[1], ids->nb[0],
2401+
ne11, ne10,
2402+
nb11, nb12);
2403+
CUDA_CHECK(cudaGetLastError());
2404+
}
24032405

2404-
GGML_ASSERT(nb11 == sizeof(float)*ne10);
2405-
GGML_ASSERT(nb1 == sizeof(float)*ne0);
2406+
src0_row.data = src0_original + i02*nb02;
24062407

2407-
src1_row.ne[1] = num_src1_rows;
2408-
src1_row.nb[1] = nb11;
2409-
src1_row.nb[2] = num_src1_rows*nb11;
2410-
src1_row.nb[3] = num_src1_rows*nb11;
2408+
GGML_ASSERT(nb11 == sizeof(float)*ne10);
2409+
GGML_ASSERT(nb1 == sizeof(float)*ne0);
24112410

2412-
dst_row.ne[1] = num_src1_rows;
2413-
dst_row.nb[1] = nb1;
2414-
dst_row.nb[2] = num_src1_rows*nb1;
2415-
dst_row.nb[3] = num_src1_rows*nb1;
2411+
src1_row.ne[1] = num_src1_rows;
2412+
src1_row.nb[1] = nb11;
2413+
src1_row.nb[2] = num_src1_rows*nb11;
2414+
src1_row.nb[3] = num_src1_rows*nb11;
24162415

2417-
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2416+
dst_row.ne[1] = num_src1_rows;
2417+
dst_row.nb[1] = nb1;
2418+
dst_row.nb[2] = num_src1_rows*nb1;
2419+
dst_row.nb[3] = num_src1_rows*nb1;
24182420

2419-
{
2420-
dim3 block_dims(std::min((unsigned int)ne0, 768u));
2421-
dim3 grid_dims(num_src1_rows);
2422-
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2423-
dst_original, dst_contiguous.get(),
2424-
dev_row_mapping.get(),
2425-
ne0,
2426-
nb1, nb2);
2427-
CUDA_CHECK(cudaGetLastError());
2428-
}
2421+
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2422+
2423+
{
2424+
dim3 block_dims(std::min((unsigned int)ne0, 768u));
2425+
dim3 grid_dims(num_src1_rows);
2426+
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2427+
dst_original, dst_contiguous.get(),
2428+
dev_row_mapping.get(),
2429+
ne0,
2430+
nb1, nb2);
2431+
CUDA_CHECK(cudaGetLastError());
24292432
}
24302433
}
24312434
}
@@ -2841,7 +2844,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
28412844
#endif
28422845
}
28432846

2844-
if (node->op == GGML_OP_MUL_MAT_ID) {
2847+
if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
28452848
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
28462849
#ifndef NDEBUG
28472850
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
@@ -3650,9 +3653,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
36503653
}
36513654
case GGML_OP_ROPE:
36523655
case GGML_OP_ROPE_BACK: {
3653-
const size_t ts = ggml_type_size(op->src[0]->type);
3654-
const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
3655-
return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
3656+
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
36563657
}
36573658
case GGML_OP_IM2COL:
36583659
case GGML_OP_CONV_2D_DW:

0 commit comments

Comments
 (0)