@@ -1629,6 +1629,11 @@ static void ggml_cuda_op_mul_mat(
16291629 const int64_t ne0 = dst->ne [0 ];
16301630 const int64_t ne1 = dst->ne [1 ];
16311631
1632+ // const int64_t nb10 = src1->nb[0];
1633+ const int64_t nb11 = src1->nb [1 ];
1634+ const int64_t nb12 = src1->nb [2 ];
1635+ const int64_t nb13 = src1->nb [3 ];
1636+
16321637 const int64_t nb2 = dst->nb [2 ];
16331638 const int64_t nb3 = dst->nb [3 ];
16341639
@@ -1763,7 +1768,10 @@ static void ggml_cuda_op_mul_mat(
17631768 dev[id].src1_ddq = dev[id].src1_ddq_alloc .alloc (ctx.pool (id), src_1_ddq_size);
17641769
17651770 if (src1_on_device && src1_is_contiguous) {
1766- quantize_src1 (dev[id].src1_ddf , dev[id].src1_ddq , ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type , stream);
1771+ quantize_src1 (
1772+ dev[id].src1_ddf , dev[id].src1_ddq , src0->type , ne10,
1773+ nb11/sizeof (float ), nb12/sizeof (float ), nb13/sizeof (float ),
1774+ src1_padded_col_size, ne11, ne12, ne13, stream);
17671775 CUDA_CHECK (cudaGetLastError ());
17681776 }
17691777 }
@@ -1861,7 +1869,9 @@ static void ggml_cuda_op_mul_mat(
18611869 }
18621870
18631871 if (quantize_src1 && !src1_is_contiguous) {
1864- quantize_src1 (src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1 , src1_padded_col_size, src0->type , stream);
1872+ quantize_src1 (
1873+ src1_ddf_i, src1_ddq_i, src0->type , ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1874+ src1_padded_col_size, src1_ncols, 1 , 1 , stream);
18651875 CUDA_CHECK (cudaGetLastError ());
18661876 }
18671877
@@ -2155,7 +2165,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
21552165 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
21562166 && src0->ne [0 ] % (GGML_CUDA_DMMV_X*2 ) == 0 && src1->ne [1 ] == 1 ;
21572167
2158- bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2168+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0-> type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
21592169 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
21602170 && src0->ne [0 ] % 2 == 0 && src1->ne [1 ] == 1 ;
21612171
@@ -2213,18 +2223,24 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
22132223 ggml_cuda_mul_mat_vec_nc (ctx, src0, src1, dst);
22142224
22152225 // } else if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
2216- } else if (!split && use_mul_mat_vec && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
2226+ } else if (!split && use_mul_mat_vec && (src0->ne [1 ] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
22172227 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
22182228 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2219- ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
2229+ ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
2230+
2231+ // } else if (!split && use_mul_mat_vec_q) {
2232+ // ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
22202233
22212234 } else if (!split && src0->type == GGML_TYPE_F16 && src1->ne [1 ] == 1 && dst->ne [3 ] == 1 && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
2222- ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
2235+ ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
22232236
2224- } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
2225- dst->op_params [0 ] == GGML_PREC_DEFAULT && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
2237+ // } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
2238+ // dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
2239+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
2240+ && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
22262241 // general KQ + KQV multi-batch without FlashAttention
22272242 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
2243+
22282244 } else if (use_dequantize_mul_mat_vec) {
22292245 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr );
22302246 } else if (use_mul_mat_vec) {
@@ -2299,6 +2315,15 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
22992315
23002316 GGML_TENSOR_BINARY_OP_LOCALS
23012317
2318+ // if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1) {
2319+ // if (ggml_is_quantized(src0->type)) {
2320+ // ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
2321+ // } else {
2322+ // ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
2323+ // }
2324+ // return;
2325+ // }
2326+
23022327 GGML_ASSERT (!ggml_backend_buft_is_cuda_split (src0->buffer ->buft ) && " mul_mat_id does not support split buffers" );
23032328
23042329 cudaStream_t stream = ctx.stream ();
@@ -2335,97 +2360,75 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
23352360 dst_row.nb [2 ] = nb1;
23362361 dst_row.nb [3 ] = nb1;
23372362
2338- if (ne12 == 1 ) {
2339- for (int64_t iid1 = 0 ; iid1 < ids->ne [1 ]; iid1++) {
2340- for (int64_t id = 0 ; id < n_ids; id++) {
2341- const int32_t i02 = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
2342-
2343- GGML_ASSERT (i02 >= 0 && i02 < n_as);
2363+ ggml_cuda_pool_alloc<char > src1_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (src1));
2364+ ggml_cuda_pool_alloc<char > dst_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (dst));
23442365
2345- const int64_t i11 = id % ne11 ;
2346- const int64_t i12 = iid1 ;
2366+ src1_row. data = src1_contiguous. get () ;
2367+ dst_row. data = dst_contiguous. get () ;
23472368
2348- const int64_t i1 = id;
2349- const int64_t i2 = i12 ;
2369+ for ( int64_t i02 = 0 ; i02 < n_as; i02++) {
2370+ int64_t num_src1_rows = 0 ;
23502371
2351- src0_row.data = src0_original + i02*nb02;
2352- src1_row.data = src1_original + i11*nb11 + i12*nb12;
2353- dst_row.data = dst_original + i1*nb1 + i2*nb2;
2354-
2355- ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
2356- }
2357- }
2358- } else {
2359- ggml_cuda_pool_alloc<char > src1_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (src1));
2360- ggml_cuda_pool_alloc<char > dst_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (dst));
2361-
2362- src1_row.data = src1_contiguous.get ();
2363- dst_row.data = dst_contiguous.get ();
2364-
2365- for (int64_t i02 = 0 ; i02 < n_as; i02++) {
2366- int64_t num_src1_rows = 0 ;
2367-
2368- for (int64_t iid1 = 0 ; iid1 < ids->ne [1 ]; iid1++) {
2369- for (int64_t id = 0 ; id < n_ids; id++) {
2370- const int32_t row_id_i = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
2371-
2372- GGML_ASSERT (row_id_i >= 0 && row_id_i < n_as);
2372+ for (int64_t iid1 = 0 ; iid1 < ids->ne [1 ]; iid1++) {
2373+ for (int64_t id = 0 ; id < n_ids; id++) {
2374+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
23732375
2374- if (row_id_i != i02) {
2375- continue ;
2376- }
2376+ GGML_ASSERT (row_id_i >= 0 && row_id_i < n_as);
23772377
2378- num_src1_rows++;
2378+ if (row_id_i != i02) {
2379+ continue ;
23792380 }
2380- }
23812381
2382- if (num_src1_rows == 0 ) {
2383- continue ;
2382+ num_src1_rows++;
23842383 }
2384+ }
23852385
2386- ggml_cuda_pool_alloc< int > dev_cur_src1_row (ctx. pool (), 1 );
2387- ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping (ctx. pool (), num_src1_rows) ;
2388- CUDA_CHECK ( cudaMemsetAsync (dev_cur_src1_row. get (), 0 , sizeof ( int ), stream));
2386+ if (num_src1_rows == 0 ) {
2387+ continue ;
2388+ }
23892389
2390- {
2391- dim3 block_dims (std::min ((unsigned int )ne10, 768u ));
2392- dim3 grid_dims (ids->ne [1 ], n_ids);
2393- k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2394- src1_original, src1_contiguous.get (),
2395- dev_cur_src1_row.get (), dev_row_mapping.get (),
2396- ids_dev, i02, ids->nb [1 ], ids->nb [0 ],
2397- ne11, ne10,
2398- nb11, nb12);
2399- CUDA_CHECK (cudaGetLastError ());
2400- }
2390+ ggml_cuda_pool_alloc<int > dev_cur_src1_row (ctx.pool (), 1 );
2391+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping (ctx.pool (), num_src1_rows);
2392+ CUDA_CHECK (cudaMemsetAsync (dev_cur_src1_row.get (), 0 , sizeof (int ), stream));
24012393
2402- src0_row.data = src0_original + i02*nb02;
2394+ {
2395+ dim3 block_dims (std::min ((unsigned int )ne10, 768u ));
2396+ dim3 grid_dims (ids->ne [1 ], n_ids);
2397+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2398+ src1_original, src1_contiguous.get (),
2399+ dev_cur_src1_row.get (), dev_row_mapping.get (),
2400+ ids_dev, i02, ids->nb [1 ], ids->nb [0 ],
2401+ ne11, ne10,
2402+ nb11, nb12);
2403+ CUDA_CHECK (cudaGetLastError ());
2404+ }
24032405
2404- GGML_ASSERT (nb11 == sizeof (float )*ne10);
2405- GGML_ASSERT (nb1 == sizeof (float )*ne0);
2406+ src0_row.data = src0_original + i02*nb02;
24062407
2407- src1_row.ne [1 ] = num_src1_rows;
2408- src1_row.nb [1 ] = nb11;
2409- src1_row.nb [2 ] = num_src1_rows*nb11;
2410- src1_row.nb [3 ] = num_src1_rows*nb11;
2408+ GGML_ASSERT (nb11 == sizeof (float )*ne10);
2409+ GGML_ASSERT (nb1 == sizeof (float )*ne0);
24112410
2412- dst_row .ne [1 ] = num_src1_rows;
2413- dst_row .nb [1 ] = nb1 ;
2414- dst_row .nb [2 ] = num_src1_rows*nb1 ;
2415- dst_row .nb [3 ] = num_src1_rows*nb1 ;
2411+ src1_row .ne [1 ] = num_src1_rows;
2412+ src1_row .nb [1 ] = nb11 ;
2413+ src1_row .nb [2 ] = num_src1_rows*nb11 ;
2414+ src1_row .nb [3 ] = num_src1_rows*nb11 ;
24162415
2417- ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
2416+ dst_row.ne [1 ] = num_src1_rows;
2417+ dst_row.nb [1 ] = nb1;
2418+ dst_row.nb [2 ] = num_src1_rows*nb1;
2419+ dst_row.nb [3 ] = num_src1_rows*nb1;
24182420
2419- {
2420- dim3 block_dims (std::min ((unsigned int )ne0, 768u ));
2421- dim3 grid_dims (num_src1_rows);
2422- k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2423- dst_original, dst_contiguous.get (),
2424- dev_row_mapping.get (),
2425- ne0,
2426- nb1, nb2);
2427- CUDA_CHECK (cudaGetLastError ());
2428- }
2421+ ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
2422+
2423+ {
2424+ dim3 block_dims (std::min ((unsigned int )ne0, 768u ));
2425+ dim3 grid_dims (num_src1_rows);
2426+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2427+ dst_original, dst_contiguous.get (),
2428+ dev_row_mapping.get (),
2429+ ne0,
2430+ nb1, nb2);
2431+ CUDA_CHECK (cudaGetLastError ());
24292432 }
24302433 }
24312434}
@@ -2841,7 +2844,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
28412844#endif
28422845 }
28432846
2844- if (node->op == GGML_OP_MUL_MAT_ID) {
2847+ if (node->op == GGML_OP_MUL_MAT_ID && node-> ne [ 2 ] != 1 ) {
28452848 use_cuda_graph = false ; // This node type is not supported by CUDA graph capture
28462849#ifndef NDEBUG
28472850 GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to unsupported node type\n " , __func__);
@@ -3650,9 +3653,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
36503653 }
36513654 case GGML_OP_ROPE:
36523655 case GGML_OP_ROPE_BACK: {
3653- const size_t ts = ggml_type_size (op->src [0 ]->type );
3654- const int64_t ne0_012 = op->src [0 ]->ne [0 ] * op->src [0 ]->ne [1 ] * op->src [0 ]->ne [2 ];
3655- return op->src [0 ]->nb [0 ] == ts && op->src [0 ]->nb [3 ] == ne0_012*ts;
3656+ return op->src [0 ]->nb [0 ] == ggml_type_size (op->src [0 ]->type ) && ggml_is_contiguous_2 (op->src [0 ]);
36563657 }
36573658 case GGML_OP_IM2COL:
36583659 case GGML_OP_CONV_2D_DW:
0 commit comments