6868#include " ggml-cuda.h"
6969#include " ggml.h"
7070
71+ #if defined(_MSC_VER)
72+ #pragma warning(disable: 4244 4267) // possible loss of data
73+ #endif
74+
7175static_assert (sizeof (half) == sizeof (ggml_fp16_t ), " wrong fp16 size" );
7276
7377#define CUDA_CHECK (err ) \
@@ -1518,19 +1522,13 @@ static void * g_scratch_buffer = nullptr;
15181522static size_t g_scratch_size = 1024 *1024 *1024 ; // 1 GB by default
15191523static size_t g_scratch_offset = 0 ;
15201524
1521- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1522- #define GGML_CUDA_MAX_EVENTS 64
1523-
15241525static int g_device_count = -1 ;
15251526static int g_main_device = 0 ;
15261527static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
15271528
15281529static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
15291530
1530- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1531-
1532- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1533- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1531+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
15341532
15351533void ggml_init_cublas () {
15361534 static bool initialized = false ;
@@ -1554,15 +1552,8 @@ void ggml_init_cublas() {
15541552 for (int id = 0 ; id < g_device_count; ++id) {
15551553 CUDA_CHECK (cudaSetDevice (id));
15561554
1557- // create streams
1558- for (int i = 0 ; i < GGML_CUDA_MAX_STREAMS; ++i) {
1559- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1560- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1561- }
1562- // create events
1563- for (int i = 0 ; i < GGML_CUDA_MAX_EVENTS; ++i) {
1564- CUDA_CHECK (cudaEventCreateWithFlags (&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1565- }
1555+ // create main stream
1556+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id], cudaStreamNonBlocking));
15661557
15671558 // create cublas handle
15681559 CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
@@ -2029,6 +2020,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
20292020 size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
20302021 size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
20312022
2023+ // if multiple GPUs are used they need to wait for the main GPU to finish
2024+ if (split && g_device_count > 1 ) {
2025+ CUDA_CHECK (cudaSetDevice (g_main_device));
2026+ CUDA_CHECK (cudaDeviceSynchronize ());
2027+ }
2028+
20322029 for (int id = 0 ; id < g_device_count; ++id) {
20332030 if (!split && id != g_main_device) {
20342031 continue ;
@@ -2127,9 +2124,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21272124 }
21282125 const int64_t i11 = i13*ne12 + i12;
21292126
2130- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2131- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2132- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2127+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
21332128
21342129 // for split tensors the data begins at i0 == i0_offset_low
21352130 char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -2157,14 +2152,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21572152 if (src1->backend == GGML_BACKEND_CPU) {
21582153 GGML_ASSERT (!flatten_rows || nrows0 == ggml_nrows (src1));
21592154 int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2160- CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_memcpy_src1 ));
2155+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_main ));
21612156 } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
21622157 if (id != g_main_device) {
21632158 GGML_ASSERT (!flatten_rows);
21642159 float * src1_ddf_i_source = (float *) src1_extra->data_device [g_main_device];
21652160 src1_ddf_i_source += i11*src1_stride;
21662161 CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof (float ),
2167- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1 ));
2162+ cudaMemcpyDeviceToDevice, cudaStream_main ));
21682163 }
21692164 } else if (src1_on_device && !src1_is_contiguous) {
21702165 GGML_ASSERT (!split);
@@ -2173,7 +2168,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21732168 GGML_ASSERT (false );
21742169 }
21752170 }
2176- CUDA_CHECK (cudaEventRecord (cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
21772171
21782172 if (!src0_on_device || !src0_is_contiguous) {
21792173 if (src0_is_f32) {
@@ -2189,9 +2183,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21892183 CUDA_CHECK (cudaGetLastError ());
21902184 }
21912185
2192- // wait with main stream until src1 memcpy is done
2193- CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, cudaEvent_memcpy_src1, 0 ));
2194-
21952186 // do the computation
21962187 op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
21972188
@@ -2229,8 +2220,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22292220
22302221 // wait until each device is finished, then free their buffers
22312222 for (int id = 0 ; id < g_device_count; ++id) {
2223+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0 ) {
2224+ continue ;
2225+ }
2226+
22322227 CUDA_CHECK (cudaSetDevice (id));
22332228 CUDA_CHECK (cudaDeviceSynchronize ());
2229+
22342230 if (src0_asq[id] > 0 ) {
22352231 ggml_cuda_pool_free (src0_ddq[id], src0_asq[id]);
22362232 }
@@ -2296,7 +2292,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
22962292 const int64_t ne02 = src0->ne [2 ];
22972293
22982294 CUDA_CHECK (cudaSetDevice (g_main_device));
2299- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2295+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
23002296
23012297 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
23022298 void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2308,8 +2304,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
23082304 float * dst_ddf = (float *) dst_extra->data_device [g_main_device];
23092305
23102306 ggml_mul_mat_p021_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2311-
2312- CUDA_CHECK (cudaDeviceSynchronize ());
23132307}
23142308
23152309void ggml_cuda_mul_mat_vec_nc (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2327,7 +2321,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
23272321 const int64_t nb02 = src0->nb [2 ];
23282322
23292323 CUDA_CHECK (cudaSetDevice (g_main_device));
2330- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2324+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
23312325
23322326 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
23332327 void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2342,8 +2336,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
23422336 const int channel_stride_x = nb02 / sizeof (half);
23432337
23442338 ggml_mul_mat_vec_nc_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2345-
2346- CUDA_CHECK (cudaDeviceSynchronize ());
23472339}
23482340
23492341void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2399,7 +2391,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
23992391 const int64_t nb12 = src1->nb [2 ];
24002392
24012393 CUDA_CHECK (cudaSetDevice (g_main_device));
2402- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2394+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
24032395
24042396 const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
24052397 const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
@@ -2417,8 +2409,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
24172409 GGML_ASSERT (false );
24182410 }
24192411
2420- CUDA_CHECK (cudaDeviceSynchronize ());
2421-
24222412 (void ) dst;
24232413}
24242414
0 commit comments