@@ -7576,7 +7576,7 @@ static void ggml_cuda_op_mul_mat_cublas(
75767576
75777577    const  int  compute_capability = g_device_caps[id].cc ;
75787578
7579-     if  (compute_capability >= CC_VOLTA && (src0->type  == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ] && dst-> op_params [ 0 ] == GGML_PREC_DEFAULT ) {
7579+     if  (compute_capability >= CC_VOLTA && (src0->type  == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ]) {
75807580        //  convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
75817581        cuda_pool_alloc<half> src0_as_f16;
75827582        if  (src0->type  != GGML_TYPE_F16) {
@@ -7597,23 +7597,44 @@ static void ggml_cuda_op_mul_mat_cublas(
75977597            to_fp16_cuda (src1_ddf_i, src1_as_f16.get (), ne, stream);
75987598        }
75997599        const  half * src1_ptr = src1->type  == GGML_TYPE_F16 ? (const  half *) src1_ddf_i : src1_as_f16.get ();
7600-         cuda_pool_alloc<half> dst_f16 (row_diff*src1_ncols);
76017600
7602-         const  half alpha_f16 = 1 .0f ;
7603-         const  half beta_f16 = 0 .0f ;
7604- 
7605-         CUBLAS_CHECK (cublasSetStream (g_cublas_handles[id], stream));
7606-         CUBLAS_CHECK (
7607-             cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7608-                     row_diff, src1_ncols, ne10,
7609-                     &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
7610-                                 src1_ptr,       CUDA_R_16F, ne10,
7611-                     &beta_f16,   dst_f16.get (), CUDA_R_16F, ldc,
7612-                     CUBLAS_COMPUTE_16F,
7613-                     CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7614- 
7615-         const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7616-         to_fp32_cuda (dst_f16.get (), dst_dd_i, row_diff*src1_ncols, stream);
7601+         switch  (dst->op_params [0 ]) {
7602+             case  GGML_PREC_DEFAULT:
7603+                 {
7604+                     cuda_pool_alloc<half> dst_f16 (row_diff*src1_ncols);
7605+ 
7606+                     const  half alpha_f16 = 1 .0f ;
7607+                     const  half beta_f16  = 0 .0f ;
7608+ 
7609+                     CUBLAS_CHECK (cublasSetStream (g_cublas_handles[id], stream));
7610+                     CUBLAS_CHECK (
7611+                             cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7612+                                 row_diff,   src1_ncols, ne10,
7613+                                 &alpha_f16, src0_ptr,      CUDA_R_16F, ne00,
7614+                                             src1_ptr,      CUDA_R_16F, ne10,
7615+                                 &beta_f16,  dst_f16.get (), CUDA_R_16F, ldc,
7616+                                 CUBLAS_COMPUTE_16F,
7617+                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7618+ 
7619+                     const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7620+                     to_fp32_cuda (dst_f16.get (), dst_dd_i, row_diff*src1_ncols, stream);
7621+                 } break ;
7622+             case  GGML_PREC_F32:
7623+                 {
7624+                     const  float  alpha_f32 = 1 .0f ;
7625+                     const  float  beta_f32  = 0 .0f ;
7626+ 
7627+                     CUBLAS_CHECK (cublasSetStream (g_cublas_handles[id], stream));
7628+                     CUBLAS_CHECK (
7629+                         cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7630+                                 row_diff, src1_ncols, ne10,
7631+                                 &alpha_f32, src0_ptr, CUDA_R_16F, ne00,
7632+                                             src1_ptr, CUDA_R_16F, ne10,
7633+                                 &beta_f32,  dst_dd_i, CUDA_R_32F, ldc,
7634+                                 CUBLAS_COMPUTE_32F,
7635+                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7636+                 } break ;
7637+         }
76177638    }
76187639    else  {
76197640        cuda_pool_alloc<float > src0_ddq_as_f32;
@@ -9228,6 +9249,20 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
92289249}
92299250
92309251void  ggml_cuda_free_data (struct  ggml_tensor  * tensor) {
9252+     //  print current mem usage using cudaMemGetInfo
9253+     //  TODO: this is a hack - need better solution
9254+     {
9255+         size_t  free ;
9256+         size_t  total;
9257+         CUDA_CHECK (cudaMemGetInfo (&free , &total));
9258+ 
9259+         static  size_t  used = 0 ;
9260+         if  (used < total - free ) {
9261+             printf (" CUDA: used %zu MB, free %zu MB\n " free )/1024 /1024 , free /1024 /1024 );
9262+             used = total - free ;
9263+         }
9264+     }
9265+ 
92319266    if  (!tensor || !tensor->extra  || (tensor->backend  != GGML_BACKEND_GPU && tensor->backend  != GGML_BACKEND_GPU_SPLIT) ) {
92329267        return ;
92339268    }
0 commit comments