@@ -6739,6 +6739,39 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
67396739#define  ggml_cuda_pool_free  ggml_cuda_pool_free_leg
67406740#endif 
67416741
6742+ template <typename  T>
6743+ struct  cuda_pool_alloc  {
6744+     T * ptr = nullptr ;
6745+     size_t  act_size = 0 ;
6746+ 
6747+     //  size is in number of elements
6748+     T * alloc (size_t  size) {
6749+         GGML_ASSERT (ptr == nullptr );
6750+         ptr = (T *) ggml_cuda_pool_malloc (size * sizeof (T), &this ->act_size );
6751+         return  ptr;
6752+     }
6753+ 
6754+     cuda_pool_alloc (size_t  size) {
6755+         alloc (size);
6756+     }
6757+ 
6758+     ~cuda_pool_alloc () {
6759+         if  (ptr != nullptr ) {
6760+             ggml_cuda_pool_free (ptr, act_size);
6761+         }
6762+     }
6763+ 
6764+     T * get () {
6765+         return  ptr;
6766+     }
6767+ 
6768+     cuda_pool_alloc () = default ;
6769+     cuda_pool_alloc (const  cuda_pool_alloc &) = delete ;
6770+     cuda_pool_alloc (cuda_pool_alloc &&) = delete ;
6771+     cuda_pool_alloc& operator =(const  cuda_pool_alloc &) = delete ;
6772+     cuda_pool_alloc& operator =(cuda_pool_alloc &&) = delete ;
6773+ };
6774+ 
67426775static  bool  g_cublas_loaded = false ;
67436776
67446777bool  ggml_cublas_loaded (void ) {
@@ -7432,16 +7465,16 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
74327465
74337466    //  on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
74347467#ifdef  GGML_CUDA_F16
7435-     size_t  ash ;
7436-     dfloat  * src1_dfloat = nullptr ; //  dfloat == half
7468+     cuda_pool_alloc<half> src1_dfloat_a ;
7469+     half  * src1_dfloat = nullptr ; //  dfloat == half
74377470
74387471    bool  src1_convert_f16 =
74397472        src0->type  == GGML_TYPE_Q4_0 || src0->type  == GGML_TYPE_Q4_1 ||
74407473        src0->type  == GGML_TYPE_Q5_0 || src0->type  == GGML_TYPE_Q5_1 ||
74417474        src0->type  == GGML_TYPE_Q8_0 || src0->type  == GGML_TYPE_F16;
74427475
74437476    if  (src1_convert_f16) {
7444-         src1_dfloat = (half *)  ggml_cuda_pool_malloc ( ne00* sizeof (half), &ash );
7477+         src1_dfloat = src1_dfloat_a. alloc ( ne00);
74457478        ggml_cpy_f32_f16_cuda ((const  char  *) src1_ddf_i, (char  *) src1_dfloat, ne00,
74467479                                ne00, 1 , sizeof (float ), 0 , 0 ,
74477480                                ne00, 1 , sizeof (half),  0 , 0 , stream);
@@ -7489,12 +7522,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
74897522            break ;
74907523    }
74917524
7492- #ifdef  GGML_CUDA_F16
7493-     if  (src1_convert_f16) {
7494-         ggml_cuda_pool_free (src1_dfloat, ash);
7495-     }
7496- #endif  //  GGML_CUDA_F16
7497- 
74987525    (void ) src1;
74997526    (void ) dst;
75007527    (void ) src1_ddq_i;
@@ -7529,29 +7556,26 @@ inline void ggml_cuda_op_mul_mat_cublas(
75297556
75307557    if  (compute_capability >= CC_VOLTA && (src0->type  == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ] && dst->op_params [0 ] == GGML_PREC_DEFAULT) {
75317558        //  convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7532-         half * src0_as_f16 = nullptr ;
7533-         size_t  src0_as = 0 ;
7559+         cuda_pool_alloc<half> src0_as_f16;
75347560        if  (src0->type  != GGML_TYPE_F16) {
75357561            const  to_fp16_cuda_t  to_fp16_cuda = ggml_get_to_fp16_cuda (src0->type );
75367562            GGML_ASSERT (to_fp16_cuda != nullptr );
75377563            size_t  ne = row_diff*ne00;
7538-             src0_as_f16 = (half *)  ggml_cuda_pool_malloc (ne *  sizeof (half), &src0_as );
7539-             to_fp16_cuda (src0_dd_i, src0_as_f16, ne, stream);
7564+             src0_as_f16. alloc (ne );
7565+             to_fp16_cuda (src0_dd_i, src0_as_f16. get () , ne, stream);
75407566        }
7541-         const  half * src0_ptr = src0->type  == GGML_TYPE_F16 ? (const  half *) src0_dd_i : src0_as_f16;
7567+         const  half * src0_ptr = src0->type  == GGML_TYPE_F16 ? (const  half *) src0_dd_i : src0_as_f16. get () ;
75427568
7543-         half * src1_as_f16 = nullptr ;
7544-         size_t  src1_as = 0 ;
7569+         cuda_pool_alloc<half> src1_as_f16;
75457570        if  (src1->type  != GGML_TYPE_F16) {
75467571            const  to_fp16_cuda_t  to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
75477572            GGML_ASSERT (to_fp16_cuda != nullptr );
75487573            size_t  ne = src1_ncols*ne10;
7549-             src1_as_f16 = (half *)  ggml_cuda_pool_malloc (ne *  sizeof (half), &src1_as );
7550-             to_fp16_cuda (src1_ddf_i, src1_as_f16, ne, stream);
7574+             src1_as_f16. alloc (ne );
7575+             to_fp16_cuda (src1_ddf_i, src1_as_f16. get () , ne, stream);
75517576        }
7552-         const  half * src1_ptr = src1->type  == GGML_TYPE_F16 ? (const  half *) src1_ddf_i : src1_as_f16;
7553-         size_t  dst_as = 0 ;
7554-         half * dst_f16 = (half *) ggml_cuda_pool_malloc (row_diff*src1_ncols * sizeof (half), &dst_as);
7577+         const  half * src1_ptr = src1->type  == GGML_TYPE_F16 ? (const  half *) src1_ddf_i : src1_as_f16.get ();
7578+         cuda_pool_alloc<half> dst_f16 (row_diff*src1_ncols);
75557579
75567580        const  half alpha_f16 = 1 .0f ;
75577581        const  half beta_f16 = 0 .0f ;
@@ -7560,36 +7584,25 @@ inline void ggml_cuda_op_mul_mat_cublas(
75607584        CUBLAS_CHECK (
75617585            cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
75627586                    row_diff, src1_ncols, ne10,
7563-                     &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
7564-                                 src1_ptr, CUDA_R_16F, ne10,
7565-                     &beta_f16,   dst_f16, CUDA_R_16F, ldc,
7587+                     &alpha_f16, src0_ptr,        CUDA_R_16F, ne00,
7588+                                 src1_ptr,        CUDA_R_16F, ne10,
7589+                     &beta_f16,   dst_f16. get () , CUDA_R_16F, ldc,
75667590                    CUBLAS_COMPUTE_16F,
75677591                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
75687592
75697593        const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7570-         to_fp32_cuda (dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
7571- 
7572-         ggml_cuda_pool_free (dst_f16, dst_as);
7573- 
7574-         if  (src1_as != 0 ) {
7575-             ggml_cuda_pool_free (src1_as_f16, src1_as);
7576-         }
7577- 
7578-         if  (src0_as != 0 ) {
7579-             ggml_cuda_pool_free (src0_as_f16, src0_as);
7580-         }
7594+         to_fp32_cuda (dst_f16.get (), dst_dd_i, row_diff*src1_ncols, stream);
75817595    }
75827596    else  {
7583-         float  * src0_ddq_as_f32 = nullptr ;
7584-         size_t  src0_as = 0 ;
7597+         cuda_pool_alloc<float > src0_ddq_as_f32;
75857598
75867599        if  (src0->type  != GGML_TYPE_F32) {
75877600            const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
75887601            GGML_ASSERT (to_fp32_cuda != nullptr );
7589-             src0_ddq_as_f32 = ( float  *)  ggml_cuda_pool_malloc ( row_diff*ne00 *  sizeof ( float ), &src0_as);  //  NOLINT 
7590-             to_fp32_cuda (src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
7602+             src0_ddq_as_f32. alloc ( row_diff*ne00); 
7603+             to_fp32_cuda (src0_dd_i, src0_ddq_as_f32. get () , row_diff*ne00, stream);
75917604        }
7592-         const  float  * src0_ddf_i = src0->type  == GGML_TYPE_F32 ? (const  float  *) src0_dd_i : src0_ddq_as_f32;
7605+         const  float  * src0_ddf_i = src0->type  == GGML_TYPE_F32 ? (const  float  *) src0_dd_i : src0_ddq_as_f32. get () ;
75937606
75947607        const  float  alpha = 1 .0f ;
75957608        const  float  beta = 0 .0f ;
@@ -7601,10 +7614,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
76017614                    &alpha, src0_ddf_i, ne00,
76027615                            src1_ddf_i, ne10,
76037616                    &beta,  dst_dd_i,   ldc));
7604- 
7605-         if  (src0_as != 0 ) {
7606-             ggml_cuda_pool_free (src0_ddq_as_f32, src0_as);
7607-         }
76087617    }
76097618
76107619    (void ) dst;
@@ -7896,33 +7905,33 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
78967905    float  * src1_ddf = nullptr ;
78977906    float  *  dst_ddf = nullptr ;
78987907
7899-     //  as = actual size
7900-     size_t  src0_asf = 0 ;
7901-     size_t  src1_asf = 0 ;
7902-     size_t   dst_asf = 0 ;
7908+     cuda_pool_alloc<float > src0_f;
7909+     cuda_pool_alloc<float > src1_f;
7910+     cuda_pool_alloc<float >  dst_f;
79037911
79047912    ggml_cuda_set_device (g_main_device);
7905-     const   cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
7913+     cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
79067914
79077915    if  (src0_on_device) {
79087916        src0_ddf = (float  *) src0_extra->data_device [g_main_device];
79097917    } else  {
7910-         src0_ddf = ( float  *)  ggml_cuda_pool_malloc ( ggml_nbytes ( src0), &src0_asf );
7918+         src0_ddf = src0_f. alloc ( ggml_nelements ( src0));
79117919        CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src0_ddf, src0, 0 , 0 , 0 , nrows0, main_stream));
79127920    }
79137921
79147922    if  (use_src1) {
79157923        if  (src1_on_device) {
79167924            src1_ddf = (float  *) src1_extra->data_device [g_main_device];
79177925        } else  {
7918-             src1_ddf = ( float  *)  ggml_cuda_pool_malloc ( ggml_nbytes ( src1), &src1_asf );
7926+             src1_ddf = src1_f. alloc ( ggml_nelements ( src1));
79197927            CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf, src1, 0 , 0 , 0 , nrows1, main_stream));
79207928        }
79217929    }
79227930    if  (dst_on_device) {
79237931        dst_ddf = (float  *) dst_extra->data_device [g_main_device];
79247932    } else  {
7925-         dst_ddf = (float  *) ggml_cuda_pool_malloc (ggml_nbytes (dst), &dst_asf);
7933+         dst_f.alloc (ggml_nbytes (dst));
7934+         dst_ddf = (float  *) dst_f.ptr ;
79267935    }
79277936
79287937    //  do the computation
@@ -7934,16 +7943,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
79347943        CUDA_CHECK (cudaMemcpyAsync (dst->data , dst_ddf, ggml_nbytes (dst), cudaMemcpyDeviceToHost, main_stream));
79357944    }
79367945
7937-     if  (dst_asf > 0 ) {
7938-         ggml_cuda_pool_free (dst_ddf, dst_asf);
7939-     }
7940-     if  (src1_asf > 0 ) {
7941-         ggml_cuda_pool_free (src1_ddf, src1_asf);
7942-     }
7943-     if  (src0_asf > 0 ) {
7944-         ggml_cuda_pool_free (src0_ddf, src0_asf);
7945-     }
7946- 
79477946    if  (dst->backend  == GGML_BACKEND_CPU) {
79487947        CUDA_CHECK (cudaDeviceSynchronize ());
79497948    }
@@ -8516,14 +8515,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
85168515    const  to_fp16_cuda_t  to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
85178516    GGML_ASSERT (to_fp16_cuda != nullptr );
85188517
8519-     size_t  src1_as = 0 ;
8520-     half * src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne1 * sizeof (half), &src1_as);
8521-     to_fp16_cuda (src1_ddf, src1_as_f16, ne1, main_stream);
8518+     cuda_pool_alloc<half> src1_as_f16 (ne1);
8519+     to_fp16_cuda (src1_ddf, src1_as_f16.get (), ne1, main_stream);
85228520
8523-     size_t  dst_as = 0 ;
8524- 
8525-     half * dst_f16 = nullptr ;
8526-     char  * dst_t    = nullptr ;
8521+     cuda_pool_alloc<half> dst_f16;
8522+     char  * dst_t ;
85278523
85288524    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
85298525    cudaDataType_t      cu_data_type    = CUDA_R_16F;
@@ -8542,8 +8538,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
85428538    const  void  * beta  = &beta_f16;
85438539
85448540    if  (dst->op_params [0 ] == GGML_PREC_DEFAULT) {
8545-         dst_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &dst_as);
8546-         dst_t    = (char  *) dst_f16;
8541+         dst_t  = (char  *) dst_f16.alloc (ne);
85478542
85488543        nbd2 /= sizeof (float ) / sizeof (half);
85498544        nbd3 /= sizeof (float ) / sizeof (half);
@@ -8590,29 +8585,23 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
85908585        CUBLAS_CHECK (
85918586        cublasGemmStridedBatchedEx (g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
85928587                ne01, ne11, ne10,
8593-                 alpha, (const  char  *) src0_as_f16, CUDA_R_16F,   nb01/sizeof (half),  src0->nb [2 ]/sizeof (half),  //  strideA
8594-                        (const  char  *) src1_as_f16, CUDA_R_16F,   nb11/sizeof (float ), src1->nb [2 ]/sizeof (float ), //  strideB
8595-                 beta,  (      char  *)       dst_t , cu_data_type, ne01,                dst->nb [2 ]/sizeof (float ), //  strideC
8588+                 alpha, (const  char  *) src0_as_f16,        CUDA_R_16F,   nb01/sizeof (half),  src0->nb [2 ]/sizeof (half),  //  strideA
8589+                        (const  char  *) src1_as_f16. get () , CUDA_R_16F,   nb11/sizeof (float ), src1->nb [2 ]/sizeof (float ), //  strideB
8590+                 beta,  (      char  *)       dst_t ,        cu_data_type, ne01,                dst->nb [2 ]/sizeof (float ), //  strideC
85968591                ne12*ne13,
85978592                cu_compute_type,
85988593                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
85998594    } else  {
86008595        //  use cublasGemmBatchedEx
86018596        const  int  ne23 = ne12*ne13;
86028597
8603-         const  void  ** ptrs_src = nullptr ;
8604-               void  ** ptrs_dst = nullptr ;
8605- 
8606-         size_t  ptrs_src_s = 0 ;
8607-         size_t  ptrs_dst_s = 0 ;
8608- 
8609-         ptrs_src = (const  void  **) ggml_cuda_pool_malloc (2 *ne23*sizeof (void  *), &ptrs_src_s);
8610-         ptrs_dst = (      void  **) ggml_cuda_pool_malloc (1 *ne23*sizeof (void  *), &ptrs_dst_s);
8598+         cuda_pool_alloc<const  void  *> ptrs_src (2 *ne23);
8599+         cuda_pool_alloc<      void  *> ptrs_dst (1 *ne23);
86118600
86128601        dim3  block_dims (ne13, ne12);
86138602        k_compute_batched_ptrs<<<1 , block_dims, 0 , main_stream>>> (
8614-                 src0_as_f16, src1_as_f16, dst_t ,
8615-                 ptrs_src, ptrs_dst,
8603+                 src0_as_f16, src1_as_f16. get () , dst_t ,
8604+                 ptrs_src. get () , ptrs_dst. get () ,
86168605                ne12, ne13,
86178606                ne23,
86188607                nb02, nb03,
@@ -8624,30 +8613,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
86248613        CUBLAS_CHECK (
86258614        cublasGemmBatchedEx (g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
86268615                ne01, ne11, ne10,
8627-                 alpha, (const  void  **) (ptrs_src + 0 *ne23), CUDA_R_16F,   nb01/sizeof (half),
8628-                        (const  void  **) (ptrs_src + 1 *ne23), CUDA_R_16F,   nb11/sizeof (float ),
8629-                 beta,  (      void  **) (ptrs_dst + 0 *ne23), cu_data_type, ne01,
8616+                 alpha, (const  void  **) (ptrs_src. get ()  + 0 *ne23), CUDA_R_16F,   nb01/sizeof (half),
8617+                        (const  void  **) (ptrs_src. get ()  + 1 *ne23), CUDA_R_16F,   nb11/sizeof (float ),
8618+                 beta,  (      void  **) (ptrs_dst. get ()  + 0 *ne23), cu_data_type, ne01,
86308619                ne23,
86318620                cu_compute_type,
86328621                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8633- 
8634-         if  (ptrs_dst_s != 0 ) {
8635-             ggml_cuda_pool_free (ptrs_dst, ptrs_dst_s);
8636-         }
8637-         if  (ptrs_src_s != 0 ) {
8638-             ggml_cuda_pool_free (ptrs_src, ptrs_src_s);
8639-         }
86408622    }
86418623#endif 
86428624
86438625    if  (dst->op_params [0 ] == GGML_PREC_DEFAULT) {
86448626        const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
8645-         to_fp32_cuda (dst_f16, dst_ddf, ne, main_stream);
8646- 
8647-         ggml_cuda_pool_free (dst_f16, dst_as);
8627+         to_fp32_cuda (dst_f16.get (), dst_ddf, ne, main_stream);
86488628    }
8649- 
8650-     ggml_cuda_pool_free (src1_as_f16, src1_as);
86518629}
86528630
86538631static  void  ggml_cuda_mul_mat (const  ggml_tensor * src0, const  ggml_tensor * src1, ggml_tensor * dst) {
@@ -8974,12 +8952,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
89748952            ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
89758953        }
89768954    } else  {
8977-         size_t  as_src1, as_dst;
8978-         char  * src1_contiguous = (char  *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (src1), &as_src1);
8979-         char  *  dst_contiguous = (char  *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (dst),  &as_dst);
8955+         cuda_pool_alloc<char > src1_contiguous (sizeof (float )*ggml_nelements (src1));
8956+         cuda_pool_alloc<char >  dst_contiguous (sizeof (float )*ggml_nelements (dst));
89808957
8981-         src1_row_extra.data_device [g_main_device] = src1_contiguous;
8982-         dst_row_extra.data_device [g_main_device]  =  dst_contiguous;
8958+         src1_row_extra.data_device [g_main_device] = src1_contiguous. get () ;
8959+         dst_row_extra.data_device [g_main_device]  =  dst_contiguous. get () ;
89838960
89848961        const  cudaMemcpyKind src1_kind = src1->backend  == GGML_BACKEND_CPU ?
89858962            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
@@ -8999,7 +8976,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
89998976
90008977                GGML_ASSERT (row_id >= 0  && row_id < n_as);
90018978
9002-                 CUDA_CHECK (cudaMemcpyAsync (src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8979+                 CUDA_CHECK (cudaMemcpyAsync (src1_contiguous. get ()  + num_src1_rows*nb11, src1_original + i01*nb11,
90038980                                        nb11, src1_kind, stream));
90048981                num_src1_rows++;
90058982            }
@@ -9031,14 +9008,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
90319008
90329009                GGML_ASSERT (row_id >= 0  && row_id < n_as);
90339010
9034-                 CUDA_CHECK (cudaMemcpyAsync (dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
9011+                 CUDA_CHECK (cudaMemcpyAsync (dst_original + i01*nb1, dst_contiguous. get ()  + num_src1_rows*nb1,
90359012                                        nb1, dst_kind, stream));
90369013                num_src1_rows++;
90379014            }
90389015        }
9039- 
9040-         ggml_cuda_pool_free (dst_contiguous,  as_dst);
9041-         ggml_cuda_pool_free (src1_contiguous, as_src1);
90429016    }
90439017
90449018    if  (dst->backend  == GGML_BACKEND_CPU) {
0 commit comments