@@ -6564,18 +6564,16 @@ struct scoped_spin_lock {
65646564
65656565static  std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
65666566
6567- #if  0 
6568- #define DEBUG_CUDA_MALLOC
6567+ //  #define DEBUG_CUDA_MALLOC
65696568struct  cuda_buffer  {
65706569    void  * ptr = nullptr ;
65716570    size_t  size = 0 ;
65726571};
65736572
65746573static  cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6575- 
65766574static  size_t  g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0 };
65776575
6578- static void * ggml_cuda_pool_malloc (size_t size, size_t * actual_size) {
6576+ static  void  * ggml_cuda_pool_malloc_leg (size_t  size, size_t  * actual_size) {
65796577    scoped_spin_lock lock (g_cuda_pool_lock);
65806578    int  id;
65816579    CUDA_CHECK (cudaGetDevice (&id));
@@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
66296627    return  ptr;
66306628}
66316629
6632- static void ggml_cuda_pool_free (void * ptr, size_t size) {
6630+ static  void  ggml_cuda_pool_free_leg (void  * ptr, size_t  size) {
66336631    scoped_spin_lock lock (g_cuda_pool_lock);
66346632    int  id;
66356633    CUDA_CHECK (cudaGetDevice (&id));
@@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
66466644    CUDA_CHECK (cudaFree (ptr));
66476645    g_cuda_pool_size[id] -= size;
66486646}
6649- #else 
66506647
6648+ #if  !defined(GGML_USE_HIPBLAS)
6649+ //  pool with virtual memory
66516650static  std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
66526651static  CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0 };
6653- static  size_t  g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0 };
66546652static  size_t  g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0 };
6655- 
66566653static  const  size_t  CUDA_POOL_MAX_SIZE = 1ull  << 36 ; //  64 GB
66576654
6658- // #define DEBUG_CUDA_MALLOC
6659- 
6660- #define  ggml_cuda_pool_malloc (size, actual_size ) ggml_cuda_pool_malloc_(size, actual_size, #size "  " 
6661- static  void  * ggml_cuda_pool_malloc_ (size_t  size, size_t  * actual_size, const  char  * call) {
6655+ static  void  * ggml_cuda_pool_malloc_vmm (size_t  size, size_t  * actual_size) {
66626656    scoped_spin_lock lock (g_cuda_pool_lock);
66636657    int  id;
66646658    CUDA_CHECK (cudaGetDevice (&id));
@@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
67056699        g_cuda_pool_handles[id].push_back (handle);
67066700        g_cuda_pool_size[id] += reserve_size;
67076701
6708-         printf (" cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s] \n " 
6709-                id, (unsigned  long  long ) (g_cuda_pool_size[id]/1024 /1024 ),
6710-                (unsigned  long  long ) (reserve_size/1024 /1024 ), call );
6702+         // printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
6703+         //        id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6704+         //        (unsigned long long) (reserve_size/1024/1024));
67116705    }
67126706
67136707    GGML_ASSERT (g_cuda_pool_addr[id] != 0 );
@@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
67176711    g_cuda_pool_used[id] += size;
67186712
67196713#ifdef  DEBUG_CUDA_MALLOC
6720-     printf (" cuda pool[%d]: allocated %llu bytes at %llx [%s]\n " unsigned  long  long ) size, ptr, call );
6714+     printf (" cuda pool[%d]: allocated %llu bytes at %llx [%s]\n " unsigned  long  long ) size, ptr);
67216715#endif 
67226716
67236717    return  ptr;
6724- 
6725-     GGML_UNUSED (call);
67266718}
67276719
6728- #define  ggml_cuda_pool_free (ptr, size ) ggml_cuda_pool_free_(ptr, size, #ptr "  " 
6729- static  void  ggml_cuda_pool_free_ (void  * ptr, size_t  size, const  char  * call) {
6720+ static  void  ggml_cuda_pool_free_vmm (void  * ptr, size_t  size) {
67306721    scoped_spin_lock lock (g_cuda_pool_lock);
67316722    int  id;
67326723    CUDA_CHECK (cudaGetDevice (&id));
67336724
67346725#ifdef  DEBUG_CUDA_MALLOC
6735-     printf (" cuda pool[%d]: free  %llu bytes at %llx [%s] \n " unsigned  long  long ) size, ptr, call );
6726+     printf (" cuda pool[%d]: freed  %llu bytes at %llx\n " unsigned  long  long ) size, ptr);
67366727#endif 
67376728
67386729    g_cuda_pool_used[id] -= size;
67396730
67406731    //  all deallocations must be in reverse order of the allocations
67416732    GGML_ASSERT (ptr == (void  *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
6733+ }
67426734
6743-     GGML_UNUSED (call);
6735+ static  bool  g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false };
6736+ 
6737+ static  void  * ggml_cuda_pool_malloc (size_t  size, size_t  * actual_size) {
6738+     int  id;
6739+     CUDA_CHECK (cudaGetDevice (&id));
6740+     if  (g_device_vmm[id]) {
6741+         return  ggml_cuda_pool_malloc_vmm (size, actual_size);
6742+     } else  {
6743+         return  ggml_cuda_pool_malloc_leg (size, actual_size);
6744+     }
67446745}
67456746
6747+ static  void  ggml_cuda_pool_free (void  * ptr, size_t  size) {
6748+     int  id;
6749+     CUDA_CHECK (cudaGetDevice (&id));
6750+     if  (g_device_vmm[id]) {
6751+         ggml_cuda_pool_free_vmm (ptr, size);
6752+     } else  {
6753+         ggml_cuda_pool_free_leg (ptr, size);
6754+     }
6755+ }
6756+ #else 
6757+ #define  ggml_cuda_pool_malloc  ggml_cuda_pool_malloc_leg
6758+ #define  ggml_cuda_pool_free  ggml_cuda_pool_free_leg
67466759#endif 
67476760
67486761static  bool  g_cublas_loaded = false ;
@@ -6783,9 +6796,18 @@ void ggml_init_cublas() {
67836796#endif 
67846797        fprintf (stderr, " %s: found %d " "  devices:\n " 
67856798        for  (int  id = 0 ; id < g_device_count; ++id) {
6799+             int  deviceSupportsVmm = 0 ;
6800+ 
6801+ #if  !defined(GGML_USE_HIPBLAS)
6802+             CUdevice device;
6803+             CU_CHECK (cuDeviceGet (&device, id));
6804+             CU_CHECK (cuDeviceGetAttribute (&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6805+             g_device_vmm[id] = !!deviceSupportsVmm;
6806+ #endif 
6807+ 
67866808            cudaDeviceProp prop;
67876809            CUDA_CHECK (cudaGetDeviceProperties (&prop, id));
6788-             fprintf (stderr, "   Device %d: %s, compute capability %d.%d\n " name , prop.major , prop.minor );
6810+             fprintf (stderr, "   Device %d: %s, compute capability %d.%d, VMM: %s \n " name , prop.major , prop.minor , deviceSupportsVmm ?  " yes "  :  " no " 
67896811
67906812            g_tensor_split[id] = total_vram;
67916813            total_vram += prop.totalGlobalMem ;
0 commit comments