Skip to content
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ option(LLAMA_CUBLAS "llama: use CUDA"
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
option(LLAMA_CUDA_USE_CUDA_POOL "llama: use CUDA memory instead of custom pool" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
Expand Down Expand Up @@ -270,6 +271,11 @@ if (LLAMA_CUBLAS)
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()

if (LLAMA_CUDA_USE_CUDA_POOL)
add_compile_definitions(GGML_USE_CUDA_MEMORY_POOL)
endif()

add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y)
Expand Down Expand Up @@ -373,6 +379,10 @@ if (LLAMA_HIPBLAS)
if (LLAMA_CUDA_FORCE_MMQ)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
endif()
if (LLAMA_CUDA_USE_CUDA_POOL)
target_compile_definitions(ggml-rocm PRIVATE GGML_USE_CUDA_MEMORY_POOL)
endif()

target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
Expand Down
121 changes: 102 additions & 19 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@
#define CUDA_USE_TENSOR_CORES
#endif

#if defined(GGML_USE_CUDA_MEMORY_POOL)
#define CUDA_USE_MEMORY_POOL
#endif

// max batch size to use MMQ kernels when tensor cores are available
#define MMQ_MAX_BATCH_SIZE 32

Expand Down Expand Up @@ -5844,8 +5848,48 @@ void ggml_init_cublas() {
for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor);

#if defined(CUDA_USE_MEMORY_POOL)
bool support_mem_pool = true;
#if CUDART_VERSION >= 12000
support_mem_pool = (prop.memoryPoolsSupported == 1);
#endif
if (support_mem_pool) {
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
err = (cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
if (err == cudaSuccess) {
fprintf(stderr, ", CUDA memory pool is supported\n");
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (release threshold error)\n");
}
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (can't load default pool)\n");
}
// test alloc/dealoc
if (err == cudaSuccess) {
void *testPtr;
size_t testSize = 1024;
err = cudaMallocFromPoolAsync(&testPtr, testSize, g_cudaMemPools[id], g_cudaStreams[id][0]);
if (err == cudaSuccess) {
err = cudaFreeAsync(testPtr, g_cudaStreams[id][0]);
if (err != cudaSuccess) {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (deallocation failed)\n");
}
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (allocation failed)\n");
}
}
} else {
fprintf(stderr, ", CUDA memory pool is not supported\n");
}
#endif
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
Expand All @@ -5854,6 +5898,52 @@ void ggml_init_cublas() {
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}

#if defined(CUDA_USE_MEMORY_POOL)
if (g_device_count > 1) {
// give access to devices memory pools
if (g_cudaMemPools[g_main_device] != nullptr) {
cudaMemPool_t main_device_pool;
cudaMemAccessDesc desc_main_device = {};
desc_main_device.location.type = cudaMemLocationTypeDevice;
desc_main_device.location.id = g_main_device;
desc_main_device.flags = cudaMemAccessFlagsProtReadWrite;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&main_device_pool, g_main_device));
for (int id = 0; id < g_device_count; ++id) {
if (id == g_main_device) continue;

if (g_cudaMemPools[id] == nullptr) {
fprintf(stderr,
"Warning: Device %d doesnt support CUDA memory pool, skipping pool access config\n",
id);
continue;
}

cudaMemAccessDesc desc_device = {};
desc_device.location.type = cudaMemLocationTypeDevice;
desc_device.location.id = id;
desc_device.flags = cudaMemAccessFlagsProtReadWrite;
cudaError_t err = cudaMemPoolSetAccess(main_device_pool, &desc_device, 1 /* numDescs */);
if (err != cudaSuccess) {
fprintf(stderr, "Can't give access for main device memory pool to device %d\n", id);
}
cudaMemPool_t mempool;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, id));
err = cudaMemPoolSetAccess(mempool, &desc_main_device, 1 /* numDescs */);
if (err != cudaSuccess) {
fprintf(stderr, "Can't give access for device %d memory pool to main device \n", id);
}
}
} else {
fprintf(stderr,
"WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n");
for (int id = 0; id < g_device_count; ++id) {
g_cudaMemPools[id] = nullptr;
}
}
}
#endif

for (int id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram;
}
Expand All @@ -5869,13 +5959,6 @@ void ggml_init_cublas() {
// create cublas handle
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));

// configure memory pool
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
}
}

// configure logging to stdout
Expand Down Expand Up @@ -6375,7 +6458,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

if (src1_convert_f16) {
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
src1_dfloat = (half *) ggml_cuda_pool_malloc_async(ne00*sizeof(half), &ash, g_main_device, stream);
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
ne00, 1, sizeof(float), 0, 0,
ne00, 1, sizeof(half), 0, 0, stream);
Expand Down Expand Up @@ -6776,22 +6859,22 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
if (src0_on_device) {
src0_ddf = (float *) src0_extra->data_device[g_main_device];
} else {
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
src0_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_asf, g_main_device, main_stream);
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
}

if (use_src1 && !src1_stays_on_host) {
if (src1_on_device) {
src1_ddf = (float *) src1_extra->data_device[g_main_device];
} else {
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
src1_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf, g_main_device, main_stream);
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
}
}
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[g_main_device];
} else {
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
dst_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(dst), &dst_asf, g_main_device, main_stream);
}

// do the computation
Expand All @@ -6803,18 +6886,18 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
}

if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
}

if (src0_asf > 0) {
ggml_cuda_pool_free(src0_ddf, src0_asf);
ggml_cuda_pool_free_async(src0_ddf, src0_asf, g_main_device, main_stream);
}
if (src1_asf > 0) {
ggml_cuda_pool_free(src1_ddf, src1_asf);
ggml_cuda_pool_free_async(src1_ddf, src1_asf, g_main_device, main_stream);
}
if (dst_asf > 0) {
ggml_cuda_pool_free(dst_ddf, dst_asf);
}

if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
ggml_cuda_pool_free_async(dst_ddf, dst_asf, g_main_device, main_stream);
}
}

Expand Down