Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 54 additions & 6 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {

// cuda buffer

static void ggml_backend_cuda_device_inc_active(ggml_backend_dev_t dev);
static void ggml_backend_cuda_device_dec_active(ggml_backend_dev_t dev);

struct ggml_backend_cuda_buffer_context {
int device;
void * dev_ptr = nullptr;
Expand All @@ -639,6 +642,9 @@ struct ggml_backend_cuda_buffer_context {

static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

ggml_backend_cuda_device_dec_active(buffer->buft->device);

delete ctx;
}

Expand Down Expand Up @@ -791,6 +797,8 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac

ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);

ggml_backend_cuda_device_inc_active(buft->device);

return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
}

Expand Down Expand Up @@ -1490,6 +1498,8 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
}

static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_device_dec_active(buffer->buft->device);

CUDA_CHECK(cudaFreeHost(buffer->context));
}

Expand All @@ -1498,6 +1508,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
return nullptr;
}

ggml_cuda_set_device(0);
Comment thread
0cc4m marked this conversation as resolved.
Outdated

void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
Expand All @@ -1523,6 +1535,8 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

ggml_backend_cuda_device_inc_active(buft->device);

return buffer;
}

Expand Down Expand Up @@ -3137,12 +3151,8 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
return cuda_ctx->name.c_str();
}

static void ggml_backend_cuda_free(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
static void ggml_backend_cuda_free(ggml_backend_t backend);

delete cuda_ctx;
delete backend;
}

static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
Expand Down Expand Up @@ -4877,13 +4887,40 @@ struct ggml_backend_cuda_device_context {
std::string description;
std::string pci_bus_id;
int op_offload_min_batch_size;
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
std::atomic<int> active_count{0};
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
};

#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static void ggml_backend_cuda_device_inc_active(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
ctx->active_count.fetch_add(1, std::memory_order_relaxed);
}

static void ggml_backend_cuda_device_dec_active(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
ctx->active_count.fetch_sub(1, std::memory_order_relaxed);
}
#else
static void ggml_backend_cuda_device_inc_active(ggml_backend_dev_t dev) { GGML_UNUSED(dev); }
static void ggml_backend_cuda_device_dec_active(ggml_backend_dev_t dev) { GGML_UNUSED(dev); }
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ctx->name.c_str();
}

static void ggml_backend_cuda_free(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

ggml_backend_cuda_device_dec_active(backend->device);

delete cuda_ctx;
delete backend;
}

static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ctx->description.c_str();
Expand Down Expand Up @@ -4993,6 +5030,13 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
}
#endif // defined(__linux__)

#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
// If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA
// context that permanently consumes VRAM. Reset the device to free it.
if (ctx->active_count.load(std::memory_order_relaxed) == 0) {
cudaDeviceReset();
}
Comment thread
0cc4m marked this conversation as resolved.
Outdated
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
}

static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
Expand Down Expand Up @@ -5687,13 +5731,17 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
return nullptr;
}

ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device);

ggml_backend_t cuda_backend = new ggml_backend {
/* .guid = */ ggml_backend_cuda_guid(),
/* .iface = */ ggml_backend_cuda_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
/* .device = */ dev,
/* .context = */ ctx,
};

ggml_backend_cuda_device_inc_active(dev);

return cuda_backend;
}

Expand Down
Loading