ggml-org · 0cc4m · Jun 8, 2026 · May 30, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -622,6 +622,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
 
 // cuda buffer
 
+static void ggml_backend_cuda_device_inc_active(ggml_backend_dev_t dev);
+static void ggml_backend_cuda_device_dec_active(ggml_backend_dev_t dev);
+
 struct ggml_backend_cuda_buffer_context {
     int device;
     void * dev_ptr = nullptr;
@@ -639,6 +642,9 @@ struct ggml_backend_cuda_buffer_context {
 
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+    ggml_backend_cuda_device_dec_active(buffer->buft->device);
+
     delete ctx;
 }
 
@@ -791,6 +797,8 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 
     ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
 
+    ggml_backend_cuda_device_inc_active(buft->device);
+
     return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }
 
@@ -1490,6 +1498,8 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
 }
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_cuda_device_dec_active(buffer->buft->device);
+
     CUDA_CHECK(cudaFreeHost(buffer->context));
 }
 
@@ -1498,6 +1508,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
         return nullptr;
     }
 
+    ggml_cuda_set_device(0);
+
     void * ptr = nullptr;
     cudaError_t err = cudaMallocHost((void **) &ptr, size);
     if (err != cudaSuccess) {
@@ -1523,6 +1535,8 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
     buffer->buft = buft;
     buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
 
+    ggml_backend_cuda_device_inc_active(buft->device);
+
     return buffer;
 }
 
@@ -3137,12 +3151,8 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
     return cuda_ctx->name.c_str();
 }
 
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+static void ggml_backend_cuda_free(ggml_backend_t backend);
 
-    delete cuda_ctx;
-    delete backend;
-}
 
 static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
@@ -4877,13 +4887,40 @@ struct ggml_backend_cuda_device_context {
     std::string description;
     std::string pci_bus_id;
     int op_offload_min_batch_size;
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    std::atomic<int> active_count{0};
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 };
 
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+static void ggml_backend_cuda_device_inc_active(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
+    ctx->active_count.fetch_add(1, std::memory_order_relaxed);
+}
+
+static void ggml_backend_cuda_device_dec_active(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
+    ctx->active_count.fetch_sub(1, std::memory_order_relaxed);
+}
+#else
+static void ggml_backend_cuda_device_inc_active(ggml_backend_dev_t dev) { GGML_UNUSED(dev); }
+static void ggml_backend_cuda_device_dec_active(ggml_backend_dev_t dev) { GGML_UNUSED(dev); }
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     return ctx->name.c_str();
 }
 
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    ggml_backend_cuda_device_dec_active(backend->device);
+
+    delete cuda_ctx;
+    delete backend;
+}
+
 static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     return ctx->description.c_str();
@@ -4993,6 +5030,13 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
     }
 #endif // defined(__linux__)
 
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    // If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA
+    // context that permanently consumes VRAM. Reset the device to free it.
+    if (ctx->active_count.load(std::memory_order_relaxed) == 0) {
+        cudaDeviceReset();
+    }
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 }
 
 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
@@ -5687,13 +5731,17 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
         return nullptr;
     }
 
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device);
+
     ggml_backend_t cuda_backend = new ggml_backend {
         /* .guid    = */ ggml_backend_cuda_guid(),
         /* .iface   = */ ggml_backend_cuda_interface,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
+        /* .device  = */ dev,
         /* .context = */ ctx,
     };
 
+    ggml_backend_cuda_device_inc_active(dev);
+
     return cuda_backend;
 }