ggml-org · ggerganov · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023
diff --git a/ggml-backend.c b/ggml-backend.c
@@ -94,6 +94,7 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size
     *allocator = (struct ggml_backend_buffer){
         /* .interface    = */ ggml_allocator_simple_interface,
         /* .context      = */ ctx,
+        /* .backend_size = */ 0,
         /* .backend_data = */ NULL,
     };
     return allocator;
@@ -192,6 +193,7 @@ static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_ba
 
     struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
     buffer->interface.free_data = ggml_backend_cpu_free_buffer;
+    buffer->backend_size = size;
     buffer->backend_data = data;
 
     return buffer;
@@ -311,8 +313,9 @@ struct ggml_backend * ggml_backend_cpu_init(void) {
     struct ggml_backend * cpu_backend = malloc(sizeof(struct ggml_backend));
 
     *cpu_backend = (struct ggml_backend) {
-        /* .interface = */ cpu_backend_interface,
-        /* .context   = */ ctx
+        /* .interface     = */ cpu_backend_interface,
+        /* .context       = */ ctx,
+        /* .is_ram_shared = */ true,
     };
     return cpu_backend;
 }

diff --git a/ggml-backend.h b/ggml-backend.h
@@ -27,6 +27,7 @@ extern "C" {
     struct ggml_backend_buffer {
         struct ggml_backend_buffer_interface interface;
         ggml_buffer_context_t context;
+        size_t backend_size;
         void * backend_data;
     };
 
@@ -96,6 +97,8 @@ extern "C" {
     struct ggml_backend {
         struct ggml_backend_interface interface;
         ggml_backend_context_t context;
+
+        bool is_ram_shared;
     };
 
     // backend helper functions

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1810,8 +1810,9 @@ ggml_backend * ggml_backend_cuda_init(void) {
 
     ggml_backend * cuda_backend = new ggml_backend;
     *cuda_backend = (ggml_backend){
-        /* .interface = */ cuda_backend_interface,
-        /* .context   = */ ctx
+        /* .interface     = */ cuda_backend_interface,
+        /* .context       = */ ctx
+        /* .is_ram_shared = */ false,
     };
     return cuda_backend;
 }
diff --git a/ggml-metal.h b/ggml-metal.h
@@ -19,51 +19,64 @@
 
 #pragma once
 
+#include "ggml.h"
+
 #include <stddef.h>
 #include <stdbool.h>
 
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 16
 
-struct ggml_tensor;
-struct ggml_cgraph;
+//struct ggml_tensor;
+//struct ggml_cgraph;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct ggml_metal_context;
+struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu);
 
-// number of command buffers to use
-struct ggml_metal_context * ggml_metal_init(int n_cb);
-void ggml_metal_free(struct ggml_metal_context * ctx);
+// TODO: temporary - move to backend interface
+bool ggml_backend_metal_map_buffer(
+        struct ggml_backend * backend,
+                 const char * name,
+                       void * data,
+                     size_t   size,
+                     size_t   max_size);
 
-// set the number of command buffers to use
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
 
-// creates a mapping between a host memory buffer and a device memory buffer
-// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
-// - the mapping is used during computation to determine the arguments of the compute kernels
-// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
-// - max_size specifies the maximum size of a tensor and is used to create shared views such
-//   that it is guaranteed that the tensor will fit in at least one of the views
+//struct ggml_metal_context;
 //
-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                       const char * name,
-                             void * data,
-                           size_t   size,
-                           size_t   max_size);
-
-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// same as ggml_graph_compute but uses Metal
-// creates gf->n_threads command buffers in parallel
-void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+//// number of command buffers to use
+//struct ggml_metal_context * ggml_metal_init(int n_cb);
+//void ggml_metal_free(struct ggml_metal_context * ctx);
+//
+//// set the number of command buffers to use
+//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+//
+//// creates a mapping between a host memory buffer and a device memory buffer
+//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+//// - the mapping is used during computation to determine the arguments of the compute kernels
+//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+//// - max_size specifies the maximum size of a tensor and is used to create shared views such
+////   that it is guaranteed that the tensor will fit in at least one of the views
+////
+//bool ggml_metal_add_buffer(
+//        struct ggml_metal_context * ctx,
+//                       const char * name,
+//                             void * data,
+//                           size_t   size,
+//                           size_t   max_size);
+//
+//// set data from host memory into the device
+//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+//
+//// get data from the device into host memory
+//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+//
+//// same as ggml_graph_compute but uses Metal
+//// creates gf->n_threads command buffers in parallel
+//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 
 #ifdef __cplusplus
 }

diff --git a/ggml-metal.m b/ggml-metal.m
@@ -242,12 +242,13 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
     return nil;
 }
 
+// TODO: rename to ggml_metal_map_buffer
 bool ggml_metal_add_buffer(
         struct ggml_metal_context * ctx,
-                     const char * name,
-                           void * data,
-                         size_t   size,
-                         size_t   max_size) {
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size) {
     if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
         fprintf(stderr, "%s: too many buffers\n", __func__);
         return false;
@@ -992,3 +993,62 @@ void ggml_metal_graph_compute(
         }
     }
 }
+
+bool ggml_backend_metal_map_buffer(
+        struct ggml_backend * backend,
+        const char * name,
+        void * data,
+        size_t   size,
+        size_t   max_size) {
+    return ggml_metal_add_buffer(backend->context, name, data, size, max_size);
+}
+
+static const char * ggml_backend_metal_name(struct ggml_backend * ctx) {
+    return "Metal";
+
+    UNUSED(ctx);
+}
+
+static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) {
+    ggml_metal_graph_compute(backend->context, cgraph);
+}
+
+static struct ggml_backend_interface metal_backend_interface = {
+    /* .get_name            = */ ggml_backend_metal_name,
+    /* .free                = */ NULL, //ggml_backend_metal_alloc_buffer,
+    /* .alloc_buffer        = */ NULL, //ggml_backend_metal_free_buffer,
+    /* .set_tensor_async    = */ NULL, //ggml_backend_metal_reset_buffer,
+    /* .get_tensor_async    = */ NULL, //ggml_backend_metal_alloc_tensor,
+    /* .synchronize         = */ NULL, //ggml_backend_metal_set_tensor_async,
+    /* .cpy_tensor_from     = */ NULL, //ggml_backend_metal_get_tensor_async,
+    /* .cpy_tensor_to       = */ NULL, //ggml_backend_metal_synchronize,
+    /* .graph_plan_create   = */ NULL, //nullptr,
+    /* .graph_plan_free     = */ NULL, //nullptr,
+    /* .graph_plan_compute  = */ NULL, //ggml_backend_metal_graph_plan_create,
+    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
+};
+
+struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) {
+    struct ggml_metal_context * ctx = ggml_metal_init(8);
+
+    struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend));
+    *backend_metal = (struct ggml_backend){
+        /* .interface     = */ metal_backend_interface,
+        /* .context       = */ ctx,
+        /* .is_ram_shared = */ true,
+    };
+
+    // reuses CPU calls for now
+    backend_metal->interface.free               = backend_cpu->interface.free;
+    backend_metal->interface.alloc_buffer       = backend_cpu->interface.alloc_buffer;
+    backend_metal->interface.set_tensor_async   = backend_cpu->interface.set_tensor_async;
+    backend_metal->interface.get_tensor_async   = backend_cpu->interface.get_tensor_async;
+    backend_metal->interface.synchronize        = backend_cpu->interface.synchronize;
+    backend_metal->interface.cpy_tensor_from    = backend_cpu->interface.cpy_tensor_from;
+    backend_metal->interface.cpy_tensor_to      = backend_cpu->interface.cpy_tensor_to;
+    backend_metal->interface.graph_plan_create  = backend_cpu->interface.graph_plan_create;
+    backend_metal->interface.graph_plan_free    = backend_cpu->interface.graph_plan_free;
+    backend_metal->interface.graph_plan_compute = backend_cpu->interface.graph_plan_compute;
+
+    return backend_metal;
+}