ggml-org · ggerganov · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023
diff --git a/ggml-backend.c b/ggml-backend.c
@@ -255,8 +255,9 @@ struct ggml_backend ggml_backend_cpu_init(void) {
     ctx->work_size = 0;
 
     struct ggml_backend cpu_backend = {
-        /* .interface = */ &cpu_backend_interface,
-        /* .context   = */ ctx
+        /* .interface     = */ &cpu_backend_interface,
+        /* .context       = */ ctx,
+        /* .is_ram_shared = */ true,
     };
     return cpu_backend;
 }

diff --git a/ggml-backend.h b/ggml-backend.h
@@ -61,7 +61,10 @@ extern "C" {
 
     struct ggml_backend {
         struct ggml_backend_interface * interface;
+
         ggml_backend_context_t context;
+
+        bool is_ram_shared;
     };
 
     // backend helper functions
@@ -78,6 +81,15 @@ extern "C" {
     static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); }
 
     // buffer and tensor allocation
+    // TODO:
+    //  - return "struct ggml_buffer *"
+    //  - fix namings:
+    //    - ggml_backend_alloc_buffer -> ggml_backend_buffer_alloc
+    //    - ggml_backend_free_buffer  -> ggml_backend_buffer_free
+    //    - ggml_backend_reset_buffer -> ggml_backend_buffer_reset
+    //    - ggml_backend_alloc_tensor -> ggml_backend_tensor_alloc
+    //    - ggml_backend_tensor_cpy   -> ggml_backend_tensor_copy
+    //
     GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors);
     GGML_API void               ggml_backend_free_buffer(struct ggml_buffer * buffer);
     static inline void          ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); }

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1834,8 +1834,9 @@ ggml_backend ggml_backend_cuda_init(void) {
     ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context;
 
     ggml_backend cuda_backend = {
-        /* .interface = */ &cuda_backend_interface,
-        /* .context   = */ ctx
+        /* .interface =   = */ &cuda_backend_interface,
+        /* .context       = */ ctx,
+        /* .is_ram_shared = */ false,
     };
     return cuda_backend;
 }
diff --git a/ggml-metal.h b/ggml-metal.h
@@ -19,51 +19,56 @@
 
 #pragma once
 
+#include "ggml.h"
+
 #include <stddef.h>
 #include <stdbool.h>
 
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 16
 
-struct ggml_tensor;
-struct ggml_cgraph;
+//struct ggml_tensor;
+//struct ggml_cgraph;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct ggml_metal_context;
-
-// number of command buffers to use
-struct ggml_metal_context * ggml_metal_init(int n_cb);
-void ggml_metal_free(struct ggml_metal_context * ctx);
-
-// set the number of command buffers to use
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+// GG: maybe return ptr and avoid the "ggml.h" include
+struct ggml_backend ggml_backend_metal_init();
 
-// creates a mapping between a host memory buffer and a device memory buffer
-// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
-// - the mapping is used during computation to determine the arguments of the compute kernels
-// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
-// - max_size specifies the maximum size of a tensor and is used to create shared views such
-//   that it is guaranteed that the tensor will fit in at least one of the views
+//struct ggml_metal_context;
 //
-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                       const char * name,
-                             void * data,
-                           size_t   size,
-                           size_t   max_size);
-
-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// same as ggml_graph_compute but uses Metal
-// creates gf->n_threads command buffers in parallel
-void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+//// number of command buffers to use
+//struct ggml_metal_context * ggml_metal_init(int n_cb);
+//void ggml_metal_free(struct ggml_metal_context * ctx);
+//
+//// set the number of command buffers to use
+//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+//
+//// creates a mapping between a host memory buffer and a device memory buffer
+//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+//// - the mapping is used during computation to determine the arguments of the compute kernels
+//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+//// - max_size specifies the maximum size of a tensor and is used to create shared views such
+////   that it is guaranteed that the tensor will fit in at least one of the views
+////
+//bool ggml_metal_add_buffer(
+//        struct ggml_metal_context * ctx,
+//                       const char * name,
+//                             void * data,
+//                           size_t   size,
+//                           size_t   max_size);
+//
+//// set data from host memory into the device
+//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+//
+//// get data from the device into host memory
+//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+//
+//// same as ggml_graph_compute but uses Metal
+//// creates gf->n_threads command buffers in parallel
+//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 
 #ifdef __cplusplus
 }

diff --git a/ggml-metal.m b/ggml-metal.m
@@ -992,3 +992,31 @@ void ggml_metal_graph_compute(
         }
     }
 }
+
+static struct ggml_backend_interface metal_backend_interface = {
+    /* .get_name            = */ //ggml_backend_metal_name,
+    /* .free_context        = */ //ggml_backend_metal_free_context,
+    /* .alloc_buffer        = */ //ggml_backend_metal_alloc_buffer,
+    /* .free_buffer         = */ //ggml_backend_metal_free_buffer,
+    /* .reset_buffer        = */ //ggml_backend_metal_reset_buffer,
+    /* .alloc_tensor        = */ //ggml_backend_metal_alloc_tensor,
+    /* .set_tensor_async    = */ //ggml_backend_metal_set_tensor_async,
+    /* .get_tensor_async    = */ //ggml_backend_metal_get_tensor_async,
+    /* .synchronize         = */ //ggml_backend_metal_synchronize,
+    /* .cpy_tensor_from     = */ //nullptr,
+    /* .cpy_tensor_to       = */ //nullptr,
+    /* .graph_plan_create   = */ //ggml_backend_metal_graph_plan_create,
+    /* .graph_plan_free     = */ //ggml_backend_metal_graph_plan_free,
+    /* .graph_plan_compute  = */ //ggml_backend_metal_graph_plan_compute,
+    /* .graph_compute       = */ //ggml_backend_metal_graph_compute
+};
+
+struct ggml_backend ggml_backend_metal_init(void) {
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+
+    struct ggml_backend metal_backend = {
+        /* .interface = */ &metal_backend_interface,
+        /* .context   = */ ctx
+    };
+    return metal_backend;
+}