Skip to content
5 changes: 3 additions & 2 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,9 @@ struct ggml_backend ggml_backend_cpu_init(void) {
ctx->work_size = 0;

struct ggml_backend cpu_backend = {
/* .interface = */ &cpu_backend_interface,
/* .context = */ ctx
/* .interface = */ &cpu_backend_interface,
/* .context = */ ctx,
/* .is_ram_shared = */ true,
};
return cpu_backend;
}
Expand Down
12 changes: 12 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ extern "C" {

struct ggml_backend {
struct ggml_backend_interface * interface;

ggml_backend_context_t context;

bool is_ram_shared;
};

// backend helper functions
Expand All @@ -78,6 +81,15 @@ extern "C" {
static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); }

// buffer and tensor allocation
// TODO:
// - return "struct ggml_buffer *"
// - fix namings:
// - ggml_backend_alloc_buffer -> ggml_backend_buffer_alloc
// - ggml_backend_free_buffer -> ggml_backend_buffer_free
// - ggml_backend_reset_buffer -> ggml_backend_buffer_reset
// - ggml_backend_alloc_tensor -> ggml_backend_tensor_alloc
// - ggml_backend_tensor_cpy -> ggml_backend_tensor_copy
//
GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors);
GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer);
static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); }
Expand Down
5 changes: 3 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1834,8 +1834,9 @@ ggml_backend ggml_backend_cuda_init(void) {
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context;

ggml_backend cuda_backend = {
/* .interface = */ &cuda_backend_interface,
/* .context = */ ctx
/* .interface = = */ &cuda_backend_interface,
/* .context = */ ctx,
/* .is_ram_shared = */ false,
};
return cuda_backend;
}
69 changes: 37 additions & 32 deletions ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,51 +19,56 @@

#pragma once

#include "ggml.h"

#include <stddef.h>
#include <stdbool.h>

// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16

struct ggml_tensor;
struct ggml_cgraph;
//struct ggml_tensor;
//struct ggml_cgraph;

#ifdef __cplusplus
extern "C" {
#endif

struct ggml_metal_context;

// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);

// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
// GG: maybe return ptr and avoid the "ggml.h" include
struct ggml_backend ggml_backend_metal_init();

// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//struct ggml_metal_context;
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size);

// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
//// number of command buffers to use
//struct ggml_metal_context * ggml_metal_init(int n_cb);
//void ggml_metal_free(struct ggml_metal_context * ctx);
//
//// set the number of command buffers to use
//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
//
//// creates a mapping between a host memory buffer and a device memory buffer
//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
//// - the mapping is used during computation to determine the arguments of the compute kernels
//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
//// - max_size specifies the maximum size of a tensor and is used to create shared views such
//// that it is guaranteed that the tensor will fit in at least one of the views
////
//bool ggml_metal_add_buffer(
// struct ggml_metal_context * ctx,
// const char * name,
// void * data,
// size_t size,
// size_t max_size);
//
//// set data from host memory into the device
//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// get data from the device into host memory
//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// same as ggml_graph_compute but uses Metal
//// creates gf->n_threads command buffers in parallel
//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

#ifdef __cplusplus
}
Expand Down
28 changes: 28 additions & 0 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -992,3 +992,31 @@ void ggml_metal_graph_compute(
}
}
}

static struct ggml_backend_interface metal_backend_interface = {
/* .get_name = */ //ggml_backend_metal_name,
/* .free_context = */ //ggml_backend_metal_free_context,
/* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer,
/* .free_buffer = */ //ggml_backend_metal_free_buffer,
/* .reset_buffer = */ //ggml_backend_metal_reset_buffer,
/* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor,
/* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async,
/* .synchronize = */ //ggml_backend_metal_synchronize,
/* .cpy_tensor_from = */ //nullptr,
/* .cpy_tensor_to = */ //nullptr,
/* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create,
/* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free,
/* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute,
/* .graph_compute = */ //ggml_backend_metal_graph_compute
};

struct ggml_backend ggml_backend_metal_init(void) {
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));

struct ggml_backend metal_backend = {
/* .interface = */ &metal_backend_interface,
/* .context = */ ctx
};
return metal_backend;
}
Loading