Skip to content
7 changes: 5 additions & 2 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size
*allocator = (struct ggml_backend_buffer){
/* .interface = */ ggml_allocator_simple_interface,
/* .context = */ ctx,
/* .backend_size = */ 0,
/* .backend_data = */ NULL,
};
return allocator;
Expand Down Expand Up @@ -192,6 +193,7 @@ static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_ba

struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
buffer->interface.free_data = ggml_backend_cpu_free_buffer;
buffer->backend_size = size;
buffer->backend_data = data;

return buffer;
Expand Down Expand Up @@ -311,8 +313,9 @@ struct ggml_backend * ggml_backend_cpu_init(void) {
struct ggml_backend * cpu_backend = malloc(sizeof(struct ggml_backend));

*cpu_backend = (struct ggml_backend) {
/* .interface = */ cpu_backend_interface,
/* .context = */ ctx
/* .interface = */ cpu_backend_interface,
/* .context = */ ctx,
/* .is_ram_shared = */ true,
};
return cpu_backend;
}
Expand Down
3 changes: 3 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ extern "C" {
struct ggml_backend_buffer {
struct ggml_backend_buffer_interface interface;
ggml_buffer_context_t context;
size_t backend_size;
void * backend_data;
};

Expand Down Expand Up @@ -96,6 +97,8 @@ extern "C" {
struct ggml_backend {
struct ggml_backend_interface interface;
ggml_backend_context_t context;

bool is_ram_shared;
};

// backend helper functions
Expand Down
5 changes: 3 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1810,8 +1810,9 @@ ggml_backend * ggml_backend_cuda_init(void) {

ggml_backend * cuda_backend = new ggml_backend;
*cuda_backend = (ggml_backend){
/* .interface = */ cuda_backend_interface,
/* .context = */ ctx
/* .interface = */ cuda_backend_interface,
/* .context = */ ctx
/* .is_ram_shared = */ false,
};
return cuda_backend;
}
73 changes: 43 additions & 30 deletions ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,51 +19,64 @@

#pragma once

#include "ggml.h"

#include <stddef.h>
#include <stdbool.h>

// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16

struct ggml_tensor;
struct ggml_cgraph;
//struct ggml_tensor;
//struct ggml_cgraph;

#ifdef __cplusplus
extern "C" {
#endif

struct ggml_metal_context;
struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu);

// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
// TODO: temporary - move to backend interface
bool ggml_backend_metal_map_buffer(
struct ggml_backend * backend,
const char * name,
void * data,
size_t size,
size_t max_size);

// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//struct ggml_metal_context;
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size);

// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
//// number of command buffers to use
//struct ggml_metal_context * ggml_metal_init(int n_cb);
//void ggml_metal_free(struct ggml_metal_context * ctx);
//
//// set the number of command buffers to use
//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
//
//// creates a mapping between a host memory buffer and a device memory buffer
//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
//// - the mapping is used during computation to determine the arguments of the compute kernels
//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
//// - max_size specifies the maximum size of a tensor and is used to create shared views such
//// that it is guaranteed that the tensor will fit in at least one of the views
////
//bool ggml_metal_add_buffer(
// struct ggml_metal_context * ctx,
// const char * name,
// void * data,
// size_t size,
// size_t max_size);
//
//// set data from host memory into the device
//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// get data from the device into host memory
//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// same as ggml_graph_compute but uses Metal
//// creates gf->n_threads command buffers in parallel
//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

#ifdef __cplusplus
}
Expand Down
68 changes: 64 additions & 4 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,13 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
return nil;
}

// TODO: rename to ggml_metal_map_buffer
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size) {
const char * name,
void * data,
size_t size,
size_t max_size) {
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
fprintf(stderr, "%s: too many buffers\n", __func__);
return false;
Expand Down Expand Up @@ -992,3 +993,62 @@ void ggml_metal_graph_compute(
}
}
}

bool ggml_backend_metal_map_buffer(
struct ggml_backend * backend,
const char * name,
void * data,
size_t size,
size_t max_size) {
return ggml_metal_add_buffer(backend->context, name, data, size, max_size);
}

static const char * ggml_backend_metal_name(struct ggml_backend * ctx) {
return "Metal";

UNUSED(ctx);
}

static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) {
ggml_metal_graph_compute(backend->context, cgraph);
}

static struct ggml_backend_interface metal_backend_interface = {
/* .get_name = */ ggml_backend_metal_name,
/* .free = */ NULL, //ggml_backend_metal_alloc_buffer,
/* .alloc_buffer = */ NULL, //ggml_backend_metal_free_buffer,
/* .set_tensor_async = */ NULL, //ggml_backend_metal_reset_buffer,
/* .get_tensor_async = */ NULL, //ggml_backend_metal_alloc_tensor,
/* .synchronize = */ NULL, //ggml_backend_metal_set_tensor_async,
/* .cpy_tensor_from = */ NULL, //ggml_backend_metal_get_tensor_async,
/* .cpy_tensor_to = */ NULL, //ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL, //nullptr,
/* .graph_plan_free = */ NULL, //nullptr,
/* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_create,
/* .graph_compute = */ ggml_backend_metal_graph_compute,
};

struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) {
struct ggml_metal_context * ctx = ggml_metal_init(8);

struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend));
*backend_metal = (struct ggml_backend){
/* .interface = */ metal_backend_interface,
/* .context = */ ctx,
/* .is_ram_shared = */ true,
};

// reuses CPU calls for now
backend_metal->interface.free = backend_cpu->interface.free;
backend_metal->interface.alloc_buffer = backend_cpu->interface.alloc_buffer;
backend_metal->interface.set_tensor_async = backend_cpu->interface.set_tensor_async;
backend_metal->interface.get_tensor_async = backend_cpu->interface.get_tensor_async;
backend_metal->interface.synchronize = backend_cpu->interface.synchronize;
backend_metal->interface.cpy_tensor_from = backend_cpu->interface.cpy_tensor_from;
backend_metal->interface.cpy_tensor_to = backend_cpu->interface.cpy_tensor_to;
backend_metal->interface.graph_plan_create = backend_cpu->interface.graph_plan_create;
backend_metal->interface.graph_plan_free = backend_cpu->interface.graph_plan_free;
backend_metal->interface.graph_plan_compute = backend_cpu->interface.graph_plan_compute;

return backend_metal;
}
Loading