Skip to content
8 changes: 6 additions & 2 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
return;
}

//printf("src->data = %p, src->extra = %p\n", src->data, src->extra);
//printf("dst->data = %p, dst->extra = %p\n", dst->data, dst->extra);

if (dst->backend->interface.cpy_tensor_from != NULL) {
dst->backend->interface.cpy_tensor_from(dst->backend->context, src, dst);
} else if (src->backend->interface.cpy_tensor_to != NULL) {
Expand Down Expand Up @@ -311,8 +314,9 @@ struct ggml_backend * ggml_backend_cpu_init(void) {
struct ggml_backend * cpu_backend = malloc(sizeof(struct ggml_backend));

*cpu_backend = (struct ggml_backend) {
/* .interface = */ cpu_backend_interface,
/* .context = */ ctx
/* .interface = */ cpu_backend_interface,
/* .context = */ ctx,
/* .is_ram_shared = */ true,
};
return cpu_backend;
}
Expand Down
2 changes: 2 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ extern "C" {
struct ggml_backend {
struct ggml_backend_interface interface;
ggml_backend_context_t context;

bool is_ram_shared;
};

// backend helper functions
Expand Down
5 changes: 3 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1810,8 +1810,9 @@ ggml_backend * ggml_backend_cuda_init(void) {

ggml_backend * cuda_backend = new ggml_backend;
*cuda_backend = (ggml_backend){
/* .interface = */ cuda_backend_interface,
/* .context = */ ctx
/* .interface = */ cuda_backend_interface,
/* .context = */ ctx
/* .is_ram_shared = */ false,
};
return cuda_backend;
}
69 changes: 35 additions & 34 deletions ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,48 +22,49 @@
#include <stddef.h>
#include <stdbool.h>

// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16

struct ggml_tensor;
struct ggml_cgraph;
//struct ggml_tensor;
//struct ggml_cgraph;

#ifdef __cplusplus
extern "C" {
#endif

struct ggml_metal_context;

// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
struct ggml_backend;

// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
struct ggml_backend * ggml_backend_metal_init(void);

// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//struct ggml_metal_context;
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size);

// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
//// number of command buffers to use
//struct ggml_metal_context * ggml_metal_init(int n_cb);
//void ggml_metal_free(struct ggml_metal_context * ctx);
//
//// set the number of command buffers to use
//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
//
//// creates a mapping between a host memory buffer and a device memory buffer
//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
//// - the mapping is used during computation to determine the arguments of the compute kernels
//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
//// - max_size specifies the maximum size of a tensor and is used to create shared views such
//// that it is guaranteed that the tensor will fit in at least one of the views
////
//bool ggml_metal_add_buffer(
// struct ggml_metal_context * ctx,
// const char * name,
// void * data,
// size_t size,
// size_t max_size);
//
//// set data from host memory into the device
//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// get data from the device into host memory
//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// same as ggml_graph_compute but uses Metal
//// creates gf->n_threads command buffers in parallel
//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

#ifdef __cplusplus
}
Expand Down
Loading