From 0a3861c47b8d787e7d40550bcbc95109eb366982 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 16:54:41 +0300 Subject: [PATCH 1/8] metal : adapting to ggml_backend (WIP) --- ggml-metal.h | 69 ++++++++++++++++++++++++++++------------------------ ggml-metal.m | 28 +++++++++++++++++++++ llama.cpp | 43 +++++++++++++++++++++++++------- 3 files changed, 99 insertions(+), 41 deletions(-) diff --git a/ggml-metal.h b/ggml-metal.h index 928f1705c381c..a726ddd1cd842 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -19,51 +19,56 @@ #pragma once +#include "ggml.h" + #include #include // max memory buffers that can be mapped to the device #define GGML_METAL_MAX_BUFFERS 16 -struct ggml_tensor; -struct ggml_cgraph; +//struct ggml_tensor; +//struct ggml_cgraph; #ifdef __cplusplus extern "C" { #endif -struct ggml_metal_context; - -// number of command buffers to use -struct ggml_metal_context * ggml_metal_init(int n_cb); -void ggml_metal_free(struct ggml_metal_context * ctx); - -// set the number of command buffers to use -void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); +// GG: maybe return ptr and avoid the "ggml.h" include +struct ggml_backend ggml_backend_metal_init(); -// creates a mapping between a host memory buffer and a device memory buffer -// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute -// - the mapping is used during computation to determine the arguments of the compute kernels -// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal -// - max_size specifies the maximum size of a tensor and is used to create shared views such -// that it is guaranteed that the tensor will fit in at least one of the views +//struct ggml_metal_context; // -bool ggml_metal_add_buffer( - struct ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size); - -// set data from host memory into the device -void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - -// get data from the device into host memory -void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - -// same as ggml_graph_compute but uses Metal -// creates gf->n_threads command buffers in parallel -void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +//// number of command buffers to use +//struct ggml_metal_context * ggml_metal_init(int n_cb); +//void ggml_metal_free(struct ggml_metal_context * ctx); +// +//// set the number of command buffers to use +//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); +// +//// creates a mapping between a host memory buffer and a device memory buffer +//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute +//// - the mapping is used during computation to determine the arguments of the compute kernels +//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal +//// - max_size specifies the maximum size of a tensor and is used to create shared views such +//// that it is guaranteed that the tensor will fit in at least one of the views +//// +//bool ggml_metal_add_buffer( +// struct ggml_metal_context * ctx, +// const char * name, +// void * data, +// size_t size, +// size_t max_size); +// +//// set data from host memory into the device +//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +// +//// get data from the device into host memory +//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +// +//// same as ggml_graph_compute but uses Metal +//// creates gf->n_threads command buffers in parallel +//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); #ifdef __cplusplus } diff --git a/ggml-metal.m b/ggml-metal.m index ee205bcdf773c..d7ff833a4d7f1 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -992,3 +992,31 @@ void ggml_metal_graph_compute( } } } + +static struct ggml_backend_interface metal_backend_interface = { + /* .get_name = */ //ggml_backend_metal_name, + /* .free_context = */ //ggml_backend_metal_free_context, + /* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer, + /* .free_buffer = */ //ggml_backend_metal_free_buffer, + /* .reset_buffer = */ //ggml_backend_metal_reset_buffer, + /* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor, + /* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async, + /* .synchronize = */ //ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ //nullptr, + /* .cpy_tensor_to = */ //nullptr, + /* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create, + /* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free, + /* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute, + /* .graph_compute = */ //ggml_backend_metal_graph_compute +}; + +struct ggml_backend ggml_backend_metal_init(void) { + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + + struct ggml_backend metal_backend = { + /* .interface = */ &metal_backend_interface, + /* .context = */ ctx + }; + return metal_backend; +} diff --git a/llama.cpp b/llama.cpp index 61e31f45fd38e..d4b563b84cc13 100644 --- a/llama.cpp +++ b/llama.cpp @@ -233,6 +233,11 @@ struct llama_model { ggml_buffer buf_cuda; ggml_context * ctx_cuda = NULL; #endif +#ifdef GGML_USE_METAL + ggml_backend backend_metal; + ggml_buffer buf_metal; + ggml_context * ctx_metal = NULL; +#endif // backend assigned to each layer ggml_backend * backend_input = NULL; @@ -249,6 +254,12 @@ struct llama_model { ggml_free(ctx_cuda); ggml_backend_free_buffer(&buf_cuda); } +#endif +#ifdef GGML_USE_METAL + if (ctx_metal) { + ggml_free(ctx_metal); + ggml_backend_free_buffer(&buf_metal); + } #endif } }; @@ -290,6 +301,9 @@ struct llama_context { #ifdef GGML_USE_CUDA ggml_buffer buf_compute_cuda = {}; #endif +#ifdef GGML_USE_METAL + ggml_buffer buf_compute_metal = {}; +#endif // input tensors struct ggml_tensor * graph_tokens_in = nullptr; @@ -940,6 +954,8 @@ static void llama_model_load_internal( const uint32_t n_layer = hparams.n_layer; model.backend_cpu = ggml_backend_cpu_init(); + + ggml_backend * backend_cpu = &model.backend_cpu; ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { @@ -947,14 +963,21 @@ static void llama_model_load_internal( backend_gpu = &model.backend_cuda; } #endif +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + model.backend_metal = ggml_backend_metal_init(); + backend_gpu = &model.backend_metal; + } +#endif // assign splits to the backends const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers); - model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : &model.backend_cpu; - model.backend_output = n_gpu_layers > 0 ? backend_gpu : &model.backend_cpu; + model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu; + model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu; + model.backend_layers.resize(n_layer); - std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, &model.backend_cpu); - std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu); + std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, backend_cpu); + std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu); // calculate the size of each context std::unordered_map ctx_sizes; @@ -977,17 +1000,18 @@ static void llama_model_load_internal( ctx_sizes[model.backend_layers[layer]] += lt.size; } } + // TODO: generalize support for mmap size_t mmap_size = 0; if (ml->use_mmap) { - mmap_size = ctx_sizes[&model.backend_cpu]; - ctx_sizes[&model.backend_cpu] = 0; + mmap_size = ctx_sizes[backend_cpu]; + ctx_sizes[backend_cpu] = 0; } fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); for (const auto & it : ctx_sizes) { fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); - if (it.first == &model.backend_cpu && ml->use_mmap) { + if (it.first == backend_cpu && ml->use_mmap) { fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); } fprintf(stderr, "\n"); @@ -996,8 +1020,8 @@ static void llama_model_load_internal( // create the buffers and contexts { size_t cpu_num_tensors = ml->tensors_map.tensors.size(); - size_t ctx_size = ctx_sizes[&model.backend_cpu]; - model.buf_cpu = ggml_backend_alloc_buffer(&model.backend_cpu, ctx_size, cpu_num_tensors); + size_t ctx_size = ctx_sizes[backend_cpu]; + model.buf_cpu = ggml_backend_alloc_buffer(backend_cpu, ctx_size, cpu_num_tensors); struct ggml_init_params params = ggml_init_params_default(); params.buffer = &model.buf_cpu; params.no_alloc = ml->use_mmap; @@ -1028,6 +1052,7 @@ static void llama_model_load_internal( if (model.backend_input == backend_gpu) ctx_input = ctx_gpu; ggml_context * ctx_output = model.ctx_cpu; if (model.backend_output == backend_gpu) ctx_output = ctx_gpu; + std::vector ctx_layers(n_layer, model.ctx_cpu); for (uint32_t i = 0; i < n_layer; ++i) { if (model.backend_layers[i] == backend_gpu) { From 90503f150d0513d38ab6ee117b3426872de312c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 17:52:13 +0300 Subject: [PATCH 2/8] llama : init metal backend as CPU backend for now --- ggml-backend.h | 2 +- llama.cpp | 73 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/ggml-backend.h b/ggml-backend.h index ce5aac2b5fbab..44b9f785f3dde 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -78,7 +78,7 @@ extern "C" { static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); } // buffer and tensor allocation - GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); + GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); // GG: probably return ptr GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer); static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); } static inline void ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); } diff --git a/llama.cpp b/llama.cpp index d4b563b84cc13..e4a566df010ca 100644 --- a/llama.cpp +++ b/llama.cpp @@ -240,8 +240,8 @@ struct llama_model { #endif // backend assigned to each layer - ggml_backend * backend_input = NULL; - ggml_backend * backend_output = NULL; + ggml_backend * backend_inp = NULL; + ggml_backend * backend_out = NULL; std::vector backend_layers; ~llama_model() { @@ -965,15 +965,15 @@ static void llama_model_load_internal( #endif #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { - model.backend_metal = ggml_backend_metal_init(); + model.backend_metal = ggml_backend_cpu_init(); backend_gpu = &model.backend_metal; } #endif // assign splits to the backends const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers); - model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu; - model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu; + model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu; + model.backend_out = n_gpu_layers > 0 ? backend_gpu : backend_cpu; model.backend_layers.resize(n_layer); std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, backend_cpu); @@ -983,10 +983,10 @@ static void llama_model_load_internal( std::unordered_map ctx_sizes; for (const llama_load_tensor & lt : ml->tensors_map.tensors) { if (lt.name == "tok_embeddings.weight") { - ctx_sizes[model.backend_input] += lt.size; + ctx_sizes[model.backend_inp] += lt.size; } else if (lt.name == "norm.weight" || lt.name == "output.weight") { - ctx_sizes[model.backend_output] += lt.size; + ctx_sizes[model.backend_out] += lt.size; } else { // parse layer number from name @@ -1032,6 +1032,7 @@ static void llama_model_load_internal( } ggml_context * ctx_gpu = model.ctx_cpu; + #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { size_t gpu_num_tensors = ml->tensors_map.tensors.size(); @@ -1043,15 +1044,35 @@ static void llama_model_load_internal( if (!model.ctx_cuda) { throw std::runtime_error(format("ggml_init() failed for CUDA backend")); } + ctx_gpu = model.ctx_cuda; } #endif +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + // the metal context is actually a CPU context because we have unified memory + const size_t ctx_size = ctx_sizes[&model.backend_metal]; + const size_t n_tensors = ml->tensors_map.tensors.size(); + + model.buf_metal = ggml_backend_alloc_buffer(&model.backend_metal, ctx_size, n_tensors); + + struct ggml_init_params params = ggml_init_params_default(); + params.buffer = &model.buf_metal; + params.no_alloc = ml->use_mmap; + + model.ctx_metal = ggml_init(params); + if (!model.ctx_metal) { + throw std::runtime_error(format("ggml_init() failed for CPU backend")); + } + + ctx_gpu = model.ctx_metal; + } +#endif + // TODO: clean this - ggml_context * ctx_input = model.ctx_cpu; - if (model.backend_input == backend_gpu) ctx_input = ctx_gpu; - ggml_context * ctx_output = model.ctx_cpu; - if (model.backend_output == backend_gpu) ctx_output = ctx_gpu; + ggml_context * ctx_input = (model.backend_inp == backend_gpu) ? ctx_gpu : model.ctx_cpu; + ggml_context * ctx_output = (model.backend_out == backend_gpu) ? ctx_gpu : model.ctx_cpu; std::vector ctx_layers(n_layer, model.ctx_cpu); for (uint32_t i = 0; i < n_layer; ++i) { @@ -1102,7 +1123,6 @@ static void llama_model_load_internal( (void) low_vram; (void) n_batch; - // print memory requirements { const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; @@ -1224,29 +1244,30 @@ static ggml_graph_splits llama_build_graph( #endif // TODO: clean this - struct ggml_context * ctx_i = nullptr; + struct ggml_context * ctx_i = nullptr; struct ggml_context * ctx_ls[80] = {nullptr}; - struct ggml_context * ctx_o = nullptr; - struct ggml_context * ctx_kv = nullptr; + struct ggml_context * ctx_o = nullptr; + struct ggml_context * ctx_kv = nullptr; - if (lctx.model.backend_input == &lctx.model.backend_cpu) ctx_i = ctx_cpu; - if (lctx.model.backend_output == &lctx.model.backend_cpu) ctx_o = ctx_cpu; + if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu; + if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu; #ifdef GGML_USE_CUDA - if (lctx.model.backend_input == &lctx.model.backend_cuda) ctx_i = ctx_cuda; - if (lctx.model.backend_output == &lctx.model.backend_cuda) ctx_o = ctx_cuda; + if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda; + if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda; #endif + for (int il = 0; il < n_layer; il++) { - if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; + if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; #ifdef GGML_USE_CUDA if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda; #endif } - if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; + + if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; #ifdef GGML_USE_CUDA if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda; #endif - struct ggml_tensor * inpL; if (embeddings_input) { @@ -2678,7 +2699,7 @@ struct llama_context * llama_new_context_with_model( buf_input_size += hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input tokens // TODO: input embeddings should be optional to save memory buf_input_size += hparams.n_embd * hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input embeddings - ctx->buf_input = ggml_backend_alloc_buffer(model->backend_input, buf_input_size, 2); + ctx->buf_input = ggml_backend_alloc_buffer(model->backend_inp, buf_input_size, 2); struct ggml_init_params ggml_params = ggml_init_params_default(); ggml_params.buffer = &ctx->buf_input; @@ -2702,7 +2723,7 @@ struct llama_context * llama_new_context_with_model( if (params.embedding) { buf_output_size += hparams.n_embd * ggml_type_size(GGML_TYPE_F32); } - ctx->buf_output = ggml_backend_alloc_buffer(model->backend_output, buf_output_size, 2); + ctx->buf_output = ggml_backend_alloc_buffer(model->backend_out, buf_output_size, 2); struct ggml_init_params ggml_params = ggml_init_params_default(); ggml_params.buffer = &ctx->buf_output; @@ -2731,7 +2752,7 @@ struct llama_context * llama_new_context_with_model( } fprintf(stderr, "%s: layer backends: ", __func__); - fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_input)); + fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp)); int start = 0; struct ggml_backend * prev_backend = ctx->model.backend_layers[0]; @@ -2746,7 +2767,7 @@ struct llama_context * llama_new_context_with_model( prev_backend = ctx->model.backend_layers[i]; } } - fprintf(stderr, "output: %s, ", ggml_backend_name(ctx->model.backend_output)); + fprintf(stderr, "output: %s, ", ggml_backend_name(ctx->model.backend_out)); fprintf(stderr, "kv: %s\n", ggml_backend_name(ctx->backend_kv)); #ifdef GGML_USE_MPI From 652c849643a81d0fee3f178b90f093f71d1f49f5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 18:51:02 +0300 Subject: [PATCH 3/8] ggml : add is_ram_shared to ggml_backend Metal can share the RAM memory and can utilize mmap without temp buffer --- ggml-backend.c | 5 ++-- ggml-backend.h | 14 ++++++++++- ggml-cuda.cu | 5 ++-- llama.cpp | 65 +++++++++++++++++++++++++++++++++++++------------- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 85a6cac05f157..bd97a5b498062 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -255,8 +255,9 @@ struct ggml_backend ggml_backend_cpu_init(void) { ctx->work_size = 0; struct ggml_backend cpu_backend = { - /* .interface = */ &cpu_backend_interface, - /* .context = */ ctx + /* .interface = */ &cpu_backend_interface, + /* .context = */ ctx, + /* .is_ram_shared = */ true, }; return cpu_backend; } diff --git a/ggml-backend.h b/ggml-backend.h index 44b9f785f3dde..635555719d360 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -61,7 +61,10 @@ extern "C" { struct ggml_backend { struct ggml_backend_interface * interface; + ggml_backend_context_t context; + + bool is_ram_shared; }; // backend helper functions @@ -78,7 +81,16 @@ extern "C" { static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); } // buffer and tensor allocation - GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); // GG: probably return ptr + // TODO: + // - return "struct ggml_buffer *" + // - fix namings: + // - ggml_backend_alloc_buffer -> ggml_backend_buffer_alloc + // - ggml_backend_free_buffer -> ggml_backend_buffer_free + // - ggml_backend_reset_buffer -> ggml_backend_buffer_reset + // - ggml_backend_alloc_tensor -> ggml_backend_tensor_alloc + // - ggml_backend_tensor_cpy -> ggml_backend_tensor_copy + // + GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer); static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); } static inline void ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 343eda0b2bd74..a2d7c545b3690 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1834,8 +1834,9 @@ ggml_backend ggml_backend_cuda_init(void) { ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context; ggml_backend cuda_backend = { - /* .interface = */ &cuda_backend_interface, - /* .context = */ ctx + /* .interface = = */ &cuda_backend_interface, + /* .context = */ ctx, + /* .is_ram_shared = */ false, }; return cuda_backend; } diff --git a/llama.cpp b/llama.cpp index e4a566df010ca..c234cdf3fb607 100644 --- a/llama.cpp +++ b/llama.cpp @@ -225,6 +225,7 @@ struct llama_model { llama_vocab vocab; // backends + // TODO: change to pointers ggml_backend backend_cpu; ggml_buffer buf_cpu; ggml_context * ctx_cpu = NULL; @@ -298,6 +299,7 @@ struct llama_context { // memory buffers used to evaluate the model ggml_buffer buf_compute_cpu = {}; + #ifdef GGML_USE_CUDA ggml_buffer buf_compute_cuda = {}; #endif @@ -612,7 +614,7 @@ struct llama_model_loader { } } - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; size_t lock_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { @@ -634,11 +636,11 @@ struct llama_model_loader { } LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already - bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu; + const bool is_ram_shared = lt.ggml_tensor->backend->is_ram_shared; // select buffer to load data into if (!use_mmap) { - if (is_cpu) { + if (is_ram_shared) { lt.data = (uint8_t *) lt.ggml_tensor->data; } else { // read to temporary buffer @@ -649,7 +651,7 @@ struct llama_model_loader { load_data_for(lt); - if (is_cpu) { + if (is_ram_shared) { if (use_mmap) { lt.ggml_tensor->data = lt.data; // TODO: this assumes that the data to lock is contiguous, which may not always be the case @@ -671,7 +673,7 @@ struct llama_model_loader { } } - void load_data_for(llama_load_tensor & lt) { + void load_data_for(llama_load_tensor & lt) const { if (use_mmap) { lt.data = (uint8_t *) mapping->addr + lt.file_off; } else { @@ -957,6 +959,7 @@ static void llama_model_load_internal( ggml_backend * backend_cpu = &model.backend_cpu; ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection + #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { model.backend_cuda = ggml_backend_cuda_init(); @@ -965,13 +968,14 @@ static void llama_model_load_internal( #endif #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { - model.backend_metal = ggml_backend_cpu_init(); + model.backend_metal = ggml_backend_metal_init(); backend_gpu = &model.backend_metal; } #endif // assign splits to the backends const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers); + model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu; model.backend_out = n_gpu_layers > 0 ? backend_gpu : backend_cpu; @@ -1011,7 +1015,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); for (const auto & it : ctx_sizes) { fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); - if (it.first == backend_cpu && ml->use_mmap) { + if (it.first->is_ram_shared && ml->use_mmap) { fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); } fprintf(stderr, "\n"); @@ -1135,12 +1139,10 @@ static void llama_model_load_internal( ctx_sum += it.second; } - const size_t mem_required = - ctx_sum + MEM_REQ_EVAL().at(model.type); + const size_t mem_required = ctx_sum + MEM_REQ_EVAL().at(model.type); // this is the memory required by one llama_state - const size_t mem_required_state = - scale*MEM_REQ_KV_SELF().at(model.type); + const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -1162,6 +1164,7 @@ static void llama_model_load_internal( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; + } static bool llama_model_load( @@ -1226,6 +1229,7 @@ static ggml_graph_splits llama_build_graph( // initialize contexts for every backend struct ggml_context * ctx_cpu = nullptr; + if (lctx.buf_compute_cpu.mem_size > 0) { struct ggml_init_params params = ggml_init_params_default(); params.buffer = &lctx.buf_compute_cpu; @@ -1235,6 +1239,7 @@ static ggml_graph_splits llama_build_graph( #ifdef GGML_USE_CUDA struct ggml_context * ctx_cuda = nullptr; + if (lctx.buf_compute_cuda.mem_size > 0) { struct ggml_init_params params = ggml_init_params_default(); params.buffer = &lctx.buf_compute_cuda; @@ -1243,30 +1248,54 @@ static ggml_graph_splits llama_build_graph( } #endif +#ifdef GGML_USE_METAL + struct ggml_context * ctx_metal = nullptr; + + if (lctx.buf_compute_metal.mem_size > 0) { + struct ggml_init_params params = ggml_init_params_default(); + params.buffer = &lctx.buf_compute_metal; + params.compute_type = compute_type; + ctx_metal = ggml_init(params); + } +#endif + // TODO: clean this struct ggml_context * ctx_i = nullptr; - struct ggml_context * ctx_ls[80] = {nullptr}; struct ggml_context * ctx_o = nullptr; struct ggml_context * ctx_kv = nullptr; + struct ggml_context * ctx_ls[80] = {nullptr}; if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu; if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu; + #ifdef GGML_USE_CUDA if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda; if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda; #endif +#ifdef GGML_USE_METAL + if (lctx.model.backend_inp == &lctx.model.backend_metal) ctx_i = ctx_metal; + if (lctx.model.backend_out == &lctx.model.backend_metal) ctx_o = ctx_metal; +#endif for (int il = 0; il < n_layer; il++) { - if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; + if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; + #ifdef GGML_USE_CUDA if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda; +#endif +#ifdef GGML_USE_METAL + if (lctx.model.backend_layers[il] == &lctx.model.backend_metal) ctx_ls[il] = ctx_metal; #endif } - if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; + if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; + #ifdef GGML_USE_CUDA if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda; #endif +#ifdef GGML_USE_METAL + if (lctx.backend_kv == &lctx.model.backend_metal) ctx_kv = ctx_metal; +#endif struct ggml_tensor * inpL; @@ -1522,7 +1551,7 @@ static ggml_graph_splits llama_build_graph( //} #ifdef LLAMA_1L_GRAPH_DUMP - if (N==1 && n_past == 0) { + if (N == 1 && n_past == 0) { ggml_graph_dump_dot(gf, NULL, "llama.dot"); printf("graph for N=%i, n_past=%i dumped to llama.dot\n", N, n_past); exit(0); @@ -1547,6 +1576,11 @@ static ggml_graph_splits llama_build_graph( ggml_free(ctx_cuda); } #endif +#ifdef GGML_USE_METAL + if (ctx_metal != nullptr) { + ggml_free(ctx_metal); + } +#endif return splits; } @@ -2651,7 +2685,6 @@ struct llama_context * llama_new_context_with_model( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - // TODO: choose backend depending on n_layers/low_vram #ifdef GGML_USE_CUDA if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2) { From ed960fa1ab91e0b90e57eb72fa4cabadcac405de Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 19:19:59 +0300 Subject: [PATCH 4/8] llama : separate compute buffer for metal --- llama.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index c234cdf3fb607..867b3e59fd56e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1616,7 +1616,6 @@ static bool llama_eval_internal( LLAMA_ASSERT(lctx.graph_logits != nullptr); - // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads; @@ -2719,11 +2718,17 @@ struct llama_context * llama_new_context_with_model( // TODO: size the buffers more accurately - depends on improved memory management ctx->buf_compute_cpu = ggml_backend_alloc_buffer(&model->backend_cpu, MEM_REQ_EVAL().at(ctx->model.type), 2048); + #ifdef GGML_USE_CUDA if (params.n_gpu_layers > 0) { ctx->buf_compute_cuda = ggml_backend_alloc_buffer(&model->backend_cuda, MEM_REQ_EVAL().at(ctx->model.type), 2048); } #endif +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + ctx->buf_compute_metal = ggml_backend_alloc_buffer(&model->backend_metal, MEM_REQ_EVAL().at(ctx->model.type), 2048); + } +#endif // initialize the graph input/output buffers // input buffer From 70c55c17c74e07e04b6892ae10a823f61055d44a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Jul 2023 16:47:43 +0300 Subject: [PATCH 5/8] metal : create backend, mostly reuse CPU backend interface --- ggml-metal.h | 2 +- ggml-metal.m | 69 +++++++++++++++++++++++++++++++++++++--------------- llama.cpp | 19 +++++++++------ 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/ggml-metal.h b/ggml-metal.h index a726ddd1cd842..6d99d7e5a11ad 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -35,7 +35,7 @@ extern "C" { #endif // GG: maybe return ptr and avoid the "ggml.h" include -struct ggml_backend ggml_backend_metal_init(); +struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu); //struct ggml_metal_context; // diff --git a/ggml-metal.m b/ggml-metal.m index d7ff833a4d7f1..6d610e6780a8d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -993,30 +993,59 @@ void ggml_metal_graph_compute( } } +static const char * ggml_backend_metal_name(ggml_backend_context_t ctx) { + return "Metal"; + + UNUSED(ctx); +} + +static void ggml_backend_metal_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) { + struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *) ctx; + + ggml_metal_graph_compute(ctx_metal, cgraph); +} + static struct ggml_backend_interface metal_backend_interface = { - /* .get_name = */ //ggml_backend_metal_name, - /* .free_context = */ //ggml_backend_metal_free_context, - /* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer, - /* .free_buffer = */ //ggml_backend_metal_free_buffer, - /* .reset_buffer = */ //ggml_backend_metal_reset_buffer, - /* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor, - /* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async, - /* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async, - /* .synchronize = */ //ggml_backend_metal_synchronize, - /* .cpy_tensor_from = */ //nullptr, - /* .cpy_tensor_to = */ //nullptr, - /* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create, - /* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free, - /* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute, - /* .graph_compute = */ //ggml_backend_metal_graph_compute + /* .get_name = */ ggml_backend_metal_name, + /* .free_context = */ NULL, //ggml_backend_metal_free_context, + /* .alloc_buffer = */ NULL, //ggml_backend_metal_alloc_buffer, + /* .free_buffer = */ NULL, //ggml_backend_metal_free_buffer, + /* .reset_buffer = */ NULL, //ggml_backend_metal_reset_buffer, + /* .alloc_tensor = */ NULL, //ggml_backend_metal_alloc_tensor, + /* .set_tensor_async = */ NULL, //ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ NULL, //ggml_backend_metal_get_tensor_async, + /* .synchronize = */ NULL, //ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ NULL, //nullptr, + /* .cpy_tensor_to = */ NULL, //nullptr, + /* .graph_plan_create = */ NULL, //ggml_backend_metal_graph_plan_create, + /* .graph_plan_free = */ NULL, //ggml_backend_metal_graph_plan_free, + /* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_compute, + /* .graph_compute = */ ggml_backend_metal_graph_compute, }; -struct ggml_backend ggml_backend_metal_init(void) { +struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu) { struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); - struct ggml_backend metal_backend = { - /* .interface = */ &metal_backend_interface, - /* .context = */ ctx + struct ggml_backend backend_metal = { + /* .interface = */ &metal_backend_interface, + /* .context = */ ctx, + /* .is_ram_shared = */ true, }; - return metal_backend; + + // reuses CPU calls for now + backend_metal.interface->free_context = backend_cpu->interface->free_context; + backend_metal.interface->alloc_buffer = backend_cpu->interface->alloc_buffer; + backend_metal.interface->free_buffer = backend_cpu->interface->free_buffer; + backend_metal.interface->reset_buffer = backend_cpu->interface->reset_buffer; + backend_metal.interface->alloc_tensor = backend_cpu->interface->alloc_tensor; + backend_metal.interface->set_tensor_async = backend_cpu->interface->set_tensor_async; + backend_metal.interface->get_tensor_async = backend_cpu->interface->get_tensor_async; + backend_metal.interface->synchronize = backend_cpu->interface->synchronize; + backend_metal.interface->cpy_tensor_from = backend_cpu->interface->cpy_tensor_from; + backend_metal.interface->cpy_tensor_to = backend_cpu->interface->cpy_tensor_to; + backend_metal.interface->graph_plan_create = backend_cpu->interface->graph_plan_create; + backend_metal.interface->graph_plan_free = backend_cpu->interface->graph_plan_free; + backend_metal.interface->graph_plan_compute = backend_cpu->interface->graph_plan_compute; + + return backend_metal; } diff --git a/llama.cpp b/llama.cpp index 867b3e59fd56e..5039f14e928e0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -968,7 +968,7 @@ static void llama_model_load_internal( #endif #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { - model.backend_metal = ggml_backend_metal_init(); + model.backend_metal = ggml_backend_metal_init(backend_cpu); backend_gpu = &model.backend_metal; } #endif @@ -1008,17 +1008,20 @@ static void llama_model_load_internal( // TODO: generalize support for mmap size_t mmap_size = 0; if (ml->use_mmap) { - mmap_size = ctx_sizes[backend_cpu]; - ctx_sizes[backend_cpu] = 0; + for (auto & it : ctx_sizes) { + if (it.first->is_ram_shared) { + mmap_size += it.second; + ctx_sizes[it.first] = 0; + } + } } fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); for (const auto & it : ctx_sizes) { - fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); - if (it.first->is_ram_shared && ml->use_mmap) { - fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); - } - fprintf(stderr, "\n"); + fprintf(stderr, "%8s = %7.2f MB\n", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); + } + if (mmap_size > 0) { + fprintf(stderr, "%8s = %7.2f MB\n", "mmap", mmap_size / 1024.0 / 1024.0); } // create the buffers and contexts From 290cb700bffb006313bdf915dd5898e7e86a755a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Jul 2023 14:30:34 +0300 Subject: [PATCH 6/8] metal : map the CPU buffers to Metal buffers (WIP) --- ggml-backend.c | 2 ++ ggml-backend.h | 1 + ggml-metal.h | 10 +++++++++- ggml-metal.m | 51 +++++++++++++++++++++++++++----------------------- llama.cpp | 38 +++++++++++++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 24 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 5dd61d32d2ede..8e95247a3d080 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -94,6 +94,7 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size *allocator = (struct ggml_backend_buffer){ /* .interface = */ ggml_allocator_simple_interface, /* .context = */ ctx, + /* .backend_size = */ 0, /* .backend_data = */ NULL, }; return allocator; @@ -192,6 +193,7 @@ static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_ba struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); buffer->interface.free_data = ggml_backend_cpu_free_buffer; + buffer->backend_size = size; buffer->backend_data = data; return buffer; diff --git a/ggml-backend.h b/ggml-backend.h index f29b555919c88..37a6addb44315 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -27,6 +27,7 @@ extern "C" { struct ggml_backend_buffer { struct ggml_backend_buffer_interface interface; ggml_buffer_context_t context; + size_t backend_size; void * backend_data; }; diff --git a/ggml-metal.h b/ggml-metal.h index 89d616bb581a6..efde14544e477 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -34,9 +34,17 @@ extern "C" { #endif -// GG: maybe return ptr and avoid the "ggml.h" include struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu); +// TODO: temporary - move to backend interface +bool ggml_backend_metal_map_buffer( + struct ggml_backend * backend, + const char * name, + void * data, + size_t size, + size_t max_size); + + //struct ggml_metal_context; // //// number of command buffers to use diff --git a/ggml-metal.m b/ggml-metal.m index 00a75777e56a1..8c0771b70bc71 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -242,12 +242,13 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { return nil; } +// TODO: rename to ggml_metal_map_buffer bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size) { + const char * name, + void * data, + size_t size, + size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { fprintf(stderr, "%s: too many buffers\n", __func__); return false; @@ -993,38 +994,42 @@ void ggml_metal_graph_compute( } } -static const char * ggml_backend_metal_name(ggml_backend_context_t ctx) { +bool ggml_backend_metal_map_buffer( + struct ggml_backend * backend, + const char * name, + void * data, + size_t size, + size_t max_size) { + return ggml_metal_add_buffer(backend->context, name, data, size, max_size); +} + +static const char * ggml_backend_metal_name(struct ggml_backend * ctx) { return "Metal"; UNUSED(ctx); } -static void ggml_backend_metal_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) { - struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *) ctx; - - ggml_metal_graph_compute(ctx_metal, cgraph); +static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { + ggml_metal_graph_compute(backend->context, cgraph); } static struct ggml_backend_interface metal_backend_interface = { /* .get_name = */ ggml_backend_metal_name, - /* .free_context = */ NULL, //ggml_backend_metal_free_context, - /* .alloc_buffer = */ NULL, //ggml_backend_metal_alloc_buffer, - /* .free_buffer = */ NULL, //ggml_backend_metal_free_buffer, - /* .reset_buffer = */ NULL, //ggml_backend_metal_reset_buffer, - /* .alloc_tensor = */ NULL, //ggml_backend_metal_alloc_tensor, - /* .set_tensor_async = */ NULL, //ggml_backend_metal_set_tensor_async, - /* .get_tensor_async = */ NULL, //ggml_backend_metal_get_tensor_async, - /* .synchronize = */ NULL, //ggml_backend_metal_synchronize, - /* .cpy_tensor_from = */ NULL, //nullptr, - /* .cpy_tensor_to = */ NULL, //nullptr, - /* .graph_plan_create = */ NULL, //ggml_backend_metal_graph_plan_create, - /* .graph_plan_free = */ NULL, //ggml_backend_metal_graph_plan_free, - /* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_compute, + /* .free = */ NULL, //ggml_backend_metal_alloc_buffer, + /* .alloc_buffer = */ NULL, //ggml_backend_metal_free_buffer, + /* .set_tensor_async = */ NULL, //ggml_backend_metal_reset_buffer, + /* .get_tensor_async = */ NULL, //ggml_backend_metal_alloc_tensor, + /* .synchronize = */ NULL, //ggml_backend_metal_set_tensor_async, + /* .cpy_tensor_from = */ NULL, //ggml_backend_metal_get_tensor_async, + /* .cpy_tensor_to = */ NULL, //ggml_backend_metal_synchronize, + /* .graph_plan_create = */ NULL, //nullptr, + /* .graph_plan_free = */ NULL, //nullptr, + /* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_create, /* .graph_compute = */ ggml_backend_metal_graph_compute, }; struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) { - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + struct ggml_metal_context * ctx = ggml_metal_init(8); struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend)); *backend_metal = (struct ggml_backend){ diff --git a/llama.cpp b/llama.cpp index e531d9a6447c6..3bbe738946580 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2817,6 +2817,44 @@ struct llama_context * llama_new_context_with_model( } } +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + void * data_ptr = NULL; + size_t data_size = 0; + + if (params.use_mmap) { + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; + } else { + data_ptr = ggml_get_mem_buffer(ctx->model.ctx_metal); + data_size = ggml_get_mem_size (ctx->model.ctx_metal); + } + + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx_metal); + + printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + +#define LLAMA_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + fprintf(stderr, "%s: failed to add buffer\n", __func__); \ + llama_free(ctx); \ + return NULL; \ + } + + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "data", data_ptr, data_size, max_size)); + + struct ggml_backend_buffer * buf_compute = ctx->buf_compute_metal->backend_buffer; + struct ggml_backend_buffer * buf_kv = ctx->kv_self.buf->backend_buffer; + + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "eval", buf_compute->backend_data, buf_compute->backend_size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "kv", buf_kv->backend_data, buf_kv->backend_size, 0)); + + //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); + //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); +#undef LLAMA_METAL_CHECK_BUF + } +#endif + fprintf(stderr, "%s: layer backends: ", __func__); fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp)); From cb82adadb847062d9749fb63eb2fed6ce1a15e97 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Jul 2023 14:56:29 +0300 Subject: [PATCH 7/8] metal : first working version of the inference without prompt processing Bonus: supports partial inference on the CPU --- ggml-metal.m | 12 ++++++------ llama.cpp | 5 +++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 8c0771b70bc71..0bc825277475f 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -237,7 +237,7 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { } } - fprintf(stderr, "%s: error: buffer is nil\n", __func__); + fprintf(stderr, "%s: error: buffer is nil for tensor '%s'\n", __func__, t->name); return nil; } @@ -877,15 +877,15 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; - const int n_past = ((int32_t *)(src1->data))[0]; + const int n_past = ((int32_t *)(dst->op_params))[0]; float freq_base; float freq_scale; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; diff --git a/llama.cpp b/llama.cpp index 3bbe738946580..ffc4676f37093 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2845,10 +2845,15 @@ struct llama_context * llama_new_context_with_model( struct ggml_backend_buffer * buf_compute = ctx->buf_compute_metal->backend_buffer; struct ggml_backend_buffer * buf_kv = ctx->kv_self.buf->backend_buffer; + struct ggml_backend_buffer * buf_input = ctx->buf_input->backend_buffer; + struct ggml_backend_buffer * buf_output = ctx->buf_output->backend_buffer; LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "eval", buf_compute->backend_data, buf_compute->backend_size, 0)); LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "kv", buf_kv->backend_data, buf_kv->backend_size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_input->backend_data, buf_input->backend_size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_output->backend_data, buf_output->backend_size, 0)); + //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); #undef LLAMA_METAL_CHECK_BUF From d45c1631bc81bceef6106d319fb177ecad32daa0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Jul 2023 16:36:33 +0300 Subject: [PATCH 8/8] metal : rewrite to fit new backend interface correctly (WIP) --- ggml-backend.c | 5 +- ggml-backend.h | 1 - ggml-metal.h | 16 +-- ggml-metal.m | 372 +++++++++++++++++++++++-------------------------- ggml.c | 12 ++ llama.cpp | 77 +++------- 6 files changed, 209 insertions(+), 274 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 8e95247a3d080..76f5a35719c19 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -94,7 +94,6 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size *allocator = (struct ggml_backend_buffer){ /* .interface = */ ggml_allocator_simple_interface, /* .context = */ ctx, - /* .backend_size = */ 0, /* .backend_data = */ NULL, }; return allocator; @@ -146,6 +145,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst return; } + //printf("src->data = %p, src->extra = %p\n", src->data, src->extra); + //printf("dst->data = %p, dst->extra = %p\n", dst->data, dst->extra); + if (dst->backend->interface.cpy_tensor_from != NULL) { dst->backend->interface.cpy_tensor_from(dst->backend->context, src, dst); } else if (src->backend->interface.cpy_tensor_to != NULL) { @@ -193,7 +195,6 @@ static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_ba struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); buffer->interface.free_data = ggml_backend_cpu_free_buffer; - buffer->backend_size = size; buffer->backend_data = data; return buffer; diff --git a/ggml-backend.h b/ggml-backend.h index 37a6addb44315..f29b555919c88 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -27,7 +27,6 @@ extern "C" { struct ggml_backend_buffer { struct ggml_backend_buffer_interface interface; ggml_buffer_context_t context; - size_t backend_size; void * backend_data; }; diff --git a/ggml-metal.h b/ggml-metal.h index efde14544e477..e6dd8b900d46f 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -19,14 +19,9 @@ #pragma once -#include "ggml.h" - #include #include -// max memory buffers that can be mapped to the device -#define GGML_METAL_MAX_BUFFERS 16 - //struct ggml_tensor; //struct ggml_cgraph; @@ -34,16 +29,9 @@ extern "C" { #endif -struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu); - -// TODO: temporary - move to backend interface -bool ggml_backend_metal_map_buffer( - struct ggml_backend * backend, - const char * name, - void * data, - size_t size, - size_t max_size); +struct ggml_backend; +struct ggml_backend * ggml_backend_metal_init(void); //struct ggml_metal_context; // diff --git a/ggml-metal.m b/ggml-metal.m index 0bc825277475f..573a9e7674ec6 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -12,18 +12,16 @@ #else #define metal_printf(...) fprintf(stderr, __VA_ARGS__) #endif +//#define metal_printf(...) fprintf(stderr, __VA_ARGS__) #define UNUSED(x) (void)(x) -struct ggml_metal_buffer { - const char * name; - - void * data; - size_t size; - - id metal; +struct ggml_metal_buffer_wrapper { + id buffer; }; +static void * g_ptr_base = (void *)0x1000; + struct ggml_metal_context { int n_cb; @@ -33,9 +31,6 @@ id queue; id library; - int n_buffers; - struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; - // custom kernels #define GGML_METAL_DECL_KERNEL(name) \ id function_##name; \ @@ -96,7 +91,6 @@ @implementation GGMLMetalClass ctx->n_cb = n_cb; ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; - ctx->n_buffers = 0; // determine if we can use MPS if (MPSSupportsMTLDevice(ctx->device)) { @@ -205,9 +199,6 @@ @implementation GGMLMetalClass void ggml_metal_free(struct ggml_metal_context * ctx) { fprintf(stderr, "%s: deallocating\n", __func__); - for (int i = 0; i < ctx->n_buffers; ++i) { - [ctx->buffers[i].metal release]; - } free(ctx); } @@ -215,143 +206,29 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { ctx->n_cb = n_cb; } -// finds the Metal buffer that contains the tensor data on the GPU device -// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the -// Metal buffer based on the host memory pointer -// -static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); - - const int64_t tsize = ggml_nbytes(t); - - // find the view that contains the tensor fully - for (int i = 0; i < ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; - - if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { - *offs = (size_t) ioffs; - - //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); - - return ctx->buffers[i].metal; - } +static id ggml_metal_get_buffer(struct ggml_tensor * tensor, size_t * offs) { + if (tensor == nil) { + return nil; } - fprintf(stderr, "%s: error: buffer is nil for tensor '%s'\n", __func__, t->name); - - return nil; -} - -// TODO: rename to ggml_metal_map_buffer -bool ggml_metal_add_buffer( - struct ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size) { - if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - fprintf(stderr, "%s: too many buffers\n", __func__); - return false; - } - - if (data) { - // verify that the buffer does not overlap with any of the existing buffers - for (int i = 0; i < ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; - - if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); - return false; - } - } - - const size_t size_page = getpagesize(); - - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } - - // the buffer fits into the max buffer size allowed by the device - if (size_aligned <= ctx->device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); - return false; - } - - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); - - ++ctx->n_buffers; - } else { - // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into - // one of the views - const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case - const size_t size_step = ctx->device.maxBufferLength - size_ovlp; - const size_t size_view = ctx->device.maxBufferLength; - - for (size_t i = 0; i < size; i += size_step) { - const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; - - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); - return false; - } - - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); - if (i + size_step < size) { - fprintf(stderr, "\n"); + switch (tensor->op) { + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + if (tensor->op == GGML_OP_VIEW) { + //printf("view offs = %zu\n", *(size_t *)tensor->op_params); } - - ++ctx->n_buffers; + return ggml_metal_get_buffer(tensor->src[0], offs); } - } - fprintf(stderr, ", (%8.2f / %8.2f)", - ctx->device.currentAllocatedSize / 1024.0 / 1024.0, - ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - - if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n"); - } else { - fprintf(stderr, "\n"); - } + default: {} } - return true; -} - -void ggml_metal_set_tensor( - struct ggml_metal_context * ctx, - struct ggml_tensor * t) { - metal_printf("%s: set input for tensor '%s'\n", __func__, t->name); - - size_t offs; - id id_dst = ggml_metal_get_buffer(ctx, t, &offs); - - memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t)); -} - -void ggml_metal_get_tensor( - struct ggml_metal_context * ctx, - struct ggml_tensor * t) { - metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name); - - size_t offs; - id id_src = ggml_metal_get_buffer(ctx, t, &offs); - - memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t)); + *offs = (size_t) tensor->data - (size_t) g_ptr_base; + //printf("%s: offs = %zu, %p, op = %s\n", __func__, *offs, tensor->extra, ggml_op_name(tensor->op)); + return ((struct ggml_metal_buffer_wrapper *) tensor->extra)->buffer; } void ggml_metal_graph_compute( @@ -432,23 +309,35 @@ void ggml_metal_graph_compute( const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; - id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; - id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - - //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - //if (src0) { - // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - // ggml_is_contiguous(src0), src0->name); - //} - //if (src1) { - // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - // ggml_is_contiguous(src1), src1->name); - //} - //if (dst) { - // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - // dst->name); - //} + switch (dst->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + continue; + } break; + default: break; + } + + id id_src0 = ggml_metal_get_buffer(src0, &offs_src0); + id id_src1 = ggml_metal_get_buffer(src1, &offs_src1); + id id_dst = ggml_metal_get_buffer(dst, &offs_dst); + + metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + if (src0) { + metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + ggml_is_contiguous(src0), src0->name); + } + if (src1) { + metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + ggml_is_contiguous(src1), src1->name); + } + if (dst) { + metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + dst->name); + } switch (dst->op) { case GGML_OP_NONE: @@ -501,7 +390,9 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const float scale = *(const float *) src1->data; + //const float scale = *(const float *) src1->data; + const float scale = ((float *)((char *)[((struct ggml_metal_buffer_wrapper *)(src1->extra))->buffer contents] + (size_t) src1->data - (size_t)g_ptr_base))[0]; + //printf("scale: %f, src1->data: %p, src1->extra: %p, src1->extra->buffer: %p\n", scale, src1->data, src1->extra, ((struct ggml_metal_buffer_wrapper *)(src1->extra))->buffer); [encoder setComputePipelineState:ctx->pipeline_scale]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -578,7 +469,8 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const int n_past = ((int32_t *)(src1->data))[0]; + //const int n_past = ((int32_t *)(src1->data))[0]; + const int n_past = ((int32_t *)(dst->op_params))[0]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -740,6 +632,10 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; + //printf("id_src0 %p, offs_src0 %zu\n", id_src0, offs_src0); + //printf("id_src1 %p, offs_src1 %zu\n", id_src1, offs_src1); + //printf("id_dst %p, offs_dst %zu\n", id_dst, offs_dst); + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -877,11 +773,10 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } + const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - const int n_past = ((int32_t *)(dst->op_params))[0]; - float freq_base; float freq_scale; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); @@ -994,61 +889,140 @@ void ggml_metal_graph_compute( } } -bool ggml_backend_metal_map_buffer( - struct ggml_backend * backend, - const char * name, - void * data, - size_t size, - size_t max_size) { - return ggml_metal_add_buffer(backend->context, name, data, size, max_size); -} - static const char * ggml_backend_metal_name(struct ggml_backend * ctx) { return "Metal"; UNUSED(ctx); } +static void ggml_backend_metal_free(struct ggml_backend * backend) { + struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *)backend->context; + ggml_metal_free(ctx_metal); + free(backend); +} + +static const size_t TENSOR_ALIGNMENT = 128; + +static void ggml_backend_metal_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { + tensor->extra = alloc->backend_data; +} + +static void ggml_backend_metal_free_data(struct ggml_backend_buffer * alloc) { + struct ggml_metal_buffer_wrapper * wrapper = (struct ggml_metal_buffer_wrapper *)alloc->backend_data; + [wrapper->buffer release]; + free(wrapper); +} + +static struct ggml_backend_buffer * ggml_backend_metal_alloc_buffer(struct ggml_backend * backend, size_t size) { + struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *)backend->context; + + struct ggml_metal_buffer_wrapper * wrapper = malloc(sizeof(struct ggml_metal_buffer_wrapper)); + wrapper->buffer = [ctx_metal->device newBufferWithLength:size options:MTLResourceStorageModeShared]; + if (wrapper->buffer == nil) { + fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + GGML_ASSERT(false); + } + + //printf("XXXXXXXXXXXXXXX ALOC: %p %p %p size = %zu\n", (void * )wrapper, (void *)&wrapper->buffer, (void *)[wrapper->buffer contents], size); + + struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(g_ptr_base, size, TENSOR_ALIGNMENT); + buffer->interface.init_tensor = ggml_backend_metal_init_tensor; + buffer->interface.free_data = ggml_backend_metal_free_data; + buffer->backend_data = wrapper; + + return buffer; +} + +static void ggml_backend_metal_set_tensor_async(struct ggml_backend * backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->extra != nil && "tensor not allocated"); + + struct ggml_metal_buffer_wrapper * wrapper = (struct ggml_metal_buffer_wrapper *)tensor->extra; + char * contents = (char *)[wrapper->buffer contents]; + + const size_t t_data = (size_t) tensor->data - (size_t) g_ptr_base; + + //printf("XXXXXXXXXXXXXXX SET : %p %p %p offset = %zu\n", (void *)(tensor->data), (void *)&wrapper->buffer, (void *)contents, offset); + + memcpy((char *)contents + t_data + offset, data, size); + + //memcpy((char *)tensor->data, data, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_get_tensor_async(struct ggml_backend * backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + //printf("XXXXXXXXXXXXXXX GET : %d %p, backend = %s\n", (void *)(tensor->data), (void *)tensor->extra, tensor->backend->interface.get_name(tensor->backend)); + GGML_ASSERT(tensor->extra != nil && "tensor not allocated"); + + struct ggml_metal_buffer_wrapper * wrapper = (struct ggml_metal_buffer_wrapper *)tensor->extra; + const char * contents = (const char *)[wrapper->buffer contents]; + + const size_t t_data = (size_t) tensor->data - (size_t) g_ptr_base; + + //printf("XXXXXXXXXXXXXXX GET : %p %p %p offset = %zu\n", (void *)(tensor->data), (void *)&wrapper->buffer, (void *)contents, offset); + + memcpy(data, (const char *)contents + t_data + offset, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_synchronize(struct ggml_backend * backend) { + UNUSED(backend); +} + +static ggml_graph_plan_t ggml_backend_metal_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { + GGML_ASSERT(false); + + return nil; + + UNUSED(backend); + UNUSED(cgraph); +} + +static void ggml_backend_metal_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) { + GGML_ASSERT(false); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_metal_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) { + GGML_ASSERT(false); + + UNUSED(backend); + UNUSED(plan); +} + static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { ggml_metal_graph_compute(backend->context, cgraph); } static struct ggml_backend_interface metal_backend_interface = { /* .get_name = */ ggml_backend_metal_name, - /* .free = */ NULL, //ggml_backend_metal_alloc_buffer, - /* .alloc_buffer = */ NULL, //ggml_backend_metal_free_buffer, - /* .set_tensor_async = */ NULL, //ggml_backend_metal_reset_buffer, - /* .get_tensor_async = */ NULL, //ggml_backend_metal_alloc_tensor, - /* .synchronize = */ NULL, //ggml_backend_metal_set_tensor_async, - /* .cpy_tensor_from = */ NULL, //ggml_backend_metal_get_tensor_async, - /* .cpy_tensor_to = */ NULL, //ggml_backend_metal_synchronize, - /* .graph_plan_create = */ NULL, //nullptr, - /* .graph_plan_free = */ NULL, //nullptr, - /* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_create, + /* .free = */ ggml_backend_metal_free, + /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .synchronize = */ ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ nil, //ggml_backend_metal_get_tensor_async, + /* .cpy_tensor_to = */ nil, //ggml_backend_metal_synchronize, + /* .graph_plan_create = */ ggml_backend_metal_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_metal_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_metal_graph_plan_compute, /* .graph_compute = */ ggml_backend_metal_graph_compute, }; -struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) { - struct ggml_metal_context * ctx = ggml_metal_init(8); +struct ggml_backend * ggml_backend_metal_init(void) { + struct ggml_metal_context * ctx = ggml_metal_init(1); struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend)); *backend_metal = (struct ggml_backend){ /* .interface = */ metal_backend_interface, /* .context = */ ctx, - /* .is_ram_shared = */ true, + /* .is_ram_shared = */ false, }; - // reuses CPU calls for now - backend_metal->interface.free = backend_cpu->interface.free; - backend_metal->interface.alloc_buffer = backend_cpu->interface.alloc_buffer; - backend_metal->interface.set_tensor_async = backend_cpu->interface.set_tensor_async; - backend_metal->interface.get_tensor_async = backend_cpu->interface.get_tensor_async; - backend_metal->interface.synchronize = backend_cpu->interface.synchronize; - backend_metal->interface.cpy_tensor_from = backend_cpu->interface.cpy_tensor_from; - backend_metal->interface.cpy_tensor_to = backend_cpu->interface.cpy_tensor_to; - backend_metal->interface.graph_plan_create = backend_cpu->interface.graph_plan_create; - backend_metal->interface.graph_plan_free = backend_cpu->interface.graph_plan_free; - backend_metal->interface.graph_plan_compute = backend_cpu->interface.graph_plan_compute; - return backend_metal; } diff --git a/ggml.c b/ggml.c index 19db8241fa4e8..1308fe2448b30 100644 --- a/ggml.c +++ b/ggml.c @@ -4927,6 +4927,7 @@ struct ggml_tensor * ggml_view_tensor( result->nb[1] = src->nb[1]; result->nb[2] = src->nb[2]; result->nb[3] = src->nb[3]; + result->extra = src->extra; return result; } @@ -6262,6 +6263,7 @@ struct ggml_tensor * ggml_reshape( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6287,6 +6289,7 @@ struct ggml_tensor * ggml_reshape_1d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6313,6 +6316,7 @@ struct ggml_tensor * ggml_reshape_2d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6340,6 +6344,7 @@ struct ggml_tensor * ggml_reshape_3d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6369,6 +6374,7 @@ struct ggml_tensor * ggml_reshape_4d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6396,6 +6402,7 @@ struct ggml_tensor * ggml_view_1d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6431,6 +6438,7 @@ struct ggml_tensor * ggml_view_2d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6468,6 +6476,7 @@ struct ggml_tensor * ggml_view_3d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6507,6 +6516,7 @@ struct ggml_tensor * ggml_view_4d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } @@ -6568,6 +6578,7 @@ struct ggml_tensor * ggml_permute( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; int32_t params[] = { axis0, axis1, axis2, axis3 }; ggml_set_op_params(result, ¶ms, sizeof(params)); @@ -6599,6 +6610,7 @@ struct ggml_tensor * ggml_transpose( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = NULL; + result->extra = a->extra; return result; } diff --git a/llama.cpp b/llama.cpp index ffc4676f37093..a215ab3005795 100644 --- a/llama.cpp +++ b/llama.cpp @@ -226,7 +226,7 @@ struct llama_model { // backends ggml_backend * backend_cpu = NULL; - ggml_buffer * buf_cpu = NULL; + ggml_buffer * buf_cpu = NULL; ggml_context * ctx_cpu = NULL; #ifdef GGML_USE_CUDA ggml_backend * backend_cuda = NULL; @@ -234,8 +234,8 @@ struct llama_model { ggml_context * ctx_cuda = NULL; #endif #ifdef GGML_USE_METAL - ggml_backend * backend_metal; - ggml_buffer * buf_metal; + ggml_backend * backend_metal = NULL; + ggml_buffer * buf_metal = NULL; ggml_context * ctx_metal = NULL; #endif @@ -991,7 +991,7 @@ static void llama_model_load_internal( #endif #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { - model.backend_metal = ggml_backend_metal_init(backend_cpu); + model.backend_metal = ggml_backend_metal_init(); backend_gpu = model.backend_metal; } #endif @@ -1081,15 +1081,13 @@ static void llama_model_load_internal( #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { - // the metal context is actually a CPU context because we have unified memory const size_t ctx_size = ctx_sizes[model.backend_metal]; const size_t n_tensors = ml->tensors_map.tensors.size(); model.buf_metal = ggml_buffer_alloc(model.backend_metal, ctx_size, n_tensors); struct ggml_init_params params = ggml_init_params_default(); - params.buffer = model.buf_metal; - params.no_alloc = ml->use_mmap; + params.buffer = model.buf_metal; model.ctx_metal = ggml_init(params); if (!model.ctx_metal) { @@ -1372,10 +1370,10 @@ static ggml_graph_splits llama_build_graph( struct ggml_tensor * tmpv = ggml_mul_mat(ctx_l, model.layers[il].wv, cur); ggml_set_name(tmpv, "tmpv"); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx_l, ggml_reshape_3d(ctx_l, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx_l, ggml_reshape_3d(ctx_l, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx_l, ggml_reshape_3d(ctx_l, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); + struct ggml_tensor * Qcur = ggml_rope(ctx_l, ggml_reshape_3d(ctx_l, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); ggml_set_name(Qcur, "Qcur"); struct ggml_tensor * Vcur = ggml_transpose(ctx_l, ggml_reshape_2d(ctx_l, tmpv, n_embd, N)); @@ -1428,15 +1426,15 @@ static ggml_graph_splits llama_build_graph( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx_kv, KQ, KQ_scale); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx_kv, KQ, KQ_scale); ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx_kv, KQ_scaled, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx_kv, KQ_scaled, n_past); ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx_kv, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx_kv, KQ_masked); ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads @@ -2717,6 +2715,12 @@ struct llama_context * llama_new_context_with_model( } else { ctx->backend_kv = model->backend_cpu; } +#elif GGML_USE_METAL + if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2 && !params.low_vram) { + ctx->backend_kv = model->backend_metal; + } else { + ctx->backend_kv = model->backend_cpu; + } #else ctx->backend_kv = model->backend_cpu; #endif @@ -2817,49 +2821,6 @@ struct llama_context * llama_new_context_with_model( } } -#ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { - void * data_ptr = NULL; - size_t data_size = 0; - - if (params.use_mmap) { - data_ptr = ctx->model.mapping->addr; - data_size = ctx->model.mapping->size; - } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx_metal); - data_size = ggml_get_mem_size (ctx->model.ctx_metal); - } - - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx_metal); - - printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); - -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - fprintf(stderr, "%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ - } - - LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "data", data_ptr, data_size, max_size)); - - struct ggml_backend_buffer * buf_compute = ctx->buf_compute_metal->backend_buffer; - struct ggml_backend_buffer * buf_kv = ctx->kv_self.buf->backend_buffer; - struct ggml_backend_buffer * buf_input = ctx->buf_input->backend_buffer; - struct ggml_backend_buffer * buf_output = ctx->buf_output->backend_buffer; - - LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "eval", buf_compute->backend_data, buf_compute->backend_size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "kv", buf_kv->backend_data, buf_kv->backend_size, 0)); - - LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_input->backend_data, buf_input->backend_size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_output->backend_data, buf_output->backend_size, 0)); - - //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); - //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); -#undef LLAMA_METAL_CHECK_BUF - } -#endif - fprintf(stderr, "%s: layer backends: ", __func__); fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp)); @@ -3150,14 +3111,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); ggml_set_name(scale_tensor, "scale_tensor"); - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + BA = ggml_scale(lora_ctx, BA, scale_tensor); ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx, dest_t, BA); - ggml_set_name(r, "r_add_inplace"); + r = ggml_add(lora_ctx, dest_t, BA); + ggml_set_name(r, "r_add"); } else { r = ggml_add(lora_ctx, base_t, BA);