-
Notifications
You must be signed in to change notification settings - Fork 8.3k
Add llama compatibility with new ggml quantization #642
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
9a3ccb2
863513d
150135c
b4b7bb6
8025c20
4364e9d
dae05f8
85ee11f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,9 @@ | ||
| [submodule "llama.cpp"] | ||
| path = gpt4all-backend/llama.cpp | ||
| [submodule "llama.cpp-230519"] | ||
| path = gpt4all-backend/llama.cpp-230519 | ||
| url = https://github.com/ggerganov/llama.cpp.git | ||
| [submodule "llama.cpp-230511"] | ||
| path = gpt4all-backend/llama.cpp-230511 | ||
| url = https://github.com/manyoso/llama.cpp.git | ||
| [submodule "llama.cpp-mainline"] | ||
| path = gpt4all-backend/llama.cpp-mainline | ||
| url = https://github.com/ggerganov/llama.cpp.git | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,7 +54,9 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) | |
| set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX}) | ||
|
|
||
| # Include GGML | ||
| include_ggml(llama.cpp -${BUILD_VARIANT} ON) | ||
| include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON) | ||
| include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON) | ||
| include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON) | ||
|
|
||
| # Function for preparing individual implementations | ||
| function(prepare_target TARGET_NAME BASE_LIB) | ||
|
|
@@ -71,18 +73,32 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) | |
| PROPERTY INTERPROCEDURAL_OPTIMIZATION ${IPO_SUPPORTED}) | ||
| endfunction() | ||
|
|
||
| # Add each individual implementation | ||
| add_library(llamamodel-${BUILD_VARIANT} SHARED | ||
| # Add each individual implementations | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick, you don't want the plural here
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed that as well, but decided to leave it as is since it's not worth a commit. Will batch this with further things that may come up. |
||
| add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED | ||
| llamamodel.cpp) | ||
| prepare_target(llamamodel llama) | ||
| target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE | ||
| LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. =>= oh man cmake.. you're kiling me
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Haha, yup. Looks confusing, is confusing, but does what we need quite flexibly.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That conditional should probably be changed to a slightly less cursed variant: #if LLAMA_VERSION <= 123456
// ...
#elif LLAMA_VERSION >= 654321
// ...
#endifAt least then it would be a readily recognizable pattern of tragic stylistic compromise instead of a confusing entirely new way to crush one's hopes and dreams. Would also shrink the cmake side a little. Pardon the gallows humour, can't help it whenever pre-processor macros seem necessary. ;) |
||
| prepare_target(llamamodel-mainline llama-mainline) | ||
|
|
||
| add_library(llamamodel-230519-${BUILD_VARIANT} SHARED | ||
| llamamodel.cpp) | ||
| target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE | ||
| LLAMA_VERSIONS===2 LLAMA_DATE=230519) | ||
| prepare_target(llamamodel-230519 llama-230519) | ||
|
|
||
| add_library(llamamodel-230511-${BUILD_VARIANT} SHARED | ||
| llamamodel.cpp) | ||
| target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE | ||
| LLAMA_VERSIONS=<=1 LLAMA_DATE=230511) | ||
| prepare_target(llamamodel-230511 llama-230511) | ||
|
|
||
| add_library(gptj-${BUILD_VARIANT} SHARED | ||
| gptj.cpp) | ||
| prepare_target(gptj ggml) | ||
| prepare_target(gptj ggml-230511) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wait, where are you tagging the actual ggml with this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. llama.cpp.cmake adds the given suffix to ggml as well. |
||
|
|
||
| add_library(mpt-${BUILD_VARIANT} SHARED | ||
| mpt.cpp) | ||
| prepare_target(mpt ggml) | ||
| prepare_target(mpt ggml-230511) | ||
| endforeach() | ||
|
|
||
| add_library(llmodel | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -332,10 +332,16 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) | |
| endif() | ||
|
|
||
| if (WITH_LLAMA) | ||
| # Backwards compatibility with old llama.cpp versions | ||
| set(LLAMA_UTIL_SOURCE_FILE llama-util.h) | ||
| if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) | ||
| set(LLAMA_UTIL_SOURCE_FILE llama_util.h) | ||
| endif() | ||
|
|
||
| add_library(llama${SUFFIX} | ||
| ${DIRECTORY}/llama.cpp | ||
| ${DIRECTORY}/llama.h | ||
| ${DIRECTORY}/llama_util.h) | ||
| ${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This branch doesn't actually introduce this file, right? It exists upstream in one of the pinned submodules?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The filename was changed. |
||
|
|
||
| target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}) | ||
| target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,14 +28,23 @@ | |
| #include <llama.h> | ||
| #include <ggml.h> | ||
|
|
||
|
|
||
| namespace { | ||
| const char *modelType_ = "LLaMA"; | ||
| } | ||
|
|
||
| struct gpt_params { | ||
| int32_t seed = -1; // RNG seed | ||
| int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) | ||
| int32_t n_keep = 0; // number of tokens to keep from initial prompt | ||
| #if LLAMA_DATE <= 230511 | ||
| int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) | ||
| #endif | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The crux of it. We're going to use macros...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Our other option would be to have an extensive collection of almost-identical
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I think this is the right choice of a bunch of bad choices.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's also CRTP and C++ template magic, but I agree it's not the time to go there yet. |
||
|
|
||
| #if LLAMA_DATE >= 230519 | ||
| // sampling parameters | ||
| float tfs_z = 1.0f; // 1.0 = disabled | ||
| float typical_p = 1.0f; // 1.0 = disabled | ||
| #endif | ||
|
|
||
| std::string prompt = ""; | ||
|
|
||
|
|
@@ -45,25 +54,45 @@ struct gpt_params { | |
| bool use_mlock = false; // use mlock to keep model in memory | ||
| }; | ||
|
|
||
| #if LLAMA_DATE >= 230519 | ||
| int llama_sample_top_p_top_k( | ||
niansa marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| llama_context *ctx, | ||
| const llama_token *last_n_tokens_data, | ||
| int last_n_tokens_size, | ||
| int top_k, | ||
| float top_p, | ||
| float temp, | ||
| float repeat_penalty) { | ||
| auto logits = llama_get_logits(ctx); | ||
| auto n_vocab = llama_n_vocab(ctx); | ||
| // Populate initial list of all candidates | ||
| std::vector<llama_token_data> candidates; | ||
| candidates.reserve(n_vocab); | ||
| for (int token_id = 0; token_id < n_vocab; token_id++) { | ||
| candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); | ||
| } | ||
| llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; | ||
| // Sample repeat penalty | ||
| llama_sample_repetition_penalty(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty); | ||
| // Temperature sampling | ||
| llama_sample_top_k(ctx, &candidates_p, top_k, 1); | ||
| llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1); | ||
| llama_sample_typical(ctx, &candidates_p, 1.0f, 1); | ||
| llama_sample_top_p(ctx, &candidates_p, top_p, 1); | ||
| llama_sample_temperature(ctx, &candidates_p, temp); | ||
| return llama_sample_token(ctx, &candidates_p); | ||
| } | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Going to assume this is giving you sane results? Have you made sure to go through and test models with each of the pinned variants and file formats? Man, we almost want regression or unit tests here...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup! I did. Man was my harddrive full..
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is also how it's done in the llama.cpp main example. |
||
| #endif | ||
|
|
||
| struct LLamaPrivate { | ||
| const std::string modelPath; | ||
| bool modelLoaded; | ||
| llama_context *ctx = nullptr; | ||
| llama_context_params params; | ||
| int64_t n_threads = 0; | ||
| bool empty = true; | ||
| }; | ||
|
|
||
|
|
||
| static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { | ||
| // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars | ||
| std::vector<llama_token> res(text.size() + (int)add_bos); | ||
| int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); | ||
| assert(n >= 0); | ||
| res.resize(n); | ||
|
|
||
| return res; | ||
| } | ||
|
|
||
| LLamaModel::LLamaModel() | ||
| : d_ptr(new LLamaPrivate) { | ||
| modelType = modelType_; | ||
|
|
@@ -78,11 +107,13 @@ bool LLamaModel::loadModel(const std::string &modelPath) | |
|
|
||
| gpt_params params; | ||
| d_ptr->params.n_ctx = 2048; | ||
| d_ptr->params.n_parts = params.n_parts; | ||
| d_ptr->params.seed = params.seed; | ||
| d_ptr->params.f16_kv = params.memory_f16; | ||
| d_ptr->params.use_mmap = params.use_mmap; | ||
| d_ptr->params.use_mlock = params.use_mlock; | ||
| #if LLAMA_DATE <= 230511 | ||
| d_ptr->params.n_parts = params.n_parts; | ||
| #endif | ||
|
|
||
| d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params); | ||
| if (!d_ptr->ctx) { | ||
|
|
@@ -126,7 +157,8 @@ size_t LLamaModel::saveState(uint8_t *dest) const | |
|
|
||
| size_t LLamaModel::restoreState(const uint8_t *src) | ||
| { | ||
| return llama_set_state_data(d_ptr->ctx, src); | ||
| // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540 | ||
| return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src)); | ||
| } | ||
|
|
||
| void LLamaModel::prompt(const std::string &prompt, | ||
|
|
@@ -147,7 +179,11 @@ void LLamaModel::prompt(const std::string &prompt, | |
| params.prompt.insert(0, 1, ' '); | ||
|
|
||
| // tokenize the prompt | ||
| auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false); | ||
| std::vector<llama_token> embd_inp(params.prompt.size() + 4); | ||
| int n = llama_tokenize(d_ptr->ctx, params.prompt.c_str(), embd_inp.data(), embd_inp.size(), d_ptr->empty); | ||
| assert(n >= 0); | ||
| embd_inp.resize(n); | ||
| d_ptr->empty = false; | ||
niansa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // save the context size | ||
| promptCtx.n_ctx = llama_n_ctx(d_ptr->ctx); | ||
|
|
@@ -313,8 +349,15 @@ const char *get_build_variant() { | |
| return GGML_BUILD_VARIANT; | ||
| } | ||
|
|
||
| bool magic_match(uint32_t magic) { | ||
| return magic == 0x67676a74; | ||
| bool magic_match(std::istream& f) { | ||
| // Check magic | ||
| uint32_t magic = 0; | ||
| f.read(reinterpret_cast<char*>(&magic), sizeof(magic)); | ||
| if (magic != 0x67676a74) return false; | ||
| // Check version | ||
| uint32_t version = 0; | ||
| f.read(reinterpret_cast<char*>(&version), sizeof(version)); | ||
| return version LLAMA_VERSIONS; | ||
| } | ||
|
|
||
| LLModel *construct() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, ok, i get ya, but this isn't actually pinning them. Also, I think I still want all of them to use the 'manyoso' fork as this gives us further control,right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure what you mean, the manyoso fork hasn't been updated to latest
llama.cpp, it's 132 commits behind...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also that fork only adds alibi, which is only needed for MPT
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean we should update that fork, and point to it I believe. lemme do that now.