feat: bump llama.cpp to b3889

mybigday · Oct 7, 2024 · 165e5c0 · 165e5c0
1 parent 8967d9b
commit 165e5c0
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 48 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -62,15 +62,6 @@ if (VULKAN_SDK)
   find_package(Vulkan REQUIRED)
 endif()
 
-find_program(PATCH patch REQUIRED)
-
-add_custom_target(
-  patch ALL
-  COMMAND ${PATCH} -p1 -N < ${CMAKE_SOURCE_DIR}/patches/llama.patch || true
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp
-  COMMENT "Applying patches"
-)
-
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
 add_subdirectory("src/llama.cpp")
 

diff --git a/patches/llama.patch b/patches/llama.patch
diff --git a/src/LlamaCompletionWorker.cpp b/src/LlamaCompletionWorker.cpp
@@ -59,13 +59,13 @@ void LlamaCompletionWorker::Execute() {
   size_t n_cur = 0;
   size_t n_input = 0;
   const auto model = _sess->model();
-  const bool add_bos = llama_should_add_bos_token(model);
+  const bool add_bos = llama_add_bos_token(model);
   auto ctx = _sess->context();
 
-  llama_set_rng_seed(ctx, _params.seed);
+  auto sparams = llama_sampler_chain_default_params();
 
-  LlamaCppSampling sampling{llama_sampling_init(_params.sparams),
-                            llama_sampling_free};
+  LlamaCppSampling sampling{gpt_sampler_init(model, _params.sparams),
+                            gpt_sampler_free};
 
   std::vector<llama_token> prompt_tokens =
       ::llama_tokenize(ctx, _params.prompt, add_bos);
@@ -109,8 +109,8 @@ void LlamaCompletionWorker::Execute() {
     }
     // sample the next token
     const llama_token new_token_id =
-        llama_sampling_sample(sampling.get(), ctx, nullptr);
-    llama_sampling_accept(sampling.get(), ctx, new_token_id, true);
+        gpt_sampler_sample(sampling.get(), ctx, -1);
+    gpt_sampler_accept(sampling.get(), new_token_id, true);
     // prepare the next batch
     embd->emplace_back(new_token_id);
     auto token = llama_token_to_piece(ctx, new_token_id);

diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp
@@ -75,7 +75,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   params.embedding = get_option<bool>(options, "embedding", false);
   params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
   params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
-  params.n_threads =
+  params.cpuparams.n_threads =
       get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
   params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
   params.use_mlock = get_option<bool>(options, "use_mlock", false);
@@ -86,16 +86,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   llama_backend_init();
   llama_numa_init(params.numa);
 
-  llama_model *model;
-  llama_context *ctx;
-  std::tie(model, ctx) = llama_init_from_gpt_params(params);
+  auto result = llama_init_from_gpt_params(params);
 
-  if (model == nullptr || ctx == nullptr) {
+  if (result.model == nullptr || result.context == nullptr) {
     Napi::TypeError::New(env, "Failed to load model")
         .ThrowAsJavaScriptException();
   }
 
-  _sess = std::make_shared<LlamaSession>(model, ctx, params);
+  _sess = std::make_shared<LlamaSession>(result.model, result.context, params);
   _info = gpt_params_get_system_info(params);
 }
 
@@ -167,11 +165,11 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   params.sparams.penalty_present =
       get_option<float>(options, "penalty_present", 0.00f);
   params.sparams.penalize_nl = get_option<bool>(options, "penalize_nl", false);
-  params.sparams.typical_p = get_option<float>(options, "typical_p", 1.00f);
-  params.ignore_eos = get_option<float>(options, "ignore_eos", false);
+  params.sparams.typ_p = get_option<float>(options, "typical_p", 1.00f);
+  params.sparams.ignore_eos = get_option<float>(options, "ignore_eos", false);
   params.sparams.grammar = get_option<std::string>(options, "grammar", "");
   params.n_keep = get_option<int32_t>(options, "n_keep", 0);
-  params.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
+  params.sparams.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
   std::vector<std::string> stop_words;
   if (options.Has("stop") && options.Get("stop").IsArray()) {
     auto stop_words_array = options.Get("stop").As<Napi::Array>();

diff --git a/src/common.hpp b/src/common.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "common/common.h"
+#include "common/sampling.h"
 #include "llama.h"
 #include <memory>
 #include <mutex>
@@ -12,7 +13,7 @@
 
 typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
 typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
-typedef std::unique_ptr<llama_sampling_context, decltype(&llama_sampling_free)>
+typedef std::unique_ptr<gpt_sampler, decltype(&gpt_sampler_free)>
     LlamaCppSampling;
 typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
 

diff --git a/src/llama.cpp b/src/llama.cpp