Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
1eac779
fix: ggml: fix vulkan-shaders-gen build
sparkleholic Nov 22, 2024
602295b
refactor: ggml: Improve vulkan-shaders-gen toolchain setup
sparkleholic Nov 28, 2024
7c1a685
refactor: ggml: Improve vulkan-shaders-gen toolchain setup
sparkleholic Dec 24, 2024
30caac3
llama : the WPM vocabs use the CLS token as BOS (#10930)
ggerganov Dec 24, 2024
09fe2e7
server: allow filtering llama server response fields (#10940)
nvrxq Dec 24, 2024
2cd43f4
ggml : more perfo with llamafile tinyblas on x86_64 (#10714)
Djip007 Dec 24, 2024
9ba399d
server : add support for "encoding_format": "base64" to the */embeddi…
elk-cloner Dec 24, 2024
d283d02
examples, ggml : fix GCC compiler warnings (#10983)
peter277 Dec 26, 2024
d79d8f3
vulkan: multi-row k quants (#10846)
netrunnereve Dec 26, 2024
16cdce7
server : fix token duplication when streaming with stop strings (#10997)
z80maniac Dec 28, 2024
f865ea1
server: added more docs for response_fields field (#10995)
isaac-mcfadyen Dec 28, 2024
fdd2188
vulkan: Use push constant offset to handle misaligned descriptors (#1…
jeffbolznv Dec 29, 2024
a813bad
vulkan: im2col and matmul optimizations for stable diffusion (#10942)
jeffbolznv Dec 29, 2024
c250ecb
android : fix llama_batch free (#11014)
ag2s20150909 Dec 30, 2024
716bd6d
vulkan: optimize mul_mat for small values of N (#10991)
jeffbolznv Dec 30, 2024
6e1531a
common, examples, ggml : fix MSYS2 GCC compiler errors and warnings w…
peter277 Dec 31, 2024
bc7b1f8
convert : fix Llama-3_1-Nemotron-51B rope settings (#11008)
ymcki Dec 31, 2024
5896c65
server : add OAI compat for /v1/completions (#10974)
ngxson Dec 31, 2024
45095a6
server : clean up built-in template detection (#11026)
ngxson Dec 31, 2024
0827b2c
ggml : fixes for AVXVNNI instruction set with MSVC and Clang (#11027)
Srihari-mcw Dec 31, 2024
a45433b
readme : add llama-swap to infrastructure section (#11032)
mostlygeek Jan 2, 2025
0da5d86
server : allow using LoRA adapters per-request (#10994)
ngxson Jan 2, 2025
2f0ee84
server: bench: minor fixes (#10765)
phymbert Jan 2, 2025
f66f582
llama : refactor `src/llama.cpp` (#10902)
ggerganov Jan 3, 2025
e7da954
metal : avoid uint (#11019)
ggerganov Jan 3, 2025
4b0c638
common : disable KV cache shifting automatically for unsupported mode…
MollySophia Jan 3, 2025
c31fc8b
fix: Vulkan shader gen binary path (#11037)
giladgd Jan 4, 2025
db68c93
ggml : improve inputs log sched_print_assignments (ggml/1053)
danbev Dec 19, 2024
5e3b08d
ggml : do not install metal source when embed library (ggml/1054)
ggerganov Jan 4, 2025
78c6785
sync : ggml
ggerganov Jan 4, 2025
46be942
llama : add support for the cohere2 model architecture (#10900)
dranger003 Jan 4, 2025
f922a9c
[GGML][RPC] Support for models with non-512-aligned tensors over RPC.…
matt23654 Jan 4, 2025
9394bbd
llama : Add support for DeepSeek V3 (#11049)
fairydreaming Jan 4, 2025
b56f079
Vulkan: Add device-specific blacklist for coopmat for the AMD proprie…
0cc4m Jan 4, 2025
46e3556
CUDA: add BF16 support (#11093)
JohannesGaessler Jan 6, 2025
5047dd3
llama : use _impl suffix instead of _internal (#11060)
ggerganov Jan 6, 2025
727368c
llama : use LLAMA_TOKEN_NULL (#11062)
ggerganov Jan 6, 2025
ae2f606
mmap : fix fileno macro clash (#11076)
ggerganov Jan 6, 2025
3e6e7a6
tokenize : escape the prompt (#11058)
ggerganov Jan 6, 2025
47182dd
llama : update llama_model API names (#11063)
ggerganov Jan 6, 2025
6369f86
llama : rename missed batch params/vars to ubatch (#10059)
danbev Jan 6, 2025
96a1dc2
llama : prevent system info string accumulation across calls (#11101)
a-ghorbani Jan 6, 2025
09186fa
llama : remove check flash_attn with lora (#11104)
ngxson Jan 6, 2025
cb6d4b3
Merge branch 'master' into master_fix
mtmcp Jan 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 13 additions & 15 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ jobs:
-DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
-DBUILD_SHARED_LIBS=OFF
-DGGML_RPC=ON
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

- name: Test
Expand Down Expand Up @@ -123,8 +122,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DBUILD_SHARED_LIBS=OFF
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

- name: Test
Expand Down Expand Up @@ -181,7 +179,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
cmake --build . --config Release -j $(nproc)

- name: Test
Expand Down Expand Up @@ -651,23 +649,23 @@ jobs:
matrix:
include:
- build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
- build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
- build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
- build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
- build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
- build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

Expand Down Expand Up @@ -914,7 +912,7 @@ jobs:
shell: cmd
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server

</details>

Expand Down
4 changes: 2 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1512,15 +1512,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--lora"}, "FNAME",
"path to LoRA adapter (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & value) {
params.lora_adapters.push_back({ std::string(value), 1.0 });
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
{"--lora-scaled"}, "FNAME", "SCALE",
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & fname, const std::string & scale) {
params.lora_adapters.push_back({ fname, std::stof(scale) });
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
Expand Down
66 changes: 40 additions & 26 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cstdarg>
#include <cstring>
#include <ctime>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <iterator>
Expand Down Expand Up @@ -62,7 +63,9 @@
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
#define PATH_MAX MAX_PATH
# if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#else
#include <sys/syslimits.h>
#endif
Expand Down Expand Up @@ -843,7 +846,7 @@ struct common_init_result common_init_from_params(common_params & params) {
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
model = llama_model_load_from_file(params.model.c_str(), mparams);
}

if (model == NULL) {
Expand All @@ -870,7 +873,7 @@ struct common_init_result common_init_from_params(common_params & params) {
}

if (!ok) {
llama_free_model(model);
llama_model_free(model);

return iparams;
}
Expand All @@ -881,14 +884,13 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_new_context_with_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
llama_model_free(model);
return iparams;
}

if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
llama_free_model(model);
return iparams;
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}

if (!params.control_vectors.empty()) {
Expand All @@ -898,7 +900,7 @@ struct common_init_result common_init_from_params(common_params & params) {
const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) {
llama_free(lctx);
llama_free_model(model);
llama_model_free(model);

return iparams;
}
Expand All @@ -911,28 +913,29 @@ struct common_init_result common_init_from_params(common_params & params) {
params.control_vector_layer_end);
if (err) {
llama_free(lctx);
llama_free_model(model);
llama_model_free(model);

return iparams;
}
}

// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
common_lora_adapter_container loaded_la;
loaded_la.path = la.path;
loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
llama_lora_adapter_ptr lora;
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
llama_model_free(model);
return iparams;
}
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters

la.ptr = lora.get();
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}

if (!params.lora_init_without_apply) {
common_lora_adapters_apply(lctx, iparams.lora_adapters);
common_lora_adapters_apply(lctx, params.lora_adapters);
}

if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
Expand Down Expand Up @@ -979,7 +982,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = bos;
}
tmp.clear();
Expand All @@ -993,17 +996,17 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_perf_context_reset(lctx);
}

iparams.model = model;
iparams.context = lctx;
iparams.model.reset(model);
iparams.context.reset(lctx);

return iparams;
}

void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) {
for (auto & la : lora) {
if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.adapter, la.scale);
llama_lora_adapter_set(ctx, la.ptr, la.scale);
}
}
}
Expand Down Expand Up @@ -1148,8 +1151,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
#endif

// Check if the file already exists locally
struct stat model_file_info;
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
auto file_exists = std::filesystem::exists(path);

// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
Expand Down Expand Up @@ -1409,7 +1411,7 @@ struct llama_model * common_load_model_from_url(
}
}

return llama_load_model_from_file(local_path.c_str(), params);
return llama_model_load_from_file(local_path.c_str(), params);
}

struct llama_model * common_load_model_from_hf(
Expand Down Expand Up @@ -1612,6 +1614,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
// Chat template utils
//

std::string common_get_builtin_chat_template(const struct llama_model * model) {
static const char * template_key = "tokenizer.chat_template";
// call with NULL buffer to get the total size of the string
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
if (res > 0) {
std::vector<char> model_template(res + 1, 0);
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
return std::string(model_template.data(), model_template.size() - 1);
}
return "";
}

bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
Expand Down
29 changes: 18 additions & 11 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#pragma once

#include "llama.h"
#include "llama-cpp.h"

#include <string>
#include <vector>
Expand All @@ -27,10 +27,8 @@
struct common_lora_adapter_info {
std::string path;
float scale;
};

struct common_lora_adapter_container : common_lora_adapter_info {
struct llama_lora_adapter * adapter;
struct llama_lora_adapter * ptr;
};

using llama_tokens = std::vector<llama_token>;
Expand Down Expand Up @@ -478,10 +476,12 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils
//

// note: defines object's lifetime
struct common_init_result {
struct llama_model * model = nullptr;
struct llama_context * context = nullptr;
std::vector<common_lora_adapter_container> lora_adapters;
llama_model_ptr model;
llama_context_ptr context;

std::vector<llama_lora_adapter_ptr> lora;
};

struct common_init_result common_init_from_params(common_params & params);
Expand All @@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
const struct llama_model_params & params);

// clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);

//
// Batch utils
Expand Down Expand Up @@ -571,6 +571,9 @@ struct common_chat_msg {
std::string content;
};

// Get the built-in chat template for the model. Return empty string if not present.
std::string common_get_builtin_chat_template(const struct llama_model * model);

// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl);

Expand Down Expand Up @@ -637,6 +640,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils
//

static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
namespace {

const char * const LLM_KV_SPLIT_NO = "split.no";
const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

}
Loading