From 93864cda8a024227dec297085f6152662738ea0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Wed, 22 Jan 2025 15:19:34 +0100 Subject: [PATCH 001/100] llama : experimental DeepSeek2 MLA implementation that caches latent kv representations --- src/llama-kv-cache.cpp | 16 ++++++- src/llama-kv-cache.h | 7 +++ src/llama.cpp | 99 +++++++++++++++++++++++++++++++++++------- 3 files changed, 106 insertions(+), 16 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 90b6c56ed068c..99fd1d8df1c32 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -53,7 +53,7 @@ bool llama_kv_cache_init( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { struct ggml_init_params params = { - /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(4u*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -71,6 +71,10 @@ bool llama_kv_cache_init( cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); + // DeepSeek MLA + cache.kr_l.reserve(n_layer); + cache.kv_l.reserve(n_layer); + for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); @@ -97,6 +101,16 @@ bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); + + // DeepSeek MLA + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size); + ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); + ggml_format_name(kr, "cache_kr_l%d", i); + ggml_format_name(kv, "cache_kv_l%d", i); + cache.kr_l.push_back(kr); + cache.kv_l.push_back(kv); } // allocate tensors and initialize the buffers to avoid NaNs in the padding diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index dca6f3998c645..7f2e1b3e7b144 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -49,11 +49,18 @@ struct llama_kv_cache { ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; + ggml_type type_kr = GGML_TYPE_F32; + ggml_type type_kv = GGML_TYPE_F32; + std::vector cells; std::vector k_l; // per layer std::vector v_l; + // DeepSeek MLA + std::vector kr_l; // per layer + std::vector kv_l; + std::vector ctxs; std::vector bufs; diff --git a/src/llama.cpp b/src/llama.cpp index 60728e5bb91ca..99af190e1474b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8860,32 +8860,37 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(kv_compressed, "kv_compressed", il); + struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head); + cb(kv_cache_view, "kv_cache_view", il); + + // note: storing c^KV in the KV cache + ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view)); + + struct ggml_tensor * kv_cache = + ggml_view_2d(ctx0, kv_self.kv_l[il], + kv_lora_rank, n_kv, + ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank), + 0); + cb(kv_cache, "kv_cache", il); + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cache); cb(kv, "kv", il); // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_kv, ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), 0); cb(k_nope, "k_nope", il); // and {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_kv, ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), ggml_row_size(kv->type, (n_embd_head_qk_nope))); cb(v_states, "v_states", il); - v_states = ggml_cont(ctx0, v_states); - cb(v_states, "v_states", il); - - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), - 0); - cb(v_states, "v_states", il); - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE q_pe = ggml_rope_ext( ctx0, q_pe, inp_pos, nullptr, @@ -8903,15 +8908,61 @@ struct llm_build_context { ); cb(k_pe, "k_pe", il); + struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head); + cb(kr_cache_view, "kr_cache_view", il); + + // note: storing RoPE-ed version of K^R in the KV cache + ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_pe, kr_cache_view)); + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); cb(q_states, "q_states", il); - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + struct ggml_tensor * kr_cache = + ggml_view_2d(ctx0, kv_self.kr_l[il], + n_embd_head_qk_rope, n_kv, + ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope), + 0); + cb(kr_cache, "kr_cache", il); + + // TODO is there a better way? + struct ggml_tensor * kr_rep_shape = ggml_new_tensor_3d(ctx0, kr_cache->type, kr_cache->ne[0], kr_cache->ne[1], n_head); + struct ggml_tensor * kr_rep = ggml_repeat(ctx0, kr_cache, kr_rep_shape); + kr_rep = ggml_permute(ctx0, kr_rep, 0, 2, 1, 3); + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, kr_rep, 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + q_states = ggml_permute(ctx0, q_states, 0, 2, 1, 3); + cb(q_states, "q_states", il); + + k_states = ggml_permute(ctx0, k_states, 0, 2, 1, 3); + cb(k_states, "k_states", il); + + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k_states, q_states); + cb(kq, "kq", il); + + kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); + + v_states = ggml_permute(ctx0, v_states, 1, 2, 0, 3); + cb(v_states, "v_states", il); + + v_states = ggml_cont(ctx0, v_states); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v_states, kq); + cb(kqv, "kqv", il); + + GGML_ASSERT(kv_self.size == n_ctx); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); + cb(cur, "kqv_merged_cont", il); + + ggml_build_forward_expand(gf, cur); + + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cb(cur, "kqv_out", il); } if (il == n_layer - 1) { @@ -12004,6 +12055,24 @@ struct llama_context * llama_new_context_with_model( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } + { + size_t memory_size_kr = 0; + size_t memory_size_kv = 0; + + for (auto & kr : ctx->kv_self.kr_l) { + memory_size_kr += ggml_nbytes(kr); + } + + for (auto & kv : ctx->kv_self.kv_l) { + memory_size_kv += ggml_nbytes(kv); + } + + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB\n", __func__, + (float)(memory_size_kr + memory_size_kv) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_kv / (1024.0f * 1024.0f)); + } + // graph outputs buffer { // resized during inference when a batch uses more outputs From f07c2ec505f2ba93c3ec8246b258a9c97c7c1660 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 24 Jan 2025 20:56:09 +0100 Subject: [PATCH 002/100] llama : add option to override tensor buffers --- common/arg.cpp | 38 ++++++++++++++++++++++++++++++++++++++ common/common.cpp | 10 ++++++++++ common/common.h | 1 + include/llama.h | 8 ++++++++ src/llama-model-loader.cpp | 5 ++++- src/llama-model-loader.h | 8 +++++--- src/llama-model.cpp | 21 +++++++++++++++++++-- src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- 9 files changed, 87 insertions(+), 8 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a6226a34b1860..d746f832e541d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1,5 +1,6 @@ #include "arg.h" +#include "common.h" #include "log.h" #include "sampling.h" @@ -321,6 +322,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.kv_overrides.back().key[0] = 0; } + if (!params.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + if (params.reranking && params.embedding) { throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); } @@ -1477,6 +1482,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex exit(0); } )); + add_opt(common_arg( + {"--override-tensor", "-ot"}, "=,...", + "override tensor buffer type", [](common_params & params, const std::string & value) { + static std::map buft_list; + if (buft_list.empty()) { + // enumerate all the devices and add their buffer types to the list + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + + for (const auto & override : string_split(value, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + throw std::invalid_argument("invalid value"); + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + throw std::invalid_argument("unknown buffer type"); + } + // FIXME: this leaks memory + params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)}); + } + } + )); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", diff --git a/common/common.cpp b/common/common.cpp index 6dea8e3d25238..1af628625ffe1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1083,15 +1083,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (!params.devices.empty()) { mparams.devices = params.devices.data(); } + if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1099,6 +1102,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.kv_overrides = params.kv_overrides.data(); } + if (params.tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = NULL; + } else { + GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = params.tensor_buft_overrides.data(); + } + return mparams; } diff --git a/common/common.h b/common/common.h index 571260372090f..9b42a8944d618 100644 --- a/common/common.h +++ b/common/common.h @@ -256,6 +256,7 @@ struct common_params { std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; + std::vector tensor_buft_overrides; bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) std::vector lora_adapters; // lora adapter path with user defined scale diff --git a/include/llama.h b/include/llama.h index 3b75e760780ef..26c6dd12828c5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -275,10 +275,18 @@ extern "C" { }; }; + struct llama_model_tensor_buft_override { + const char * pattern; + ggml_backend_buffer_type_t buft; + }; + struct llama_model_params { // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) ggml_backend_dev_t * devices; + // NULL-terminated list of buffer types to use for tensors that match a pattern + const struct llama_model_tensor_buft_override * tensor_buft_overrides; + int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 75073bf610ac3..c64e974a94f57 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p) { + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader( } } + tensor_buft_overrides = param_tensor_buft_overrides_p; + // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index fe35404b26889..0f52b011b6986 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -77,8 +77,9 @@ struct llama_model_loader { llama_mmaps mappings; - std::map weights_map; - std::unordered_map kv_overrides; + std::map weights_map; + std::unordered_map kv_overrides; + const llama_model_tensor_buft_override * tensor_buft_overrides; gguf_context_ptr meta; std::vector contexts; @@ -95,7 +96,8 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p); + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 031b4c30b75dd..6b1653536f39e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1444,9 +1444,25 @@ bool llama_model::load_tensors(llama_model_loader & ml) { GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); } - ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list); + ggml_backend_buffer_type_t buft = nullptr; + + // check overrides + if (ml.tensor_buft_overrides) { + std::string tensor_name = tn.str(); + for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { + if (tensor_name.find(overrides->pattern) != std::string::npos) { + LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); + buft = overrides->buft; + break; + } + } + } + if (!buft) { - throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + buft = select_weight_buft(hparams, t_meta, op, *buft_list); + if (!buft) { + throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + } } // avoid using a host buffer when using mmap @@ -3757,6 +3773,7 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const { struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.devices =*/ nullptr, + /*.tensor_buft_overrides =*/ nullptr, /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655a373..ab50c5d179a29 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index e8cfe5012819c..e2ca1d7b45c47 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); From de538aa32929a10555097f01cad91639dfbe84ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sat, 25 Jan 2025 18:10:22 +0100 Subject: [PATCH 003/100] llama : optimize DeepSeek MLA implementation --- convert_hf_to_gguf.py | 23 ++++++++++ gguf-py/gguf/constants.py | 6 +++ gguf-py/gguf/tensor_mapping.py | 8 ++++ src/llama-arch.cpp | 6 +++ src/llama-arch.h | 2 + src/llama-kv-cache.cpp | 1 + src/llama-kv-cache.h | 4 +- src/llama-model.cpp | 2 + src/llama-model.h | 2 + src/llama.cpp | 83 ++++++++++++++++++---------------- 10 files changed, 96 insertions(+), 41 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 63b54a9cf6b48..4df55e7b15b93 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4136,6 +4136,29 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2); + k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim) + v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1]) + + return [ + (self.map_tensor_name(name), data_torch), + (self.map_tensor_name(name_kb), k_b), + (self.map_tensor_name(name_vb), v_b) + ] + + return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8fe84df21ea20..12522928a8c28 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -356,6 +356,8 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_B = auto() ATTN_KV_A_MQA = auto() ATTN_KV_B = auto() + ATTN_K_B = auto() + ATTN_V_B = auto() ATTN_Q_A_NORM = auto() ATTN_KV_A_NORM = auto() FFN_SUB_NORM = auto() @@ -543,6 +545,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", + MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", @@ -1333,6 +1337,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.ATTN_OUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 617791e240b60..df831ba70594c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -586,6 +586,14 @@ class TensorNameMap: "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 ), + MODEL_TENSOR.ATTN_K_B: ( + "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 + ), + + MODEL_TENSOR.ATTN_V_B: ( + "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 + ), + MODEL_TENSOR.ATTN_Q_A_NORM: ( "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a7260f495d945..e6daa1bc4b5ce 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -999,6 +999,8 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, + { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, @@ -1330,6 +1332,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, @@ -1347,6 +1351,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 122fdcebe0af6..c6105d59ac1f3 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -277,6 +277,8 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_K_B, + LLM_TENSOR_ATTN_V_B, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_SUB_NORM, diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 8a836c784eca5..51e71437c1391 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -105,6 +105,7 @@ bool llama_kv_cache_init( // DeepSeek MLA const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; + LLAMA_LOG_DEBUG("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size); ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); ggml_format_name(kr, "cache_kr_l%d", i); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 7f2e1b3e7b144..a87344c849235 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -49,8 +49,8 @@ struct llama_kv_cache { ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; - ggml_type type_kr = GGML_TYPE_F32; - ggml_type type_kv = GGML_TYPE_F32; + ggml_type type_kr = GGML_TYPE_F16; + ggml_type type_kv = GGML_TYPE_F16; std::vector cells; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 031b4c30b75dd..8007e730d04f8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2870,6 +2870,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0); layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); diff --git a/src/llama-model.h b/src/llama-model.h index a7c30444786fd..1fdbd3721d630 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -161,6 +161,8 @@ struct llama_layer { struct ggml_tensor * wq_b = nullptr; struct ggml_tensor * wkv_a_mqa = nullptr; struct ggml_tensor * wkv_b = nullptr; + struct ggml_tensor * wk_b = nullptr; + struct ggml_tensor * wv_b = nullptr; struct ggml_tensor * wq_cross = nullptr; struct ggml_tensor * wk_cross = nullptr; struct ggml_tensor * wv_cross = nullptr; diff --git a/src/llama.cpp b/src/llama.cpp index 5a9518a8e93e2..cb9fe8c9714f5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6483,24 +6483,6 @@ struct llm_build_context { 0); cb(kv_cache, "kv_cache", il); - // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cache); - cb(kv, "kv", il); - - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_kv, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), - 0); - cb(k_nope, "k_nope", il); - - // and {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_kv, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), - ggml_row_size(kv->type, (n_embd_head_qk_nope))); - cb(v_states, "v_states", il); - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE q_pe = ggml_rope_ext( ctx0, q_pe, inp_pos, nullptr, @@ -6524,9 +6506,6 @@ struct llm_build_context { // note: storing RoPE-ed version of K^R in the KV cache ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_pe, kr_cache_view)); - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); - cb(q_states, "q_states", il); - struct ggml_tensor * kr_cache = ggml_view_2d(ctx0, kv_self.kr_l[il], n_embd_head_qk_rope, n_kv, @@ -6534,36 +6513,62 @@ struct llm_build_context { 0); cb(kr_cache, "kr_cache", il); - // TODO is there a better way? - struct ggml_tensor * kr_rep_shape = ggml_new_tensor_3d(ctx0, kr_cache->type, kr_cache->ne[0], kr_cache->ne[1], n_head); - struct ggml_tensor * kr_rep = ggml_repeat(ctx0, kr_cache, kr_rep_shape); - kr_rep = ggml_permute(ctx0, kr_rep, 0, 2, 1, 3); - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, kr_rep, 0); - cb(k_states, "k_states", il); + struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0); + cb(wk_b, "wk_b", il); - q_states = ggml_permute(ctx0, q_states, 0, 2, 1, 3); - cb(q_states, "q_states", il); + struct ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 3, 1); + cb(q_nope_perm, "q_nope_perm", il); - k_states = ggml_permute(ctx0, k_states, 0, 2, 1, 3); - cb(k_states, "k_states", il); + struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope_perm); + cb(q_nope2, "q_nope2", il); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k_states, q_states); - cb(kq, "kq", il); + struct ggml_tensor * q_nope2_perm = ggml_permute(ctx0, q_nope2, 0, 1, 3, 2); + cb(q_nope2_perm, "q_nope2_perm", il); + + struct ggml_tensor * kv_cache_perm = ggml_cont(ctx0, ggml_permute(ctx0, kv_cache, 1, 0, 2, 3)); + cb(kv_cache_perm, "kv_cache_perm", il); + + struct ggml_tensor * scores1 = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm); + cb(scores1, "scores1", il); + + struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1); + cb(q_pe_perm, "q_pe_perm", il); + + struct ggml_tensor * kr_cache_perm = ggml_permute(ctx0, kr_cache, 0, 2, 3, 1); + cb(kr_cache_perm, "kr_cache_perm", il); + + struct ggml_tensor * scores2 = ggml_mul_mat(ctx0, kr_cache, q_pe_perm); + cb(scores2, "scores2", il); + + struct ggml_tensor * scores = ggml_add(ctx0, scores1, scores2); + cb(scores, "scores", il); + + struct ggml_tensor * kq = ggml_permute(ctx0, scores, 0, 3, 1, 2); + + struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0); + cb(wv_b, "wv_b", il); kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); - v_states = ggml_permute(ctx0, v_states, 1, 2, 0, 3); - cb(v_states, "v_states", il); + struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 3, 1); + cb(kq_perm, "kq_perm", il); - v_states = ggml_cont(ctx0, v_states); + struct ggml_tensor * kqv1 = ggml_mul_mat(ctx0, kv_cache_perm, kq_perm); + cb(kqv1, "kqv1", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v_states, kq); - cb(kqv, "kqv", il); + struct ggml_tensor * kqv1_trans = ggml_permute(ctx0, kqv1, 0, 1, 3, 2); + cb(kqv1_trans, "kqv1_trans", il); + + struct ggml_tensor * kqv2 = ggml_mul_mat(ctx0, wv_b, kqv1_trans); + cb(kqv2, "kqv2", il); + + struct ggml_tensor * kqv2_trans = ggml_permute(ctx0, kqv2, 0, 3, 2, 1); + cb(kqv2_trans, "kqv2_trans", il); GGML_ASSERT(kv_self.size == n_ctx); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv2_trans, 0, 2, 1, 3); cb(kqv_merged, "kqv_merged", il); cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); From ce730637e8fe1b86de7d6e1758f33d716c6c7781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 26 Jan 2025 12:50:17 +0100 Subject: [PATCH 004/100] llama : Update tensor names in DeepSeek2 MLA implementation. --- src/llama.cpp | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index cb9fe8c9714f5..08b27b33add97 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6525,11 +6525,8 @@ struct llm_build_context { struct ggml_tensor * q_nope2_perm = ggml_permute(ctx0, q_nope2, 0, 1, 3, 2); cb(q_nope2_perm, "q_nope2_perm", il); - struct ggml_tensor * kv_cache_perm = ggml_cont(ctx0, ggml_permute(ctx0, kv_cache, 1, 0, 2, 3)); - cb(kv_cache_perm, "kv_cache_perm", il); - - struct ggml_tensor * scores1 = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm); - cb(scores1, "scores1", il); + struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm); + cb(kq_nope, "kq_nope", il); struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1); cb(q_pe_perm, "q_pe_perm", il); @@ -6537,13 +6534,14 @@ struct llm_build_context { struct ggml_tensor * kr_cache_perm = ggml_permute(ctx0, kr_cache, 0, 2, 3, 1); cb(kr_cache_perm, "kr_cache_perm", il); - struct ggml_tensor * scores2 = ggml_mul_mat(ctx0, kr_cache, q_pe_perm); - cb(scores2, "scores2", il); + struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe_perm); + cb(kq_pe, "kq_pe", il); - struct ggml_tensor * scores = ggml_add(ctx0, scores1, scores2); - cb(scores, "scores", il); + struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe); + cb(kq, "kq", il); - struct ggml_tensor * kq = ggml_permute(ctx0, scores, 0, 3, 1, 2); + kq = ggml_permute(ctx0, kq, 0, 3, 1, 2); + cb(kq, "kq_perm", il); struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0); cb(wv_b, "wv_b", il); @@ -6552,27 +6550,25 @@ struct llm_build_context { cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 3, 1); - cb(kq_perm, "kq_perm", il); - - struct ggml_tensor * kqv1 = ggml_mul_mat(ctx0, kv_cache_perm, kq_perm); - cb(kqv1, "kqv1", il); + cb(kq_perm, "kq_soft_max_ext_perm", il); - struct ggml_tensor * kqv1_trans = ggml_permute(ctx0, kqv1, 0, 1, 3, 2); - cb(kqv1_trans, "kqv1_trans", il); + struct ggml_tensor * kv_cache_trans = ggml_cont(ctx0, ggml_transpose(ctx0, kv_cache)); + cb(kv_cache_trans, "kv_cache_trans", il); - struct ggml_tensor * kqv2 = ggml_mul_mat(ctx0, wv_b, kqv1_trans); - cb(kqv2, "kqv2", il); + struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq_perm); + cb(kqv_compressed, "kqv_compressed", il); - struct ggml_tensor * kqv2_trans = ggml_permute(ctx0, kqv2, 0, 3, 2, 1); - cb(kqv2_trans, "kqv2_trans", il); + kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 1, 3, 2); + cb(kqv_compressed, "kqv_compressed_perm", il); - GGML_ASSERT(kv_self.size == n_ctx); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed); + cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv2_trans, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + kqv = ggml_permute(ctx0, kqv, 0, 3, 1, 2); + cb(kqv, "kqv_perm", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); - cb(cur, "kqv_merged_cont", il); + cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0); + cb(cur, "kqv_2d", il); ggml_build_forward_expand(gf, cur); From 202f323e66809bb1df192245caddc49471660466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 26 Jan 2025 18:29:54 +0100 Subject: [PATCH 005/100] llama : add a second copy of c^KV cache in DeepSeek2 MLA to avoid transposing the cache during inference --- src/llama-kv-cache.cpp | 6 +++++- src/llama-kv-cache.h | 1 + src/llama.cpp | 16 +++++++++++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 51e71437c1391..57ccbeeae7e26 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -53,7 +53,7 @@ bool llama_kv_cache_init( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { struct ggml_init_params params = { - /*.mem_size =*/ size_t(4u*n_layer*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(5u*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -74,6 +74,7 @@ bool llama_kv_cache_init( // DeepSeek MLA cache.kr_l.reserve(n_layer); cache.kv_l.reserve(n_layer); + cache.kvt_l.reserve(n_layer); for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); @@ -108,10 +109,13 @@ bool llama_kv_cache_init( LLAMA_LOG_DEBUG("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size); ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); + ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); ggml_format_name(kr, "cache_kr_l%d", i); ggml_format_name(kv, "cache_kv_l%d", i); + ggml_format_name(kvt, "cache_kvt_l%d", i); cache.kr_l.push_back(kr); cache.kv_l.push_back(kv); + cache.kvt_l.push_back(kvt); } // allocate tensors and initialize the buffers to avoid NaNs in the padding diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index a87344c849235..b10540d76442e 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -60,6 +60,7 @@ struct llama_kv_cache { // DeepSeek MLA std::vector kr_l; // per layer std::vector kv_l; + std::vector kvt_l; std::vector ctxs; std::vector bufs; diff --git a/src/llama.cpp b/src/llama.cpp index 08b27b33add97..d9fe40102b346 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6476,6 +6476,12 @@ struct llm_build_context { // note: storing c^KV in the KV cache ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view)); + struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head)); + cb(kv_cache_trans_view, "kv_cache_trans_view", il); + + // note: storing transposed c^KV in the transposed KV cache + ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view)); + struct ggml_tensor * kv_cache = ggml_view_2d(ctx0, kv_self.kv_l[il], kv_lora_rank, n_kv, @@ -6483,6 +6489,13 @@ struct llm_build_context { 0); cb(kv_cache, "kv_cache", il); + struct ggml_tensor * kv_cache_trans = + ggml_view_2d(ctx0, kv_self.kvt_l[il], + n_kv, kv_lora_rank, + ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), + 0); + cb(kv_cache_trans, "kv_cache_trans", il); + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE q_pe = ggml_rope_ext( ctx0, q_pe, inp_pos, nullptr, @@ -6552,9 +6565,6 @@ struct llm_build_context { struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 3, 1); cb(kq_perm, "kq_soft_max_ext_perm", il); - struct ggml_tensor * kv_cache_trans = ggml_cont(ctx0, ggml_transpose(ctx0, kv_cache)); - cb(kv_cache_trans, "kv_cache_trans", il); - struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq_perm); cb(kqv_compressed, "kqv_compressed", il); From 93c5937249c313bf825d020f4a5213e32c94737c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 26 Jan 2025 22:23:13 +0100 Subject: [PATCH 006/100] llama : modified tensor permutations to multiply larger matrices during inference --- src/llama.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d9fe40102b346..3df9896922254 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6529,13 +6529,13 @@ struct llm_build_context { struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0); cb(wk_b, "wk_b", il); - struct ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 3, 1); + struct ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); cb(q_nope_perm, "q_nope_perm", il); struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope_perm); cb(q_nope2, "q_nope2", il); - struct ggml_tensor * q_nope2_perm = ggml_permute(ctx0, q_nope2, 0, 1, 3, 2); + struct ggml_tensor * q_nope2_perm = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3); cb(q_nope2_perm, "q_nope2_perm", il); struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm); @@ -6547,34 +6547,34 @@ struct llm_build_context { struct ggml_tensor * kr_cache_perm = ggml_permute(ctx0, kr_cache, 0, 2, 3, 1); cb(kr_cache_perm, "kr_cache_perm", il); - struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe_perm); + struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe); cb(kq_pe, "kq_pe", il); struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe); cb(kq, "kq", il); - kq = ggml_permute(ctx0, kq, 0, 3, 1, 2); + kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3)); cb(kq, "kq_perm", il); - struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0); - cb(wv_b, "wv_b", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 3, 1); + struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 1, 3); cb(kq_perm, "kq_soft_max_ext_perm", il); struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq_perm); cb(kqv_compressed, "kqv_compressed", il); - kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 1, 3, 2); + kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3); cb(kqv_compressed, "kqv_compressed_perm", il); + struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0); + cb(wv_b, "wv_b", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed); cb(kqv, "kqv", il); - kqv = ggml_permute(ctx0, kqv, 0, 3, 1, 2); + kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3)); cb(kqv, "kqv_perm", il); cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0); From 1eee98f01fca721a889defac3d38e9ada7abb617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Mon, 27 Jan 2025 09:32:25 +0100 Subject: [PATCH 007/100] llama : removed unnecessary code in DeepSeek V2 implementation --- src/llama.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3df9896922254..a4c78240b265e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6544,9 +6544,6 @@ struct llm_build_context { struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1); cb(q_pe_perm, "q_pe_perm", il); - struct ggml_tensor * kr_cache_perm = ggml_permute(ctx0, kr_cache, 0, 2, 3, 1); - cb(kr_cache_perm, "kr_cache_perm", il); - struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe); cb(kq_pe, "kq_pe", il); From 8ff0991eed65e4041e6e3dfa2e3c98aee7fa2c21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Tue, 28 Jan 2025 11:02:52 +0100 Subject: [PATCH 008/100] convert : make lint happy --- convert_hf_to_gguf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4df55e7b15b93..2be7de5a59bbe 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4148,7 +4148,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) - k_b = k_b.transpose(1, 2); + k_b = k_b.transpose(1, 2) k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim) v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1]) @@ -4158,7 +4158,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter (self.map_tensor_name(name_vb), v_b) ] - return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): From 8a887decd35083c1542534cfadc3a5ee592da964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Tue, 28 Jan 2025 19:26:54 +0100 Subject: [PATCH 009/100] llama : prompt processing optimizations in DeepSeek V2 --- src/llama.cpp | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a4c78240b265e..5768f9215fea9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6403,6 +6403,10 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + // whether to use n_tokens as the matrix dimension during multiplication or n_head + // n_tokens is higher during prompt processing, this allows to optimize for this case + bool pp_opt = n_tokens > n_head; + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6535,14 +6539,18 @@ struct llm_build_context { struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope_perm); cb(q_nope2, "q_nope2", il); - struct ggml_tensor * q_nope2_perm = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3); - cb(q_nope2_perm, "q_nope2_perm", il); + if (!pp_opt) { + q_nope2 = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3); + cb(q_nope2, "q_nope2_perm", il); + } - struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm); + struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2); cb(kq_nope, "kq_nope", il); - struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1); - cb(q_pe_perm, "q_pe_perm", il); + if (pp_opt) { + q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3); + cb(q_pe, "q_pe_perm", il); + } struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe); cb(kq_pe, "kq_pe", il); @@ -6550,20 +6558,26 @@ struct llm_build_context { struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe); cb(kq, "kq", il); - kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3)); - cb(kq, "kq_perm", il); + if (!pp_opt) { + kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3)); + cb(kq, "kq_perm", il); + } kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * kq_perm = ggml_permute(ctx0, kq, 0, 2, 1, 3); - cb(kq_perm, "kq_soft_max_ext_perm", il); + if (!pp_opt) { + kq = ggml_permute(ctx0, kq, 0, 2, 1, 3); + cb(kq, "kq_soft_max_ext_perm", il); + } - struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq_perm); + struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq); cb(kqv_compressed, "kqv_compressed", il); - kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3); - cb(kqv_compressed, "kqv_compressed_perm", il); + if (!pp_opt) { + kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3); + cb(kqv_compressed, "kqv_compressed_perm", il); + } struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0); cb(wv_b, "wv_b", il); From 76543311acc85e1d77575728000f1979faa7591f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Thu, 30 Jan 2025 18:25:36 +0100 Subject: [PATCH 010/100] llama : avoid ggml_cont() is possible in DeepSeek V2 implementation --- src/llama.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5768f9215fea9..1a3d1d0bda9d2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6533,10 +6533,10 @@ struct llm_build_context { struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0); cb(wk_b, "wk_b", il); - struct ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); - cb(q_nope_perm, "q_nope_perm", il); + q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); + cb(q_nope, "q_nope_perm", il); - struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope_perm); + struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope); cb(q_nope2, "q_nope2", il); if (!pp_opt) { @@ -6547,6 +6547,11 @@ struct llm_build_context { struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2); cb(kq_nope, "kq_nope", il); + if (!pp_opt) { + kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3); + cb(kq_nope, "kq_nope_perm", il); + } + if (pp_opt) { q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3); cb(q_pe, "q_pe_perm", il); @@ -6555,14 +6560,14 @@ struct llm_build_context { struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe); cb(kq_pe, "kq_pe", il); - struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe); - cb(kq, "kq", il); - if (!pp_opt) { - kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3)); - cb(kq, "kq_perm", il); + kq_pe = ggml_permute(ctx0, kq_pe, 0, 2, 1, 3); + cb(kq_pe, "kq_pe_perm", il); } + struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe); + cb(kq, "kq", il); + kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); @@ -6575,7 +6580,7 @@ struct llm_build_context { cb(kqv_compressed, "kqv_compressed", il); if (!pp_opt) { - kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3); + kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 3, 1); cb(kqv_compressed, "kqv_compressed_perm", il); } @@ -6585,8 +6590,10 @@ struct llm_build_context { struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed); cb(kqv, "kqv", il); - kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3)); - cb(kqv, "kqv_perm", il); + if (pp_opt) { + kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3)); + cb(kqv, "kqv_perm", il); + } cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0); cb(cur, "kqv_2d", il); From 83a473a00133fe9ba66fec54cea3cae8df275ca4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sat, 1 Feb 2025 10:32:06 +0100 Subject: [PATCH 011/100] llama : use all experts during warmup --- src/llama.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 192b20a27e5ca..a8258becdfeb4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1092,7 +1092,8 @@ struct llm_build_context { llama_context & lctx, const llama_ubatch & ubatch, const llm_build_cb & cb, - bool worst_case) : + bool worst_case, + bool warmup) : model (lctx.model), lctx (lctx), hparams (model.hparams), @@ -1110,7 +1111,7 @@ struct llm_build_context { n_embd_head_v (hparams.n_embd_head_v), n_embd_v_gqa (hparams.n_embd_v_gqa()), n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), + n_expert_used (warmup ? hparams.n_expert : hparams.n_expert_used), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), @@ -8103,7 +8104,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - struct llm_build_context llm(lctx, dummy, cb, false); + struct llm_build_context llm(lctx, dummy, cb, false, false); llm.init(); @@ -8120,7 +8121,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - struct llm_build_context llm(lctx, dummy, cb, false); + struct llm_build_context llm(lctx, dummy, cb, false, false); llm.init(); @@ -8171,7 +8172,11 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_cgraph * result = NULL; - struct llm_build_context llm(lctx, ubatch, cb, worst_case); + const llama_vocab * vocab = llama_model_get_vocab(&model); + llama_token bos = llama_vocab_bos(vocab); + llama_token eos = llama_vocab_eos(vocab); + bool is_warming_up = (ubatch.n_tokens == 2 && ubatch.token[0] == bos && ubatch.token[1] == eos); + struct llm_build_context llm(lctx, ubatch, cb, worst_case, is_warming_up); llm.init(); From c8bc6e4ff4b9f1cb1e94eb56ddd10a95bd0108da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sat, 1 Feb 2025 12:43:14 +0100 Subject: [PATCH 012/100] llama : increased max_nodes as large MoE models use massive amounts of nodes during warmup --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 18bd0b071bb90..c958edb873a03 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3552,7 +3552,7 @@ size_t llama_model::size() const { } size_t llama_model::max_nodes() const { - return std::max(8192, tensors_by_name.size()*5); + return std::max(65536, tensors_by_name.size()*5); } size_t llama_model::n_devices() const { From 6c8d01a8bbe0d64491608089027c26ac85cce262 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 2 Feb 2025 17:23:32 +0100 Subject: [PATCH 013/100] add regex support --- src/llama-model.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f1cba4f39a676..f134d1bf1ef2a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -1464,7 +1465,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (ml.tensor_buft_overrides) { std::string tensor_name = tn.str(); for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { - if (tensor_name.find(overrides->pattern) != std::string::npos) { + std::regex pattern(overrides->pattern); + if (std::regex_search(tensor_name, pattern)) { LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); buft = overrides->buft; break; From 538f60934abd36f19598d74518cdef0ccd18a023 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Feb 2025 01:32:04 +0100 Subject: [PATCH 014/100] ggml : fix possible underflow in ggml_nbytes --- ggml/src/ggml.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3b48615421187..52c553e76b29f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1151,6 +1151,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) { } size_t ggml_nbytes(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] <= 0) { + return 0; + } + } + size_t nbytes; const size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { From 8770ffa60c0d0eac481f199f2da1bb6b622a8207 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 9 Feb 2025 00:32:52 +0100 Subject: [PATCH 015/100] rebuild buft list on every call --- common/arg.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5a98c4baf3a83..e796d0e85f946 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1485,13 +1485,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--override-tensor", "-ot"}, "=,...", "override tensor buffer type", [](common_params & params, const std::string & value) { - static std::map buft_list; + /* static */ std::map buft_list; if (buft_list.empty()) { // enumerate all the devices and add their buffer types to the list for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { auto * dev = ggml_backend_dev_get(i); auto * buft = ggml_backend_dev_buffer_type(dev); - buft_list[ggml_backend_buft_name(buft)] = buft; + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } } } From 0d4ff95b8270b481c4131795925a0b7abdc657bd Mon Sep 17 00:00:00 2001 From: Orca Date: Tue, 25 Feb 2025 20:41:08 +0800 Subject: [PATCH 016/100] can shift --- examples/server/server.cpp | 29 ++++++++++++++++++----------- src/llama-kv-cache.cpp | 2 +- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2306dc26fe431..c4db6642e9ef6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1671,9 +1671,8 @@ struct server_response { } void add_waiting_tasks(const std::vector & tasks) { - std::unique_lock lock(mutex_results); - for (const auto & task : tasks) { + std::unique_lock lock(mutex_results); SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); waiting_task_ids.insert(task.id); } @@ -1683,20 +1682,24 @@ struct server_response { void remove_waiting_task_id(int id_task) { SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); - std::unique_lock lock(mutex_results); - waiting_task_ids.erase(id_task); + { + std::unique_lock lock(mutex_results); + waiting_task_ids.erase(id_task); + } // make sure to clean up all pending results - queue_results.erase( - std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { - return res->id == id_task; - }), - queue_results.end()); + { + std::unique_lock lock(mutex_results); + queue_results.erase( + std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { + return res->id == id_task; + }), + queue_results.end()); + } } void remove_waiting_task_ids(const std::unordered_set & id_tasks) { - std::unique_lock lock(mutex_results); - for (const auto & id_task : id_tasks) { + std::unique_lock lock(mutex_results); SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); waiting_task_ids.erase(id_task); } @@ -3841,6 +3844,10 @@ int main(int argc, char ** argv) { // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); + if (prompt.contains("chat_history")) { + return; + } + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); tasks.reserve(tokenized_prompts.size()); for (size_t i = 0; i < tokenized_prompts.size(); i++) { diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index feffdf0de52cf..b5fbb3a25f0b6 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -32,7 +32,7 @@ bool llama_kv_cache_init( cache.recurrent = llama_model_is_recurrent(&model); cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + cache.can_shift = !cache.recurrent; // not supported due to MLA LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); From d256aa04cecdebe286bee5036c60cff852144e34 Mon Sep 17 00:00:00 2001 From: Orca Date: Tue, 25 Feb 2025 20:43:49 +0800 Subject: [PATCH 017/100] tmp --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- src/llama-kv-cache.cpp | 4 ++-- src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ebb2ccae04065..5725fc375cc89 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3248,7 +3248,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + const int min_batch_size = 9999999; return get_op_batch_size(op) >= min_batch_size; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 7ded592c289e1..4d43c692ea4bd 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -96,8 +96,8 @@ bool llama_kv_cache_init( return false; } - ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); - ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, 1); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, 1); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index ab50c5d179a29..cabb5f8f8cdd5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -776,7 +776,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { - new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + new_type = name.find("_exps") != std::string::npos ? name.find("ffn_down") != std::string::npos ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K : GGML_TYPE_BF16; } if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; diff --git a/src/llama.cpp b/src/llama.cpp index 6cefcc7912eb5..c1aa5380498c6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6407,7 +6407,7 @@ struct llm_build_context { // whether to use n_tokens as the matrix dimension during multiplication or n_head // n_tokens is higher during prompt processing, this allows to optimize for this case - bool pp_opt = n_tokens > n_head; + bool pp_opt = true; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; From d13d6ffdfd794257895ff771a30881b0dc60854e Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 09:58:40 +0800 Subject: [PATCH 018/100] support dynamic wkv --- src/llama-model.cpp | 89 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 83fa74a28ac2d..77f7a88cb1f16 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1422,7 +1422,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { try { info = llm_tensor_info_for(tn_tensor); } catch (const std::out_of_range & e) { - throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str())); + LLAMA_LOG_WARN("missing tensor info mapping for %s -- ignoring\n", tn.str().c_str()); + return nullptr; } // skip unused tensors @@ -2911,9 +2912,93 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0); - layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0); layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0); + if (!layer.wk_b || !layer.wv_b) { + auto wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); + if (!wkv_b) { + throw std::runtime_error("wkv_b must be defined without wk_b and wv_b"); + } + + // select the buffer type for this tensor + buft_list_t * buft_list = pimpl->dev_input.buft_list; + + ggml_backend_buffer_type_t buft = nullptr; + + // check overrides + if (ml.tensor_buft_overrides) { + std::string tensor_name = "blk."+ std::to_string(i) +".attn_kv_b.weight"; + for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { + std::regex pattern(overrides->pattern); + if (std::regex_search(tensor_name, pattern)) { + LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); + buft = overrides->buft; + break; + } + } + } + + // avoid using a host buffer when using mmap + auto * buft_dev = ggml_backend_buft_get_device(buft); + if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + buft = ggml_backend_dev_buffer_type(cpu_dev); + } + + ggml_context * ctx = ctx_for_buft(buft); + layer.wk_b = ggml_new_tensor_2d(ctx, + layer.wkv_b->type, + n_head_kv * kv_lora_rank, + n_embd_head_qk_nope + ); + { + float *src = (float *)layer.wkv_b->data; + float *dst = (float *)layer.wk_b->data; + int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 + + for (int h = 0; h < n_head_kv; ++h) { + int k_start = h * (n_embd_head_qk_nope + n_embd_head_v); + for (int row = 0; row < kv_lora_rank; ++row) { + for (int col = 0; col < n_embd_head_qk_nope; ++col) { + int src_idx = row * src_stride + k_start + col; + GGML_ASSERT(src_idx < ggml_nelements(layer.wkv_b)); + + int dst_row = h * kv_lora_rank + row; + int dst_col = col; + dst[dst_row * n_embd_head_qk_nope + dst_col] = src[src_idx]; + } + } + } + } + + layer.wv_b = ggml_new_tensor_2d( + ctx, + layer.wkv_b->type, + n_head_kv * n_embd_head_v, // 行数:合并头和特征维度 + kv_lora_rank // 列数:LoRA 秩 + ); + { + float *src = (float *)layer.wkv_b->data; + float *dst = (float *)layer.wv_b->data; + int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 + + for (int h = 0; h < n_head_kv; ++h) { + int v_start = h * (n_embd_head_qk_nope + n_embd_head_v) + n_embd_head_qk_nope; + for (int row = 0; row < kv_lora_rank; ++row) { + for (int col = 0; col < n_embd_head_v; ++col) { + // 源索引计算 + int src_idx = row * src_stride + v_start + col; + GGML_ASSERT(src_idx < ggml_nelements(layer.wkv_b)); + + // 目标索引计算 + int dst_row = h * n_embd_head_v + col; // 合并头和特征维度 + int dst_col = row; // LoRA 秩维度 + dst[dst_row * kv_lora_rank + dst_col] = src[src_idx]; + } + } + } + } + } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); From 822807b8e8813db7af16c1b497c0657a8c94ba84 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:01:10 +0800 Subject: [PATCH 019/100] ignore missing --- src/llama-model.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 77f7a88cb1f16..a43b338625c89 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1407,7 +1407,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (flags & TENSOR_NOT_REQUIRED) { return nullptr; } - throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); + LLAMA_LOG_WARN("missing tensor info mapping for %s -- ignoring\n", tn.str().c_str()); + return nullptr; } // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops From 9f75d93e21996096d340f9757042ea6569976b31 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:05:41 +0800 Subject: [PATCH 020/100] fix core dump --- src/llama-model.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a43b338625c89..51ebb4caf54c0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2948,12 +2948,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_context * ctx = ctx_for_buft(buft); layer.wk_b = ggml_new_tensor_2d(ctx, - layer.wkv_b->type, + wkv_b->type, n_head_kv * kv_lora_rank, n_embd_head_qk_nope ); + LLAMA_LOG_DEBUG("111\n", 0); { - float *src = (float *)layer.wkv_b->data; + float *src = (float *)wkv_b->data; float *dst = (float *)layer.wk_b->data; int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 @@ -2962,7 +2963,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (int row = 0; row < kv_lora_rank; ++row) { for (int col = 0; col < n_embd_head_qk_nope; ++col) { int src_idx = row * src_stride + k_start + col; - GGML_ASSERT(src_idx < ggml_nelements(layer.wkv_b)); + GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); int dst_row = h * kv_lora_rank + row; int dst_col = col; @@ -2974,12 +2975,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wv_b = ggml_new_tensor_2d( ctx, - layer.wkv_b->type, + wkv_b->type, n_head_kv * n_embd_head_v, // 行数:合并头和特征维度 kv_lora_rank // 列数:LoRA 秩 ); { - float *src = (float *)layer.wkv_b->data; + float *src = (float *)wkv_b->data; float *dst = (float *)layer.wv_b->data; int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 @@ -2989,7 +2990,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (int col = 0; col < n_embd_head_v; ++col) { // 源索引计算 int src_idx = row * src_stride + v_start + col; - GGML_ASSERT(src_idx < ggml_nelements(layer.wkv_b)); + GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); // 目标索引计算 int dst_row = h * n_embd_head_v + col; // 合并头和特征维度 From dafd46a6cd9b79363bfd2c7a2477b01505008ea7 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:08:01 +0800 Subject: [PATCH 021/100] add debug log --- src/llama-model.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 51ebb4caf54c0..7fcde49176e39 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2958,6 +2958,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { float *dst = (float *)layer.wk_b->data; int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 + LLAMA_LOG_DEBUG("222\n", 0); for (int h = 0; h < n_head_kv; ++h) { int k_start = h * (n_embd_head_qk_nope + n_embd_head_v); for (int row = 0; row < kv_lora_rank; ++row) { @@ -2968,6 +2969,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { int dst_row = h * kv_lora_rank + row; int dst_col = col; dst[dst_row * n_embd_head_qk_nope + dst_col] = src[src_idx]; + LLAMA_LOG_DEBUG("333 row: %d, col: %d\n", row, col); } } } @@ -2979,11 +2981,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { n_head_kv * n_embd_head_v, // 行数:合并头和特征维度 kv_lora_rank // 列数:LoRA 秩 ); + LLAMA_LOG_DEBUG("444\n", 0); { float *src = (float *)wkv_b->data; float *dst = (float *)layer.wv_b->data; int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 + LLAMA_LOG_DEBUG("555\n", 0); for (int h = 0; h < n_head_kv; ++h) { int v_start = h * (n_embd_head_qk_nope + n_embd_head_v) + n_embd_head_qk_nope; for (int row = 0; row < kv_lora_rank; ++row) { @@ -2996,6 +3000,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { int dst_row = h * n_embd_head_v + col; // 合并头和特征维度 int dst_col = row; // LoRA 秩维度 dst[dst_row * kv_lora_rank + dst_col] = src[src_idx]; + LLAMA_LOG_DEBUG("666 row: %d, col: %d\n", row, col); } } } From 6277a0ee82748fc11778caa264eb1dbfb5ddddb9 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:09:33 +0800 Subject: [PATCH 022/100] debug --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7fcde49176e39..6c5839d4a2fcc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2963,13 +2963,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { int k_start = h * (n_embd_head_qk_nope + n_embd_head_v); for (int row = 0; row < kv_lora_rank; ++row) { for (int col = 0; col < n_embd_head_qk_nope; ++col) { + LLAMA_LOG_DEBUG("333 row: %d, col: %d\n", row, col); int src_idx = row * src_stride + k_start + col; GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); int dst_row = h * kv_lora_rank + row; int dst_col = col; dst[dst_row * n_embd_head_qk_nope + dst_col] = src[src_idx]; - LLAMA_LOG_DEBUG("333 row: %d, col: %d\n", row, col); } } } @@ -2992,6 +2992,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { int v_start = h * (n_embd_head_qk_nope + n_embd_head_v) + n_embd_head_qk_nope; for (int row = 0; row < kv_lora_rank; ++row) { for (int col = 0; col < n_embd_head_v; ++col) { + LLAMA_LOG_DEBUG("666 row: %d, col: %d\n", row, col); // 源索引计算 int src_idx = row * src_stride + v_start + col; GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); @@ -3000,7 +3001,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { int dst_row = h * n_embd_head_v + col; // 合并头和特征维度 int dst_col = row; // LoRA 秩维度 dst[dst_row * kv_lora_rank + dst_col] = src[src_idx]; - LLAMA_LOG_DEBUG("666 row: %d, col: %d\n", row, col); } } } From 88536f706055af2b16c225c5a051c4aff1ac6c59 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:14:24 +0800 Subject: [PATCH 023/100] debug log --- src/llama-model.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6c5839d4a2fcc..e01bab23552d9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2946,29 +2946,30 @@ bool llama_model::load_tensors(llama_model_loader & ml) { buft = ggml_backend_dev_buffer_type(cpu_dev); } + LLAMA_LOG_INFO("n_head_kv: %d, kv_lora_rank: %d, n_embd_head_qk_nope: %d\n", n_head_kv, kv_lora_rank, n_embd_head_qk_nope); ggml_context * ctx = ctx_for_buft(buft); layer.wk_b = ggml_new_tensor_2d(ctx, wkv_b->type, n_head_kv * kv_lora_rank, n_embd_head_qk_nope ); - LLAMA_LOG_DEBUG("111\n", 0); + LLAMA_LOG_INFO("wk_b shape: [%d, %d]\n", layer.wk_b->ne[0], layer.wk_b->ne[1]); { float *src = (float *)wkv_b->data; float *dst = (float *)layer.wk_b->data; int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 - LLAMA_LOG_DEBUG("222\n", 0); for (int h = 0; h < n_head_kv; ++h) { int k_start = h * (n_embd_head_qk_nope + n_embd_head_v); for (int row = 0; row < kv_lora_rank; ++row) { for (int col = 0; col < n_embd_head_qk_nope; ++col) { - LLAMA_LOG_DEBUG("333 row: %d, col: %d\n", row, col); + LLAMA_LOG_INFO("wk_b row: %d, col: %d\n", row, col); int src_idx = row * src_stride + k_start + col; GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); int dst_row = h * kv_lora_rank + row; int dst_col = col; + LLAMA_LOG_INFO("wk_b dst_row: %d, dst_col: %d\n", dst_row, dst_col); dst[dst_row * n_embd_head_qk_nope + dst_col] = src[src_idx]; } } @@ -2981,18 +2982,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) { n_head_kv * n_embd_head_v, // 行数:合并头和特征维度 kv_lora_rank // 列数:LoRA 秩 ); - LLAMA_LOG_DEBUG("444\n", 0); + LLAMA_LOG_INFO("wv_b shape: [%d, %d]\n", layer.wv_b->ne[0], layer.wv_b->ne[1]); { float *src = (float *)wkv_b->data; float *dst = (float *)layer.wv_b->data; int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 - LLAMA_LOG_DEBUG("555\n", 0); for (int h = 0; h < n_head_kv; ++h) { int v_start = h * (n_embd_head_qk_nope + n_embd_head_v) + n_embd_head_qk_nope; for (int row = 0; row < kv_lora_rank; ++row) { for (int col = 0; col < n_embd_head_v; ++col) { - LLAMA_LOG_DEBUG("666 row: %d, col: %d\n", row, col); + LLAMA_LOG_INFO("wv_b row: %d, col: %d\n", row, col); // 源索引计算 int src_idx = row * src_stride + v_start + col; GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); @@ -3000,6 +3000,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // 目标索引计算 int dst_row = h * n_embd_head_v + col; // 合并头和特征维度 int dst_col = row; // LoRA 秩维度 + LLAMA_LOG_INFO("wv_b dst_row: %d, dst_col: %d\n", dst_row, dst_col); dst[dst_row * kv_lora_rank + dst_col] = src[src_idx]; } } From fe68015f9907d129f6a0a7773ec56754d6f0f851 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:16:39 +0800 Subject: [PATCH 024/100] add log --- src/llama-model.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e01bab23552d9..74454b677717f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2946,6 +2946,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { buft = ggml_backend_dev_buffer_type(cpu_dev); } + LLAMA_LOG_INFO("wkv_b shape: [%d, %d]\n", wkv_b->ne[0], wkv_b->ne[1]); LLAMA_LOG_INFO("n_head_kv: %d, kv_lora_rank: %d, n_embd_head_qk_nope: %d\n", n_head_kv, kv_lora_rank, n_embd_head_qk_nope); ggml_context * ctx = ctx_for_buft(buft); layer.wk_b = ggml_new_tensor_2d(ctx, @@ -2965,6 +2966,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (int col = 0; col < n_embd_head_qk_nope; ++col) { LLAMA_LOG_INFO("wk_b row: %d, col: %d\n", row, col); int src_idx = row * src_stride + k_start + col; + LLAMA_LOG_INFO("src_idx: %d\n", src_idx); GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); int dst_row = h * kv_lora_rank + row; @@ -2995,6 +2997,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { LLAMA_LOG_INFO("wv_b row: %d, col: %d\n", row, col); // 源索引计算 int src_idx = row * src_stride + v_start + col; + LLAMA_LOG_INFO("src_idx: %d\n", src_idx); GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); // 目标索引计算 From 93674de625d519f2dd894ae8c19460df7744e450 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:20:12 +0800 Subject: [PATCH 025/100] debug --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 74454b677717f..06bc42e3e6bf4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2946,7 +2946,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { buft = ggml_backend_dev_buffer_type(cpu_dev); } - LLAMA_LOG_INFO("wkv_b shape: [%d, %d]\n", wkv_b->ne[0], wkv_b->ne[1]); + LLAMA_LOG_INFO("wkv_b shape: [%d, %d], type: %d\n", wkv_b->ne[0], wkv_b->ne[1], int(wkv_b->type)); LLAMA_LOG_INFO("n_head_kv: %d, kv_lora_rank: %d, n_embd_head_qk_nope: %d\n", n_head_kv, kv_lora_rank, n_embd_head_qk_nope); ggml_context * ctx = ctx_for_buft(buft); layer.wk_b = ggml_new_tensor_2d(ctx, From 770184d7dba4e1204f447e15ae01018d7ecc7fb6 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 10:57:26 +0800 Subject: [PATCH 026/100] tmp --- src/llama-model.cpp | 68 ++++----------------------------------------- 1 file changed, 6 insertions(+), 62 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 06bc42e3e6bf4..51c56d144f645 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2913,11 +2913,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0); + layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0); layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0); if (!layer.wk_b || !layer.wv_b) { - auto wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); - if (!wkv_b) { + if (!layer.wkv_b) { throw std::runtime_error("wkv_b must be defined without wk_b and wv_b"); } @@ -2946,69 +2946,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { buft = ggml_backend_dev_buffer_type(cpu_dev); } - LLAMA_LOG_INFO("wkv_b shape: [%d, %d], type: %d\n", wkv_b->ne[0], wkv_b->ne[1], int(wkv_b->type)); + LLAMA_LOG_INFO("wkv_b shape: [%d, %d], type: %d\n", layer.wkv_b->ne[0], layer.wkv_b->ne[1], int(layer.wkv_b->type)); LLAMA_LOG_INFO("n_head_kv: %d, kv_lora_rank: %d, n_embd_head_qk_nope: %d\n", n_head_kv, kv_lora_rank, n_embd_head_qk_nope); ggml_context * ctx = ctx_for_buft(buft); - layer.wk_b = ggml_new_tensor_2d(ctx, - wkv_b->type, - n_head_kv * kv_lora_rank, - n_embd_head_qk_nope - ); - LLAMA_LOG_INFO("wk_b shape: [%d, %d]\n", layer.wk_b->ne[0], layer.wk_b->ne[1]); - { - float *src = (float *)wkv_b->data; - float *dst = (float *)layer.wk_b->data; - int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 - - for (int h = 0; h < n_head_kv; ++h) { - int k_start = h * (n_embd_head_qk_nope + n_embd_head_v); - for (int row = 0; row < kv_lora_rank; ++row) { - for (int col = 0; col < n_embd_head_qk_nope; ++col) { - LLAMA_LOG_INFO("wk_b row: %d, col: %d\n", row, col); - int src_idx = row * src_stride + k_start + col; - LLAMA_LOG_INFO("src_idx: %d\n", src_idx); - GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); - - int dst_row = h * kv_lora_rank + row; - int dst_col = col; - LLAMA_LOG_INFO("wk_b dst_row: %d, dst_col: %d\n", dst_row, dst_col); - dst[dst_row * n_embd_head_qk_nope + dst_col] = src[src_idx]; - } - } - } - } - layer.wv_b = ggml_new_tensor_2d( - ctx, - wkv_b->type, - n_head_kv * n_embd_head_v, // 行数:合并头和特征维度 - kv_lora_rank // 列数:LoRA 秩 - ); - LLAMA_LOG_INFO("wv_b shape: [%d, %d]\n", layer.wv_b->ne[0], layer.wv_b->ne[1]); - { - float *src = (float *)wkv_b->data; - float *dst = (float *)layer.wv_b->data; - int src_stride = wkv_b->ne[0]; // 原始张量每行的元素数 - - for (int h = 0; h < n_head_kv; ++h) { - int v_start = h * (n_embd_head_qk_nope + n_embd_head_v) + n_embd_head_qk_nope; - for (int row = 0; row < kv_lora_rank; ++row) { - for (int col = 0; col < n_embd_head_v; ++col) { - LLAMA_LOG_INFO("wv_b row: %d, col: %d\n", row, col); - // 源索引计算 - int src_idx = row * src_stride + v_start + col; - LLAMA_LOG_INFO("src_idx: %d\n", src_idx); - GGML_ASSERT(src_idx < ggml_nelements(wkv_b)); - - // 目标索引计算 - int dst_row = h * n_embd_head_v + col; // 合并头和特征维度 - int dst_col = row; // LoRA 秩维度 - LLAMA_LOG_INFO("wv_b dst_row: %d, dst_col: %d\n", dst_row, dst_col); - dst[dst_row * kv_lora_rank + dst_col] = src[src_idx]; - } - } - } - } + auto trans_wkv_b = ggml_transpose(ctx, layer.wkv_b); + layer.wk_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_qk_nope, n_head, 0); + layer.wv_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head); } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); From 83fb5b895b4f2b8a96f726f89fc585a1e64e6dc8 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:06:59 +0800 Subject: [PATCH 027/100] tmp --- src/llama-kv-cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 4d43c692ea4bd..1905ad9273019 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -32,7 +32,7 @@ bool llama_kv_cache_init( cache.recurrent = llama_model_is_recurrent(&model); cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent; // not supported due to MLA + cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); From 7dba6fb0dc76d918b721ae911ce03e9001e4e15c Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:07:54 +0800 Subject: [PATCH 028/100] remove log --- src/llama-model.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 51c56d144f645..25a51047c28fe 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2946,8 +2946,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { buft = ggml_backend_dev_buffer_type(cpu_dev); } - LLAMA_LOG_INFO("wkv_b shape: [%d, %d], type: %d\n", layer.wkv_b->ne[0], layer.wkv_b->ne[1], int(layer.wkv_b->type)); - LLAMA_LOG_INFO("n_head_kv: %d, kv_lora_rank: %d, n_embd_head_qk_nope: %d\n", n_head_kv, kv_lora_rank, n_embd_head_qk_nope); ggml_context * ctx = ctx_for_buft(buft); auto trans_wkv_b = ggml_transpose(ctx, layer.wkv_b); From fd32a4391d2bc0a620f17ef34d34ead4c2b3e387 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:18:31 +0800 Subject: [PATCH 029/100] add lock-free hash_map --- examples/server/atomic_hash_map.hpp | 596 ++++++++++++++++++++++++++++ examples/server/server.cpp | 85 ++-- 2 files changed, 635 insertions(+), 46 deletions(-) create mode 100644 examples/server/atomic_hash_map.hpp diff --git a/examples/server/atomic_hash_map.hpp b/examples/server/atomic_hash_map.hpp new file mode 100644 index 0000000000000..be6d78e87aca8 --- /dev/null +++ b/examples/server/atomic_hash_map.hpp @@ -0,0 +1,596 @@ +/* + hash_map -- Lock-Free Hash Map port from folly::AtomicUnorderedInsertMap for C++. + + Copyright (c) 2010-2017 + + This library is released under the MIT License. + Please see LICENSE file or visit https://github.com/ez8-co/atomic for details. + */ +#pragma once + +#include +#include +#include +#include +#include + +#ifdef _MSC_VER + #include + #define LIKELY(x) (x) + #define UNLIKELY(x) (x) +#else + #define LIKELY(x) (__builtin_expect((x), 1)) + #define UNLIKELY(x) (__builtin_expect((x), 0)) +#endif + +#if __cplusplus >= 201103L || _MSC_VER >= 1700 + #include +#else +namespace std { + + typedef enum memory_order { + memory_order_relaxed, + memory_order_consume, + memory_order_acquire, + memory_order_release, + memory_order_acq_rel, + memory_order_seq_cst + } memory_order; + +#ifdef _MSC_VER + template + struct interlocked {}; + + template + struct interlocked { + static inline T incre(T volatile* x) { + return static_cast(_InterlockedIncrement(reinterpret_cast(x))); + } + static inline T decre(T volatile* x) { + return static_cast(_InterlockedDecrement(reinterpret_cast(x))); + } + static inline T add(T volatile* x, T delta) { + return static_cast(_InterlockedExchangeAdd(reinterpret_cast(x), delta)); + } + static inline T compare_exchange(T volatile* x, const T new_val, const T expected_val) { + return static_cast( + _InterlockedCompareExchange(reinterpret_cast(x), + static_cast(new_val), static_cast(expected_val))); + } + static inline T exchange(T volatile* x, const T new_val) { + return static_cast( + _InterlockedExchange( + reinterpret_cast(x), static_cast(new_val))); + } + }; + + template + struct interlocked { + static inline T incre(T volatile* x) { +#ifdef WIN64 + return static_cast(_InterlockedIncrement64(reinterpret_cast(x))); +#else + return add(x, 1); +#endif // WIN64 + } + static inline T decre(T volatile* x) { +#ifdef WIN64 + return static_cast(_InterlockedDecrement64(reinterpret_cast(x))); +#else + return add(x, -1); +#endif // WIN64 + } + static inline T add(T volatile* x, T delta) { +#ifdef WIN64 + return static_cast(_InterlockedExchangeAdd64(reinterpret_cast(x), delta)); +#else + __int64 old_val, new_val; + do { + old_val = static_cast<__int64>(*x); + new_val = old_val + static_cast<__int64>(delta); + } while (_InterlockedCompareExchange64( + reinterpret_cast(x), new_val, old_val) != + old_val); + return static_cast(new_val); +#endif // WIN64 + } + static inline T compare_exchange(T volatile* x, const T new_val, const T expected_val) { + return static_cast( + _InterlockedCompareExchange64(reinterpret_cast(x), + static_cast(new_val), static_cast(expected_val))); + } + static inline T exchange(T volatile* x, const T new_val) { +#ifdef WIN64 + return static_cast( + _InterlockedExchange64(reinterpret_cast(x), + static_cast(new_val))); +#else + __int64 old_val; + do { + old_val = static_cast<__int64>(*x); + } while (_InterlockedCompareExchange64( + reinterpret_cast(x), new_val, old_val) != + old_val); + return static_cast(old_val); +#endif // WIN64 + } + }; + +#else + + template + struct hash {}; + + template<> + struct hash { + inline size_t operator()(size_t v) const { return v; } + }; + +#endif + + template + class atomic { + public: + atomic() : value_(static_cast(0)) {} + explicit atomic(const T value) : value_(value) {} + + T operator++() { + #ifdef _MSC_VER + return interlocked::incre(&value_); + #else + return __atomic_add_fetch(&value_, 1, __ATOMIC_SEQ_CST); + #endif + } + + T operator++(int) { + T v = load(); ++(*this); return v; + } + + T operator--() { + #ifdef _MSC_VER + return interlocked::decre(&value_); + #else + return __atomic_sub_fetch(&value_, 1, __ATOMIC_SEQ_CST); + #endif + } + + T operator+=(T v) { + #ifdef _MSC_VER + return interlocked::add(&value_, v); + #else + return __atomic_add_fetch(&value_, v, __ATOMIC_SEQ_CST); + #endif + } + + bool compare_exchange_strong(T& expected_val, T new_val, memory_order order = memory_order_seq_cst) { + #ifdef _MSC_VER + return expected_val == interlocked::compare_exchange(&value_, new_val, expected_val); + #else + return __atomic_compare_exchange_n(&value_, &expected_val, new_val, 0, order, __ATOMIC_SEQ_CST); + #endif + } + + void store(const T new_val, memory_order order = memory_order_seq_cst) { + #ifdef _MSC_VER + interlocked::exchange(&value_, new_val); + #else + __atomic_store_n(&value_, new_val, order); + #endif + } + + T load(memory_order order = memory_order_seq_cst) const { + #ifdef _MSC_VER + return interlocked::add(const_cast(&value_), 0); + #else + return __atomic_load_n(&value_, order); + #endif + } + + T operator=(const T new_value) { + store(new_value); + return new_value; + } + + operator T() const { + return load(); + } + + private: + volatile T value_; + }; +} +#endif + +/* +* Copyright 2013-present Facebook, Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +namespace atomic { + + size_t nextPowTwo(size_t v) { + #ifdef _MSC_VER + unsigned long x = 0; + _BitScanForward(&x, v - 1); + #else + int x = __builtin_clzll(v - 1); + #endif + return v ? (size_t(1) << (v - 1 ? (((sizeof(unsigned long long) << 3) - 1) ^ x) + 1 : 0)) : 1; + } + + template < + typename Key, + typename Value, + typename Hash = std::hash, + typename KeyEqual = std::equal_to, + template class Atom = std::atomic, + typename IndexType = size_t, + typename Allocator = std::allocator > + + struct hash_map { + + typedef Key key_type; + typedef Value mapped_type; + typedef std::pair value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef Hash hasher; + typedef KeyEqual key_equal; + typedef const value_type& const_reference; + + typedef struct ConstIterator : public std::iterator { + ConstIterator(const hash_map& owner, IndexType slot) + : owner_(owner) + , slot_(slot) + {} + + const value_type& operator*() const { + return owner_.slots_[slot_].keyValue(); + } + + const value_type* operator->() const { + return &owner_.slots_[slot_].keyValue(); + } + + // pre-increment + const ConstIterator& operator++() { + while (slot_ > 0) { + --slot_; + if (owner_.slots_[slot_].state() == LINKED) { + break; + } + } + return *this; + } + + // post-increment + ConstIterator operator++(int /* dummy */) { + ConstIterator prev = *this; + ++*this; + return prev; + } + + bool operator==(const ConstIterator& rhs) const { + return slot_ == rhs.slot_; + } + bool operator!=(const ConstIterator& rhs) const { + return !(*this == rhs); + } + + private: + const hash_map& owner_; + IndexType slot_; + } const_iterator; + + friend ConstIterator; + + explicit hash_map(size_t maxSize, + float maxLoadFactor = 0.8f, + const Allocator& alloc = Allocator()) + : allocator_(alloc) + { + size_t capacity = size_t(maxSize / (maxLoadFactor > 1.0f ? 1.0f : maxLoadFactor) + 128); + size_t avail = size_t(1) << (8 * sizeof(IndexType) - 2); + if (capacity > avail && maxSize < avail) { + // we'll do our best + capacity = avail; + } + if (capacity < maxSize || capacity > avail) { + throw std::invalid_argument( + "hash_map capacity must fit in IndexType with 2 bits " + "left over"); + } + + numSlots_ = capacity; + slotMask_ = nextPowTwo(capacity * 4) - 1; + mmapRequested_ = sizeof(Slot) * capacity; + slots_ = reinterpret_cast(allocator_.allocate(mmapRequested_)); + memset(slots_, 0, mmapRequested_); + // mark the zero-th slot as in-use but not valid, since that happens + // to be our nil value + slots_[0].stateUpdate(EMPTY, CONSTRUCTING); + } + + ~hash_map() { + for (size_t i = 1; i < numSlots_; ++i) { + slots_[i].~Slot(); + } + allocator_.deallocate(reinterpret_cast(slots_), mmapRequested_); + } + + template + std::pair findOrConstruct(const Key& key, Func func, const V* value) { + IndexType const slot = keyToSlotIdx(key); + IndexType prev = slots_[slot].headAndState_.load(std::memory_order_acquire); + + IndexType existing = find(key, slot); + if (existing) + return std::make_pair(ConstIterator(*this, existing), false); + + IndexType idx = allocateNear(slot); + // allocaion failed, return fake element + if (!idx) + return std::make_pair(ConstIterator(*this, idx), false); + new (&slots_[idx].keyValue().first) Key(key); + func(static_cast(&slots_[idx].keyValue().second), value); + + while (true) { + slots_[idx].next_ = prev >> 2; + + // we can merge the head update and the CONSTRUCTING -> LINKED update + // into a single CAS if slot == idx (which should happen often) + IndexType after = idx << 2; + if (slot == idx) + after += LINKED; + else + after += (prev & 3); + + if (slots_[slot].headAndState_.compare_exchange_strong(prev, after)) { + // success + if (idx != slot) + slots_[idx].stateUpdate(CONSTRUCTING, LINKED); + return std::make_pair(ConstIterator(*this, idx), true); + } + // compare_exchange_strong updates its first arg on failure, so + // there is no need to reread prev + + existing = find(key, slot); + if (existing) { + // our allocated key and value are no longer needed + slots_[idx].keyValue().first.~Key(); + slots_[idx].keyValue().second.~Value(); + slots_[idx].stateUpdate(CONSTRUCTING, EMPTY); + + return std::make_pair(ConstIterator(*this, existing), false); + } + } + } + + template + std::pair insert(const K& key, const V& value) { + return findOrConstruct(key, &hash_map::copyCtor, &value); + } + + const_iterator find(const Key& key) const { + return ConstIterator(*this, find(key, keyToSlotIdx(key))); + } + + const_iterator cbegin() const { + IndexType slot = numSlots_ - 1; + while (slot > 0 && slots_[slot].state() != LINKED) { + --slot; + } + return ConstIterator(*this, slot); + } + + const_iterator cend() const { + return ConstIterator(*this, 0); + } + + // Add by orca.zhang@yahoo.com + void clear() { + for (size_t i = 1; i < numSlots_; ++i) { + slots_[i].~Slot(); + } + memset(slots_, 0, mmapRequested_); + slots_[0].stateUpdate(EMPTY, CONSTRUCTING); + } + + // Add by orca.zhang@yahoo.com + bool erase(const Key& key) const { + KeyEqual ke; + IndexType slot = keyToSlotIdx(key); + IndexType hs = slots_[slot].headAndState_.load(std::memory_order_acquire); + IndexType last_slot = 0; + for (IndexType idx = hs >> 2; idx != 0; idx = slots_[idx].next_) { + if (ke(key, slots_[idx].keyValue().first)) { + if (!last_slot) + slots_[slot].headAndState_ = (slots_[idx].next_ & (unsigned)-4) | (hs & 3); + else + slots_[last_slot].next_ = slots_[idx].next_; + slots_[idx].~Slot(); + slots_[idx].stateUpdate(LINKED, EMPTY); + return true; + } + last_slot = idx; + } + return false; + } + + private: + enum { + kMaxAllocationTries = 1000, // after this we throw + }; + + typedef IndexType BucketState; + + enum { + EMPTY = 0, + CONSTRUCTING = 1, + LINKED = 2, + }; + + /// Lock-free insertion is easiest by prepending to collision chains. + /// A large chaining hash table takes two cache misses instead of + /// one, however. Our solution is to colocate the bucket storage and + /// the head storage, so that even though we are traversing chains we + /// are likely to stay within the same cache line. Just make sure to + /// traverse head before looking at any keys. This strategy gives us + /// 32 bit pointers and fast iteration. + struct Slot { + /// The bottom two bits are the BucketState, the rest is the index + /// of the first bucket for the chain whose keys map to this slot. + /// When things are going well the head usually links to this slot, + /// but that doesn't always have to happen. + Atom headAndState_; + + /// The next bucket in the chain + IndexType next_; + + /// Key and Value + unsigned char raw_[sizeof(value_type)]; + + ~Slot() { + BucketState s = state(); + assert(s == EMPTY || s == LINKED); + if (s == LINKED) { + keyValue().first.~Key(); + keyValue().second.~Value(); + } + } + + BucketState state() const { + return BucketState(headAndState_.load(std::memory_order_acquire) & 3); + } + + void stateUpdate(BucketState before, BucketState after) { + assert(state() == before); + headAndState_ += (after - before); + } + + value_type& keyValue() { + assert(state() != EMPTY); + union { + unsigned char* p; + value_type* v; + } u; + u.p = raw_; + return *u.v; + } + + const value_type& keyValue() const { + assert(state() != EMPTY); + union { + unsigned char* p; + value_type* v; + } u; + u.p = raw_; + return *u.v; + } + + }; + + // We manually manage the slot memory so we can bypass initialization + // (by getting a zero-filled mmap chunk) and optionally destruction of + // the slots + + size_t mmapRequested_; + size_t numSlots_; + + /// tricky, see keyToSlodIdx + size_t slotMask_; + + Allocator allocator_; + Slot* slots_; + + IndexType keyToSlotIdx(const Key& key) const { + size_t h = hasher()(key); + h &= slotMask_; + while (h >= numSlots_) { + h -= numSlots_; + } + return h; + } + + IndexType find(const Key& key, IndexType slot) const { + KeyEqual ke; + IndexType hs = slots_[slot].headAndState_.load(std::memory_order_acquire); + for (slot = hs >> 2; slot != 0; slot = slots_[slot].next_) { + if (ke(key, slots_[slot].keyValue().first)) { + return slot; + } + } + return 0; + } + + /// Allocates a slot and returns its index. Tries to put it near + /// slots_[start]. + IndexType allocateNear(IndexType start) { + for (IndexType tries = 0; tries < kMaxAllocationTries; ++tries) { + IndexType slot = allocationAttempt(start, tries); + IndexType prev = slots_[slot].headAndState_.load(std::memory_order_acquire); + if ((prev & 3) == EMPTY && + slots_[slot].headAndState_.compare_exchange_strong( + prev, prev + CONSTRUCTING - EMPTY)) { + return slot; + } + } + return 0; // return fake element rather than throw exception to ignore overflow + // throw std::bad_alloc(); + } + + /// Returns the slot we should attempt to allocate after tries failed + /// tries, starting from the specified slot. This is pulled out so we + /// can specialize it differently during deterministic testing + IndexType allocationAttempt(IndexType start, IndexType tries) const { + if (LIKELY(tries < 8 && start + tries < numSlots_)) { + return IndexType(start + tries); + } else { + IndexType rv; + if (sizeof(IndexType) <= 4) { + rv = IndexType(rand() % numSlots_); + } else { + rv = IndexType(((int64_t(rand()) << 32) + rand()) % numSlots_); + } + assert(rv < numSlots_); + return rv; + } + } + + template + static void copyCtor(void* raw, const V* v) { + assert(v); + new (raw) Value(*v); + } + }; + + /// MutableAtom is a tiny wrapper than gives you the option of atomically + /// updating values inserted into an hash_map>. This relies on hash_map's guarantee + /// that it doesn't move values. + template class Atom = std::atomic> + struct MutableAtom { + mutable Atom data; + explicit MutableAtom(const T& init) : data(init) {} + }; + + /// MutableData is a tiny wrapper than gives you the option of using an + /// external concurrency control mechanism to updating values inserted + /// into an hash_map. + template + struct MutableData { + mutable T data; + explicit MutableData(const T& init) : data(init) {} + }; + +} // namespace atomic \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c4db6642e9ef6..d4c4cfd3b7dfe 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -18,6 +18,8 @@ #include "index.html.gz.hpp" #include "loading.html.hpp" +#include "atomic_hash_map.hpp" + #include #include #include @@ -1654,10 +1656,10 @@ struct server_queue { struct server_response { // for keeping track of all tasks waiting for the result - std::unordered_set waiting_task_ids; + atomic::hash_map waiting_task_ids; // the main result queue (using ptr for polymorphism) - std::vector queue_results; + atomic::hash_map queue_results; std::mutex mutex_results; std::condition_variable condition_results; @@ -1665,41 +1667,26 @@ struct server_response { // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size()); - - std::unique_lock lock(mutex_results); - waiting_task_ids.insert(id_task); + waiting_task_ids.insert(id_task, 0); } void add_waiting_tasks(const std::vector & tasks) { for (const auto & task : tasks) { - std::unique_lock lock(mutex_results); SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); - waiting_task_ids.insert(task.id); + waiting_task_ids.insert(task.id, 0); } } // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); - - { - std::unique_lock lock(mutex_results); - waiting_task_ids.erase(id_task); - } + waiting_task_ids.erase(id_task); // make sure to clean up all pending results - { - std::unique_lock lock(mutex_results); - queue_results.erase( - std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { - return res->id == id_task; - }), - queue_results.end()); - } + queue_results.erase(id_task); } void remove_waiting_task_ids(const std::unordered_set & id_tasks) { for (const auto & id_task : id_tasks) { - std::unique_lock lock(mutex_results); SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); waiting_task_ids.erase(id_task); } @@ -1708,18 +1695,18 @@ struct server_response { // This function blocks the thread until there is a response for one of the id_tasks server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return !queue_results.empty(); - }); - - for (size_t i = 0; i < queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); - queue_results.erase(queue_results.begin() + i); + for (size_t i = 0; i < id_tasks.size(); i++) { + auto iter = queue_results.find(id_tasks[i]); + if (iter != queue_results.cend()) { + server_task_result_ptr res = std::move(iter->second); + queue_results.erase(id_tasks[i]); return res; } } + + condition_results.wait(mutex_results, [&]{ + return !queue_results.empty(); + }); } // should never reach here @@ -1729,12 +1716,11 @@ struct server_response { // if timeout is reached, nullptr is returned server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { while (true) { - std::unique_lock lock(mutex_results); - - for (int i = 0; i < (int) queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); - queue_results.erase(queue_results.begin() + i); + for (size_t i = 0; i < id_tasks.size(); i++) { + auto iter = queue_results.find(id_tasks[i]); + if (iter != queue_results.cend()) { + server_task_result_ptr res = std::move(iter->second); + queue_results.erase(id_tasks[i]); return res; } } @@ -1750,23 +1736,30 @@ struct server_response { // single-task version of recv() server_task_result_ptr recv(int id_task) { - std::unordered_set id_tasks = {id_task}; - return recv(id_tasks); + while (true) { + auto iter = queue_results.find(id_task); + if (iter != queue_results.cend()) { + server_task_result_ptr res = std::move(iter->second); + queue_results.erase(id_task); + return res; + } + + condition_results.wait(mutex_results, [&]{ + return !queue_results.empty(); + }); + } } // Send a new result to a waiting id_task void send(server_task_result_ptr && result) { SRV_DBG("sending result for task id = %d\n", result->id); - std::unique_lock lock(mutex_results); - for (const auto & id_task : waiting_task_ids) { - if (result->id == id_task) { - SRV_DBG("task id = %d pushed to result queue\n", result->id); + if (waiting_task_ids.find(result->id) != waiting_task_ids.cend()) { + SRV_DBG("task id = %d pushed to result queue\n", result->id); - queue_results.emplace_back(std::move(result)); - condition_results.notify_all(); - return; - } + queue_results.insert(result->id, std::move(result)); + condition_results.notify_all(); + return; } } }; From b092a2c632fec97593347a4953dc45e3588020c1 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:22:00 +0800 Subject: [PATCH 030/100] tmp --- src/llama-model.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 25a51047c28fe..065c6ea0927ff 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2949,8 +2949,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_context * ctx = ctx_for_buft(buft); auto trans_wkv_b = ggml_transpose(ctx, layer.wkv_b); - layer.wk_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_qk_nope, n_head, 0); - layer.wv_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head); + auto wkv_b_copied = ggml_dup(ctx, trans_wkv_b); + layer.wk_b = ggml_view_2d(ctx, wkv_b_copied, wkv_b_copied->ne[0], n_embd_head_qk_nope, n_head, 0); + layer.wv_b = ggml_view_2d(ctx, wkv_b_copied, wkv_b_copied->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head); } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); From 2ffbc62e29f4c065886b8507fe98040186bccbf3 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:24:12 +0800 Subject: [PATCH 031/100] fix --- examples/server/server.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d4c4cfd3b7dfe..c216bb842474c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1695,11 +1695,11 @@ struct server_response { // This function blocks the thread until there is a response for one of the id_tasks server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { - for (size_t i = 0; i < id_tasks.size(); i++) { - auto iter = queue_results.find(id_tasks[i]); + for (const auto & id_task : id_tasks) { + auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { server_task_result_ptr res = std::move(iter->second); - queue_results.erase(id_tasks[i]); + queue_results.erase(id_task); return res; } } @@ -1716,11 +1716,11 @@ struct server_response { // if timeout is reached, nullptr is returned server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { while (true) { - for (size_t i = 0; i < id_tasks.size(); i++) { - auto iter = queue_results.find(id_tasks[i]); + for (const auto & id_task : id_tasks) { + auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { server_task_result_ptr res = std::move(iter->second); - queue_results.erase(id_tasks[i]); + queue_results.erase(id_task); return res; } } From 4c24c26d54e00f7db6c35d095419d81e30f35862 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:24:51 +0800 Subject: [PATCH 032/100] fix --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c216bb842474c..e6e15f537ecde 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1744,7 +1744,8 @@ struct server_response { return res; } - condition_results.wait(mutex_results, [&]{ + std::lock_guard lock(mutex_results); + condition_results.wait(lock, [&]{ return !queue_results.empty(); }); } From 21a6c9288aff9d28ed4a6eb2ece3ccb9a97579bd Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:26:47 +0800 Subject: [PATCH 033/100] fix --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e6e15f537ecde..a0617ea17d21d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1705,7 +1705,7 @@ struct server_response { } condition_results.wait(mutex_results, [&]{ - return !queue_results.empty(); + return queue_results.cbegin() != queue_results.cend(); }); } @@ -1746,7 +1746,7 @@ struct server_response { std::lock_guard lock(mutex_results); condition_results.wait(lock, [&]{ - return !queue_results.empty(); + return queue_results.cbegin() != queue_results.cend(); }); } } From 45e3f2e14dc7c73d2160844c0abdf629819e2a75 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:29:22 +0800 Subject: [PATCH 034/100] fix --- examples/server/server.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a0617ea17d21d..7d548ccfc75ed 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1666,20 +1666,20 @@ struct server_response { // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size()); + SRV_DBG("add task %d to waiting list. current no waiting = %d (before add)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1); waiting_task_ids.insert(id_task, 0); } void add_waiting_tasks(const std::vector & tasks) { for (const auto & task : tasks) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); + SRV_DBG("add task %d to waiting list. current no waiting = %d (before add)\n", task.id, queue_results.cbegin() == queue_results.cend() ? 0 : 1); waiting_task_ids.insert(task.id, 0); } } // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); + SRV_DBG("remove task %d from waiting list. current no waiting = %d (before remove)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1); waiting_task_ids.erase(id_task); // make sure to clean up all pending results queue_results.erase(id_task); @@ -1687,7 +1687,7 @@ struct server_response { void remove_waiting_task_ids(const std::unordered_set & id_tasks) { for (const auto & id_task : id_tasks) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); + SRV_DBG("remove task %d from waiting list. current no waiting = %d (before remove)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1); waiting_task_ids.erase(id_task); } } @@ -1704,7 +1704,8 @@ struct server_response { } } - condition_results.wait(mutex_results, [&]{ + std::lock_guard lock(mutex_results); + condition_results.wait(lock, [&]{ return queue_results.cbegin() != queue_results.cend(); }); } @@ -1733,7 +1734,7 @@ struct server_response { // should never reach here } - + // single-task version of recv() server_task_result_ptr recv(int id_task) { while (true) { From 8b3be100ee878984ad719bdaa7ed0305e095d699 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:30:40 +0800 Subject: [PATCH 035/100] fix --- examples/server/server.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7d548ccfc75ed..dcdea582d7723 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1698,13 +1698,13 @@ struct server_response { for (const auto & id_task : id_tasks) { auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { - server_task_result_ptr res = std::move(iter->second); + server_task_result_ptr res = iter->second; queue_results.erase(id_task); return res; } } - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ return queue_results.cbegin() != queue_results.cend(); }); @@ -1720,7 +1720,7 @@ struct server_response { for (const auto & id_task : id_tasks) { auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { - server_task_result_ptr res = std::move(iter->second); + server_task_result_ptr res = iter->second; queue_results.erase(id_task); return res; } @@ -1740,12 +1740,12 @@ struct server_response { while (true) { auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { - server_task_result_ptr res = std::move(iter->second); + server_task_result_ptr res = iter->second; queue_results.erase(id_task); return res; } - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ return queue_results.cbegin() != queue_results.cend(); }); From eb8e05843ccae607a6ca9abe6cb2e950f0388939 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:31:36 +0800 Subject: [PATCH 036/100] fix --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index dcdea582d7723..72b62daf49b5a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -516,7 +516,7 @@ struct server_task_result { }; // using shared_ptr for polymorphism of server_task_result -using server_task_result_ptr = std::unique_ptr; +using server_task_result_ptr = std::shared_ptr; inline std::string stop_type_to_str(stop_type type) { switch (type) { From a0317cddd15a807927e8ff91ccb3a50e3c23ce1c Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:36:01 +0800 Subject: [PATCH 037/100] fix --- examples/server/server.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 72b62daf49b5a..4c7212759de37 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1656,10 +1656,10 @@ struct server_queue { struct server_response { // for keeping track of all tasks waiting for the result - atomic::hash_map waiting_task_ids; + atomic::hash_map waiting_task_ids = {10000}; // the main result queue (using ptr for polymorphism) - atomic::hash_map queue_results; + atomic::hash_map queue_results = {10000}; std::mutex mutex_results; std::condition_variable condition_results; @@ -1726,6 +1726,7 @@ struct server_response { } } + std::unique_lock lock(mutex_results); std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout)); if (cr_res == std::cv_status::timeout) { return nullptr; From a15e0105f732b9f0ae9cd40f919c6fb15f9b803c Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:38:15 +0800 Subject: [PATCH 038/100] fix --- examples/server/atomic_hash_map.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/atomic_hash_map.hpp b/examples/server/atomic_hash_map.hpp index be6d78e87aca8..7f2a6d523f99f 100644 --- a/examples/server/atomic_hash_map.hpp +++ b/examples/server/atomic_hash_map.hpp @@ -294,7 +294,7 @@ namespace atomic { friend ConstIterator; - explicit hash_map(size_t maxSize, + hash_map(size_t maxSize, float maxLoadFactor = 0.8f, const Allocator& alloc = Allocator()) : allocator_(alloc) From 70fb2f972cf0c95734c75b907b0640cffc9358c9 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:43:02 +0800 Subject: [PATCH 039/100] fix --- examples/server/atomic_hash_map.hpp | 4 ++-- src/llama-model.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/server/atomic_hash_map.hpp b/examples/server/atomic_hash_map.hpp index 7f2a6d523f99f..f511b2159742a 100644 --- a/examples/server/atomic_hash_map.hpp +++ b/examples/server/atomic_hash_map.hpp @@ -295,8 +295,8 @@ namespace atomic { friend ConstIterator; hash_map(size_t maxSize, - float maxLoadFactor = 0.8f, - const Allocator& alloc = Allocator()) + float maxLoadFactor = 0.8f, + const Allocator& alloc = Allocator()) : allocator_(alloc) { size_t capacity = size_t(maxSize / (maxLoadFactor > 1.0f ? 1.0f : maxLoadFactor) + 128); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 065c6ea0927ff..29de5dc1fb503 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2949,7 +2949,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_context * ctx = ctx_for_buft(buft); auto trans_wkv_b = ggml_transpose(ctx, layer.wkv_b); - auto wkv_b_copied = ggml_dup(ctx, trans_wkv_b); + auto wkv_b_copied = ggml_new_tensor_2d(ctx, trans_wkv_b->type, trans_wkv_b->ne[0], trans_wkv_b->ne[1]); + ggml_cpy(ctx, trans_wkv_b, wkv_b_copied); layer.wk_b = ggml_view_2d(ctx, wkv_b_copied, wkv_b_copied->ne[0], n_embd_head_qk_nope, n_head, 0); layer.wv_b = ggml_view_2d(ctx, wkv_b_copied, wkv_b_copied->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head); } From ab9a13a9b7adb121f547812d95061d65856e0857 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 11:47:52 +0800 Subject: [PATCH 040/100] fix --- src/llama-model.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 29de5dc1fb503..f393fd9af5764 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2948,11 +2948,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_context * ctx = ctx_for_buft(buft); - auto trans_wkv_b = ggml_transpose(ctx, layer.wkv_b); - auto wkv_b_copied = ggml_new_tensor_2d(ctx, trans_wkv_b->type, trans_wkv_b->ne[0], trans_wkv_b->ne[1]); - ggml_cpy(ctx, trans_wkv_b, wkv_b_copied); - layer.wk_b = ggml_view_2d(ctx, wkv_b_copied, wkv_b_copied->ne[0], n_embd_head_qk_nope, n_head, 0); - layer.wv_b = ggml_view_2d(ctx, wkv_b_copied, wkv_b_copied->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head); + auto trans_wkv_b = ggml_cont(ctx, ggml_transpose(ctx, layer.wkv_b)); + layer.wk_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_qk_nope, n_head, 0); + layer.wv_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head); } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); From 751ff03669dbd5fb81c214762ca1bd6d2a205f33 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 12:21:39 +0800 Subject: [PATCH 041/100] fix --- src/llama-model.cpp | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f393fd9af5764..9db14d46f019c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2948,9 +2948,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_context * ctx = ctx_for_buft(buft); - auto trans_wkv_b = ggml_cont(ctx, ggml_transpose(ctx, layer.wkv_b)); - layer.wk_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_qk_nope, n_head, 0); - layer.wv_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head); + // 反量化 wkv_b + const auto * qtype = ggml_get_type_traits(layer.wkv_b->type); + std::vector dequantized_wkv_b(layer.wkv_b->ne[0] * layer.wkv_b->ne[1]); + qtype->to_float(layer.wkv_b->data, dequantized_wkv_b.data(), layer.wkv_b->ne[0] * layer.wkv_b->ne[1]); + + // 创建 wk_b 和 wv_b 张量 + auto * wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd_head_qk_nope, n_head * kv_lora_rank); + auto * wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, kv_lora_rank, n_head * n_embd_head_v); + + // 分割 wkv_b 数据来生成 wk_b 和 wv_b + for (int h = 0; h < n_head; ++h) { + int k_start = h * (n_embd_head_qk_nope + n_embd_head_v); + + for (int row = 0; row < kv_lora_rank; ++row) { + for (int col = 0; col < n_embd_head_qk_nope; ++col) { + // 填充 wk_b + int src_idx = row * layer.wkv_b->ne[0] + k_start + col; + GGML_ASSERT(src_idx < dequantized_wkv_b.size()); + int dst_row = h * kv_lora_rank + row; + int dst_col = col; + ((float*)wk_b->data)[dst_row * n_embd_head_qk_nope + dst_col] = dequantized_wkv_b[src_idx]; + } + + for (int col = 0; col < n_embd_head_v; ++col) { + // 填充 wv_b + int src_idx = row * layer.wkv_b->ne[0] + k_start + n_embd_head_qk_nope + col; + GGML_ASSERT(src_idx < dequantized_wkv_b.size()); + int dst_row = row; + int dst_col = h * n_embd_head_v + col; + ((float*)wv_b->data)[dst_row * n_head * n_embd_head_v + dst_col] = dequantized_wkv_b[src_idx]; + } + } + } + + layer.wk_b = ggml_cast(ctx, wk_b, layer.wkv_b->type); + layer.wv_b = ggml_cast(ctx, wv_b, layer.wkv_b->type); } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); From 46ac9f6208d3518c8089d642c6885576f2a7a767 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 14:01:08 +0800 Subject: [PATCH 042/100] revert MLA --- convert_hf_to_gguf.py | 3956 +--------------------------------------- src/llama-kv-cache.cpp | 21 +- src/llama-kv-cache.h | 8 - src/llama.cpp | 153 +- 4 files changed, 92 insertions(+), 4046 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c3fd04941053b..047612392c7a5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -947,695 +947,63 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) - -@Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): - model_arch = gguf.MODEL_ARCH.GPTNEOX - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - tensors: list[tuple[str, Tensor]] = [] - - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors - - -@Model.register("BloomForCausalLM", "BloomModel") -class BloomModel(Model): - model_arch = gguf.MODEL_ARCH.BLOOM - - def set_gguf_parameters(self): - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - name = re.sub(r'transformer\.', '', name) - - tensors: list[tuple[str, Tensor]] = [] - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - tensors.append((self.map_tensor_name(name), data_torch)) - - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("MPTForCausalLM") -class MPTModel(Model): - model_arch = gguf.MODEL_ARCH.MPT - - def set_vocab(self): - try: - self._set_vocab_gpt2() - except Exception: - # Fallback for SEA-LION model - self._set_vocab_sentencepiece() - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_pad_token_id(3) - self.gguf_writer.add_eos_token_id(1) - self.gguf_writer.add_unk_token_id(0) - - def set_gguf_parameters(self): - block_count = self.hparams["n_layers"] - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - else: - self.gguf_writer.add_max_alibi_bias(0.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if "scales" in name: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") - else: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - - return [(new_name, data_torch)] - - -@Model.register("OrionForCausalLM") -class OrionModel(Model): - model_arch = gguf.MODEL_ARCH.ORION - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - # note: config provides rms norm but it is actually layer norm - # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 - self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - - -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): - model_arch = gguf.MODEL_ARCH.BAICHUAN - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - tensors: list[tuple[str, Tensor]] = [] - - if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": - logger.info(f"Unpacking and permuting layer {bid}") - tensors = [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), - self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - self._reverse_hf_part(data_torch, 2)), - ] - else: - tensors = [(self.map_tensor_name(name), data_torch)] - - return tensors - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -@Model.register("XverseForCausalLM") -class XverseModel(Model): - model_arch = gguf.MODEL_ARCH.XVERSE - - def set_vocab(self): - assert (self.dir_model / "tokenizer.json").is_file() - dir_model = self.dir_model - hparams = self.hparams - - tokens: list[bytes] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, - # because vocab_size is the count of items, and indexes start at 0. - max_vocab_index = max(tokenizer.get_vocab().values()) - if max_vocab_index >= vocab_size: - raise ValueError("Vocabulary size exceeds expected maximum size.") - - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode('utf-8') - # replace "\x00" to string with length > 0 - if token_text == b"\x00": - toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode('utf-8') - elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE # special - elif reverse_vocab[token_id] in added_vocab: - if tokenizer.added_tokens_decoder[token_id].special: - toktype = gguf.TokenType.CONTROL - else: - toktype = gguf.TokenType.USER_DEFINED - else: - toktype = gguf.TokenType.NORMAL - - tokens.append(token_text) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith("q_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) - if name.endswith("k_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - - return [(self.map_tensor_name(name), data_torch)] - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): - model_arch = gguf.MODEL_ARCH.FALCON - - def set_gguf_parameters(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 - head_dim = self.hparams["hidden_size"] // n_head - - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): - model_arch = gguf.MODEL_ARCH.STARCODER - - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@Model.register("GPTRefactForCausalLM") -class RefactModel(Model): - model_arch = gguf.MODEL_ARCH.REFACT - - def set_vocab(self): - super().set_vocab() - - # TODO: how to determine special FIM tokens automatically? - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot']) - special_vocab._set_special_token("prefix", 1) - special_vocab._set_special_token("suffix", 3) - special_vocab._set_special_token("middle", 2) - special_vocab.chat_template = None # do not add it twice - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - block_count = self.hparams["n_layer"] - - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - - tensors: list[tuple[str, Tensor]] = [] - - if bid is not None: - if name == f"transformer.h.{bid}.attn.kv.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) - elif name == f"transformer.h.{bid}.attn.q.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) - elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) - - if len(tensors) == 0: - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors - - -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(Model): - model_arch = gguf.MODEL_ARCH.STABLELM - - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab - self._set_vocab_qwen() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) - self.gguf_writer.add_file_type(self.ftype) - - _q_norms: list[dict[str, Tensor]] | None = None - _k_norms: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams["num_key_value_heads"] - - if name.find("q_layernorm.norms") != -1: - assert bid is not None - - if self._q_norms is None: - self._q_norms = [{} for _ in range(self.block_count)] - - self._q_norms[bid][name] = data_torch - - if len(self._q_norms[bid]) >= n_head: - return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") - else: - return [] - - if name.find("k_layernorm.norms") != -1: - assert bid is not None - - if self._k_norms is None: - self._k_norms = [{} for _ in range(self.block_count)] - - self._k_norms[bid][name] = data_torch - - if len(self._k_norms[bid]) >= n_kv_head: - return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): - datas: list[Tensor] = [] - # extract the norms in order - for xid in range(n_head): - ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" - datas.append(norms[ename]) - del norms[ename] - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - new_name = self.map_tensor_name(merged_name) - - return [(new_name, data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._q_norms is not None or self._k_norms is not None: - # flatten two `list[dict[str, Tensor]]` into a single `list[str]` - norms = ( - [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] - ) + ( - [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] - ) - if len(norms) > 0: - raise ValueError(f"Unprocessed norms: {norms}") - - -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") -class LlamaModel(Model): - model_arch = gguf.MODEL_ARCH.LLAMA +@Model.register("DeepseekV2ForCausalLM") +@Model.register("DeepseekV3ForCausalLM") +class DeepseekV2Model(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() - - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) - if self.hparams.get("vocab_size", 32000) == 32016: - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] - ) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) + self._set_vocab_gpt2() def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams + + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] + if hparams["scoring_func"] == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif hparams["scoring_func"] == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) + raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") + + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") + # rename e_score_correction_bias tensors + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + # skip Multi-Token Prediction (MTP) layers + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return [] # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] assert bid is not None if self._experts is None: @@ -1647,17 +1015,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors: list[tuple[str, Tensor]] = [] # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: + for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" datas.append(self._experts[bid][ename]) del self._experts[bid][ename] data_torch = torch.stack(datas, dim=0) - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" new_name = self.map_tensor_name(merged_name) @@ -1666,36 +1034,29 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] - return [(self.map_tensor_name(name), data_torch)] + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim) + v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1]) + + return [ + (self.map_tensor_name(name), data_torch), + (self.map_tensor_name(name_kb), k_b), + (self.map_tensor_name(name_vb), v_b) + ] - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): super().prepare_tensors() @@ -1707,3199 +1068,6 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeciLMForCausalLM") -class DeciModel(Model): - model_arch = gguf.MODEL_ARCH.DECI - - @staticmethod - def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: - # DeciLM-specific code - intermediate_size = int(2 * ffn_mult * n_embd / 3) - return DeciModel._find_multiple(intermediate_size, 256) - - @staticmethod - def _find_multiple(n: int, k: int) -> int: - # DeciLM-specific code - if n % k == 0: - return n - return n + k - (n % k) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] - assert self.block_count == len(_block_configs) - self._num_kv_heads = list() - self._num_heads = list() - _ffn_multipliers = list() - # ***linear attention layer*** - # if n_heads_in_group is None and replace_with_linear is True - # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads - # ***attention-free layer*** - # if n_heads_in_group is None and replace_with_linear is False - # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 - # ***normal attention-layer*** - # if n_heads_in_group is not None, then - # _num_kv_heads[il] is num_attention_head // n_heads_in_group and - # _num_heads[il] is num_attention_head - for il in range(len(_block_configs)): - if _block_configs[il]["attention"]["n_heads_in_group"] is None: - if _block_configs[il]["attention"]["replace_with_linear"] is True: - self._num_kv_heads.append(0) - self._num_heads.append(self.hparams["num_attention_heads"]) - else: - self._num_kv_heads.append(0) - self._num_heads.append(0) - else: - self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) - self._num_heads.append(self.hparams["num_attention_heads"]) - _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(_ffn_multipliers) - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) - assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) - self._ffn_dims: list[int] = [ - DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) - for multiplier in _ffn_multipliers - ] - - def set_vocab(self): - # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's - # eos_token from '|eot_id|' to '|end_of_text|' - if self.hparams.get("vocab_size", 128256) == 128256: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - else: - # DeciLM-7B - self._set_vocab_llama_hf() - - def set_gguf_parameters(self): - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(self._ffn_dims) - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_head_count(self._num_heads) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_file_type(self.ftype) - else: # DeciLM-7B - super().set_gguf_parameters() - if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B - self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] - assert self.block_count == len(self._num_kv_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - if bid is not None: - if "num_key_value_heads_per_layer" in self.hparams: - n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] - elif "block_configs" in self.hparams: - n_kv_head = self._num_kv_heads[bid] - n_head = self._num_heads[bid] - else: - n_kv_head = self.hparams.get("num_key_value_heads") - else: - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - def prepare_tensors(self): - super().prepare_tensors() - - -@Model.register("BitnetForCausalLM") -class BitnetModel(Model): - model_arch = gguf.MODEL_ARCH.BITNET - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def weight_quant(self, weight: Tensor) -> Tensor: - dtype = weight.dtype - weight = weight.float() - scale = weight.abs().mean().clamp(min=1e-5) - iscale = 1 / scale - # TODO: multiply by the scale directly instead of inverting it twice - # (this is also unnecessarily doubly inverted upstream) - # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 - result = (weight * iscale).round().clamp(-1, 1) / iscale - return result.type(dtype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if any(self.match_model_tensor_name(new_name, key, bid) for key in [ - gguf.MODEL_TENSOR.ATTN_Q, - gguf.MODEL_TENSOR.ATTN_K, - gguf.MODEL_TENSOR.ATTN_V, - gguf.MODEL_TENSOR.ATTN_OUT, - gguf.MODEL_TENSOR.FFN_UP, - gguf.MODEL_TENSOR.FFN_DOWN, - gguf.MODEL_TENSOR.FFN_GATE, - ]): - # transform weight into 1/0/-1 (in fp32) - data_torch = self.weight_quant(data_torch) - - yield (new_name, data_torch) - - -@Model.register("GrokForCausalLM") -class GrokModel(Model): - model_arch = gguf.MODEL_ARCH.GROK - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find(".moe.") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["linear", "linear_1", "linear_v"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("DbrxForCausalLM") -class DbrxModel(Model): - model_arch = gguf.MODEL_ARCH.DBRX - - def set_gguf_parameters(self): - ffn_config = self.hparams["ffn_config"] - attn_config = self.hparams["attn_config"] - self.gguf_writer.add_block_count(self.hparams["n_layers"]) - - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) - - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) - - self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) - - self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) - - self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) - - self.gguf_writer.add_layer_norm_eps(1e-5) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_expert = self.hparams["ffn_config"]["moe_num_experts"] - n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] - n_embd = self.hparams["d_model"] - - # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose - # original implementation expects (n_expert, n_ff, n_embd) for all experts weights - # But llama.cpp moe graph works differently - # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions - # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - experts = False - - for exp_tensor_name in exp_tensor_names.keys(): - if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: - experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd) - if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: - data_torch = data_torch.permute(*permute_tensor) - break - - # map tensor names - # In MoE models the ffn tensors are typically most of the model weights, - # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. - # Every other model has the weight names ending in .weight, - # let's assume that is the convention which is not the case for dbrx: - # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) - - return [(new_name, data_torch)] - - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid # unused - - return n_dims > 1 - - -@Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): - model_arch = gguf.MODEL_ARCH.MINICPM - - def set_gguf_parameters(self): - super().set_gguf_parameters() - embedding_scale = float(self.hparams["scale_emb"]) - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") - residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 - self.gguf_writer.add_residual_scale(residual_scale) - logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") - logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] - self.gguf_writer.add_logit_scale(logit_scale) - logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - if self.hparams.get("rope_scaling") is not None: - if self.hparams["rope_scaling"].get("type") == "longrope": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) - logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(Model): - model_arch = gguf.MODEL_ARCH.MINICPM3 - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - rope_dims = self.hparams["qk_rope_head_dim"] - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@Model.register("QWenLMHeadModel") -class QwenModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - self._set_vocab_qwen() - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): - model_arch = gguf.MODEL_ARCH.QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - - -@Model.register("Qwen2VLForConditionalGeneration") -class Qwen2VLModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN2VL - - def set_gguf_parameters(self): - super().set_gguf_parameters() - mrope_section = self.hparams["rope_scaling"]["mrope_section"] - mrope_section += [0] * max(0, 4 - len(mrope_section)) - self.gguf_writer.add_rope_dimension_sections(mrope_section) - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, data in super().get_tensors(): - if name.startswith("visual."): - continue - yield name, data - - -@Model.register("WavTokenizerDec") -class WavTokenizerDecModel(Model): - model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if \ - name.endswith("codebook.cluster_size") or \ - name.endswith("codebook.embed_avg") or \ - name.endswith("codebook.inited"): - logger.debug(f"Skipping {name!r}") - return [] - - logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") - - return [(self.map_tensor_name(name), data_torch)] - - def set_vocab(self): - self._set_vocab_none() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) - self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) - self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) - self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) - - self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) - self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) - - self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) - self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) - - self.gguf_writer.add_causal_attention(False) - - -@Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN2MOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (n_experts := self.hparams.get("num_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: - self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) - logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams["num_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("GPT2LMHeadModel") -class GPT2Model(Model): - model_arch = gguf.MODEL_ARCH.GPT2 - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_ctx"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - - # we don't need these - if name.endswith((".attn.bias", ".attn.masked_bias")): - return tensors - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - tensors.append((new_name, data_torch)) - - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("PhiForCausalLM") -class Phi2Model(Model): - model_arch = gguf.MODEL_ARCH.PHI2 - - def set_gguf_parameters(self): - block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - - rot_pct = self.find_hparam(["partial_rotary_factor"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) - - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(4 * n_embd) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_add_bos_token(False) - - -@Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): - model_arch = gguf.MODEL_ARCH.PHI3 - - def set_vocab(self): - # Phi-4 model uses GPT2Tokenizer - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - tokenizer_class = tokenizer_config_json['tokenizer_class'] - if tokenizer_class == 'GPT2Tokenizer': - return self._set_vocab_gpt2() - - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - raise ValueError(f'Error: Missing {tokenizer_path}') - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) - rms_eps = self.find_hparam(["rms_norm_eps"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head - - self.gguf_writer.add_context_length(max_pos_embds) - self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(rms_eps) - self.gguf_writer.add_rope_dimension_count(rope_dims) - self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) - self.gguf_writer.add_file_type(self.ftype) - sliding_window = self.hparams.get("sliding_window") - # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models - if sliding_window is None: - sliding_window = 0 - self.gguf_writer.add_sliding_window(sliding_window) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head - - # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is None: - return - - scale = max_pos_embds / orig_max_pos_embds - - rope_scaling_type = rope_scaling.get('type', '').lower() - if len(rope_scaling_type) == 0: - raise KeyError('Missing the required key rope_scaling.type') - - if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': - attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 - elif rope_scaling_type == 'yarn': - attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 - else: - raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') - - self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - -@Model.register("PhiMoEForCausalLM") -class PhiMoeModel(Phi3MiniModel): - model_arch = gguf.MODEL_ARCH.PHIMOE - - _experts: list[dict[str, Tensor]] | None = None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) - self.gguf_writer.add_expert_count(self.hparams["num_local_experts"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("PlamoForCausalLM") -class PlamoModel(Model): - model_arch = gguf.MODEL_ARCH.PLAMO - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_context_length(4096) # not in config.json - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def shuffle_attn_q_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(8, 5, 128, 5120) - data_torch = torch.permute(data_torch, (1, 0, 2, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def shuffle_attn_output_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(5120, 8, 5, 128) - data_torch = torch.permute(data_torch, (0, 2, 1, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - new_name = self.map_tensor_name(name) - - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - return [(new_name, data_torch)] - - -@Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): - model_arch = gguf.MODEL_ARCH.CODESHELL - - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(10000.0) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - new_name = self.map_tensor_name(name) - - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None - - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): - model_arch = gguf.MODEL_ARCH.INTERNLM2 - - def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉".encode("utf-8") - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - # take care of ununsed raw token - if piece.startswith('[UNUSED'): - toktype = SentencePieceTokenTypes.UNUSED - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - chat_eos_token = '<|im_end|>' - chat_eos_token_id = None - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"] - if token == chat_eos_token: - chat_eos_token_id = token_id - token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"] - if token == chat_eos_token: - chat_eos_token_id = token_id - token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - old_eos = special_vocab.special_token_ids["eos"] - if chat_eos_token_id is not None: - # For the chat model, we replace the eos with '<|im_end|>'. - # TODO: this is a hack, should be fixed - # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 - special_vocab.special_token_ids["eos"] = chat_eos_token_id - logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" - " in chat mode so that the conversation can end normally.") - - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_file_type(self.ftype) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_heads = self.hparams["num_attention_heads"] - num_kv_heads = self.hparams["num_key_value_heads"] - n_embd = self.hparams["hidden_size"] - q_per_kv = num_heads // num_kv_heads - head_dim = n_embd // num_heads - num_groups = num_heads // q_per_kv - - if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: - qkv = data_torch - - qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) - q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] - - # The model weights of q and k equire additional reshape. - q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) - k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) - v = v.reshape((-1, v.shape[-1])) - - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), - ] - else: - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("InternLM3ForCausalLM") -class InternLM3Model(Model): - model_arch = gguf.MODEL_ARCH.LLAMA - - def set_vocab(self): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - if "added_tokens_decoder" in tokenizer_config_json: - for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): - if token_data.get("special"): - token_id = int(token_id) - token = token_data["content"] - special_vocab._set_special_token(token, token_id) - # update eos token - if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: - special_vocab.special_token_ids["eos"] = token_id - - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") -class BertModel(Model): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.vocab_size = None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_causal_attention(False) - - # get pooling path - pooling_path = None - module_path = self.dir_model / "modules.json" - if module_path.is_file(): - with open(module_path, encoding="utf-8") as f: - modules = json.load(f) - for mod in modules: - if mod["type"] == "sentence_transformers.models.Pooling": - pooling_path = mod["path"] - break - - # get pooling type - if pooling_path is not None: - with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: - pooling = json.load(f) - if pooling["pooling_mode_mean_tokens"]: - pooling_type = gguf.PoolingType.MEAN - elif pooling["pooling_mode_cls_token"]: - pooling_type = gguf.PoolingType.CLS - else: - raise NotImplementedError("Only MEAN and CLS pooling types supported") - self.gguf_writer.add_pooling_type(pooling_type) - - def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base() - self.vocab_size = len(tokens) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - # convert to phantom space vocab - def phantom(tok): - if tok.startswith("[") and tok.endswith("]"): - return tok - if tok.startswith("##"): - return tok[2:] - return "\u2581" + tok - tokens = list(map(phantom, tokens)) - - # add vocab to gguf - self.gguf_writer.add_tokenizer_model("bert") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # handle special tokens - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if name.startswith("bert."): - name = name[5:] - - if name.endswith(".gamma"): - name = name[:-6] + ".weight" - - if name.endswith(".beta"): - name = name[:-5] + ".bias" - - # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return [] # we don't need these - - if name.startswith("cls.predictions"): - return [] - - if name.startswith("cls.seq_relationship"): - return [] - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("RobertaModel") -class RobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): - """Support BPE tokenizers for roberta models""" - bpe_tok_path = self.dir_model / "tokenizer.json" - if bpe_tok_path.exists(): - self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - else: - return super().set_vocab() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - return super().modify_tensors(data_torch, name, bid) - - -@Model.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.NOMIC_BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # the HF config claims n_ctx=8192, but it uses RoPE scaling - self.hparams["n_ctx"] = 2048 - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # this doesn't do anything in the HF version - assert self.hparams["causal"] is False - # no bias tensors - assert self.hparams["qkv_proj_bias"] is False - assert self.hparams["mlp_fc1_bias"] is False - assert self.hparams["mlp_fc2_bias"] is False - # norm at end of layer - assert self.hparams["prenorm"] is False - # standard RoPE - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - - -@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") -class XLMRobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - # realign tokens (see HF tokenizer code) - tokens = [b'', b'', b'', b''] + tokens[3:-1] - scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] - toktypes = [ - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.UNKNOWN, - ] + toktypes[3:-1] - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - return super().modify_tensors(data_torch, name, bid) - - -@Model.register("GemmaForCausalLM") -class GemmaModel(Model): - model_arch = gguf.MODEL_ARCH.GEMMA - - def set_vocab(self): - self._set_vocab_sentencepiece() - - # TODO: these special tokens should be exported only for the CodeGemma family - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) - special_vocab._set_special_token("prefix", 67) - special_vocab._set_special_token("suffix", 69) - special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("fsep", 70) - special_vocab._set_special_token("eot", 107) - special_vocab.chat_template = None # do not add it twice - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] - - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("Gemma2ForCausalLM") -class Gemma2Model(Model): - model_arch = gguf.MODEL_ARCH.GEMMA2 - - def set_vocab(self): - self._set_vocab_sentencepiece() - - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_attn_logit_softcapping( - self.hparams["attn_logit_softcapping"] - ) - self.gguf_writer.add_final_logit_softcapping( - self.hparams["final_logit_softcapping"] - ) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] - - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): - model_arch = gguf.MODEL_ARCH.STARCODER2 - - -@Model.register("Rwkv6ForCausalLM") -class Rwkv6Model(Model): - model_arch = gguf.MODEL_ARCH.RWKV6 - - def set_vocab(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = [''.encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: - lines = f.readlines() - for line in lines: - parts = line.split(' ') - assert len(parts) >= 3 - token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.chat_template = "rwkv-world" - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_size = self.hparams["head_size"] - hidden_size = self.hparams["hidden_size"] - layer_norm_eps = self.hparams["layer_norm_epsilon"] - rescale_every_n_layers = self.hparams["rescale_every"] - intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) - time_mix_extra_dim = 64 if hidden_size == 4096 else 32 - time_decay_extra_dim = 128 if hidden_size == 4096 else 64 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_layer_norm_eps(layer_norm_eps) - self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - lerp_weights: dict[int, dict[str, Tensor]] = {} - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if not (new_name.endswith(".weight") or new_name.endswith(".bias")): - new_name += ".weight" - - if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): - data_torch = data_torch.transpose(0, 1) - - if new_name.endswith("time_mix_w2.weight"): - data_torch = data_torch.permute(0, 2, 1) - - if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: - data_torch = data_torch.squeeze() - - try: - rescale_every_n_layers = self.hparams["rescale_every"] - if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): - data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) - except KeyError: - pass - - # concat time_mix_lerp weights to reduce some cpu overhead - # also reduces the number of tensors in the model - if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: - try: - self.lerp_weights[bid][new_name] = data_torch - except KeyError: - self.lerp_weights[bid] = {new_name: data_torch} - if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) - yield (new_name, data) - return - - yield (new_name, data_torch) - - -@Model.register("RWKV6Qwen2ForCausalLM") -class RWKV6Qwen2Model(Rwkv6Model): - model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - num_attention_heads = self.hparams["num_attention_heads"] - num_key_value_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] - head_size = hidden_size // num_attention_heads - rms_norm_eps = self.hparams["rms_norm_eps"] - intermediate_size = self.hparams["intermediate_size"] - time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 - time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # special parameters for time_mixing in RWKV6QWEN2 - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_token_shift_count(1) - # RWKV6QWEN2 use grouped key/value like GQA - self.gguf_writer.add_head_count_kv(num_key_value_heads) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - for new_name, data in super().modify_tensors(data_torch, name, bid): - if "time_mix_w1" in new_name or "time_mix_w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg - # permute them here to avoid code changes - data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) - if "w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - yield (new_name, data) - continue - yield (new_name, data) - - -@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(Model): - model_arch = gguf.MODEL_ARCH.MAMBA - - def set_vocab(self): - vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 8 - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - elif (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present - self._set_vocab_builtin("gpt-neox", vocab_size) - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - use_dt_b_c_norm = False - # For falconmamba we do apply RMS norm on B / DT and C layers - if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): - use_dt_b_c_norm = True - # Fail early for models which don't have a block expansion factor of 2 - assert d_inner == 2 * d_model - - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers - self.gguf_writer.add_file_type(self.ftype) - - _tok_embd = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - - new_name = self.map_tensor_name(name) - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - # assuming token_embd.weight is seen before output.weight - if self._tok_embd is not None and new_name == output_name: - if torch.equal(self._tok_embd, data_torch): - logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") - return [] - elif new_name == tok_embd_name: - self._tok_embd = data_torch - - return [(new_name, data_torch)] - - -@Model.register("CohereForCausalLM") -class CommandR2Model(Model): - model_arch = gguf.MODEL_ARCH.COMMAND_R - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # max_position_embeddings = 8192 in config.json but model was actually - # trained on 128k context length - # aya-23 models don't have model_max_length specified - self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - -@Model.register("Cohere2ForCausalLM") -class Cohere2Model(Model): - model_arch = gguf.MODEL_ARCH.COHERE2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - rotary_pct = self.hparams["rotary_pct"] - hidden_size = self.hparams["hidden_size"] - num_attention_heads = self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") -class OlmoModel(Model): - model_arch = gguf.MODEL_ARCH.OLMO - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_eps(1e-5) - clip_qkv = self.hparams.get("clip_qkv") - if clip_qkv is not None: - self.gguf_writer.add_clamp_kqv(clip_qkv) - - # Same as super class, but permuting q_proj, k_proj - # Copied from: LlamaModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("Olmo2ForCausalLM") -class Olmo2Model(Model): - model_arch = gguf.MODEL_ARCH.OLMO2 - - -@Model.register("OlmoeForCausalLM") -class OlmoeModel(Model): - model_arch = gguf.MODEL_ARCH.OLMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_rms_eps(1e-5) - if (n_experts := self.hparams.get("num_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - - _experts: list[dict[str, Tensor]] | None = None - - # Copied from: Qwen2MoeModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams["num_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - # Copied from: Qwen2MoeModel - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("JinaBertModel", "JinaBertForMaskedLM") -class JinaBertV2Model(BertModel): - model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.intermediate_size = self.hparams["intermediate_size"] - - def get_tensors(self): - for name, data in super().get_tensors(): - if 'gated_layer' in name: - d1 = data[:self.intermediate_size, :] - name1 = name.replace('gated_layers', 'gated_layers_w') - name1 = name1.replace('up_gated_layer', 'gated_layers_v') - d2 = data[self.intermediate_size:, :] - name2 = name.replace('gated_layers', 'gated_layers_v') - name2 = name2.replace('up_gated_layer', 'gated_layers_w') - yield name1, d1 - yield name2, d2 - continue - - yield name, data - - def set_vocab(self): - tokenizer_class = 'BertTokenizer' - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_class = json.load(f)['tokenizer_class'] - - if tokenizer_class == 'BertTokenizer': - super().set_vocab() - elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() - self.gguf_writer.add_token_type_count(2) - else: - raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "bert.", remove the prefix - # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - if name.startswith("bert."): - name = name[5:] - - return super().modify_tensors(data_torch, name, bid) - - -@Model.register("OpenELMForCausalLM") -class OpenELMModel(Model): - model_arch = gguf.MODEL_ARCH.OPENELM - - @staticmethod - def _make_divisible(v: float | int, divisor: int) -> int: - # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 - new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if new_v < 0.9 * v: - new_v += divisor - return new_v - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] - ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] - self._n_embd: int = self.hparams["model_dim"] - self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] - self._num_query_heads: list[int] = self.hparams["num_query_heads"] - self._ffn_dims: list[int] = [ - OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) - for multiplier in ffn_multipliers - ] - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) - - # Uses the tokenizer from meta-llama/Llama-2-7b-hf - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) - - def set_gguf_parameters(self): - n_embd = self._n_embd - head_dim = self.hparams["head_dim"] - rot_pct = 1.0 - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_query_heads) - assert self.block_count == len(self._ffn_dims) - - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_context_length"]) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_head_count(self._num_query_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) - # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 - self.gguf_writer.add_layer_norm_rms_eps(1e-6) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - self.gguf_writer.add_file_type(self.ftype) - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - if "n_layers" in keys: - return self.hparams["num_transformer_layers"] - - return super().find_hparam(keys, optional) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - # split ff - if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": - ff_dim = self._ffn_dims[bid] - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) - return - - yield (self.map_tensor_name(name), data_torch) - - -@Model.register("ArcticForCausalLM") -class ArcticModel(Model): - model_arch = gguf.MODEL_ARCH.ARCTIC - - def set_vocab(self): - # The reason for using a custom implementation here is that the - # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from - # tokenizer.model and used them as BOS and EOS instead of adding new tokens. - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - # Read the whole vocabulary from the tokenizer.model file - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - # Use the added_tokens_decoder field from tokeniser_config.json as the source - # of information about added/redefined tokens and modify them accordingly. - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - - if "added_tokens_decoder" in tokenizer_config_json: - added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] - for token_id, token_json in added_tokens_decoder.items(): - token_id = int(token_id) - if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - token_content = token_json["content"] - token_type = SentencePieceTokenTypes.USER_DEFINED - token_score = -10000.0 - - # Map unk_token to UNKNOWN, other special tokens to CONTROL - # Set the score to 0.0 as in the original tokenizer.model - if ("special" in token_json) and token_json["special"]: - if token_content == tokenizer_config_json["unk_token"]: - token_type = SentencePieceTokenTypes.UNKNOWN - else: - token_type = SentencePieceTokenTypes.CONTROL - token_score = 0.0 - - logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") - tokens[token_id] = token_content.encode("utf-8") - toktypes[token_id] = token_type - scores[token_id] = token_score - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("DeepseekForCausalLM") -class DeepseekModel(Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_weights_scale(1.0) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - - _experts: list[dict[str, Tensor]] | None = None - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") -class DeepseekV2Model(Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) - - if hparams["scoring_func"] == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif hparams["scoring_func"] == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") - - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # rename e_score_correction_bias tensors - if name.endswith("e_score_correction_bias"): - name = name.replace("e_score_correction_bias", "e_score_correction.bias") - - # skip Multi-Token Prediction (MTP) layers - block_count = self.hparams["num_hidden_layers"] - match = re.match(r"model.layers.(\d+)", name) - if match and int(match.group(1)) >= block_count: - return [] - - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - if name.endswith("kv_b_proj.weight"): - name_kb = name.replace("kv_b_proj", "k_b_proj") - name_vb = name.replace("kv_b_proj", "v_b_proj") - - n_head_kv = self.hparams["num_key_value_heads"] - v_head_dim = self.hparams["v_head_dim"] - qk_nope_head_dim = self.hparams["qk_nope_head_dim"] - - assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) - - kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) - k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) - k_b = k_b.transpose(1, 2) - k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim) - v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1]) - - return [ - (self.map_tensor_name(name), data_torch), - (self.map_tensor_name(name_kb), k_b), - (self.map_tensor_name(name_vb), v_b) - ] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("T5WithLMHeadModel") -@Model.register("T5ForConditionalGeneration") -@Model.register("MT5ForConditionalGeneration") -@Model.register("UMT5ForConditionalGeneration") -class T5Model(Model): - model_arch = gguf.MODEL_ARCH.T5 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_add_eos_token(True) - - def set_gguf_parameters(self): - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) - self.gguf_writer.add_block_count(self.hparams["num_layers"]) - self.gguf_writer.add_head_count(self.hparams["num_heads"]) - self.gguf_writer.add_key_length(self.hparams["d_kv"]) - self.gguf_writer.add_value_length(self.hparams["d_kv"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("T5EncoderModel") -class T5EncoderModel(Model): - model_arch = gguf.MODEL_ARCH.T5ENCODER - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_add_eos_token(True) - - def set_gguf_parameters(self): - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) - self.gguf_writer.add_block_count(self.hparams["num_layers"]) - self.gguf_writer.add_head_count(self.hparams["num_heads"]) - self.gguf_writer.add_key_length(self.hparams["d_kv"]) - self.gguf_writer.add_value_length(self.hparams["d_kv"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("JAISLMHeadModel") -class JaisModel(Model): - model_arch = gguf.MODEL_ARCH.JAIS - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # ALiBi position embedding - assert self.hparams["position_embedding_type"] == "alibi" - - # Embeddings scale - self.embeddings_scale = 1.0 - if 'mup_embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['mup_embeddings_scale'] - elif 'embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['embeddings_scale'] - else: - assert False - - self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: - assert 'mup_width_scale' in self.hparams - self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] - elif 'width_scale' in self.hparams: - self.width_scale = self.hparams['width_scale'] - else: - assert False - - self.max_alibi_bias = 8.0 - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - - # we don't need these - if name.endswith((".attn.bias")): - return tensors - - if name.endswith(("relative_pe.slopes")): - # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) - # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, - # but Jais's PyTorch model simply precalculates the slope values and places them - # in relative_pes.slopes - n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) - first_val = float(data_torch[0].item()) - self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - - return tensors - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((new_name, data_torch * self.embeddings_scale)) - elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - tensors.append((new_name, data_torch * self.width_scale)) - else: - tensors.append((new_name, data_torch)) - - return tensors - - def prepare_tensors(self): - super().prepare_tensors() - self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) - - -@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(Model): - model_arch = gguf.MODEL_ARCH.CHATGLM - - def set_vocab_chatglm3(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytes] = [] - toktypes: list[int] = [] - scores: list[float] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) - assert max(tokenizer.get_vocab().values()) < vocab_size - role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] - special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens - for token_id in range(vocab_size): - piece = tokenizer._convert_id_to_token(token_id) - if token_id == 0: - piece = "" - elif token_id == 1: - piece = "" - elif token_id == 2: - piece = "" - - text = piece.encode("utf-8") - score = 0.0 - # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), - # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() - if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): - score = tokenizer.tokenizer.sp_model.get_score(token_id) - - if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): - if piece in special_tokens: - toktype = SentencePieceTokenTypes.CONTROL - elif len(piece) == 0: - text = f"[PAD{token_id}]".encode("utf-8") - toktype = SentencePieceTokenTypes.UNUSED - else: - toktype = SentencePieceTokenTypes.USER_DEFINED - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - continue - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.tokenizer.sp_model.is_unknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.tokenizer.sp_model.is_control(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.tokenizer.sp_model.is_unused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.tokenizer.sp_model.is_byte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - # glm3 needs prefix and suffix formatted as: - # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" - self.gguf_writer.add_tokenizer_pre("chatglm-spm") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""): - self.set_vocab_chatglm3() - return - - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) - assert max(tokenizer.get_vocab().values()) < vocab_size - - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) - self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"])) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) - self.gguf_writer.add_file_type(self.ftype) - if "attention_dim" in self.hparams: - rope_dim = self.hparams["attention_dim"] - else: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - self.gguf_writer.add_add_bos_token(False) - rope_freq = 10000 - if "rope_ratio" in self.hparams: - rope_freq = rope_freq * self.hparams["rope_ratio"] - self.gguf_writer.add_rope_freq_base(rope_freq) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."): - return [] - - name = name.removeprefix("transformer.") - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("NemotronForCausalLM") -class NemotronModel(Model): - model_arch = gguf.MODEL_ARCH.NEMOTRON - - def set_vocab(self): - self._set_vocab_sentencepiece() - self.gguf_writer.add_pad_token_id(0) - self.gguf_writer.add_unk_token_id(1) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - - # * Partial RoPE - rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - - # * RopeScaling for Nemotron - if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side - # model.layers.{l}.input_layernorm.weight - # model.layers.{l}.post_attention_layernorm.weight - # model.norm.weight - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("ExaoneForCausalLM") -class ExaoneModel(Model): - model_arch = gguf.MODEL_ARCH.EXAONE - - def set_gguf_parameters(self): - hparams = self.hparams - - assert (hparams["activation_function"] == "silu") - - max_position_embeddings = hparams["max_position_embeddings"] - embed_dim = hparams["hidden_size"] - num_heads = hparams["num_attention_heads"] - num_kv_heads = hparams.get("num_key_value_heads", num_heads) - layer_norm_eps = hparams["layer_norm_epsilon"] - intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim - num_layers = hparams["num_layers"] - # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 - # attention_dropout_rate = hparams["attention_dropout"] - # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 - # embed_dropout_rate = hparams["embed_dropout"] - self.gguf_writer.add_embedding_length(embed_dim) - self.gguf_writer.add_head_count(num_heads) - self.gguf_writer.add_head_count_kv(num_kv_heads) - self.gguf_writer.add_context_length(max_position_embeddings) - self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_block_count(num_layers) - self.gguf_writer.add_file_type(self.ftype) - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) - rotary_factor = rotary_factor if rotary_factor is not None else 1.0 - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]: - if hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@Model.register("GraniteForCausalLM") -class GraniteModel(LlamaModel): - """Conversion for IBM's GraniteForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE - - def set_gguf_parameters(self): - """Granite uses standard llama parameters with the following differences: - - - No head_dim support - - New multiplier params: - - attention_scale - - embedding_scale - - residual_scale - - logits_scaling - """ - if head_dim := self.hparams.pop("head_dim", None): - logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) - super().set_gguf_parameters() - # NOTE: Convert _multiplier params to _scale params for naming - # consistency - if attention_scale := self.hparams.get("attention_multiplier"): - self.gguf_writer.add_attention_scale(attention_scale) - logger.info("gguf: (granite) attention_scale = %s", attention_scale) - if embedding_scale := self.hparams.get("embedding_multiplier"): - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) - if residual_scale := self.hparams.get("residual_multiplier"): - self.gguf_writer.add_residual_scale(residual_scale) - logger.info("gguf: (granite) residual_scale = %s", residual_scale) - if logits_scale := self.hparams.get("logits_scaling"): - self.gguf_writer.add_logit_scale(logits_scale) - logger.info("gguf: (granite) logits_scale = %s", logits_scale) - - -@Model.register("GraniteMoeForCausalLM") -class GraniteMoeModel(GraniteModel): - """Conversion for IBM's GraniteMoeForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE_MOE - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - """In modeling_granitemoe, the JetMoe implementation of parallel experts - is used. This essentially merges w1 and w3 into a single tensor with 2x - the hidden size that is then split during forward. To keep compatibility - with existing mixtral support, we pull them apart here. - """ - - if name.endswith("block_sparse_moe.input_linear.weight"): - ffn_dim = self.hparams["intermediate_size"] - assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" - gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :] - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), - ] - - return super().modify_tensors(data_torch, name, bid) - - -@Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(Model): - model_arch = gguf.MODEL_ARCH.CHAMELEON - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) - - def set_vocab(self): - self._set_vocab_gpt2() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # ignore image tokenizer for now - # TODO: remove this once image support is implemented for Chameleon - if name.startswith("model.vqmodel"): - return [] - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - hidden_dim = self.hparams.get("hidden_size") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - if name.endswith(("q_norm.weight", "q_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) - if name.endswith(("k_norm.weight", "k_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) - - return [(self.map_tensor_name(name), data_torch)] - - # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 - @staticmethod - def _reverse_hf_permute(data_torch, n_heads, hidden_dim): - head_dim = hidden_dim // n_heads - data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) - data_torch = data_torch.repeat_interleave(n_heads, 0) - return data_torch - - ###### CONVERSION LOGIC ###### diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 1905ad9273019..d9a33dce346b6 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -32,7 +32,7 @@ bool llama_kv_cache_init( cache.recurrent = llama_model_is_recurrent(&model); cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + cache.can_shift = !cache.recurrent; // not supported due to MLA LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); @@ -71,11 +71,6 @@ bool llama_kv_cache_init( cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - // DeepSeek MLA - cache.kr_l.reserve(n_layer); - cache.kv_l.reserve(n_layer); - cache.kvt_l.reserve(n_layer); - for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); @@ -102,20 +97,6 @@ bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); - - // DeepSeek MLA - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t kv_lora_rank = hparams.n_lora_kv; - LLAMA_LOG_DEBUG("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); - ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size); - ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); - ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); - ggml_format_name(kr, "cache_kr_l%d", i); - ggml_format_name(kv, "cache_kv_l%d", i); - ggml_format_name(kvt, "cache_kvt_l%d", i); - cache.kr_l.push_back(kr); - cache.kv_l.push_back(kv); - cache.kvt_l.push_back(kvt); } // allocate tensors and initialize the buffers to avoid NaNs in the padding diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 68b2e63d0e2b5..1ed688e3b5b7e 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -49,19 +49,11 @@ struct llama_kv_cache { ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; - ggml_type type_kr = GGML_TYPE_F16; - ggml_type type_kv = GGML_TYPE_F16; - std::vector cells; std::vector k_l; // per layer std::vector v_l; - // DeepSeek MLA - std::vector kr_l; // per layer - std::vector kv_l; - std::vector kvt_l; - std::vector ctxs; std::vector bufs; diff --git a/src/llama.cpp b/src/llama.cpp index c1aa5380498c6..b6f44403ea737 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6477,40 +6477,39 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(kv_compressed, "kv_compressed", il); - struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head); - cb(kv_cache_view, "kv_cache_view", il); - - // note: storing c^KV in the KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view)); + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + cb(kv, "kv", il); - struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head)); - cb(kv_cache_trans_view, "kv_cache_trans_view", il); + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + 0); + cb(k_nope, "k_nope", il); - // note: storing transposed c^KV in the transposed KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view)); + // and {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_row_size(kv->type, (n_embd_head_qk_nope))); + cb(v_states, "v_states", il); - struct ggml_tensor * kv_cache = - ggml_view_2d(ctx0, kv_self.kv_l[il], - kv_lora_rank, n_kv, - ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank), - 0); - cb(kv_cache, "kv_cache", il); + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); - struct ggml_tensor * kv_cache_trans = - ggml_view_2d(ctx0, kv_self.kvt_l[il], - n_kv, kv_lora_rank, - ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), - 0); - cb(kv_cache_trans, "kv_cache_trans", il); + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + 0); + cb(v_states, "v_states", il); - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this q_pe = ggml_rope_ext( ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(q_pe, "q_pe", il); - // shared RoPE key k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this k_pe = ggml_rope_ext( @@ -6520,91 +6519,15 @@ struct llm_build_context { ); cb(k_pe, "k_pe", il); - struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head); - cb(kr_cache_view, "kr_cache_view", il); - - // note: storing RoPE-ed version of K^R in the KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_pe, kr_cache_view)); - - struct ggml_tensor * kr_cache = - ggml_view_2d(ctx0, kv_self.kr_l[il], - n_embd_head_qk_rope, n_kv, - ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope), - 0); - cb(kr_cache, "kr_cache", il); - - struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0); - cb(wk_b, "wk_b", il); - - q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); - cb(q_nope, "q_nope_perm", il); - - struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope); - cb(q_nope2, "q_nope2", il); - - if (!pp_opt) { - q_nope2 = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3); - cb(q_nope2, "q_nope2_perm", il); - } - - struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2); - cb(kq_nope, "kq_nope", il); - - if (!pp_opt) { - kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3); - cb(kq_nope, "kq_nope_perm", il); - } - - if (pp_opt) { - q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3); - cb(q_pe, "q_pe_perm", il); - } - - struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe); - cb(kq_pe, "kq_pe", il); - - if (!pp_opt) { - kq_pe = ggml_permute(ctx0, kq_pe, 0, 2, 1, 3); - cb(kq_pe, "kq_pe_perm", il); - } - - struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe); - cb(kq, "kq", il); - - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); - - if (!pp_opt) { - kq = ggml_permute(ctx0, kq, 0, 2, 1, 3); - cb(kq, "kq_soft_max_ext_perm", il); - } - - struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq); - cb(kqv_compressed, "kqv_compressed", il); - - if (!pp_opt) { - kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 3, 1); - cb(kqv_compressed, "kqv_compressed_perm", il); - } - - struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0); - cb(wv_b, "wv_b", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed); - cb(kqv, "kqv", il); - - if (pp_opt) { - kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3)); - cb(kqv, "kqv_perm", il); - } - - cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0); - cb(cur, "kqv_2d", il); + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); - ggml_build_forward_expand(gf, cur); + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -9853,24 +9776,6 @@ struct llama_context * llama_init_from_model( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - { - size_t memory_size_kr = 0; - size_t memory_size_kv = 0; - - for (auto & kr : ctx->kv_self.kr_l) { - memory_size_kr += ggml_nbytes(kr); - } - - for (auto & kv : ctx->kv_self.kv_l) { - memory_size_kv += ggml_nbytes(kv); - } - - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB\n", __func__, - (float)(memory_size_kr + memory_size_kv) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_kv / (1024.0f * 1024.0f)); - } - // graph outputs buffer { // resized during inference when a batch uses more outputs From 69355a0b90b6d8ccf5a842f90ad69b33cc1ffadd Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 14:05:06 +0800 Subject: [PATCH 043/100] revert MLA --- src/llama-arch.cpp | 6 ---- src/llama-arch.h | 2 -- src/llama-model.cpp | 71 --------------------------------------------- src/llama-model.h | 2 -- 4 files changed, 81 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 93d0812c5fcfe..97a1e7e5e01ef 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -999,8 +999,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, - { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, - { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, @@ -1335,8 +1333,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, @@ -1354,8 +1350,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index c6105d59ac1f3..122fdcebe0af6 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -277,8 +277,6 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, LLM_TENSOR_ATTN_KV_B, - LLM_TENSOR_ATTN_K_B, - LLM_TENSOR_ATTN_V_B, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_SUB_NORM, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9db14d46f019c..338b678e0590f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2914,77 +2914,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0); layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); - layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0); - layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0); - if (!layer.wk_b || !layer.wv_b) { - if (!layer.wkv_b) { - throw std::runtime_error("wkv_b must be defined without wk_b and wv_b"); - } - - // select the buffer type for this tensor - buft_list_t * buft_list = pimpl->dev_input.buft_list; - - ggml_backend_buffer_type_t buft = nullptr; - - // check overrides - if (ml.tensor_buft_overrides) { - std::string tensor_name = "blk."+ std::to_string(i) +".attn_kv_b.weight"; - for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { - std::regex pattern(overrides->pattern); - if (std::regex_search(tensor_name, pattern)) { - LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); - buft = overrides->buft; - break; - } - } - } - - // avoid using a host buffer when using mmap - auto * buft_dev = ggml_backend_buft_get_device(buft); - if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - buft = ggml_backend_dev_buffer_type(cpu_dev); - } - - ggml_context * ctx = ctx_for_buft(buft); - - // 反量化 wkv_b - const auto * qtype = ggml_get_type_traits(layer.wkv_b->type); - std::vector dequantized_wkv_b(layer.wkv_b->ne[0] * layer.wkv_b->ne[1]); - qtype->to_float(layer.wkv_b->data, dequantized_wkv_b.data(), layer.wkv_b->ne[0] * layer.wkv_b->ne[1]); - - // 创建 wk_b 和 wv_b 张量 - auto * wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd_head_qk_nope, n_head * kv_lora_rank); - auto * wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, kv_lora_rank, n_head * n_embd_head_v); - - // 分割 wkv_b 数据来生成 wk_b 和 wv_b - for (int h = 0; h < n_head; ++h) { - int k_start = h * (n_embd_head_qk_nope + n_embd_head_v); - - for (int row = 0; row < kv_lora_rank; ++row) { - for (int col = 0; col < n_embd_head_qk_nope; ++col) { - // 填充 wk_b - int src_idx = row * layer.wkv_b->ne[0] + k_start + col; - GGML_ASSERT(src_idx < dequantized_wkv_b.size()); - int dst_row = h * kv_lora_rank + row; - int dst_col = col; - ((float*)wk_b->data)[dst_row * n_embd_head_qk_nope + dst_col] = dequantized_wkv_b[src_idx]; - } - - for (int col = 0; col < n_embd_head_v; ++col) { - // 填充 wv_b - int src_idx = row * layer.wkv_b->ne[0] + k_start + n_embd_head_qk_nope + col; - GGML_ASSERT(src_idx < dequantized_wkv_b.size()); - int dst_row = row; - int dst_col = h * n_embd_head_v + col; - ((float*)wv_b->data)[dst_row * n_head * n_embd_head_v + dst_col] = dequantized_wkv_b[src_idx]; - } - } - } - - layer.wk_b = ggml_cast(ctx, wk_b, layer.wkv_b->type); - layer.wv_b = ggml_cast(ctx, wv_b, layer.wkv_b->type); - } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); diff --git a/src/llama-model.h b/src/llama-model.h index 1fdbd3721d630..a7c30444786fd 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -161,8 +161,6 @@ struct llama_layer { struct ggml_tensor * wq_b = nullptr; struct ggml_tensor * wkv_a_mqa = nullptr; struct ggml_tensor * wkv_b = nullptr; - struct ggml_tensor * wk_b = nullptr; - struct ggml_tensor * wv_b = nullptr; struct ggml_tensor * wq_cross = nullptr; struct ggml_tensor * wk_cross = nullptr; struct ggml_tensor * wv_cross = nullptr; From f9c292eb526c341e89aa16ab230e403bdb01f0ee Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 14:33:45 +0800 Subject: [PATCH 044/100] revert --- convert_hf_to_gguf.py | 3936 +++++++++++++++++++++++++++++++- gguf-py/gguf/constants.py | 6 - gguf-py/gguf/tensor_mapping.py | 8 - src/llama-kv-cache.cpp | 6 +- src/llama-quant.cpp | 2 +- src/llama.cpp | 19 +- 6 files changed, 3882 insertions(+), 95 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 047612392c7a5..3db21d178ac59 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -947,63 +947,695 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) -@Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") -class DeepseekV2Model(Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + +@Model.register("GPTNeoXForCausalLM") +class GPTNeoXModel(Model): + model_arch = gguf.MODEL_ARCH.GPTNEOX + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + tensors: list[tuple[str, Tensor]] = [] + + if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors + + +@Model.register("BloomForCausalLM", "BloomModel") +class BloomModel(Model): + model_arch = gguf.MODEL_ARCH.BLOOM + + def set_gguf_parameters(self): + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + name = re.sub(r'transformer\.', '', name) + + tensors: list[tuple[str, Tensor]] = [] + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + tensors.append((self.map_tensor_name(name), data_torch)) + + if name == "word_embeddings.weight": + assert self.tensor_names is not None + + # TODO: tie them at runtime, don't duplicate in the model file + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("MPTForCausalLM") +class MPTModel(Model): + model_arch = gguf.MODEL_ARCH.MPT def set_vocab(self): - self._set_vocab_gpt2() + try: + self._set_vocab_gpt2() + except Exception: + # Fallback for SEA-LION model + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) def set_gguf_parameters(self): - super().set_gguf_parameters() + block_count = self.hparams["n_layers"] + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if "scales" in name: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) + + return [(new_name, data_torch)] + + +@Model.register("OrionForCausalLM") +class OrionModel(Model): + model_arch = gguf.MODEL_ARCH.ORION + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) + + +@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(Model): + model_arch = gguf.MODEL_ARCH.BAICHUAN + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + tensors: list[tuple[str, Tensor]] = [] + + if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": + logger.info(f"Unpacking and permuting layer {bid}") + tensors = [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2)), + ] + else: + tensors = [(self.map_tensor_name(name), data_torch)] + + return tensors + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] + + +@Model.register("XverseForCausalLM") +class XverseModel(Model): + model_arch = gguf.MODEL_ARCH.XVERSE + + def set_vocab(self): + assert (self.dir_model / "tokenizer.json").is_file() + dir_model = self.dir_model hparams = self.hparams - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + tokens: list[bytes] = [] + toktypes: list[int] = [] - if hparams["scoring_func"] == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif hparams["scoring_func"] == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. + max_vocab_index = max(tokenizer.get_vocab().values()) + if max_vocab_index >= vocab_size: + raise ValueError("Vocabulary size exceeds expected maximum size.") + + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for token_id in range(vocab_size): + token_text = reverse_vocab[token_id].encode('utf-8') + # replace "\x00" to string with length > 0 + if token_text == b"\x00": + toktype = gguf.TokenType.BYTE # special + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE # special + elif reverse_vocab[token_id] in added_vocab: + if tokenizer.added_tokens_decoder[token_id].special: + toktype = gguf.TokenType.CONTROL + else: + toktype = gguf.TokenType.USER_DEFINED + else: + toktype = gguf.TokenType.NORMAL + + tokens.append(token_text) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] else: - raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith("q_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + if name.endswith("k_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + + return [(self.map_tensor_name(name), data_torch)] + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + +@Model.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(Model): + model_arch = gguf.MODEL_ARCH.FALCON + + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + head_dim = self.hparams["hidden_size"] // n_head + + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("GPTBigCodeForCausalLM") +class StarCoderModel(Model): + model_arch = gguf.MODEL_ARCH.STARCODER + + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@Model.register("GPTRefactForCausalLM") +class RefactModel(Model): + model_arch = gguf.MODEL_ARCH.REFACT + + def set_vocab(self): + super().set_vocab() + + # TODO: how to determine special FIM tokens automatically? + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_vocab._set_special_token("prefix", 1) + special_vocab._set_special_token("suffix", 3) + special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + block_count = self.hparams["n_layer"] + + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + + tensors: list[tuple[str, Tensor]] = [] + + if bid is not None: + if name == f"transformer.h.{bid}.attn.kv.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) + elif name == f"transformer.h.{bid}.attn.q.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) + elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) + + if len(tensors) == 0: + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors + + +@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(Model): + model_arch = gguf.MODEL_ARCH.STABLELM + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + self.gguf_writer.add_file_type(self.ftype) + + _q_norms: list[dict[str, Tensor]] | None = None + _k_norms: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + + if name.find("q_layernorm.norms") != -1: + assert bid is not None + + if self._q_norms is None: + self._q_norms = [{} for _ in range(self.block_count)] + + self._q_norms[bid][name] = data_torch + + if len(self._q_norms[bid]) >= n_head: + return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") + else: + return [] + + if name.find("k_layernorm.norms") != -1: + assert bid is not None + + if self._k_norms is None: + self._k_norms = [{} for _ in range(self.block_count)] + + self._k_norms[bid][name] = data_torch + + if len(self._k_norms[bid]) >= n_kv_head: + return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): + datas: list[Tensor] = [] + # extract the norms in order + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + datas.append(norms[ename]) + del norms[ename] + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + new_name = self.map_tensor_name(merged_name) + + return [(new_name, data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._q_norms is not None or self._k_norms is not None: + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` + norms = ( + [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] + ) + ( + [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] + ) + if len(norms) > 0: + raise ValueError(f"Unprocessed norms: {norms}") + + +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +class LlamaModel(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # rename e_score_correction_bias tensors - if name.endswith("e_score_correction_bias"): - name = name.replace("e_score_correction_bias", "e_score_correction.bias") + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") - # skip Multi-Token Prediction (MTP) layers - block_count = self.hparams["num_hidden_layers"] - match = re.match(r"model.layers.(\d+)", name) - if match and int(match.group(1)) >= block_count: - return [] + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + assert bid is not None if self._experts is None: @@ -1015,17 +1647,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors: list[tuple[str, Tensor]] = [] # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: + for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" datas.append(self._experts[bid][ename]) del self._experts[bid][ename] data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" new_name = self.map_tensor_name(merged_name) @@ -1034,29 +1666,36 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] - if name.endswith("kv_b_proj.weight"): - name_kb = name.replace("kv_b_proj", "k_b_proj") - name_vb = name.replace("kv_b_proj", "v_b_proj") - - n_head_kv = self.hparams["num_key_value_heads"] - v_head_dim = self.hparams["v_head_dim"] - qk_nope_head_dim = self.hparams["qk_nope_head_dim"] - - assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) - - kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) - k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) - k_b = k_b.transpose(1, 2) - k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim) - v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1]) + return [(self.map_tensor_name(name), data_torch)] - return [ - (self.map_tensor_name(name), data_torch), - (self.map_tensor_name(name_kb), k_b), - (self.map_tensor_name(name_vb), v_b) - ] + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - return [(self.map_tensor_name(name), data_torch)] + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) def prepare_tensors(self): super().prepare_tensors() @@ -1068,6 +1707,3177 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("DeciLMForCausalLM") +class DeciModel(Model): + model_arch = gguf.MODEL_ARCH.DECI + + @staticmethod + def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: + # DeciLM-specific code + intermediate_size = int(2 * ffn_mult * n_embd / 3) + return DeciModel._find_multiple(intermediate_size, 256) + + @staticmethod + def _find_multiple(n: int, k: int) -> int: + # DeciLM-specific code + if n % k == 0: + return n + return n + k - (n % k) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] + assert self.block_count == len(_block_configs) + self._num_kv_heads = list() + self._num_heads = list() + _ffn_multipliers = list() + # ***linear attention layer*** + # if n_heads_in_group is None and replace_with_linear is True + # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads + # ***attention-free layer*** + # if n_heads_in_group is None and replace_with_linear is False + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 + # ***normal attention-layer*** + # if n_heads_in_group is not None, then + # _num_kv_heads[il] is num_attention_head // n_heads_in_group and + # _num_heads[il] is num_attention_head + for il in range(len(_block_configs)): + if _block_configs[il]["attention"]["n_heads_in_group"] is None: + if _block_configs[il]["attention"]["replace_with_linear"] is True: + self._num_kv_heads.append(0) + self._num_heads.append(self.hparams["num_attention_heads"]) + else: + self._num_kv_heads.append(0) + self._num_heads.append(0) + else: + self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) + self._num_heads.append(self.hparams["num_attention_heads"]) + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(_ffn_multipliers) + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) + assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) + self._ffn_dims: list[int] = [ + DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) + for multiplier in _ffn_multipliers + ] + + def set_vocab(self): + # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's + # eos_token from '|eot_id|' to '|end_of_text|' + if self.hparams.get("vocab_size", 128256) == 128256: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + else: + # DeciLM-7B + self._set_vocab_llama_hf() + + def set_gguf_parameters(self): + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(self._ffn_dims) + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_head_count(self._num_heads) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_file_type(self.ftype) + else: # DeciLM-7B + super().set_gguf_parameters() + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] + assert self.block_count == len(self._num_kv_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + if bid is not None: + if "num_key_value_heads_per_layer" in self.hparams: + n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] + elif "block_configs" in self.hparams: + n_kv_head = self._num_kv_heads[bid] + n_head = self._num_heads[bid] + else: + n_kv_head = self.hparams.get("num_key_value_heads") + else: + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_tensors(self): + super().prepare_tensors() + + +@Model.register("BitnetForCausalLM") +class BitnetModel(Model): + model_arch = gguf.MODEL_ARCH.BITNET + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def weight_quant(self, weight: Tensor) -> Tensor: + dtype = weight.dtype + weight = weight.float() + scale = weight.abs().mean().clamp(min=1e-5) + iscale = 1 / scale + # TODO: multiply by the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 + result = (weight * iscale).round().clamp(-1, 1) / iscale + return result.type(dtype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if any(self.match_model_tensor_name(new_name, key, bid) for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ]): + # transform weight into 1/0/-1 (in fp32) + data_torch = self.weight_quant(data_torch) + + yield (new_name, data_torch) + + +@Model.register("GrokForCausalLM") +class GrokModel(Model): + model_arch = gguf.MODEL_ARCH.GROK + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find(".moe.") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["linear", "linear_1", "linear_v"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("DbrxForCausalLM") +class DbrxModel(Model): + model_arch = gguf.MODEL_ARCH.DBRX + + def set_gguf_parameters(self): + ffn_config = self.hparams["ffn_config"] + attn_config = self.hparams["attn_config"] + self.gguf_writer.add_block_count(self.hparams["n_layers"]) + + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) + + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) + + self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + + self.gguf_writer.add_layer_norm_eps(1e-5) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_expert = self.hparams["ffn_config"]["moe_num_experts"] + n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] + n_embd = self.hparams["d_model"] + + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + experts = False + + for exp_tensor_name in exp_tensor_names.keys(): + if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: + experts = True + data_torch = data_torch.view(n_expert, n_ff, n_embd) + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: + data_torch = data_torch.permute(*permute_tensor) + break + + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) + + return [(new_name, data_torch)] + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del name, new_name, bid # unused + + return n_dims > 1 + + +@Model.register("MiniCPMForCausalLM") +class MiniCPMModel(Model): + model_arch = gguf.MODEL_ARCH.MINICPM + + def set_gguf_parameters(self): + super().set_gguf_parameters() + embedding_scale = float(self.hparams["scale_emb"]) + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + self.gguf_writer.add_residual_scale(residual_scale) + logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + self.gguf_writer.add_logit_scale(logit_scale) + logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") + if self.hparams.get("rope_scaling") is not None: + if self.hparams["rope_scaling"].get("type") == "longrope": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) + logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(Model): + model_arch = gguf.MODEL_ARCH.MINICPM3 + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + rope_dims = self.hparams["qk_rope_head_dim"] + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + +@Model.register("QWenLMHeadModel") +class QwenModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + self._set_vocab_qwen() + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@Model.register("Qwen2ForCausalLM") +class Qwen2Model(Model): + model_arch = gguf.MODEL_ARCH.QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + + +@Model.register("Qwen2VLForConditionalGeneration") +class Qwen2VLModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + mrope_section = self.hparams["rope_scaling"]["mrope_section"] + mrope_section += [0] * max(0, 4 - len(mrope_section)) + self.gguf_writer.add_rope_dimension_sections(mrope_section) + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, data in super().get_tensors(): + if name.startswith("visual."): + continue + yield name, data + + +@Model.register("WavTokenizerDec") +class WavTokenizerDecModel(Model): + model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): + logger.debug(f"Skipping {name!r}") + return [] + + logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) + self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) + self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) + + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + + self.gguf_writer.add_causal_attention(False) + + +@Model.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) + logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("GPT2LMHeadModel") +class GPT2Model(Model): + model_arch = gguf.MODEL_ARCH.GPT2 + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias", ".attn.masked_bias")): + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + tensors.append((new_name, data_torch)) + + # note: GPT2 output is tied to (same as) wte in original model + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("PhiForCausalLM") +class Phi2Model(Model): + model_arch = gguf.MODEL_ARCH.PHI2 + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + +@Model.register("Phi3ForCausalLM") +class Phi3MiniModel(Model): + model_arch = gguf.MODEL_ARCH.PHI3 + + def set_vocab(self): + # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + tokenizer_class = tokenizer_config_json['tokenizer_class'] + if tokenizer_class == 'GPT2Tokenizer': + return self._set_vocab_gpt2() + + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise ValueError(f'Error: Missing {tokenizer_path}') + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) + rms_eps = self.find_hparam(["rms_norm_eps"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head + + self.gguf_writer.add_context_length(max_pos_embds) + self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(rms_eps) + self.gguf_writer.add_rope_dimension_count(rope_dims) + self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + self.gguf_writer.add_file_type(self.ftype) + sliding_window = self.hparams.get("sliding_window") + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: + sliding_window = 0 + self.gguf_writer.add_sliding_window(sliding_window) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head + + # write rope scaling for long context (128k) model + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is None: + return + + scale = max_pos_embds / orig_max_pos_embds + + rope_scaling_type = rope_scaling.get('type', '').lower() + if len(rope_scaling_type) == 0: + raise KeyError('Missing the required key rope_scaling.type') + + if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': + attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 + elif rope_scaling_type == 'yarn': + attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 + else: + raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') + + self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + +@Model.register("PhiMoEForCausalLM") +class PhiMoeModel(Phi3MiniModel): + model_arch = gguf.MODEL_ARCH.PHIMOE + + _experts: list[dict[str, Tensor]] | None = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) + self.gguf_writer.add_expert_count(self.hparams["num_local_experts"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("PlamoForCausalLM") +class PlamoModel(Model): + model_arch = gguf.MODEL_ARCH.PLAMO + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def shuffle_attn_q_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(8, 5, 128, 5120) + data_torch = torch.permute(data_torch, (1, 0, 2, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def shuffle_attn_output_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(5120, 8, 5, 128) + data_torch = torch.permute(data_torch, (0, 2, 1, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + new_name = self.map_tensor_name(name) + + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) + + return [(new_name, data_torch)] + + +@Model.register("CodeShellForCausalLM") +class CodeShellModel(Model): + model_arch = gguf.MODEL_ARCH.CODESHELL + + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + new_name = self.map_tensor_name(name) + + tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + assert self.tensor_names is not None + + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + # copy tok_embd.weight to output.weight + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("InternLM2ForCausalLM") +class InternLM2Model(Model): + model_arch = gguf.MODEL_ARCH.INTERNLM2 + + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉".encode("utf-8") + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + # take care of ununsed raw token + if piece.startswith('[UNUSED'): + toktype = SentencePieceTokenTypes.UNUSED + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + chat_eos_token = '<|im_end|>' + chat_eos_token_id = None + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if chat_eos_token_id is not None: + # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 + special_vocab.special_token_ids["eos"] = chat_eos_token_id + logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" + " in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_file_type(self.ftype) + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + q_per_kv = num_heads // num_kv_heads + head_dim = n_embd // num_heads + num_groups = num_heads // q_per_kv + + if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: + qkv = data_torch + + qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) + q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] + + # The model weights of q and k equire additional reshape. + q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) + k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + v = v.reshape((-1, v.shape[-1])) + + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), + ] + else: + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("InternLM3ForCausalLM") +class InternLM3Model(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + if "added_tokens_decoder" in tokenizer_config_json: + for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): + if token_data.get("special"): + token_id = int(token_id) + token = token_data["content"] + special_vocab._set_special_token(token, token_id) + # update eos token + if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: + special_vocab.special_token_ids["eos"] = token_id + + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") +class BertModel(Model): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vocab_size = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_causal_attention(False) + + # get pooling path + pooling_path = None + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break + + # get pooling type + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling["pooling_mode_mean_tokens"]: + pooling_type = gguf.PoolingType.MEAN + elif pooling["pooling_mode_cls_token"]: + pooling_type = gguf.PoolingType.CLS + else: + raise NotImplementedError("Only MEAN and CLS pooling types supported") + self.gguf_writer.add_pooling_type(pooling_type) + + def set_vocab(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.vocab_size = len(tokens) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + # convert to phantom space vocab + def phantom(tok): + if tok.startswith("[") and tok.endswith("]"): + return tok + if tok.startswith("##"): + return tok[2:] + return "\u2581" + tok + tokens = list(map(phantom, tokens)) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("bert."): + name = name[5:] + + if name.endswith(".gamma"): + name = name[:-6] + ".weight" + + if name.endswith(".beta"): + name = name[:-5] + ".bias" + + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + return [] # we don't need these + + if name.startswith("cls.predictions"): + return [] + + if name.startswith("cls.seq_relationship"): + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + +@Model.register("NomicBertModel") +class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.NOMIC_BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors + assert self.hparams["qkv_proj_bias"] is False + assert self.hparams["mlp_fc1_bias"] is False + assert self.hparams["mlp_fc2_bias"] is False + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + + +@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + # realign tokens (see HF tokenizer code) + tokens = [b'', b'', b'', b''] + tokens[3:-1] + scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] + toktypes = [ + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.UNKNOWN, + ] + toktypes[3:-1] + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + +@Model.register("GemmaForCausalLM") +class GemmaModel(Model): + model_arch = gguf.MODEL_ARCH.GEMMA + + def set_vocab(self): + self._set_vocab_sentencepiece() + + # TODO: these special tokens should be exported only for the CodeGemma family + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 67) + special_vocab._set_special_token("suffix", 69) + special_vocab._set_special_token("middle", 68) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("Gemma2ForCausalLM") +class Gemma2Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA2 + + def set_vocab(self): + self._set_vocab_sentencepiece() + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_attn_logit_softcapping( + self.hparams["attn_logit_softcapping"] + ) + self.gguf_writer.add_final_logit_softcapping( + self.hparams["final_logit_softcapping"] + ) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("Starcoder2ForCausalLM") +class StarCoder2Model(Model): + model_arch = gguf.MODEL_ARCH.STARCODER2 + + +@Model.register("Rwkv6ForCausalLM") +class Rwkv6Model(Model): + model_arch = gguf.MODEL_ARCH.RWKV6 + + def set_vocab(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = [''.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + parts = line.split(' ') + assert len(parts) >= 3 + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_size = self.hparams["head_size"] + hidden_size = self.hparams["hidden_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + rescale_every_n_layers = self.hparams["rescale_every"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) + time_mix_extra_dim = 64 if hidden_size == 4096 else 32 + time_decay_extra_dim = 128 if hidden_size == 4096 else 64 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): + data_torch = data_torch.transpose(0, 1) + + if new_name.endswith("time_mix_w2.weight"): + data_torch = data_torch.permute(0, 2, 1) + + if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: + data_torch = data_torch.squeeze() + + try: + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + except KeyError: + pass + + # concat time_mix_lerp weights to reduce some cpu overhead + # also reduces the number of tensors in the model + if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: + try: + self.lerp_weights[bid][new_name] = data_torch + except KeyError: + self.lerp_weights[bid] = {new_name: data_torch} + if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) + yield (new_name, data) + return + + yield (new_name, data_torch) + + +@Model.register("RWKV6Qwen2ForCausalLM") +class RWKV6Qwen2Model(Rwkv6Model): + model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + num_attention_heads = self.hparams["num_attention_heads"] + num_key_value_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + head_size = hidden_size // num_attention_heads + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 + time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # special parameters for time_mixing in RWKV6QWEN2 + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_token_shift_count(1) + # RWKV6QWEN2 use grouped key/value like GQA + self.gguf_writer.add_head_count_kv(num_key_value_heads) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + for new_name, data in super().modify_tensors(data_torch, name, bid): + if "time_mix_w1" in new_name or "time_mix_w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg + # permute them here to avoid code changes + data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) + if "w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + yield (new_name, data) + continue + yield (new_name, data) + + +@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +class MambaModel(Model): + model_arch = gguf.MODEL_ARCH.MAMBA + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + elif (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + self._set_vocab_builtin("gpt-neox", vocab_size) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + use_dt_b_c_norm = False + # For falconmamba we do apply RMS norm on B / DT and C layers + if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): + use_dt_b_c_norm = True + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_file_type(self.ftype) + + _tok_embd = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) + + new_name = self.map_tensor_name(name) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + # assuming token_embd.weight is seen before output.weight + if self._tok_embd is not None and new_name == output_name: + if torch.equal(self._tok_embd, data_torch): + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") + return [] + elif new_name == tok_embd_name: + self._tok_embd = data_torch + + return [(new_name, data_torch)] + + +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + # aya-23 models don't have model_max_length specified + self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + +@Model.register("Cohere2ForCausalLM") +class Cohere2Model(Model): + model_arch = gguf.MODEL_ARCH.COHERE2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + rotary_pct = self.hparams["rotary_pct"] + hidden_size = self.hparams["hidden_size"] + num_attention_heads = self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + +@Model.register("OlmoForCausalLM") +@Model.register("OLMoForCausalLM") +class OlmoModel(Model): + model_arch = gguf.MODEL_ARCH.OLMO + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_eps(1e-5) + clip_qkv = self.hparams.get("clip_qkv") + if clip_qkv is not None: + self.gguf_writer.add_clamp_kqv(clip_qkv) + + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("Olmo2ForCausalLM") +class Olmo2Model(Model): + model_arch = gguf.MODEL_ARCH.OLMO2 + + +@Model.register("OlmoeForCausalLM") +class OlmoeModel(Model): + model_arch = gguf.MODEL_ARCH.OLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("JinaBertModel", "JinaBertForMaskedLM") +class JinaBertV2Model(BertModel): + model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.intermediate_size = self.hparams["intermediate_size"] + + def get_tensors(self): + for name, data in super().get_tensors(): + if 'gated_layer' in name: + d1 = data[:self.intermediate_size, :] + name1 = name.replace('gated_layers', 'gated_layers_w') + name1 = name1.replace('up_gated_layer', 'gated_layers_v') + d2 = data[self.intermediate_size:, :] + name2 = name.replace('gated_layers', 'gated_layers_v') + name2 = name2.replace('up_gated_layer', 'gated_layers_w') + yield name1, d1 + yield name2, d2 + continue + + yield name, data + + def set_vocab(self): + tokenizer_class = 'BertTokenizer' + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_class = json.load(f)['tokenizer_class'] + + if tokenizer_class == 'BertTokenizer': + super().set_vocab() + elif tokenizer_class == 'RobertaTokenizer': + self._set_vocab_gpt2() + self.gguf_writer.add_token_type_count(2) + else: + raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "bert.", remove the prefix + # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + if name.startswith("bert."): + name = name[5:] + + return super().modify_tensors(data_torch, name, bid) + + +@Model.register("OpenELMForCausalLM") +class OpenELMModel(Model): + model_arch = gguf.MODEL_ARCH.OPENELM + + @staticmethod + def _make_divisible(v: float | int, divisor: int) -> int: + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 + new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] + ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] + self._n_embd: int = self.hparams["model_dim"] + self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] + self._num_query_heads: list[int] = self.hparams["num_query_heads"] + self._ffn_dims: list[int] = [ + OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) + for multiplier in ffn_multipliers + ] + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) + + # Uses the tokenizer from meta-llama/Llama-2-7b-hf + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) + + def set_gguf_parameters(self): + n_embd = self._n_embd + head_dim = self.hparams["head_dim"] + rot_pct = 1.0 + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_query_heads) + assert self.block_count == len(self._ffn_dims) + + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_context_length"]) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_head_count(self._num_query_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) + # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 + self.gguf_writer.add_layer_norm_rms_eps(1e-6) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + if "n_layers" in keys: + return self.hparams["num_transformer_layers"] + + return super().find_hparam(keys, optional) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # split ff + if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": + ff_dim = self._ffn_dims[bid] + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) + return + + yield (self.map_tensor_name(name), data_torch) + + +@Model.register("ArcticForCausalLM") +class ArcticModel(Model): + model_arch = gguf.MODEL_ARCH.ARCTIC + + def set_vocab(self): + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + # Read the whole vocabulary from the tokenizer.model file + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + + if "added_tokens_decoder" in tokenizer_config_json: + added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] + for token_id, token_json in added_tokens_decoder.items(): + token_id = int(token_id) + if token_id >= vocab_size: + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + token_content = token_json["content"] + token_type = SentencePieceTokenTypes.USER_DEFINED + token_score = -10000.0 + + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model + if ("special" in token_json) and token_json["special"]: + if token_content == tokenizer_config_json["unk_token"]: + token_type = SentencePieceTokenTypes.UNKNOWN + else: + token_type = SentencePieceTokenTypes.CONTROL + token_score = 0.0 + + logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") + tokens[token_id] = token_content.encode("utf-8") + toktypes[token_id] = token_type + scores[token_id] = token_score + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("DeepseekForCausalLM") +class DeepseekModel(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("DeepseekV2ForCausalLM") +@Model.register("DeepseekV3ForCausalLM") +class DeepseekV2Model(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + if hparams["scoring_func"] == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif hparams["scoring_func"] == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") + + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # rename e_score_correction_bias tensors + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + # skip Multi-Token Prediction (MTP) layers + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return [] + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("T5WithLMHeadModel") +@Model.register("T5ForConditionalGeneration") +@Model.register("MT5ForConditionalGeneration") +@Model.register("UMT5ForConditionalGeneration") +class T5Model(Model): + model_arch = gguf.MODEL_ARCH.T5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(True) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.hparams["num_layers"]) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("T5EncoderModel") +class T5EncoderModel(Model): + model_arch = gguf.MODEL_ARCH.T5ENCODER + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(True) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.hparams["num_layers"]) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("JAISLMHeadModel") +class JaisModel(Model): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + if 'mup_embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + self.max_alibi_bias = 8.0 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias")): + return tensors + + if name.endswith(("relative_pe.slopes")): + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) + first_val = float(data_torch[0].item()) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((new_name, data_torch * self.embeddings_scale)) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + tensors.append((new_name, data_torch * self.width_scale)) + else: + tensors.append((new_name, data_torch)) + + return tensors + + def prepare_tensors(self): + super().prepare_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) + + +@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +class ChatGLMModel(Model): + model_arch = gguf.MODEL_ARCH.CHATGLM + + def set_vocab_chatglm3(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytes] = [] + toktypes: list[int] = [] + scores: list[float] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) + assert max(tokenizer.get_vocab().values()) < vocab_size + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + for token_id in range(vocab_size): + piece = tokenizer._convert_id_to_token(token_id) + if token_id == 0: + piece = "" + elif token_id == 1: + piece = "" + elif token_id == 2: + piece = "" + + text = piece.encode("utf-8") + score = 0.0 + # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), + # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() + if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): + score = tokenizer.tokenizer.sp_model.get_score(token_id) + + if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): + if piece in special_tokens: + toktype = SentencePieceTokenTypes.CONTROL + elif len(piece) == 0: + text = f"[PAD{token_id}]".encode("utf-8") + toktype = SentencePieceTokenTypes.UNUSED + else: + toktype = SentencePieceTokenTypes.USER_DEFINED + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + continue + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.tokenizer.sp_model.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.tokenizer.sp_model.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.tokenizer.sp_model.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.tokenizer.sp_model.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + # glm3 needs prefix and suffix formatted as: + # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" + self.gguf_writer.add_tokenizer_pre("chatglm-spm") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""): + self.set_vocab_chatglm3() + return + + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) + assert max(tokenizer.get_vocab().values()) < vocab_size + + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + # only add special tokens when they were not already loaded from config.json + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) + self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"])) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) + self.gguf_writer.add_file_type(self.ftype) + if "attention_dim" in self.hparams: + rope_dim = self.hparams["attention_dim"] + else: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_add_bos_token(False) + rope_freq = 10000 + if "rope_ratio" in self.hparams: + rope_freq = rope_freq * self.hparams["rope_ratio"] + self.gguf_writer.add_rope_freq_base(rope_freq) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."): + return [] + + name = name.removeprefix("transformer.") + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("NemotronForCausalLM") +class NemotronModel(Model): + model_arch = gguf.MODEL_ARCH.NEMOTRON + + def set_vocab(self): + self._set_vocab_sentencepiece() + self.gguf_writer.add_pad_token_id(0) + self.gguf_writer.add_unk_token_id(1) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + + # * Partial RoPE + rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + + # * RopeScaling for Nemotron + if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + else: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side + # model.layers.{l}.input_layernorm.weight + # model.layers.{l}.post_attention_layernorm.weight + # model.norm.weight + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("ExaoneForCausalLM") +class ExaoneModel(Model): + model_arch = gguf.MODEL_ARCH.EXAONE + + def set_gguf_parameters(self): + hparams = self.hparams + + assert (hparams["activation_function"] == "silu") + + max_position_embeddings = hparams["max_position_embeddings"] + embed_dim = hparams["hidden_size"] + num_heads = hparams["num_attention_heads"] + num_kv_heads = hparams.get("num_key_value_heads", num_heads) + layer_norm_eps = hparams["layer_norm_epsilon"] + intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim + num_layers = hparams["num_layers"] + # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 + # attention_dropout_rate = hparams["attention_dropout"] + # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 + # embed_dropout_rate = hparams["embed_dropout"] + self.gguf_writer.add_embedding_length(embed_dim) + self.gguf_writer.add_head_count(num_heads) + self.gguf_writer.add_head_count_kv(num_kv_heads) + self.gguf_writer.add_context_length(max_position_embeddings) + self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_block_count(num_layers) + self.gguf_writer.add_file_type(self.ftype) + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) + rotary_factor = rotary_factor if rotary_factor is not None else 1.0 + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]: + if hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + +@Model.register("GraniteForCausalLM") +class GraniteModel(LlamaModel): + """Conversion for IBM's GraniteForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE + + def set_gguf_parameters(self): + """Granite uses standard llama parameters with the following differences: + + - No head_dim support + - New multiplier params: + - attention_scale + - embedding_scale + - residual_scale + - logits_scaling + """ + if head_dim := self.hparams.pop("head_dim", None): + logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) + super().set_gguf_parameters() + # NOTE: Convert _multiplier params to _scale params for naming + # consistency + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + logger.info("gguf: (granite) attention_scale = %s", attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) + logger.info("gguf: (granite) residual_scale = %s", residual_scale) + if logits_scale := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scale) + logger.info("gguf: (granite) logits_scale = %s", logits_scale) + + +@Model.register("GraniteMoeForCausalLM") +class GraniteMoeModel(GraniteModel): + """Conversion for IBM's GraniteMoeForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE_MOE + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + """In modeling_granitemoe, the JetMoe implementation of parallel experts + is used. This essentially merges w1 and w3 into a single tensor with 2x + the hidden size that is then split during forward. To keep compatibility + with existing mixtral support, we pull them apart here. + """ + + if name.endswith("block_sparse_moe.input_linear.weight"): + ffn_dim = self.hparams["intermediate_size"] + assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" + gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :] + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), + ] + + return super().modify_tensors(data_torch, name, bid) + + +@Model.register("ChameleonForConditionalGeneration") +@Model.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(Model): + model_arch = gguf.MODEL_ARCH.CHAMELEON + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) + + def set_vocab(self): + self._set_vocab_gpt2() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # ignore image tokenizer for now + # TODO: remove this once image support is implemented for Chameleon + if name.startswith("model.vqmodel"): + return [] + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + hidden_dim = self.hparams.get("hidden_size") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if name.endswith(("q_norm.weight", "q_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) + if name.endswith(("k_norm.weight", "k_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) + + return [(self.map_tensor_name(name), data_torch)] + + # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 + @staticmethod + def _reverse_hf_permute(data_torch, n_heads, hidden_dim): + head_dim = hidden_dim // n_heads + data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) + data_torch = data_torch.repeat_interleave(n_heads, 0) + return data_torch + + ###### CONVERSION LOGIC ###### @@ -1299,4 +5109,4 @@ def main() -> None: if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 86ff013b71bb3..ecac5b4bb7f59 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -356,8 +356,6 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_B = auto() ATTN_KV_A_MQA = auto() ATTN_KV_B = auto() - ATTN_K_B = auto() - ATTN_V_B = auto() ATTN_Q_A_NORM = auto() ATTN_KV_A_NORM = auto() FFN_SUB_NORM = auto() @@ -545,8 +543,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", - MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", - MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", @@ -1337,8 +1333,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, - MODEL_TENSOR.ATTN_K_B, - MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.ATTN_OUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index df831ba70594c..617791e240b60 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -586,14 +586,6 @@ class TensorNameMap: "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 ), - MODEL_TENSOR.ATTN_K_B: ( - "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 - ), - - MODEL_TENSOR.ATTN_V_B: ( - "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 - ), - MODEL_TENSOR.ATTN_Q_A_NORM: ( "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 ), diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index d9a33dce346b6..b5fbb3a25f0b6 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -53,7 +53,7 @@ bool llama_kv_cache_init( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { struct ggml_init_params params = { - /*.mem_size =*/ size_t(5u*n_layer*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -91,8 +91,8 @@ bool llama_kv_cache_init( return false; } - ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, 1); - ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, 1); + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cabb5f8f8cdd5..ab50c5d179a29 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -776,7 +776,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { - new_type = name.find("_exps") != std::string::npos ? name.find("ffn_down") != std::string::npos ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K : GGML_TYPE_BF16; + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); } if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; diff --git a/src/llama.cpp b/src/llama.cpp index b6f44403ea737..1bcc0b7e6df6d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1092,8 +1092,7 @@ struct llm_build_context { llama_context & lctx, const llama_ubatch & ubatch, const llm_build_cb & cb, - bool worst_case, - bool warmup) : + bool worst_case) : model (lctx.model), lctx (lctx), hparams (model.hparams), @@ -1111,7 +1110,7 @@ struct llm_build_context { n_embd_head_v (hparams.n_embd_head_v), n_embd_v_gqa (hparams.n_embd_v_gqa()), n_expert (hparams.n_expert), - n_expert_used (warmup ? hparams.n_expert : hparams.n_expert_used), + n_expert_used (hparams.n_expert_used), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), @@ -6405,10 +6404,6 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // whether to use n_tokens as the matrix dimension during multiplication or n_head - // n_tokens is higher during prompt processing, this allows to optimize for this case - bool pp_opt = true; - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8122,7 +8117,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - struct llm_build_context llm(lctx, dummy, cb, false, false); + struct llm_build_context llm(lctx, dummy, cb, false); llm.init(); @@ -8139,7 +8134,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - struct llm_build_context llm(lctx, dummy, cb, false, false); + struct llm_build_context llm(lctx, dummy, cb, false); llm.init(); @@ -8190,11 +8185,7 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_cgraph * result = NULL; - const llama_vocab * vocab = llama_model_get_vocab(&model); - llama_token bos = llama_vocab_bos(vocab); - llama_token eos = llama_vocab_eos(vocab); - bool is_warming_up = (ubatch.n_tokens == 2 && ubatch.token[0] == bos && ubatch.token[1] == eos); - struct llm_build_context llm(lctx, ubatch, cb, worst_case, is_warming_up); + struct llm_build_context llm(lctx, ubatch, cb, worst_case); llm.init(); From 6cccad2880c41d52811499cc248542462c4c8709 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 14:54:27 +0800 Subject: [PATCH 045/100] fix --- examples/server/server.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4c7212759de37..f7a92a0d9f14a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1695,6 +1695,11 @@ struct server_response { // This function blocks the thread until there is a response for one of the id_tasks server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return queue_results.cbegin() != queue_results.cend(); + }); + for (const auto & id_task : id_tasks) { auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { @@ -1703,11 +1708,6 @@ struct server_response { return res; } } - - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return queue_results.cbegin() != queue_results.cend(); - }); } // should never reach here @@ -1739,17 +1739,17 @@ struct server_response { // single-task version of recv() server_task_result_ptr recv(int id_task) { while (true) { + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return queue_results.cbegin() != queue_results.cend(); + }); + auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { server_task_result_ptr res = iter->second; queue_results.erase(id_task); return res; } - - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return queue_results.cbegin() != queue_results.cend(); - }); } } From 20e429d2d0a554ae8a60e555533d2181c13f1cb1 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 17:46:18 +0800 Subject: [PATCH 046/100] add flash_attn --- ggml/src/ggml-cuda/fattn.cu | 3 ++ ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- ggml/src/ggml-cuda/pad.cu | 53 +++++++++++++++++++++++++++++---- src/llama.cpp | 14 +++++++-- 4 files changed, 63 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index b1becccb4de72..c20b0ddd33f49 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -50,6 +50,9 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context case 128: ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst); break; + case 192: + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, ncols2>(ctx, dst); + break; case 256: ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst); break; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5725fc375cc89..ebb2ccae04065 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3248,7 +3248,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 9999999; + const int min_batch_size = 32; return get_op_batch_size(op) >= min_batch_size; diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index aba539e8dad10..d9e0bd09301cf 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -25,6 +25,31 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons } } +static __global__ void pad_f16(const half * x, half * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { + // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 + // blockIdx.y: idx of ne1 + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = + nidx + + blockIdx.y * ne0 + + blockIdx.z * ne0 * gridDim.y; + if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { + int offset_src = + nidx + + blockIdx.y * ne00 + + blockIdx.z * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = 0.0f; + } +} + static void pad_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03, const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { @@ -33,17 +58,35 @@ static void pad_f32_cuda(const float * x, float * dst, pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); } +static void pad_f16_cuda(const half * x, half * dst, + const int ne00, const int ne01, const int ne02, const int ne03, + const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne1, ne2*ne3); + pad_f16<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); +} + void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - pad_f32_cuda(src0_d, dst_d, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + if (src0->type == GGML_TYPE_F32) { + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + pad_f32_cuda(src0_d, dst_d, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + } else { + const half * src0_d = (const half *)src0->data; + half * dst_d = (half *)dst->data; + pad_f16_cuda(src0_d, dst_d, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + } } diff --git a/src/llama.cpp b/src/llama.cpp index 1bcc0b7e6df6d..6c531ee2aa378 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -588,8 +588,16 @@ static struct ggml_tensor * llm_build_kqv( ggml_row_size(kv.v_l[il]->type, n_embd_head_v), 0); cb(v, "v", il); + + struct ggml_tensor * padded_v = v; + int64_t n_embd_head_v_out = n_embd_head_v; + if (n_embd_head_v < n_embd_head_k) { + padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); + cb(padded_v, "padded_v", il); + n_embd_head_v_out = n_embd_head_k; + } - cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); @@ -9567,8 +9575,8 @@ struct llama_context * llama_init_from_model( params.flash_attn = false; } - if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { - LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); + if (params.flash_attn && model->hparams.n_embd_head_k < model->hparams.n_embd_head_v) { + LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k >= n_embd_head_v - forcing off\n", __func__); params.flash_attn = false; } From 17cf6f87565816fa06b340222f9a2348e4609dd8 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 17:48:50 +0800 Subject: [PATCH 047/100] fix warning --- ggml/src/ggml-cuda/pad.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index d9e0bd09301cf..353a89589ee3c 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -68,8 +68,6 @@ static void pad_f16_cuda(const half * x, half * dst, void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); From 6627422fd6a266b7e231e69fcd34bf55d266ff21 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 18:35:44 +0800 Subject: [PATCH 048/100] fix --- src/llama.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 6c531ee2aa378..e1947c384864c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -588,11 +588,12 @@ static struct ggml_tensor * llm_build_kqv( ggml_row_size(kv.v_l[il]->type, n_embd_head_v), 0); cb(v, "v", il); - + struct ggml_tensor * padded_v = v; int64_t n_embd_head_v_out = n_embd_head_v; if (n_embd_head_v < n_embd_head_k) { - padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); + // Pad the feature dimension (assuming it's the third dimension, adjust indices as per actual tensor layout) + padded_v = ggml_pad(ctx, v, 0, 0, k->ne[2] - v->ne[2], 0); // Correct dimension for feature padding cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; } From d82682187b63ea9c8f732a4a252ebbbaf103896e Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 18:43:35 +0800 Subject: [PATCH 049/100] fix --- ggml/src/ggml.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 206fd03413301..baf54532ab97d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3061,6 +3061,9 @@ struct ggml_tensor * ggml_reshape_2d( int64_t ne0, int64_t ne1) { GGML_ASSERT(ggml_is_contiguous(a)); + if (ggml_nelements(a) != ne0*ne1) { + GGML_LOG_ERROR("ggml_reshape_2d: number of elements mismatch name: %s\n", a->name); + } GGML_ASSERT(ggml_nelements(a) == ne0*ne1); const int64_t ne[2] = { ne0, ne1 }; From a5ca0eb21ed8701606f6facd53f8070becd215ef Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 18:49:13 +0800 Subject: [PATCH 050/100] fix --- src/llama.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e1947c384864c..fc401135bd61a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -582,7 +582,7 @@ static struct ggml_tensor * llm_build_kqv( // split cached v into n_head heads (not transposed) struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], + ggml_view_3d(ctx, kv.v_l[il], n_embd_head_v, n_kv, n_head_kv, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), ggml_row_size(kv.v_l[il]->type, n_embd_head_v), @@ -592,8 +592,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * padded_v = v; int64_t n_embd_head_v_out = n_embd_head_v; if (n_embd_head_v < n_embd_head_k) { - // Pad the feature dimension (assuming it's the third dimension, adjust indices as per actual tensor layout) - padded_v = ggml_pad(ctx, v, 0, 0, k->ne[2] - v->ne[2], 0); // Correct dimension for feature padding + padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; } @@ -603,6 +602,15 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + if (n_embd_head_v < n_embd_head_k) { + cur = ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens); + cur = ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, + ggml_element_size(cur) * n_embd_head_v_out, + ggml_element_size(cur) * n_embd_head_v_out * n_head, + 0); + cur = ggml_cont(ctx, cur); + } + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); From 4c33abe4e4fdea3f10418784675ed7cea12ac775 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 18:57:03 +0800 Subject: [PATCH 051/100] add log --- ggml/src/ggml-cuda/fattn.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index c20b0ddd33f49..d580e3abe27af 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -10,6 +10,7 @@ template static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + GGML_LOG_INFO("ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1: D=%d, ncols2=%d", D, ncols2); const ggml_tensor * Q = dst->src[0]; if (Q->ne[1] <= 8/ncols2) { @@ -32,6 +33,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con template static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + GGML_LOG_INFO("ggml_cuda_flash_attn_ext_mma_f16_switch_hs: ncols2=%d", ncols2); const ggml_tensor * Q = dst->src[0]; switch (Q->ne[0]) { From 794d740b7d923088c4f01ddb3a7a9d7da7a2884b Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 18:58:16 +0800 Subject: [PATCH 052/100] fix --- ggml/src/ggml-cuda/fattn.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index d580e3abe27af..4ebd5f94c4d8d 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -7,6 +7,7 @@ #include "fattn-vec-f32.cuh" #include "fattn-wmma-f16.cuh" #include "fattn.cuh" +#include "ggml.h" template static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { From d70d9f08e4d6816443d5a822a033940afcca09e3 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 18:59:23 +0800 Subject: [PATCH 053/100] fix --- ggml/src/ggml-cuda/fattn.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 4ebd5f94c4d8d..8b7724fba7f5a 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -7,7 +7,7 @@ #include "fattn-vec-f32.cuh" #include "fattn-wmma-f16.cuh" #include "fattn.cuh" -#include "ggml.h" +#include "ggml-impl.h" template static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { From fad3960d3594b4731de3cb65d3cf83bf810f4eab Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:11:52 +0800 Subject: [PATCH 054/100] add log --- ggml/src/ggml-cuda/fattn.cu | 3 --- ggml/src/ggml-cuda/pad.cu | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 8b7724fba7f5a..c20b0ddd33f49 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -7,11 +7,9 @@ #include "fattn-vec-f32.cuh" #include "fattn-wmma-f16.cuh" #include "fattn.cuh" -#include "ggml-impl.h" template static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - GGML_LOG_INFO("ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1: D=%d, ncols2=%d", D, ncols2); const ggml_tensor * Q = dst->src[0]; if (Q->ne[1] <= 8/ncols2) { @@ -34,7 +32,6 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con template static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - GGML_LOG_INFO("ggml_cuda_flash_attn_ext_mma_f16_switch_hs: ncols2=%d", ncols2); const ggml_tensor * Q = dst->src[0]; switch (Q->ne[0]) { diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index 353a89589ee3c..21d547eb274b4 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -57,6 +57,7 @@ static void pad_f32_cuda(const float * x, float * dst, dim3 gridDim(num_blocks, ne1, ne2*ne3); pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); } +#include "ggml-impl.h" static void pad_f16_cuda(const half * x, half * dst, const int ne00, const int ne01, const int ne02, const int ne03, @@ -73,6 +74,8 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + GGML_LOG_INFO("ggml_cuda_op_pad: type=%d, ne0=%d, ne1=%d, ne2=%d, ne3=%d, ne0=%d, ne1=%d, ne2=%d, ne3=%d\n", + src0->type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); if (src0->type == GGML_TYPE_F32) { const float * src0_d = (const float *)src0->data; From 6372f5485a6cf7cd17e3163064d3dc438ff9c7a9 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:12:25 +0800 Subject: [PATCH 055/100] fix log --- ggml/src/ggml-cuda/pad.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index 21d547eb274b4..6d82aab8a7eaf 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -74,7 +74,7 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - GGML_LOG_INFO("ggml_cuda_op_pad: type=%d, ne0=%d, ne1=%d, ne2=%d, ne3=%d, ne0=%d, ne1=%d, ne2=%d, ne3=%d\n", + GGML_LOG_INFO("ggml_cuda_op_pad: type=%ld, ne0=%d, ne1=%d, ne2=%d, ne3=%d, ne0=%d, ne1=%d, ne2=%d, ne3=%d\n", src0->type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); if (src0->type == GGML_TYPE_F32) { From 1eeec1c27bf85f9ebbef26e0d260dab77f67e534 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:19:14 +0800 Subject: [PATCH 056/100] fix prec --- ggml/src/ggml.c | 3 --- src/llama.cpp | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index baf54532ab97d..206fd03413301 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3061,9 +3061,6 @@ struct ggml_tensor * ggml_reshape_2d( int64_t ne0, int64_t ne1) { GGML_ASSERT(ggml_is_contiguous(a)); - if (ggml_nelements(a) != ne0*ne1) { - GGML_LOG_ERROR("ggml_reshape_2d: number of elements mismatch name: %s\n", a->name); - } GGML_ASSERT(ggml_nelements(a) == ne0*ne1); const int64_t ne[2] = { ne0, ne1 }; diff --git a/src/llama.cpp b/src/llama.cpp index fc401135bd61a..6673f60ad9602 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -600,7 +600,9 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + if (v->type == GGML_TYPE_F32) { + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + } if (n_embd_head_v < n_embd_head_k) { cur = ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens); From be5f49983980c69e95461605756c3c2168bca88b Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:26:36 +0800 Subject: [PATCH 057/100] add --- ggml/src/ggml-cuda/fattn.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index c20b0ddd33f49..fdf7c8d1b4c1c 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -163,6 +163,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(192, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) #endif // GGML_CUDA_FA_ALL_QUANTS @@ -238,6 +239,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(192, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) #endif // GGML_CUDA_FA_ALL_QUANTS From e341ec64c62b6490d04a4279d0926df899cf20fd Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:39:20 +0800 Subject: [PATCH 058/100] fix --- src/llama.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 6673f60ad9602..c6df68ef9a3e3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -600,17 +600,26 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + LLAMA_LOG_INFO("kq_scale: %f\n", kq_scale); + + // 检查 Softmax 参数 + if (hparams.attn_soft_cap) { + LLAMA_LOG_INFO("Soft capping applied: %f\n", hparams.f_attn_logit_softcapping); + } + LLAMA_LOG_INFO("q shape: [%ld, %ld, %ld]\n", q->ne[0], q->ne[1], q->ne[2]); + LLAMA_LOG_INFO("k shape: [%ld, %ld, %ld]\n", k->ne[0], k->ne[1], k->ne[2]); + LLAMA_LOG_INFO("padded_v shape: [%ld, %ld, %ld]\n", padded_v->ne[0], padded_v->ne[1], padded_v->ne[2]); + if (v->type == GGML_TYPE_F32) { ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); } if (n_embd_head_v < n_embd_head_k) { - cur = ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens); - cur = ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, + cur = ggml_cont(ctx, ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens)); + cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, ggml_element_size(cur) * n_embd_head_v_out * n_head, - 0); - cur = ggml_cont(ctx, cur); + 0)); } cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From 6305eb7d85e7d750ba35285cd0e25ca62577cc6d Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:40:45 +0800 Subject: [PATCH 059/100] fix --- src/llama.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index c6df68ef9a3e3..5f0b991ddb539 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -591,12 +591,21 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * padded_v = v; int64_t n_embd_head_v_out = n_embd_head_v; + // 确保正确填充特征维度(假设v的特征维度是ne[2]) if (n_embd_head_v < n_embd_head_k) { - padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); + padded_v = ggml_pad(ctx, v, + 0, // 不填充dim 0 + 0, // 不填充dim 1 + n_embd_head_k - n_embd_head_v, // 填充特征维度dim 2 + 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; } + // 确保Flash Attention输入维度对齐 + GGML_ASSERT(padded_v->ne[2] == k->ne[2]); // 特征维度一致 + GGML_ASSERT(q->ne[1] == k->ne[1]); // 序列长度一致 + cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); From ebb19c53a515b944b644534b0ba3df89c2565405 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:44:36 +0800 Subject: [PATCH 060/100] fix --- src/llama.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5f0b991ddb539..c6df68ef9a3e3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -591,21 +591,12 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * padded_v = v; int64_t n_embd_head_v_out = n_embd_head_v; - // 确保正确填充特征维度(假设v的特征维度是ne[2]) if (n_embd_head_v < n_embd_head_k) { - padded_v = ggml_pad(ctx, v, - 0, // 不填充dim 0 - 0, // 不填充dim 1 - n_embd_head_k - n_embd_head_v, // 填充特征维度dim 2 - 0); + padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; } - // 确保Flash Attention输入维度对齐 - GGML_ASSERT(padded_v->ne[2] == k->ne[2]); // 特征维度一致 - GGML_ASSERT(q->ne[1] == k->ne[1]); // 序列长度一致 - cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); From 907d09a7216d8540f9d20c4ca4357aec11777361 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 19:51:57 +0800 Subject: [PATCH 061/100] fix --- src/llama.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c6df68ef9a3e3..763849954fcbd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -610,9 +610,7 @@ static struct ggml_tensor * llm_build_kqv( LLAMA_LOG_INFO("k shape: [%ld, %ld, %ld]\n", k->ne[0], k->ne[1], k->ne[2]); LLAMA_LOG_INFO("padded_v shape: [%ld, %ld, %ld]\n", padded_v->ne[0], padded_v->ne[1], padded_v->ne[2]); - if (v->type == GGML_TYPE_F32) { - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - } + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { cur = ggml_cont(ctx, ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens)); From 3395a3498a24a6cd474b9479e2931e9fc5651f45 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 20:06:33 +0800 Subject: [PATCH 062/100] fix --- src/llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 763849954fcbd..275de3df48534 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1338,8 +1338,9 @@ struct llm_build_context { } struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { + const auto i = std::max(n_embd_head_k, n_embd_head_v); lctx.inp_KQ_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, cparams.flash_attn ? i * i : n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); cb(lctx.inp_KQ_mask, "KQ_mask", -1); ggml_set_input(lctx.inp_KQ_mask); From 5f5f9cd8e569fcb8af7a8ae972d392ae778804c5 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Wed, 26 Feb 2025 20:13:52 +0800 Subject: [PATCH 063/100] fix --- ggml/src/ggml-cuda/pad.cu | 2 -- src/llama.cpp | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index 6d82aab8a7eaf..8f8cced4b4d31 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -74,8 +74,6 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - GGML_LOG_INFO("ggml_cuda_op_pad: type=%ld, ne0=%d, ne1=%d, ne2=%d, ne3=%d, ne0=%d, ne1=%d, ne2=%d, ne3=%d\n", - src0->type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); if (src0->type == GGML_TYPE_F32) { const float * src0_d = (const float *)src0->data; diff --git a/src/llama.cpp b/src/llama.cpp index 275de3df48534..763849954fcbd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1338,9 +1338,8 @@ struct llm_build_context { } struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { - const auto i = std::max(n_embd_head_k, n_embd_head_v); lctx.inp_KQ_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, cparams.flash_attn ? i * i : n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); cb(lctx.inp_KQ_mask, "KQ_mask", -1); ggml_set_input(lctx.inp_KQ_mask); From dcbce53dee856739ea51db0be627b6500425583a Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 13:25:37 +0800 Subject: [PATCH 064/100] fix --- examples/server/server.cpp | 20 ++++++++++---------- src/llama.cpp | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f7a92a0d9f14a..4c7212759de37 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1695,11 +1695,6 @@ struct server_response { // This function blocks the thread until there is a response for one of the id_tasks server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return queue_results.cbegin() != queue_results.cend(); - }); - for (const auto & id_task : id_tasks) { auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { @@ -1708,6 +1703,11 @@ struct server_response { return res; } } + + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return queue_results.cbegin() != queue_results.cend(); + }); } // should never reach here @@ -1739,17 +1739,17 @@ struct server_response { // single-task version of recv() server_task_result_ptr recv(int id_task) { while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return queue_results.cbegin() != queue_results.cend(); - }); - auto iter = queue_results.find(id_task); if (iter != queue_results.cend()) { server_task_result_ptr res = iter->second; queue_results.erase(id_task); return res; } + + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return queue_results.cbegin() != queue_results.cend(); + }); } } diff --git a/src/llama.cpp b/src/llama.cpp index 763849954fcbd..534ff0451038c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -613,7 +613,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_cont(ctx, ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens)); + cur = ggml_cont(ctx, cur); cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, ggml_element_size(cur) * n_embd_head_v_out * n_head, From 1647e2b08ee4b9ba02a8bbbb1210d8ff6edc6395 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 13:50:51 +0800 Subject: [PATCH 065/100] fix --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 534ff0451038c..2636a512db218 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -595,6 +595,7 @@ static struct ggml_tensor * llm_build_kqv( padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; + padded_v = ggml_cont(ctx, padded_v); } cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, @@ -613,8 +614,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_cont(ctx, cur); - cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, + cur = ggml_cont(ctx, ggml_view_3d(ctx, ggml_cont(ctx, cur), n_embd_head_v, n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); From ef0b5c43f1179a2ba974deb69d312847b9a48cb8 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 14:00:07 +0800 Subject: [PATCH 066/100] fix --- src/llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 2636a512db218..3dc154ddc686c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -614,9 +614,8 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_cont(ctx, ggml_view_3d(ctx, ggml_cont(ctx, cur), n_embd_head_v, n_head, n_tokens, + cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, - ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); } From 00fd137665b7024069cf5df082ae7e06ee000631 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 14:08:55 +0800 Subject: [PATCH 067/100] fix --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3dc154ddc686c..a67f86087d48a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -595,10 +595,9 @@ static struct ggml_tensor * llm_build_kqv( padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; - padded_v = ggml_cont(ctx, padded_v); } - cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + cur = ggml_flash_attn_ext(ctx, q, k, ggml_cont(ctx, padded_v), kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); LLAMA_LOG_INFO("kq_scale: %f\n", kq_scale); @@ -614,12 +613,13 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { + cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, 0)); + } else { + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } - - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); From 64721f6f889b241f7b26cae4793fad61e2a16fe2 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 14:09:29 +0800 Subject: [PATCH 068/100] fix --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index a67f86087d48a..22a0dc13fa391 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -615,7 +615,7 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, - ggml_element_size(cur) * n_embd_head_v_out, + ggml_element_size(cur) * n_embd_head_v_out*n_head, 0)); } else { cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From 1c49614fded7d9a47354c0370a9b6d83d09b3c46 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 14:17:35 +0800 Subject: [PATCH 069/100] fix --- src/llama.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 22a0dc13fa391..723b41130add8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -613,13 +613,12 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, - ggml_element_size(cur) * n_embd_head_v_out*n_head, + ggml_element_size(cur) * n_embd_head_v_out, 0)); - } else { - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } + + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); @@ -1338,7 +1337,7 @@ struct llm_build_context { struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { lctx.inp_KQ_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, flash_attn ? (n_embd_head_k > n_embd_head_v ? n_embd_head_k * n_embd_head_k : n_kv) : n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); cb(lctx.inp_KQ_mask, "KQ_mask", -1); ggml_set_input(lctx.inp_KQ_mask); From e07b5d9c11746499cf7f1fe150cb9eafd5fe770a Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 14:27:15 +0800 Subject: [PATCH 070/100] fix --- src/llama.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 723b41130add8..b90eb0b215e36 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -610,7 +610,9 @@ static struct ggml_tensor * llm_build_kqv( LLAMA_LOG_INFO("k shape: [%ld, %ld, %ld]\n", k->ne[0], k->ne[1], k->ne[2]); LLAMA_LOG_INFO("padded_v shape: [%ld, %ld, %ld]\n", padded_v->ne[0], padded_v->ne[1], padded_v->ne[2]); - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + if (cur->type == GGML_TYPE_F32) { + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + } if (n_embd_head_v < n_embd_head_k) { cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, @@ -1337,7 +1339,7 @@ struct llm_build_context { struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { lctx.inp_KQ_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, flash_attn ? (n_embd_head_k > n_embd_head_v ? n_embd_head_k * n_embd_head_k : n_kv) : n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); cb(lctx.inp_KQ_mask, "KQ_mask", -1); ggml_set_input(lctx.inp_KQ_mask); From f1fbc194205134c4ecb23620a7a23fbdd9af497f Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 14:36:23 +0800 Subject: [PATCH 071/100] fix --- src/llama.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index b90eb0b215e36..50d8169623a0d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -595,9 +595,10 @@ static struct ggml_tensor * llm_build_kqv( padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; + padded_v = ggml_cont(ctx, padded_v); } - cur = ggml_flash_attn_ext(ctx, q, k, ggml_cont(ctx, padded_v), kq_mask, kq_scale, hparams.f_max_alibi_bias, + cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); LLAMA_LOG_INFO("kq_scale: %f\n", kq_scale); @@ -615,8 +616,10 @@ static struct ggml_tensor * llm_build_kqv( } if (n_embd_head_v < n_embd_head_k) { - cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, + cur = ggml_reshape_3d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out, n_head, n_tokens); + cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, + ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); } From bb532d8c4fc49a5dace0598cc191551d80216135 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 14:53:04 +0800 Subject: [PATCH 072/100] fix --- src/llama.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 50d8169623a0d..a6aebf638de0a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -601,26 +601,19 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - LLAMA_LOG_INFO("kq_scale: %f\n", kq_scale); - - // 检查 Softmax 参数 - if (hparams.attn_soft_cap) { - LLAMA_LOG_INFO("Soft capping applied: %f\n", hparams.f_attn_logit_softcapping); - } - LLAMA_LOG_INFO("q shape: [%ld, %ld, %ld]\n", q->ne[0], q->ne[1], q->ne[2]); - LLAMA_LOG_INFO("k shape: [%ld, %ld, %ld]\n", k->ne[0], k->ne[1], k->ne[2]); - LLAMA_LOG_INFO("padded_v shape: [%ld, %ld, %ld]\n", padded_v->ne[0], padded_v->ne[1], padded_v->ne[2]); - if (cur->type == GGML_TYPE_F32) { ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); } if (n_embd_head_v < n_embd_head_k) { + LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); cur = ggml_reshape_3d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out, n_head, n_tokens); + LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); + LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); } cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From 42908f969edbfee97453643806e67087d5ffaf09 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 15:03:15 +0800 Subject: [PATCH 073/100] fix --- src/llama.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a6aebf638de0a..22dee01e4d98b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -601,22 +601,19 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - if (cur->type == GGML_TYPE_F32) { - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - } + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_reshape_3d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out, n_head, n_tokens); + cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, - ggml_element_size(cur) * n_embd_head_v_out, - ggml_element_size(cur) * n_embd_head_v_out * n_head, + cur = ggml_cont(ctx, ggml_view_2d(ctx, cur, n_embd_head_v* n_head, n_tokens, + ggml_element_size(cur) * n_embd_head_v_out*n_head, 0)); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); + } else { + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } - - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); From 616218ab3b61ed91dcefda0c909f74209447bd51 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 15:28:39 +0800 Subject: [PATCH 074/100] fix --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 22dee01e4d98b..86c9ed62e0435 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -607,8 +607,8 @@ static struct ggml_tensor * llm_build_kqv( LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_cont(ctx, ggml_view_2d(ctx, cur, n_embd_head_v* n_head, n_tokens, - ggml_element_size(cur) * n_embd_head_v_out*n_head, + cur = ggml_cont(ctx, ggml_view_2d(ctx, cur, n_embd_head_v, n_head*n_tokens, + ggml_element_size(cur) * n_embd_head_v_out, 0)); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); } else { From 6993e96f13c718b43529c8fa3852b973420ec2ce Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 15:29:19 +0800 Subject: [PATCH 075/100] fix --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 86c9ed62e0435..678381595e0c1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -605,7 +605,7 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); + cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out, n_head*n_tokens); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); cur = ggml_cont(ctx, ggml_view_2d(ctx, cur, n_embd_head_v, n_head*n_tokens, ggml_element_size(cur) * n_embd_head_v_out, From 848cadec8086a1f77383b76ea50fce7ecec8b99b Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 15:38:45 +0800 Subject: [PATCH 076/100] fix --- src/llama.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 678381595e0c1..4f731835d76b9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -605,15 +605,16 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out, n_head*n_tokens); + cur = ggml_reshape_3d(ctx, cur, n_head, n_embd_head_v_out, n_tokens); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_cont(ctx, ggml_view_2d(ctx, cur, n_embd_head_v, n_head*n_tokens, - ggml_element_size(cur) * n_embd_head_v_out, + cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_head, n_embd_head_v, n_tokens, + ggml_element_size(cur) * n_head, + ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - } else { - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } + + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); From 892bbc61ff2fc61919598ea89d6933a21f6aca39 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 15:44:30 +0800 Subject: [PATCH 077/100] fix --- src/llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 4f731835d76b9..5e867bd971d40 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -605,11 +605,11 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_reshape_3d(ctx, cur, n_head, n_embd_head_v_out, n_tokens); + cur = ggml_reshape_3d(ctx, cur, n_head, n_tokens, n_embd_head_v_out); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_head, n_embd_head_v, n_tokens, + cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_head, n_tokens, n_embd_head_v, ggml_element_size(cur) * n_head, - ggml_element_size(cur) * n_embd_head_v_out * n_head, + ggml_element_size(cur) * n_head * n_tokens, 0)); LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); } From c0827df7fd2c442ce8918902650d2ac1ccf03060 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 16:22:03 +0800 Subject: [PATCH 078/100] fix --- src/llama.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5e867bd971d40..ef0af709832ec 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -604,17 +604,13 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_reshape_3d(ctx, cur, n_head, n_tokens, n_embd_head_v_out); - LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); - cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_head, n_tokens, n_embd_head_v, - ggml_element_size(cur) * n_head, - ggml_element_size(cur) * n_head * n_tokens, + cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); + cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, + ggml_element_size(cur) * n_embd_head_v_out, 0)); - LLAMA_LOG_INFO("cur shape: [%ld, %ld, %ld]\n", cur->ne[0], cur->ne[1], cur->ne[2]); + } else { + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } - - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); From ac4409f28dfec78220c7b9bf775b74d138068ebd Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 16:29:44 +0800 Subject: [PATCH 079/100] fix --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index ef0af709832ec..d5515820b0959 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -606,7 +606,7 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, - ggml_element_size(cur) * n_embd_head_v_out, + ggml_element_size(cur) * n_embd_head_v_out*n_head, 0)); } else { cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From b1a1562c15df2c66d57d87c93abfac9aa41885df Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 16:37:23 +0800 Subject: [PATCH 080/100] fix --- src/llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d5515820b0959..10cbe41f44f6b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -604,13 +604,13 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_reshape_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v_out*n_head, n_tokens); - cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, - ggml_element_size(cur) * n_embd_head_v_out*n_head, + cur = ggml_cont(ctx, ggml_view_3d(ctx, ggml_cont(ctx, cur), n_embd_head_v, n_head, n_tokens, + ggml_element_size(cur) * n_embd_head_v_out, + ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); - } else { - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } + + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); From b0778a6d1e883af8f8faf35e01c8b6557f819c93 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 16:45:19 +0800 Subject: [PATCH 081/100] fix --- src/llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 10cbe41f44f6b..e74b0d28ad017 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -604,9 +604,8 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_cont(ctx, ggml_view_3d(ctx, ggml_cont(ctx, cur), n_embd_head_v, n_head, n_tokens, + cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, - ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); } From eec8dadd2e1cead135fd9b4890deacec53250ada Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 16:53:37 +0800 Subject: [PATCH 082/100] fix --- src/llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index e74b0d28ad017..2bb65844fbb9c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -604,8 +604,10 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_cont(ctx, ggml_view_2d(ctx, ggml_cont(ctx, cur), n_embd_head_v*n_head, n_tokens, + cur = ggml_cont(ctx, cur); + cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_element_size(cur) * n_embd_head_v_out, + ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); } From b218b9bb9aacc812d8e82edadce16dedc01d45e1 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 17:06:44 +0800 Subject: [PATCH 083/100] fix --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 2bb65844fbb9c..4ca7dcc362b1a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -605,8 +605,8 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { cur = ggml_cont(ctx, cur); - cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, - ggml_element_size(cur) * n_embd_head_v_out, + cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_head, n_embd_head_v, n_tokens, + ggml_element_size(cur) * n_head, ggml_element_size(cur) * n_embd_head_v_out * n_head, 0)); } From 28471ebc0bc73ce9b14b6e86f2ff35ba903bdfd5 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 17:17:17 +0800 Subject: [PATCH 084/100] fix --- src/llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 4ca7dcc362b1a..a08fc81102d28 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -605,9 +605,9 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { cur = ggml_cont(ctx, cur); - cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_head, n_embd_head_v, n_tokens, - ggml_element_size(cur) * n_head, - ggml_element_size(cur) * n_embd_head_v_out * n_head, + cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, + ggml_row_size(cur->type, n_embd_head_v_out), + ggml_row_size(cur->type, n_embd_head_v_out * n_head), 0)); } From f298a83624feb907881f773ace832fefff6222e6 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 17:21:24 +0800 Subject: [PATCH 085/100] fix --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a08fc81102d28..3022480b6f777 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -606,8 +606,8 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { cur = ggml_cont(ctx, cur); cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, - ggml_row_size(cur->type, n_embd_head_v_out), - ggml_row_size(cur->type, n_embd_head_v_out * n_head), + cur->nb[1], + cur->nb[2], 0)); } From 5001deea0e187bf98c8318d4f2d54ad204c784b6 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 17:36:53 +0800 Subject: [PATCH 086/100] fix --- src/llama.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3022480b6f777..77ca7dc04d5d1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -595,7 +595,6 @@ static struct ggml_tensor * llm_build_kqv( padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; - padded_v = ggml_cont(ctx, padded_v); } cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, @@ -604,11 +603,11 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_cont(ctx, cur); + cur = ggml_reshape_3d(ctx, cur, n_embd_head_v, n_head, n_tokens); cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, - cur->nb[1], - cur->nb[2], - 0)); + ggml_row_size(cur->type, n_embd_head_v_out), + ggml_row_size(cur->type, n_embd_head_v_out * n_head), + 0)); } cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From 3dba0171d998052b18c6b9f53b5835aca8de950e Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 17:40:23 +0800 Subject: [PATCH 087/100] fix --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 77ca7dc04d5d1..b503f1818fbec 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -603,7 +603,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_reshape_3d(ctx, cur, n_embd_head_v, n_head, n_tokens); + cur = ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens); cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_row_size(cur->type, n_embd_head_v_out), ggml_row_size(cur->type, n_embd_head_v_out * n_head), From 3f960653a7c89dbb5d3c9798ca74f6482ab4e4eb Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 17:40:40 +0800 Subject: [PATCH 088/100] fix --- src/llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama.cpp b/src/llama.cpp index b503f1818fbec..23b18822613ec 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -595,6 +595,7 @@ static struct ggml_tensor * llm_build_kqv( padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); n_embd_head_v_out = n_embd_head_k; + padded_v = ggml_cont(ctx, padded_v); } cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, From 2dd8afc4ed01add6e75ceddd379a4a6803b3fc8c Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 18:03:53 +0800 Subject: [PATCH 089/100] fix --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 23b18822613ec..503b01b07c585 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -608,7 +608,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_row_size(cur->type, n_embd_head_v_out), ggml_row_size(cur->type, n_embd_head_v_out * n_head), - 0)); + ggml_row_size(cur->type, n_embd_head_v_out * n_head * (n_embd_head_k - n_embd_head_v)))); } cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From 6dcfa8977de41110e9748bfc9d904c71b9f6cf17 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 18:09:16 +0800 Subject: [PATCH 090/100] fix --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 503b01b07c585..775e1b39ed930 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -608,7 +608,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, ggml_row_size(cur->type, n_embd_head_v_out), ggml_row_size(cur->type, n_embd_head_v_out * n_head), - ggml_row_size(cur->type, n_embd_head_v_out * n_head * (n_embd_head_k - n_embd_head_v)))); + ggml_element_size(cur) * (n_embd_head_k - n_embd_head_v))); } cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From 50c79409a405cb141c32d7e6d959aec81521415b Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 18:21:00 +0800 Subject: [PATCH 091/100] fix --- src/llama.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 775e1b39ed930..56b4ae9d262a0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -590,11 +590,9 @@ static struct ggml_tensor * llm_build_kqv( cb(v, "v", il); struct ggml_tensor * padded_v = v; - int64_t n_embd_head_v_out = n_embd_head_v; if (n_embd_head_v < n_embd_head_k) { padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); - n_embd_head_v_out = n_embd_head_k; padded_v = ggml_cont(ctx, padded_v); } @@ -604,11 +602,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); if (n_embd_head_v < n_embd_head_k) { - cur = ggml_reshape_3d(ctx, cur, n_embd_head_v_out, n_head, n_tokens); - cur = ggml_cont(ctx, ggml_view_3d(ctx, cur, n_embd_head_v, n_head, n_tokens, - ggml_row_size(cur->type, n_embd_head_v_out), - ggml_row_size(cur->type, n_embd_head_v_out * n_head), - ggml_element_size(cur) * (n_embd_head_k - n_embd_head_v))); + cur = ggml_view_1d(ctx, ggml_cont(ctx, cur), n_embd_head_k*n_head, n_tokens); } cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); From 87f1435c7a6d81456ac866166897e9eb981529ee Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 18:21:29 +0800 Subject: [PATCH 092/100] fix --- src/llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 56b4ae9d262a0..4d91e5b51be3c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -593,7 +593,6 @@ static struct ggml_tensor * llm_build_kqv( if (n_embd_head_v < n_embd_head_k) { padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); cb(padded_v, "padded_v", il); - padded_v = ggml_cont(ctx, padded_v); } cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, From f8f5be1942f7b5e58c03fced7a25792bbf95dca8 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 18:25:46 +0800 Subject: [PATCH 093/100] fix --- src/llama.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 4d91e5b51be3c..648b3919a7b02 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -589,21 +589,11 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); - struct ggml_tensor * padded_v = v; - if (n_embd_head_v < n_embd_head_k) { - padded_v = ggml_pad(ctx, v, 0, k->ne[0] - v->ne[1], 0, 0); - cb(padded_v, "padded_v", il); - } - - cur = ggml_flash_attn_ext(ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - if (n_embd_head_v < n_embd_head_k) { - cur = ggml_view_1d(ctx, ggml_cont(ctx, cur), n_embd_head_k*n_head, n_tokens); - } - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); @@ -9577,8 +9567,8 @@ struct llama_context * llama_init_from_model( params.flash_attn = false; } - if (params.flash_attn && model->hparams.n_embd_head_k < model->hparams.n_embd_head_v) { - LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k >= n_embd_head_v - forcing off\n", __func__); + if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { + LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k != n_embd_head_v - forcing off\n", __func__); params.flash_attn = false; } From 97677f7f540ac42d949257f5ebbccdcdc7eab80f Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Thu, 27 Feb 2025 22:05:17 +0800 Subject: [PATCH 094/100] fix --- ggml/src/ggml-cuda/pad.cu | 1 - src/llama.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index 8f8cced4b4d31..353a89589ee3c 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -57,7 +57,6 @@ static void pad_f32_cuda(const float * x, float * dst, dim3 gridDim(num_blocks, ne1, ne2*ne3); pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); } -#include "ggml-impl.h" static void pad_f16_cuda(const half * x, half * dst, const int ne00, const int ne01, const int ne02, const int ne03, diff --git a/src/llama.cpp b/src/llama.cpp index 648b3919a7b02..e65d2cb9d154e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9568,7 +9568,7 @@ struct llama_context * llama_init_from_model( } if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { - LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k != n_embd_head_v - forcing off\n", __func__); + LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); params.flash_attn = false; } From f24aed85c98358cc23af673034bc7fad72bfd4ba Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Fri, 28 Feb 2025 15:51:53 +0800 Subject: [PATCH 095/100] fix --- examples/server/server.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4c7212759de37..f689eb714d391 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3933,6 +3933,11 @@ int main(int argc, char ** argv) { }; const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + if (req.body.find("chat_history") != std::string::npos) { + res_ok(res, ""); + return; + } + json data = json::parse(req.body); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, @@ -3943,6 +3948,11 @@ int main(int argc, char ** argv) { }; const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + if (req.body.find("chat_history") != std::string::npos) { + res_ok(res, ""); + return; + } + json data = oaicompat_completion_params_parse(json::parse(req.body)); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, @@ -4036,6 +4046,11 @@ int main(int argc, char ** argv) { return; } + if (req.body.find("chat_history") != std::string::npos) { + res_ok(res, ""); + return; + } + auto body = json::parse(req.body); json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get()); From a29ac5726300185d4c706de0e60ccf96c469f604 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Fri, 28 Feb 2025 15:54:53 +0800 Subject: [PATCH 096/100] fix --- examples/server/server.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f689eb714d391..aacd80d3a3fad 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3934,7 +3934,7 @@ int main(int argc, char ** argv) { const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { if (req.body.find("chat_history") != std::string::npos) { - res_ok(res, ""); + res_ok(res, {{ "success", true }}); return; } @@ -3949,7 +3949,7 @@ int main(int argc, char ** argv) { const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { if (req.body.find("chat_history") != std::string::npos) { - res_ok(res, ""); + res_ok(res, {{ "success", true }}); return; } @@ -4047,7 +4047,7 @@ int main(int argc, char ** argv) { } if (req.body.find("chat_history") != std::string::npos) { - res_ok(res, ""); + res_ok(res, {{ "success", true }}); return; } From d19e2da26ef57d1fd95b8a7f9498211d1b37d3b2 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Fri, 28 Feb 2025 15:57:34 +0800 Subject: [PATCH 097/100] fix --- examples/server/server.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index aacd80d3a3fad..56fac45faeda0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3841,6 +3841,7 @@ int main(int argc, char ** argv) { //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); if (prompt.contains("chat_history")) { + res_ok(res, {{ "success", true }}); return; } @@ -3932,7 +3933,7 @@ int main(int argc, char ** argv) { } }; - const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + const auto handle_completions = [&handle_completions_impl, &res_ok](const httplib::Request & req, httplib::Response & res) { if (req.body.find("chat_history") != std::string::npos) { res_ok(res, {{ "success", true }}); return; @@ -3947,7 +3948,7 @@ int main(int argc, char ** argv) { OAICOMPAT_TYPE_NONE); }; - const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + const auto handle_completions_oai = [&handle_completions_impl, &res_ok](const httplib::Request & req, httplib::Response & res) { if (req.body.find("chat_history") != std::string::npos) { res_ok(res, {{ "success", true }}); return; @@ -4039,7 +4040,7 @@ int main(int argc, char ** argv) { OAICOMPAT_TYPE_NONE); // infill is not OAI compatible }; - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl, &res_ok](const httplib::Request & req, httplib::Response & res) { LOG_DBG("request: %s\n", req.body.c_str()); if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); From a951520b4c9e9df3b0dfafcab08154c115099a26 Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Fri, 28 Feb 2025 16:25:59 +0800 Subject: [PATCH 098/100] tmp --- .../{atomic_hash_map.hpp => lock-free.hpp} | 289 +++++++++++++++++- examples/server/server.cpp | 94 +++--- 2 files changed, 321 insertions(+), 62 deletions(-) rename examples/server/{atomic_hash_map.hpp => lock-free.hpp} (66%) diff --git a/examples/server/atomic_hash_map.hpp b/examples/server/lock-free.hpp similarity index 66% rename from examples/server/atomic_hash_map.hpp rename to examples/server/lock-free.hpp index f511b2159742a..c0b872a094428 100644 --- a/examples/server/atomic_hash_map.hpp +++ b/examples/server/lock-free.hpp @@ -216,7 +216,7 @@ namespace std { * See the License for the specific language governing permissions and * limitations under the License. */ -namespace atomic { +namespace lock_free { size_t nextPowTwo(size_t v) { #ifdef _MSC_VER @@ -593,4 +593,289 @@ namespace atomic { explicit MutableData(const T& init) : data(init) {} }; -} // namespace atomic \ No newline at end of file + /** + * A very simple atomic single-linked list primitive. + * + * Usage: + * + * class MyClass { + * _linked_list_hook hook_; + * } + * + * _linked_list list; + * list.insert(&a); + * list.sweep([] (MyClass* c) { doSomething(c); } + */ + template + struct _linked_list_hook { + T* next{nullptr}; + }; + + template T::*HookMember> + class _linked_list { + public: + _linked_list() {} + + _linked_list(const _linked_list&) = delete; + _linked_list& operator=(const _linked_list&) = + delete; + + _linked_list(_linked_list&& other) noexcept + : head_(other.head_.exchange(nullptr, std::memory_order_acq_rel)) {} + + // Absent because would be too error-prone to use correctly because of + // the requirement that lists are empty upon destruction. + _linked_list& operator=( + _linked_list&& other) noexcept = delete; + + /** + * Move the currently held elements to a new list. + * The current list becomes empty, but concurrent threads + * might still add new elements to it. + * + * Equivalent to calling a move constructor, but more linter-friendly + * in case you still need the old list. + */ + _linked_list spliceAll() { return std::move(*this); } + + /** + * Move-assign the current list to `other`, then reverse-sweep + * the old list with the provided callback `func`. + * + * A safe replacement for the move assignment operator, which is absent + * because of the resource leak concerns. + */ + template + void reverseSweepAndAssign(_linked_list&& other, F&& func) { + auto otherHead = other.head_.exchange(nullptr, std::memory_order_acq_rel); + auto head = head_.exchange(otherHead, std::memory_order_acq_rel); + unlinkAll(head, std::forward(func)); + } + + /** + * Note: The list must be empty on destruction. + */ + ~_linked_list() { assert(empty()); } + + /** + * Returns the current head of the list. + * + * WARNING: The returned pointer might not be valid if the list + * is modified concurrently! + */ + T* unsafeHead() const { return head_.load(std::memory_order_acquire); } + + /** + * Returns true if the list is empty. + * + * WARNING: This method's return value is only valid for a snapshot + * of the state, it might become stale as soon as it's returned. + */ + bool empty() const { return unsafeHead() == nullptr; } + + /** + * Atomically insert t at the head of the list. + * @return True if the inserted element is the only one in the list + * after the call. + */ + bool insertHead(T* t) { + assert(next(t) == nullptr); + + auto oldHead = head_.load(std::memory_order_relaxed); + do { + next(t) = oldHead; + /* oldHead is updated by the call below. + + NOTE: we don't use next(t) instead of oldHead directly due to + compiler bugs (GCC prior to 4.8.3 (bug 60272), clang (bug 18899), + MSVC (bug 819819); source: + http://en.cppreference.com/w/cpp/atomic/atomic/compare_exchange */ + } while (!head_.compare_exchange_weak( + oldHead, t, std::memory_order_release, std::memory_order_relaxed)); + + return oldHead == nullptr; + } + + /** + * Replaces the head with nullptr, + * and calls func() on the removed elements in the order from tail to head. + * Returns false if the list was empty. + */ + template + bool sweepOnce(F&& func) { + if (auto head = head_.exchange(nullptr, std::memory_order_acq_rel)) { + auto rhead = reverse(head); + unlinkAll(rhead, std::forward(func)); + return true; + } + return false; + } + + /** + * Repeatedly replaces the head with nullptr, + * and calls func() on the removed elements in the order from tail to head. + * Stops when the list is empty. + */ + template + void sweep(F&& func) { + while (sweepOnce(func)) { + } + } + + /** + * Similar to sweep() but calls func() on elements in LIFO order. + * + * func() is called for all elements in the list at the moment + * reverseSweep() is called. Unlike sweep() it does not loop to ensure the + * list is empty at some point after the last invocation. This way callers + * can reason about the ordering: elements inserted since the last call to + * reverseSweep() will be provided in LIFO order. + * + * Example: if elements are inserted in the order 1-2-3, the callback is + * invoked 3-2-1. If the callback moves elements onto a stack, popping off + * the stack will produce the original insertion order 1-2-3. + */ + template + void reverseSweep(F&& func) { + // We don't loop like sweep() does because the overall order of callbacks + // would be strand-wise LIFO which is meaningless to callers. + auto head = head_.exchange(nullptr, std::memory_order_acq_rel); + unlinkAll(head, std::forward(func)); + } + + private: + std::atomic head_{nullptr}; + + static T*& next(T* t) { return (t->*HookMember).next; } + + /* Reverses a linked list, returning the pointer to the new head + (old tail) */ + static T* reverse(T* head) { + T* rhead = nullptr; + while (head != nullptr) { + auto t = head; + head = next(t); + next(t) = rhead; + rhead = t; + } + return rhead; + } + + /* Unlinks all elements in the linked list fragment pointed to by `head', + * calling func() on every element */ + template + static void unlinkAll(T* head, F&& func) { + while (head != nullptr) { + auto t = head; + head = next(t); + next(t) = nullptr; + func(t); + } + } + }; + + /** + * A very simple atomic single-linked list primitive. + * + * Usage: + * + * linked_list list; + * list.insert(a); + * list.sweep([] (MyClass& c) { doSomething(c); } + */ + + template + class linked_list { + public: + linked_list() {} + linked_list(const linked_list&) = delete; + linked_list& operator=(const linked_list&) = delete; + linked_list(linked_list&& other) noexcept = default; + linked_list& operator=(linked_list&& other) noexcept { + list_.reverseSweepAndAssign(std::move(other.list_), [](Wrapper* node) { + delete node; + }); + return *this; + } + + ~linked_list() { + sweep([](T&&) {}); + } + + bool empty() const { return list_.empty(); } + + /** + * Atomically insert t at the head of the list. + * @return True if the inserted element is the only one in the list + * after the call. + */ + bool insertHead(T t) { + auto wrapper = std::make_unique(std::move(t)); + + return list_.insertHead(wrapper.release()); + } + + /** + * Repeatedly pops element from head, + * and calls func() on the removed elements in the order from tail to head. + * Stops when the list is empty. + */ + template + void sweep(F&& func) { + list_.sweep([&](Wrapper* wrapperPtr) mutable { + std::unique_ptr wrapper(wrapperPtr); + + func(std::move(wrapper->data)); + }); + } + + /** + * Sweeps the list a single time, as a single point in time swap with the + * current contents of the list. + * + * Unlike sweep() it does not loop to ensure the list is empty at some point + * after the last invocation. + * + * Returns false if the list is empty. + */ + template + bool sweepOnce(F&& func) { + return list_.sweepOnce([&](Wrapper* wrappedPtr) { + std::unique_ptr wrapper(wrappedPtr); + func(std::move(wrapper->data)); + }); + } + + /** + * Similar to sweep() but calls func() on elements in LIFO order. + * + * func() is called for all elements in the list at the moment + * reverseSweep() is called. Unlike sweep() it does not loop to ensure the + * list is empty at some point after the last invocation. This way callers + * can reason about the ordering: elements inserted since the last call to + * reverseSweep() will be provided in LIFO order. + * + * Example: if elements are inserted in the order 1-2-3, the callback is + * invoked 3-2-1. If the callback moves elements onto a stack, popping off + * the stack will produce the original insertion order 1-2-3. + */ + template + void reverseSweep(F&& func) { + list_.reverseSweep([&](Wrapper* wrapperPtr) mutable { + std::unique_ptr wrapper(wrapperPtr); + + func(std::move(wrapper->data)); + }); + } + + private: + struct Wrapper { + explicit Wrapper(T&& t) : data(std::move(t)) {} + + _linked_list_hook hook; + T data; + }; + _linked_list list_; + }; + +} // namespace lock_free \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 56fac45faeda0..3173a328b2900 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -18,7 +18,7 @@ #include "index.html.gz.hpp" #include "loading.html.hpp" -#include "atomic_hash_map.hpp" +#include "lock-free.hpp" #include #include @@ -1491,12 +1491,14 @@ struct server_metrics { }; struct server_queue { - int id = 0; + std::atomic id = 0; bool running; // queues - std::deque queue_tasks; - std::deque queue_tasks_deferred; + lock_free::linked_list queue_tasks; + lock_free::linked_list queue_tasks_deferred; + + lock_free::hash_map cancel_tasks = {10000}; std::mutex mutex_tasks; std::condition_variable condition_tasks; @@ -1507,17 +1509,13 @@ struct server_queue { // Add a new task to the end of the queue int post(server_task task, bool front = false) { - std::unique_lock lock(mutex_tasks); GGML_ASSERT(task.id != -1); // if this is cancel task make sure to clean up pending tasks if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - QUE_DBG("new task, id = %d, front = %d\n", task.id, front); - if (front) { - queue_tasks.push_front(std::move(task)); + cancel_tasks.insert(task.id_target, task.id_target); } else { - queue_tasks.push_back(std::move(task)); + QUE_DBG("new task, id = %d, front = %d\n", task.id, front); + queue_tasks.insertHead(std::move(task)); } condition_tasks.notify_one(); return task.id; @@ -1525,20 +1523,16 @@ struct server_queue { // multi-task version of post() int post(std::vector & tasks, bool front = false) { - std::unique_lock lock(mutex_tasks); for (auto & task : tasks) { if (task.id == -1) { task.id = id++; } // if this is cancel task make sure to clean up pending tasks if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); - if (front) { - queue_tasks.push_front(std::move(task)); + cancel_tasks.insert(task.id_target, task.id_target); } else { - queue_tasks.push_back(std::move(task)); + QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); + queue_tasks.insertHead(std::move(task)); } } condition_tasks.notify_one(); @@ -1547,15 +1541,13 @@ struct server_queue { // Add a new task, but defer until one slot is available void defer(server_task task) { - std::unique_lock lock(mutex_tasks); QUE_DBG("defer task, id = %d\n", task.id); - queue_tasks_deferred.push_back(std::move(task)); + queue_tasks_deferred.insertHead(std::move(task)); condition_tasks.notify_one(); } // Get the next id for creating a new task int get_new_id() { - std::unique_lock lock(mutex_tasks); int new_id = id++; return new_id; } @@ -1572,17 +1564,16 @@ struct server_queue { // Call when the state of one slot is changed, it will move one task from deferred to main queue void pop_deferred_task() { - std::unique_lock lock(mutex_tasks); if (!queue_tasks_deferred.empty()) { - queue_tasks.emplace_back(std::move(queue_tasks_deferred.front())); - queue_tasks_deferred.pop_front(); + queue_tasks_deferred.sweepOnce([&](server_task & task) { + queue_tasks.insertHead(std::move(task)); + }); } condition_tasks.notify_one(); } // end the start_loop routine void terminate() { - std::unique_lock lock(mutex_tasks); running = false; condition_tasks.notify_all(); } @@ -1601,21 +1592,21 @@ struct server_queue { QUE_DBG("%s", "processing new tasks\n"); while (true) { - std::unique_lock lock(mutex_tasks); if (!running) { QUE_DBG("%s", "terminate\n"); return; } if (queue_tasks.empty()) { - lock.unlock(); break; } - server_task task = queue_tasks.front(); - queue_tasks.pop_front(); - lock.unlock(); - - QUE_DBG("processing task, id = %d\n", task.id); - callback_new_task(std::move(task)); + queue_tasks.sweepOnce([&](server_task & task) { + QUE_DBG("processing task, id = %d\n", task.id); + if (cancel_tasks.erase(task.id) > 0) { + QUE_DBG("task id = %d is canceled\n", task.id); + return; + } + callback_new_task(std::move(task)); + }); } // all tasks in the current loop is processed, slots data is now ready @@ -1624,42 +1615,25 @@ struct server_queue { callback_update_slots(); QUE_DBG("%s", "waiting for new tasks\n"); - { - std::unique_lock lock(mutex_tasks); - if (!running) { - QUE_DBG("%s", "terminate\n"); - return; - } - if (queue_tasks.empty()) { - condition_tasks.wait(lock, [&]{ - return (!queue_tasks.empty() || !running); - }); - } + if (!running) { + QUE_DBG("%s", "terminate\n"); + return; + } + if (queue_tasks.empty()) { + condition_tasks.wait(lock, [&]{ + return (!queue_tasks.empty() || !running); + }); } } } - -private: - void cleanup_pending_task(int id_target) { - // no need lock because this is called exclusively by post() - auto rm_func = [id_target](const server_task & task) { - return task.id_target == id_target; - }; - queue_tasks.erase( - std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func), - queue_tasks.end()); - queue_tasks_deferred.erase( - std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func), - queue_tasks_deferred.end()); - } }; struct server_response { // for keeping track of all tasks waiting for the result - atomic::hash_map waiting_task_ids = {10000}; + lock_free::hash_map waiting_task_ids = {10000}; // the main result queue (using ptr for polymorphism) - atomic::hash_map queue_results = {10000}; + lock_free::hash_map queue_results = {10000}; std::mutex mutex_results; std::condition_variable condition_results; From f33d3acf02beccbe943dc94c03a4dd420ff8217a Mon Sep 17 00:00:00 2001 From: orca-zhang Date: Fri, 28 Feb 2025 16:32:29 +0800 Subject: [PATCH 099/100] fix --- examples/server/server.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3173a328b2900..c1ef196d171b3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1497,6 +1497,7 @@ struct server_queue { // queues lock_free::linked_list queue_tasks; lock_free::linked_list queue_tasks_deferred; + std::atomic n_queue_tasks_deferred = 0; lock_free::hash_map cancel_tasks = {10000}; @@ -1543,6 +1544,7 @@ struct server_queue { void defer(server_task task) { QUE_DBG("defer task, id = %d\n", task.id); queue_tasks_deferred.insertHead(std::move(task)); + n_queue_tasks_deferred++; condition_tasks.notify_one(); } @@ -1565,9 +1567,10 @@ struct server_queue { // Call when the state of one slot is changed, it will move one task from deferred to main queue void pop_deferred_task() { if (!queue_tasks_deferred.empty()) { - queue_tasks_deferred.sweepOnce([&](server_task & task) { + queue_tasks_deferred.sweepOnce([&](server_task && task) { queue_tasks.insertHead(std::move(task)); }); + n_queue_tasks_deferred--; } condition_tasks.notify_one(); } @@ -1599,7 +1602,7 @@ struct server_queue { if (queue_tasks.empty()) { break; } - queue_tasks.sweepOnce([&](server_task & task) { + queue_tasks.sweepOnce([&](server_task && task) { QUE_DBG("processing task, id = %d\n", task.id); if (cancel_tasks.erase(task.id) > 0) { QUE_DBG("task id = %d is canceled\n", task.id); @@ -1620,6 +1623,7 @@ struct server_queue { return; } if (queue_tasks.empty()) { + std::unique_lock lock(mutex_tasks); condition_tasks.wait(lock, [&]{ return (!queue_tasks.empty() || !running); }); @@ -2595,7 +2599,7 @@ struct server_context { res->slots_data = std::move(slots_data); res->n_idle_slots = n_idle_slots; res->n_processing_slots = n_processing_slots; - res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); + res->n_tasks_deferred = queue_tasks.n_queue_tasks_deferred; res->t_start = metrics.t_start; res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); From 78b0d1d9cdd43e6737ca97cf5ed1644885759c9b Mon Sep 17 00:00:00 2001 From: Orca Date: Mon, 10 Mar 2025 17:43:20 +0800 Subject: [PATCH 100/100] tmp --- examples/server/server.cpp | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c1ef196d171b3..2b634697dcc46 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3819,7 +3819,6 @@ int main(int argc, char ** argv) { //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); if (prompt.contains("chat_history")) { - res_ok(res, {{ "success", true }}); return; } @@ -3911,12 +3910,7 @@ int main(int argc, char ** argv) { } }; - const auto handle_completions = [&handle_completions_impl, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (req.body.find("chat_history") != std::string::npos) { - res_ok(res, {{ "success", true }}); - return; - } - + const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, @@ -3926,12 +3920,7 @@ int main(int argc, char ** argv) { OAICOMPAT_TYPE_NONE); }; - const auto handle_completions_oai = [&handle_completions_impl, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (req.body.find("chat_history") != std::string::npos) { - res_ok(res, {{ "success", true }}); - return; - } - + const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { json data = oaicompat_completion_params_parse(json::parse(req.body)); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, @@ -4025,11 +4014,6 @@ int main(int argc, char ** argv) { return; } - if (req.body.find("chat_history") != std::string::npos) { - res_ok(res, {{ "success", true }}); - return; - } - auto body = json::parse(req.body); json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());