Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pi/gg/SYSTEM.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ Pull requests (PRs):
- New branch names are prefixed with "gg/"
- Before opening a pull request, ask the user to confirm the description
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]"
- Ask the user to tell you what model was used and write it in place of [MODEL]
- Always create the pull requests in draft mode

Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag
- Do not explicitly set the git author in commits - rely on the default git config
- Always use `--no-gpg-sign` when committing
- Never `git push` without explicit confirmation from the user
Expand Down
8 changes: 4 additions & 4 deletions src/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
ggml_init_params params = {
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
/*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
Expand All @@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) {
};

// make tensors
tensors.reserve(hparams.n_layer);
tensors.reserve(hparams.n_layer());
tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < hparams.n_layer; il++) {
for (size_t il = 1; il < hparams.n_layer(); il++) {
ggml_backend_buffer_type_t buft = model.select_buft(il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
Expand Down Expand Up @@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply(
layer_start = il_start;
layer_end = il_end;

for (size_t il = 1; il < hparams.n_layer; il++) {
for (size_t il = 1; il < hparams.n_layer(); il++) {
assert(tensors[il] != nullptr);

const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
Expand Down
10 changes: 5 additions & 5 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ llama_context::llama_context(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
model.n_gpu_layers() > model.hparams.n_layer &&
model.n_gpu_layers() > model.hparams.n_layer() &&
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
Expand Down Expand Up @@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);
Expand Down Expand Up @@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model(

if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
Expand All @@ -3427,7 +3427,7 @@ llama_context * llama_init_from_model(

if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
Expand All @@ -3449,7 +3449,7 @@ llama_context * llama_init_from_model(
}

if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
model->hparams.nextn_predict_layers == 0) {
model->hparams.n_layer_nextn == 0) {
LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
return nullptr;
}
Expand Down
2 changes: 1 addition & 1 deletion src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
cparams (params.cparams),
ubatch (params.ubatch),
n_embd (hparams.n_embd),
n_layer (hparams.n_layer),
n_layer (hparams.n_layer()),
n_rot (hparams.n_rot()),
n_ctx (cparams.n_ctx),
n_head (hparams.n_head()),
Expand Down
83 changes: 38 additions & 45 deletions src/llama-hparams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,31 +7,38 @@

void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
}
} else {
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
}
}

for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
is_swa_impl[il] = false;
}
}

// TODO: implement
//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
// if (dense_first) {
// for (uint32_t il = 0; il < n_layer; ++il) {
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
// }
// } else {
// for (uint32_t il = 0; il < n_layer; ++il) {
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
// }
// }
//}
void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
}
} else {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
}
}

for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
is_recr_impl[il] = false;
}
}

bool llama_hparams::is_swa_any() const {
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
if (is_swa_impl[il]) {
return true;
}
Expand All @@ -41,23 +48,23 @@ bool llama_hparams::is_swa_any() const {
}

uint32_t llama_hparams::n_head(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return n_head_arr[il];
}

GGML_ABORT("fatal error");
}

uint32_t llama_hparams::n_head_kv(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return n_head_kv_arr[il];
}

GGML_ABORT("fatal error");
}

uint32_t llama_hparams::n_ff(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return n_ff_arr[il];
}

Expand All @@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
}

uint32_t llama_hparams::n_rot(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_swa(il) ? n_rot_swa : n_rot_full;
}

Expand All @@ -98,15 +105,15 @@ uint32_t llama_hparams::n_embd_out() const {
}

uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
}

GGML_ABORT("fatal error");
}

uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
}

Expand All @@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {

bool llama_hparams::is_n_embd_k_gqa_variable() const {
const uint32_t val = n_embd_k_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
if (val != n_embd_k_gqa(il)) {
return true;
}
Expand All @@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const {

bool llama_hparams::is_n_embd_v_gqa_variable() const {
const uint32_t val = n_embd_v_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
if (val != n_embd_v_gqa(il)) {
return true;
}
Expand All @@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const {

uint32_t llama_hparams::n_embd_k_gqa_max() const {
uint32_t val = n_embd_k_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
val = std::max(val, n_embd_k_gqa(il));
}

Expand All @@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const {

uint32_t llama_hparams::n_embd_v_gqa_max() const {
uint32_t val = n_embd_v_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
val = std::max(val, n_embd_v_gqa(il));
}

Expand Down Expand Up @@ -207,23 +214,23 @@ uint32_t llama_hparams::n_embd_s() const {
}

bool llama_hparams::is_recr(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_recr_impl[il];
}

GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
}

uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
}

bool llama_hparams::is_swa(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a potential discrepancy between is_swa_any, that only checks n_layer(), although the result is the same as set_swa_pattern also only updates n_layer() while the rest is zeroed.

Technically anything above n_layer() is out of bounds, until there is SWA MTP of course. :)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added code to explicitly set il >= n_layer() to false to make it more clear.

return is_swa_impl[il];
}

GGML_ABORT("fatal error");
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
}

bool llama_hparams::is_mla() const {
Expand All @@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
}

bool llama_hparams::has_kv(uint32_t il) const {
if (kv_only_nextn) {
// MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
// the leading trunk blocks are not executed in this graph.
return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
}

if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) {
return true;
Expand All @@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const {
return true;
}

uint32_t llama_hparams::n_layer_kv() const {
uint32_t res = 0;

for (uint32_t il = 0; il < n_layer; ++il) {
if (has_kv(il)) {
res++;
}
}

return res;
uint32_t llama_hparams::n_layer() const {
return n_layer_all - n_layer_nextn;
}

bool llama_hparams::use_mrope() const {
Expand Down
17 changes: 8 additions & 9 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,15 @@ struct llama_hparams {

uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_layer_all;
uint32_t n_layer_nextn = 0;
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;

// TODO: this needs to be reworked
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache

// different head size for full_attention and SWA layers
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
Expand Down Expand Up @@ -96,9 +99,6 @@ struct llama_hparams {
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;

bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)

float f_norm_eps;
float f_norm_rms_eps;
Expand Down Expand Up @@ -272,8 +272,7 @@ struct llama_hparams {

bool is_swa(uint32_t il) const;

// TODO: implement
//void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);

// whether or not the given layer is recurrent (for hybrid models)
bool is_recr(uint32_t il) const;
Expand Down Expand Up @@ -329,8 +328,8 @@ struct llama_hparams {

bool has_kv(uint32_t il) const;

// number of layers for which has_kv() returns true
uint32_t n_layer_kv() const;
// number of effective layers (excludes nextn layers)
uint32_t n_layer() const;

// note that this function uses different SWA parameters from those in the hparams
// note: inlined on purpose for performance reasons
Expand Down
8 changes: 4 additions & 4 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ llama_kv_cache::llama_kv_cache(

GGML_ASSERT(kv_size % n_pad == 0);

const uint32_t n_layer_kv = hparams.n_layer_kv();
const uint32_t n_layer = hparams.n_layer_all;

// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {
Expand All @@ -112,7 +112,7 @@ llama_kv_cache::llama_kv_cache(
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
ggml_init_params params = {
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
Expand Down Expand Up @@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(

const bool is_mla = hparams.is_mla();

for (uint32_t il = 0; il < hparams.n_layer; il++) {
for (uint32_t il = 0; il < n_layer; il++) {
if (!hparams.has_kv(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
continue;
Expand Down Expand Up @@ -230,7 +230,7 @@ llama_kv_cache::llama_kv_cache(
if (reuse) {
LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);

for (uint32_t il = 0; il < hparams.n_layer; il++) {
for (uint32_t il = 0; il < n_layer; il++) {
const int32_t il_reuse = reuse(il);

if (il_reuse < 0) {
Expand Down
Loading
Loading