Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pi/gg/SYSTEM.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ Pull requests (PRs):
- New branch names are prefixed with "gg/"
- Before opening a pull request, ask the user to confirm the description
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]"
- Ask the user to tell you what model was used and write it in place of [MODEL]
- Always create the pull requests in draft mode

Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag
- Do not explicitly set the git author in commits - rely on the default git config
- Always use `--no-gpg-sign` when committing
- Never `git push` without explicit confirmation from the user
Expand Down
8 changes: 4 additions & 4 deletions src/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
ggml_init_params params = {
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
/*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
Expand All @@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) {
};

// make tensors
tensors.reserve(hparams.n_layer);
tensors.reserve(hparams.n_layer());
tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < hparams.n_layer; il++) {
for (size_t il = 1; il < hparams.n_layer(); il++) {
ggml_backend_buffer_type_t buft = model.select_buft(il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
Expand Down Expand Up @@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply(
layer_start = il_start;
layer_end = il_end;

for (size_t il = 1; il < hparams.n_layer; il++) {
for (size_t il = 1; il < hparams.n_layer(); il++) {
assert(tensors[il] != nullptr);

const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
Expand Down
10 changes: 5 additions & 5 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ llama_context::llama_context(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
model.n_gpu_layers() > model.hparams.n_layer &&
model.n_gpu_layers() > model.hparams.n_layer() &&
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
Expand Down Expand Up @@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);
Expand Down Expand Up @@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model(

if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
Expand All @@ -3427,7 +3427,7 @@ llama_context * llama_init_from_model(

if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
Expand All @@ -3449,7 +3449,7 @@ llama_context * llama_init_from_model(
}

if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
model->hparams.nextn_predict_layers == 0) {
model->hparams.n_layer_nextn == 0) {
LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
return nullptr;
}
Expand Down
2 changes: 1 addition & 1 deletion src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
cparams (params.cparams),
ubatch (params.ubatch),
n_embd (hparams.n_embd),
n_layer (hparams.n_layer),
n_layer (hparams.n_layer()),
n_rot (hparams.n_rot()),
n_ctx (cparams.n_ctx),
n_head (hparams.n_head()),
Expand Down
83 changes: 38 additions & 45 deletions src/llama-hparams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,31 +7,38 @@

void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
}
} else {
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
}
}

for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
is_swa_impl[il] = false;
}
}

// TODO: implement
//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
// if (dense_first) {
// for (uint32_t il = 0; il < n_layer; ++il) {
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
// }
// } else {
// for (uint32_t il = 0; il < n_layer; ++il) {
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
// }
// }
//}
void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
}
} else {
for (uint32_t il = 0; il < n_layer(); ++il) {
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
}
}

for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
is_recr_impl[il] = false;
}
}

bool llama_hparams::is_swa_any() const {
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
if (is_swa_impl[il]) {
return true;
}
Expand All @@ -41,23 +48,23 @@ bool llama_hparams::is_swa_any() const {
}

uint32_t llama_hparams::n_head(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return n_head_arr[il];
}

GGML_ABORT("fatal error");
}

uint32_t llama_hparams::n_head_kv(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return n_head_kv_arr[il];
}

GGML_ABORT("fatal error");
}

uint32_t llama_hparams::n_ff(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return n_ff_arr[il];
}

Expand All @@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
}

uint32_t llama_hparams::n_rot(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_swa(il) ? n_rot_swa : n_rot_full;
}

Expand All @@ -98,15 +105,15 @@ uint32_t llama_hparams::n_embd_out() const {
}

uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
}

GGML_ABORT("fatal error");
}

uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
}

Expand All @@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {

bool llama_hparams::is_n_embd_k_gqa_variable() const {
const uint32_t val = n_embd_k_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
if (val != n_embd_k_gqa(il)) {
return true;
}
Expand All @@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const {

bool llama_hparams::is_n_embd_v_gqa_variable() const {
const uint32_t val = n_embd_v_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
if (val != n_embd_v_gqa(il)) {
return true;
}
Expand All @@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const {

uint32_t llama_hparams::n_embd_k_gqa_max() const {
uint32_t val = n_embd_k_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
val = std::max(val, n_embd_k_gqa(il));
}

Expand All @@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const {

uint32_t llama_hparams::n_embd_v_gqa_max() const {
uint32_t val = n_embd_v_gqa();
for (uint32_t il = 0; il < n_layer; ++il) {
for (uint32_t il = 0; il < n_layer_all; ++il) {
val = std::max(val, n_embd_v_gqa(il));
}

Expand Down Expand Up @@ -207,23 +214,23 @@ uint32_t llama_hparams::n_embd_s() const {
}

bool llama_hparams::is_recr(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {
return is_recr_impl[il];
}

GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
}

uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
}

bool llama_hparams::is_swa(uint32_t il) const {
if (il < n_layer) {
if (il < n_layer_all) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a potential discrepancy between is_swa_any, that only checks n_layer(), although the result is the same as set_swa_pattern also only updates n_layer() while the rest is zeroed.

Technically anything above n_layer() is out of bounds, until there is SWA MTP of course. :)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added code to explicitly set il >= n_layer() to false to make it more clear.

return is_swa_impl[il];
}

GGML_ABORT("fatal error");
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
}

bool llama_hparams::is_mla() const {
Expand All @@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
}

bool llama_hparams::has_kv(uint32_t il) const {
if (kv_only_nextn) {
// MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
// the leading trunk blocks are not executed in this graph.
return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
}

if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) {
return true;
Expand All @@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const {
return true;
}

uint32_t llama_hparams::n_layer_kv() const {
uint32_t res = 0;

for (uint32_t il = 0; il < n_layer; ++il) {
if (has_kv(il)) {
res++;
}
}

return res;
uint32_t llama_hparams::n_layer() const {
return n_layer_all - n_layer_nextn;
}

bool llama_hparams::use_mrope() const {
Expand Down
17 changes: 8 additions & 9 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,15 @@ struct llama_hparams {

uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_layer_all;
uint32_t n_layer_nextn = 0;
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;

// TODO: this needs to be reworked
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache

// different head size for full_attention and SWA layers
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
Expand Down Expand Up @@ -96,9 +99,6 @@ struct llama_hparams {
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;

bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)

float f_norm_eps;
float f_norm_rms_eps;
Expand Down Expand Up @@ -272,8 +272,7 @@ struct llama_hparams {

bool is_swa(uint32_t il) const;

// TODO: implement
//void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);

// whether or not the given layer is recurrent (for hybrid models)
bool is_recr(uint32_t il) const;
Expand Down Expand Up @@ -329,8 +328,8 @@ struct llama_hparams {

bool has_kv(uint32_t il) const;

// number of layers for which has_kv() returns true
uint32_t n_layer_kv() const;
// number of effective layers (excludes nextn layers)
uint32_t n_layer() const;

// note that this function uses different SWA parameters from those in the hparams
// note: inlined on purpose for performance reasons
Expand Down
8 changes: 4 additions & 4 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ llama_kv_cache::llama_kv_cache(

GGML_ASSERT(kv_size % n_pad == 0);

const uint32_t n_layer_kv = hparams.n_layer_kv();
const uint32_t n_layer = hparams.n_layer_all;

// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {
Expand All @@ -112,7 +112,7 @@ llama_kv_cache::llama_kv_cache(
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
ggml_init_params params = {
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
Expand Down Expand Up @@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(

const bool is_mla = hparams.is_mla();

for (uint32_t il = 0; il < hparams.n_layer; il++) {
for (uint32_t il = 0; il < n_layer; il++) {
if (!hparams.has_kv(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
continue;
Expand Down Expand Up @@ -230,7 +230,7 @@ llama_kv_cache::llama_kv_cache(
if (reuse) {
LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);

for (uint32_t il = 0; il < hparams.n_layer; il++) {
for (uint32_t il = 0; il < n_layer; il++) {
const int32_t il_reuse = reuse(il);

if (il_reuse < 0) {
Expand Down
Loading