Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
{ LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" },
{ LLM_KV_ATTENTION_RECURRENT_LAYERS, "%s.attention.recurrent_layers" },

{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ enum llm_kv {
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
LLM_KV_ATTENTION_INDEXER_TOP_K,
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
LLM_KV_ATTENTION_RECURRENT_LAYERS,

LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
Expand Down
25 changes: 19 additions & 6 deletions src/llama-hparams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,31 @@
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
for (uint32_t il = 0; il < n_layer; ++il) {
swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
}
} else {
for (uint32_t il = 0; il < n_layer; ++il) {
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
}
}
}

// TODO: implement
//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
// if (dense_first) {
// for (uint32_t il = 0; il < n_layer; ++il) {
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
// }
// } else {
// for (uint32_t il = 0; il < n_layer; ++il) {
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
// }
// }
//}

bool llama_hparams::is_swa_any() const {
for (uint32_t il = 0; il < n_layer; ++il) {
if (swa_layers[il]) {
if (is_swa_impl[il]) {
return true;
}
}
Expand Down Expand Up @@ -193,9 +206,9 @@ uint32_t llama_hparams::n_embd_s() const {
return ssm_d_state * ssm_d_inner;
}

bool llama_hparams::is_recurrent(uint32_t il) const {
bool llama_hparams::is_recr(uint32_t il) const {
if (il < n_layer) {
return recurrent_layer_arr[il];
return is_recr_impl[il];
}

GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
Expand All @@ -207,7 +220,7 @@ uint32_t llama_hparams::n_pos_per_embd() const {

bool llama_hparams::is_swa(uint32_t il) const {
if (il < n_layer) {
return swa_layers[il];
return is_swa_impl[il];
}

GGML_ABORT("fatal error");
Expand Down
31 changes: 19 additions & 12 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ struct llama_hparams_convnext {
};

struct llama_hparams {
// note: use the `_impl` suffix to avoid name conflict between members and getters
// for example: n_embd_out() vs n_embd_out_impl

bool vocab_only;
bool no_alloc;
bool rope_finetuned;
Expand All @@ -46,7 +49,7 @@ struct llama_hparams {
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
Expand Down Expand Up @@ -137,11 +140,15 @@ struct llama_hparams {
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
// the size of the sliding window (0 - no SWA)
uint32_t n_swa = 0;
// if swa_layers[il] == 1, then layer il is SWA
// if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)

// if is_swa_impl[il] == 1, then layer il is SWA
// if is_swa_impl[il] == 0, then layer il is dense (i.e. non-SWA)
// by default, all layers are dense
// note: using uint32_t type for compatibility reason
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
std::array<uint32_t, LLAMA_MAX_LAYERS> is_swa_impl;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure "is_swa_impl" is a good choice for the variable name. I'm reading it as "is SWA implementation" but then you have the code of the individual models manipulating it which to me would intuitively seem like the models messing with the internals of llama_hparams. Maybe "swa_pattern" to be consistent with set_swa_pattern?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get the is_swa part, to match the function, but agreed, it's confusing, maybe is_swa_layer?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll follow-up with more refactoring of the hparams after this to avoid this PR growing. The main goal here is to get recurrent models enrolled in test-llama-archs to be able to generate small dummy models for testing purposes.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Be aware though that currently there is no implementation for creating a dummy vocab for those models - I have a poor understanding of the related code and did not want to delay the unit tests for TP. But this means that you cannot just use the dummy models for e.g. llama-perplexity or llama-completion.

Copy link
Copy Markdown
Member Author

@ggerganov ggerganov Jun 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I noticed that. I'll be using it with test-save-load-state and I can rework it to not require a vocab.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For dummy models, wouldn't it be fine to just map ASCII characters to int? I would intuitively assume that that would not be too difficult to implement, the problem for me was just that I would have to read up on the vocab code first.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes probably. It would be definitely useful to generate some dummy vocabs too. Will take a look.


// for hybrid state space models
std::array<uint32_t, LLAMA_MAX_LAYERS> is_recr_impl;

// for State Space Models
uint32_t ssm_d_conv = 0;
Expand All @@ -153,9 +160,6 @@ struct llama_hparams {
// for Kimi Linear KDA
uint32_t n_embd_head_kda = 0;

// for hybrid state space models
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;

bool ssm_dt_b_c_rms = false;

float f_clamp_kqv = 0.0f;
Expand Down Expand Up @@ -266,6 +270,14 @@ struct llama_hparams {
// return true if one of the layers is SWA
bool is_swa_any() const;

bool is_swa(uint32_t il) const;

// TODO: implement
//void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);

// whether or not the given layer is recurrent (for hybrid models)
bool is_recr(uint32_t il) const;

uint32_t n_head(uint32_t il = 0) const;

uint32_t n_head_kv(uint32_t il = 0) const;
Expand Down Expand Up @@ -307,13 +319,8 @@ struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_s() const;

// whether or not the given layer is recurrent (for hybrid models)
bool is_recurrent(uint32_t il) const;

uint32_t n_pos_per_embd() const;

bool is_swa(uint32_t il) const;

// note: currently only support if either all or none of the layers are MLA
bool is_mla() const;

Expand Down
4 changes: 2 additions & 2 deletions src/llama-memory-hybrid-iswa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
n_ubatch,
n_pad,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
[&](int32_t il) { return !hparams.is_recr(il); }
: filter_attn,
nullptr
)),
Expand All @@ -57,7 +57,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
n_seq_max,
n_rs_seq,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
[&](int32_t il) { return hparams.is_recr(il); }
: filter_recr
)) {}

Expand Down
4 changes: 2 additions & 2 deletions src/llama-memory-hybrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ llama_memory_hybrid::llama_memory_hybrid(
n_swa,
swa_type,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
[&](int32_t il) { return !hparams.is_recr(il); }
: filter_attn,
nullptr
)),
Expand All @@ -58,7 +58,7 @@ llama_memory_hybrid::llama_memory_hybrid(
n_seq_max,
n_rs_seq,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
[&](int32_t il) { return hparams.is_recr(il); }
: filter_recr
)) {}

Expand Down
8 changes: 4 additions & 4 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ namespace GGUFMeta {
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
return ArrayInfo {
arr_type,
size_t(gguf_get_arr_n(ctx, k)),
gguf_get_arr_n(ctx, k),
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
};
}
Expand Down Expand Up @@ -445,7 +445,7 @@ namespace GGUFMeta {
}

if (n > N_MAX) {
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", n, (uint32_t) N_MAX, key.c_str()));
}

if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
Expand Down Expand Up @@ -502,9 +502,9 @@ namespace GGUFMeta {
}

// TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>> (enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);


llama_model_loader::llama_model_loader(
Expand Down
8 changes: 4 additions & 4 deletions src/llama-model-saver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@

bool llama_model_saver_supports_arch(llm_arch arch) {
switch (arch) {
case LLM_ARCH_QWEN3NEXT:
case LLM_ARCH_QWEN35:
case LLM_ARCH_QWEN35MOE:
case LLM_ARCH_PLAMO3:
case LLM_ARCH_GEMMA3:
case LLM_ARCH_GEMMA3N:
Expand Down Expand Up @@ -107,6 +104,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
} else if (std::is_same<typename Container::value_type, uint32_t>::value) {
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
} else if (std::is_same<typename Container::value_type, bool>::value) {
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_BOOL, value.data(), n_values);
} else if (std::is_same<typename Container::value_type, int32_t>::value) {
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
} else if (std::is_same<typename Container::value_type, float>::value) {
Expand Down Expand Up @@ -245,7 +244,7 @@ void llama_model_saver::add_kv_from_model() {
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count);
add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
Comment thread
JohannesGaessler marked this conversation as resolved.
// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???);
// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); // saved as LLM_KV_ATTENTION_RECURRENT_LAYERS instead

add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
Expand Down Expand Up @@ -279,6 +278,7 @@ void llama_model_saver::add_kv_from_model() {
add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
add_kv(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, true);

const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;

Expand Down
28 changes: 13 additions & 15 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,10 +373,10 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
// count only the same type of previous layers to avoid this
auto get_il_eff = [&](const size_t il){
size_t ret = 0;
const bool il_is_recurrent = hparams.is_recurrent(il);
const bool il_is_swa = hparams.is_swa(il);
const bool il_is_recr = hparams.is_recr(il);
const bool il_is_swa = hparams.is_swa(il);
for (size_t il_prev = 0; il_prev < il; il_prev++) {
ret += hparams.is_recurrent(il_prev) == il_is_recurrent && hparams.is_swa(il_prev) == il_is_swa;
ret += hparams.is_recr(il_prev) == il_is_recr && hparams.is_swa(il_prev) == il_is_swa;
}
return ret;
};
Expand Down Expand Up @@ -553,7 +553,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
};

auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<std::pair<int64_t, uint32_t>> & segments) -> std::vector<int64_t> {
if (hparams.is_recurrent(il)) {
if (hparams.is_recr(il)) {
// linear attention
const int64_t head_dim = hparams.ssm_d_state;
const int64_t granularity_qkv = std::lcm(blck_size, head_dim);
Expand Down Expand Up @@ -1076,18 +1076,16 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
std::fill(
hparams.recurrent_layer_arr.begin(),
hparams.recurrent_layer_arr.end(),
llm_arch_is_recurrent(ml.get_arch()));

std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
std::fill(hparams.is_swa_impl.begin(), hparams.is_swa_impl.end(), 0);
std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), llm_arch_is_recurrent(ml.get_arch()) ? 1 : 0);

std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);

std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);

Expand Down Expand Up @@ -2040,18 +2038,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
filter_recr = [&](int32_t) { return true; };
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
filter_attn = [&](int32_t il) {
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
return !hparams.is_recr(il) && hparams.n_ff(il) == 0;
};
filter_recr = [&](int32_t il) {
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
return hparams.is_recr(il) && hparams.n_ff(il) == 0;
};
} else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
filter_attn = [&, n_main](int32_t il) {
return (uint32_t)il < n_main && !hparams.is_recurrent(il);
return (uint32_t)il < n_main && !hparams.is_recr(il);
};
filter_recr = [&, n_main](int32_t il) {
return (uint32_t)il < n_main && hparams.is_recurrent(il);
return (uint32_t)il < n_main && hparams.is_recr(il);
};
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/falcon-h1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);

std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true);

switch (hparams.n_layer) {
case 36:
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);

uint32_t n_kv_shared_layers = 0;
ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
Expand Down
6 changes: 3 additions & 3 deletions src/models/granite-hybrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {

// A layer is recurrent IFF the n_head_kv value is set to 0
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
}

ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
Expand Down Expand Up @@ -71,7 +71,7 @@ void llama_model_granite_hybrid::load_arch_tensors(llama_model_loader &) {
// norm
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);

if (hparams.is_recurrent(i)) {
if (hparams.is_recr(i)) {
// ssm layers
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);

Expand Down Expand Up @@ -158,7 +158,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);

if (hparams.is_recurrent(il)) {
if (hparams.is_recr(il)) {
// ssm layer //
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/models/jamba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

for (uint32_t i = 0; i < hparams.n_layer; ++i) {
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
}

switch (hparams.n_layer) {
Expand Down
Loading
Loading