Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1713,7 +1713,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
// that have no expert_gating_func model parameter set
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
if (hparams.n_layer == 47 || hparams.n_layer == 48) { // GLM 4.7 Lite
Comment thread
pwilkin marked this conversation as resolved.
Outdated
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
} else {
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
}
}

if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
Expand Down
2 changes: 1 addition & 1 deletion src/models/deepseek2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
hparams.expert_weights_scale, hparams.expert_weights_scale,
true, hparams.expert_weights_scale,
Comment thread
pwilkin marked this conversation as resolved.
Outdated
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
Expand Down
Loading