Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/llama-model-saver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ void llama_model_saver::add_tensors_from_model() {
add_tensor(model->output);
add_tensor(model->output_b);
add_tensor(model->output_norm_enc);
add_tensor(model->output_s);
add_tensor(model->output_in_s);
add_tensor(model->cls);
add_tensor(model->cls_b);
add_tensor(model->cls_out);
Expand Down
15 changes: 14 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1394,10 +1394,23 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
}
// output scales
if (output) {
// weight scale
if (!output_s) {
output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED);
}
// input scale
if (!output_in_s) {
output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED);
}
}
}

ml.done_getting_tensors();

GGML_ASSERT(!(output && tok_embd &&
strcmp(output->name, tok_embd->name) == 0 &&
output->type == GGML_TYPE_NVFP4));
// populate tensors_by_name
for (auto & [_, ctx_ptr] : ml.ctx_map) {
for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
Expand Down
5 changes: 5 additions & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,11 @@ struct llama_model {
struct ggml_tensor * output_b = nullptr;
struct ggml_tensor * output_norm_enc = nullptr;


// NVFP4 per-tensor scale2, input_scale for LM head
struct ggml_tensor * output_s = nullptr;
struct ggml_tensor * output_in_s = nullptr;

// classifier
struct ggml_tensor * cls = nullptr;
struct ggml_tensor * cls_b = nullptr;
Expand Down
2 changes: 1 addition & 1 deletion src/models/afmoe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

Expand Down
2 changes: 1 addition & 1 deletion src/models/apertus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/arcee.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/arctic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/arwkv7.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/baichuan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/bailingmoe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/bailingmoe2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_param
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/chameleon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output_with_img_logits", -1);

// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
Expand Down
2 changes: 1 addition & 1 deletion src/models/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/codeshell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_p
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/cogvlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
Expand Down
2 changes: 1 addition & 1 deletion src/models/cohere2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale);
Expand Down
2 changes: 1 addition & 1 deletion src/models/command-r.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale);
Expand Down
2 changes: 1 addition & 1 deletion src/models/dbrx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/deci.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/deepseek.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/dots1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/dream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ llama_model_dream::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/ernie4-5-moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_grap
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/ernie4-5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/exaone-moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/exaone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/exaone4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/falcon-h1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

// final logit soft-capping
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ llama_model_gemma3::graph<iswa>::graph(const llama_model & model, const llm_grap
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (hparams.f_final_logit_softcapping) {
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma3n.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

{
// final logit soft-capping
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (hparams.f_final_logit_softcapping) {
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
Expand Down
2 changes: 1 addition & 1 deletion src/models/glm4-moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/glm4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;

// Output projection
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gpt2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gptneox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/granite-hybrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

// For Granite architectures - scale logits
if (hparams.f_logit_scale) {
Expand Down
2 changes: 1 addition & 1 deletion src/models/granite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ llama_model_granite::graph::graph(
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

// For Granite architectures - scale logits
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
Expand Down
2 changes: 1 addition & 1 deletion src/models/grok.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ llama_model_grok::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);

Expand Down
Loading
Loading