@@ -13793,7 +13793,21 @@ struct llm_build_glm4_moe : public llm_graph_context {
1379313793
1379413794 ggml_tensor * cur;
1379513795
13796- if (params.mtp_params.op_type != MTP_OP_NONE) {
13796+ // if (params.mtp_params.op_type != MTP_OP_NONE && params.mtp_params.op_type != MTP_OP_MAIN_VALIDATION) {
13797+ // ggml_tensor* hidden_states_from_main_model;
13798+
13799+ // if (params.mtp_params.op_type == MTP_OP_WARMUP || params.mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED) {
13800+ // hidden_states_from_main_model = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
13801+ // } else {
13802+ // hidden_states_from_main_model = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hparams.n_embd);
13803+ // }
13804+ // ggml_set_name(hidden_states_from_main_model, "result_embd_pooled");
13805+ // ggml_set_input(hidden_states_from_main_model);
13806+
13807+ // auto inp_mtp = std::make_unique<llm_graph_input_mtp_states>();
13808+ // inp_mtp->states = hidden_states_from_main_model;
13809+ // res->add_input(std::move(inp_mtp));
13810+ if (params.mtp_params.op_type != MTP_OP_NONE && params.mtp_params.op_type != MTP_OP_MAIN_VALIDATION) {
1379713811 ggml_tensor* hidden_states_from_main_model;
1379813812
1379913813 if (params.mtp_params.op_type == MTP_OP_WARMUP || params.mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED) {
@@ -13971,8 +13985,9 @@ struct llm_build_glm4_moe : public llm_graph_context {
1397113985 ggml_tensor * embd_copy = ggml_dup(ctx0, prev_embeddings);
1397213986
1397313987 const int il = hparams.n_layer - 1;
13988+ // cb(embd_copy, "mtp_embd_copy", il);
1397413989 ggml_tensor * sum_node = ggml_sum(ctx0, embd_copy);
13975-
13990+ // cb(sum_node, "mtp_sum_node", il);
1397613991 ggml_set_name(sum_node, "mtp_input_sum");
1397713992
1397813993 ggml_tensor * inp_pos = build_inp_pos();
@@ -13983,30 +13998,48 @@ struct llm_build_glm4_moe : public llm_graph_context {
1398313998 ggml_tensor * hidden_state_norm = build_norm(embd_copy, mtp_layer.nextn.hnorm, NULL, LLM_NORM_RMS, il);
1398413999
1398514000 ggml_tensor * combined = ggml_concat(ctx0, token_emb_norm, hidden_state_norm, 0);
14001+ // cb(combined, "mtp_combined", il);
14002+
1398614003 ggml_tensor* cur = build_lora_mm(mtp_layer.nextn.eh_proj, combined);
1398714004
1398814005 // now proceed through last layer (skipped in main model)
1398914006 ggml_tensor * inpSA = cur;
1399014007 // Pre-attention norm for the MTP block
1399114008 cur = build_norm(cur, mtp_layer.attn_norm, NULL, LLM_NORM_RMS, il);
14009+ // cb(cur, "mtp_attn_norm", il);
1399214010
1399314011 // self-attention
1399414012 {
1399514013 ggml_tensor * Qcur = build_lora_mm(mtp_layer.wq, cur);
14014+ // if (mtp_layer.bq) {
14015+ // Qcur = ggml_add(ctx0, Qcur, mtp_layer.bq);
14016+ // cb(Qcur, "mtp_q_bias", il); // ADICIONADO
14017+ // }
1399614018 if (mtp_layer.bq) Qcur = ggml_add(ctx0, Qcur, mtp_layer.bq);
1399714019 cb(Qcur, "Qcur", il);
1399814020
1399914021 ggml_tensor * Kcur = build_lora_mm(mtp_layer.wk, cur);
14022+ // if (mtp_layer.bk) {
14023+ // Kcur = ggml_add(ctx0, Kcur, mtp_layer.bk);
14024+ // cb(Kcur, "mtp_k_bias", il); // ADICIONADO
14025+ // }
1400014026 if (mtp_layer.bk) Kcur = ggml_add(ctx0, Kcur, mtp_layer.bk);
1400114027 cb(Kcur, "Kcur", il);
1400214028
1400314029 ggml_tensor * Vcur = build_lora_mm(mtp_layer.wv, cur);
14030+ // if (mtp_layer.bv) {
14031+ // Vcur = ggml_add(ctx0, Vcur, mtp_layer.bv);
14032+ // cb(Vcur, "mtp_v_bias", il); // ADICIONADO
14033+ // }
1400414034 if (mtp_layer.bv) Vcur = ggml_add(ctx0, Vcur, mtp_layer.bv);
1400514035 cb(Vcur, "Vcur", il);
1400614036
1400714037 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14038+ // cb(Qcur, "mtp_q_reshaped", il);
1400814039 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14040+ // cb(Kcur, "mtp_k_reshaped", il);
1400914041 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14042+ // cb(Vcur, "mtp_v_reshaped", il);
1401014043
1401114044 // Apply Q/K norm if available (GLM-4.5 355B variant)
1401214045 if (mtp_layer.attn_q_norm) {
@@ -14023,12 +14056,14 @@ struct llm_build_glm4_moe : public llm_graph_context {
1402314056 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1402414057 ext_factor, attn_factor, beta_fast, beta_slow
1402514058 );
14059+ // cb(Qcur, "mtp_q_rope", il);
1402614060
1402714061 Kcur = ggml_rope_ext(
1402814062 ctx0, Kcur, inp_pos, nullptr,
1402914063 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1403014064 ext_factor, attn_factor, beta_fast, beta_slow
1403114065 );
14066+ // cb(Kcur, "mtp_k_rope", il);
1403214067
1403314068 cb(Qcur, "Qcur", il);
1403414069 cb(Kcur, "Kcur", il);
@@ -14040,8 +14075,10 @@ struct llm_build_glm4_moe : public llm_graph_context {
1404014075 }
1404114076
1404214077 ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14078+ // cb(ffn_inp, "mtp_ffn_inp", il);
1404314079
1404414080 cur = build_norm(ffn_inp, mtp_layer.attn_post_norm, NULL, LLM_NORM_RMS, il);
14081+ // cb(cur, "post_attn_norm", il);
1404514082
1404614083 // moe ffn for nextn block
1404714084 {
@@ -14073,7 +14110,10 @@ struct llm_build_glm4_moe : public llm_graph_context {
1407314110 cb(cur, "ffn_out", il);
1407414111 }
1407514112 cur = ggml_add(ctx0, cur, ffn_inp);
14113+ // cb(cur, "mtp_ffn_residual", il);
14114+
1407614115 cur = build_norm(cur, mtp_layer.nextn.shared_head_norm, NULL, LLM_NORM_RMS, il);
14116+ // cb(cur, "mtp_final_norm", il);
1407714117 cur = build_lora_mm(mtp_layer.nextn.shared_head_head, cur);
1407814118
1407914119 return cur;
@@ -18305,7 +18345,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1830518345}
1830618346
1830718347ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18308-
18348+ const int64_t t_start_us = ggml_time_us();
1830918349 std::unique_ptr<llm_graph_context> llm;
1831018350 switch (arch) {
1831118351 case LLM_ARCH_LLAMA:
@@ -18664,10 +18704,16 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
1866418704 GGML_ABORT("fatal error");
1866518705 }
1866618706
18667- if (params.mtp_params.op_type == MTP_OP_NONE) {
18707+ if (params.mtp_params.op_type == MTP_OP_NONE || params.mtp_params.op_type == MTP_OP_MAIN_VALIDATION ) {
1866818708 // add on pooling layer
1866918709 llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
1867018710 }
18711+ const int64_t t_end_us = ggml_time_us();
18712+ LLAMA_LOG_INFO(
18713+ "[PERF] Graph build time: %.2f ms (MTP path: %s)\n",
18714+ (t_end_us - t_start_us) / 1000.0,
18715+ params.mtp_params.op_type != MTP_OP_NONE || params.mtp_params.op_type != MTP_OP_MAIN_VALIDATION ? "yes" : "no"
18716+ );
1867118717 return llm->res->get_gf();
1867218718}
1867318719
0 commit comments