mtp-graph (wip): testing different ways to allow graph reuse

SamuelOliveirads · SamuelOliveirads · commit 5859cb9c0423 · 2025-10-23T20:23:21.000-03:00
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -382,7 +382,10 @@ llama_token mtp_speculative_gen_draft(
 
     // Perform the MTP draft generation decode. This writes the MTP layer's
     // KV state for the draft token into the cache.
+    const int64_t t_start_us = ggml_time_us();
     llama_decode(ctx, mtp_batch);
+    const int64_t t_end_us = ggml_time_us();
+    LOG_INF("[PERF-MTP] mtp_speculative_gen_draft internal decode: %.2f ms\n", (t_end_us - t_start_us) / 1000.0);
     llama_batch_free(mtp_batch);
 
     // CRITICAL: Purge the metadata for the draft token we just wrote.
@@ -423,7 +426,10 @@ void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, b
     for (int i = 0; i < mtp_batch.n_tokens; ++i) {
         mtp_batch.logits[i] = true;
     }
+    const int64_t t_start_us = ggml_time_us();
     llama_decode(ctx, mtp_batch);
+    const int64_t t_end_us = ggml_time_us();
+    LOG_INF("[PERF-MTP] mtp_update_kv_cache internal decode (op=%d): %.2f ms\n", (int)mtp_batch.mtp_params.op_type, (t_end_us - t_start_us) / 1000.0);
 }
 
 void mtp_accept_tokens(
diff --git a/include/llama.h b/include/llama.h
@@ -226,6 +226,7 @@ extern "C" {
         MTP_OP_WARMUP,
         MTP_OP_UPDATE_ACCEPTED,
         MTP_OP_DRAFT_GEN,
+        MTP_OP_MAIN_VALIDATION,
     } llama_mtp_op_type;
 
     typedef struct llama_mtp_params {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -35,6 +35,7 @@ struct llama_context_kv_cache_data {
     llama_kv_cache_unified::slot_info_vec_t resized_sinfo_for_force;
     const llama_kv_cache_unified::slot_info_vec_t * forced_sinfos = nullptr;
     std::map<llama_graph_cache_key, llm_graph_result_ptr> graph_cache;
+    llm_graph_result_ptr gf_res_prev_validation; 
 };
 
 llama_context::llama_context(
@@ -788,24 +789,35 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
         LLAMA_LOG_INFO("[GRAPH-CACHE] MISS, RECONSTRUCTING THE STRUCTURE of the graph for key (op=%d, tok=%d, out=%d)\n",
             (int)key.op_type, key.n_tokens, key.n_outputs);
         
+        const int64_t t_reset_start_us = ggml_time_us();
         ggml_backend_sched_reset(sched.get());
+        const int64_t t_reset_end_us = ggml_time_us();
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
         res->reset();
         res->set_params(gparams);
+        const int64_t t_build_start_us = ggml_time_us();
         res->gf = model.build_graph(gparams);
+        const int64_t t_build_end_us = ggml_time_us();
+        LLAMA_LOG_INFO("[PERF-GRAPH] Graph build (op=%d): %.2f ms\n", (int)mtp_params.op_type, (t_build_end_us - t_build_start_us) / 1000.0);
 
         if (!res->gf) {
             LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
             ret = GGML_STATUS_FAILED;
             return nullptr;
         }
 
+        const int64_t t_alloc_start_us = ggml_time_us();
         if (!ggml_backend_sched_alloc_graph(sched.get(), res->gf)) {
             LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
             ret = GGML_STATUS_ALLOC_FAILED;
             return nullptr;
         }
+        const int64_t t_alloc_end_us = ggml_time_us();
+            LLAMA_LOG_INFO("[PERF-GRAPH] sched_reset: %.2f ms | sched_alloc: %.2f ms (op=%d)\n",
+                (t_reset_end_us - t_reset_start_us) / 1000.0,
+                (t_alloc_end_us - t_alloc_start_us) / 1000.0,
+                (int)mtp_params.op_type);
         // }
 
     } else {
@@ -818,14 +830,19 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
         } else {
             LLAMA_LOG_INFO("%s: RECONSTRUCTED graph...\n", __func__);
 
+            const int64_t t_reset_start_us = ggml_time_us();
             ggml_backend_sched_reset(sched.get());
+            const int64_t t_reset_end_us = ggml_time_us();
             ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
             res->reset();
             res->set_params(gparams);
             //const auto t_start_us = ggml_time_us();
 
+            const int64_t t_build_start_us = ggml_time_us();
             res->gf = model.build_graph(gparams);
+            const int64_t t_build_end_us = ggml_time_us();
+            LLAMA_LOG_INFO("[PERF-GRAPH] Graph build (op=%d): %.2f ms\n", (int)mtp_params.op_type, (t_build_end_us - t_build_start_us) / 1000.0);
 
             //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
 
@@ -835,15 +852,21 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
                 return nullptr;
             }
 
+            const int64_t t_alloc_start_us = ggml_time_us();
             if (!ggml_backend_sched_alloc_graph(sched.get(), res->gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
                 ret = GGML_STATUS_ALLOC_FAILED;
                 return nullptr;
             }
+            const int64_t t_alloc_end_us = ggml_time_us();
+            LLAMA_LOG_INFO("[PERF-GRAPH] sched_reset: %.2f ms | sched_alloc: %.2f ms (op=%d)\n",
+                (t_reset_end_us - t_reset_start_us) / 1000.0,
+                (t_alloc_end_us - t_alloc_start_us) / 1000.0,
+                (int)mtp_params.op_type);
         }
     }
 
-    if (mtp_params.op_type != MTP_OP_NONE) { // If it is any MTP operation
+    if (mtp_params.op_type != MTP_OP_NONE && mtp_params.op_type != MTP_OP_MAIN_VALIDATION) {
         if (!prepare_mtp_graph_inputs(res, ubatch, mtp_params)) {
             ret = GGML_STATUS_FAILED;
             return nullptr;
@@ -1241,7 +1264,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
         // extract embeddings
         if (t_embd && n_outputs > 0) {
-            if (batch_inp.mtp_params.op_type == MTP_OP_NONE) {
+            if (batch_inp.mtp_params.op_type == MTP_OP_NONE || batch_inp.mtp_params.op_type == MTP_OP_MAIN_VALIDATION) {
                 ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
                 GGML_ASSERT(backend_embd != nullptr);
 
@@ -3133,7 +3156,7 @@ std::unique_ptr<llama_memory_context_i> llama_context::initialize_decode_context
     } else {
         mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
 
-        if (batch_inp.mtp_params.op_type == MTP_OP_NONE) {
+        if (batch_inp.mtp_params.op_type == MTP_OP_NONE || batch_inp.mtp_params.op_type == MTP_OP_MAIN_VALIDATION) {
             if (mctx && mctx->get_status() == LLAMA_MEMORY_STATUS_SUCCESS) {
                 kvd->last_main_model_sinfos = static_cast<llama_kv_cache_unified_context *>(mctx.get())->get_sinfos();
             } else {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -442,34 +442,22 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
 
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
     if (!this->params.allow_reuse(params)) {
-        if (debug > 1) {
-            LLAMA_LOG_DEBUG("%s: cannot reuse graph due to incompatible graph parameters\n", __func__);
-        }
-
+        LLAMA_LOG_WARN("[GRAPH-REUSE-FAIL] Failure in 'allow_reuse'. Incompatible parameters.");
+        LLAMA_LOG_WARN("                   n_tokens: %d vs %d, op_type: %d vs %d",
+                       this->params.ubatch.n_tokens, params.ubatch.n_tokens,
+                       (int)this->params.mtp_params.op_type, (int)params.mtp_params.op_type);
         return false;
     }
 
-    if (debug > 1) {
-        LLAMA_LOG_DEBUG("%s: checking compatibility of %d inputs:\n", __func__, (int) inputs.size());
-    }
-
-    bool res = true;
-
-    for (auto & input : inputs) {
-        const bool cur = input->can_reuse(params);
-
-        if (debug > 1) {
-            LLAMA_LOG_DEBUG("%s: can_reuse = %d\n", "placeholder", cur);
+    for (size_t i = 0; i < inputs.size(); ++i) {
+        if (!inputs[i]->can_reuse(params)) {
+            LLAMA_LOG_WARN("[GRAPH-REUSE-FAIL] Failure in 'can_reuse' of the input node #%zu.", i);
+            return false;
         }
-
-        res = res && cur;
     }
 
-    if (debug > 0) {
-        LLAMA_LOG_DEBUG("%s: can reuse graph = %d\n", __func__, res);
-    }
-
-    return res;
+    LLAMA_LOG_DEBUG("%s: can reuse graph = true\n", __func__);
+    return true;
 }
 
 llm_graph_input_i * llm_graph_result::add_input(llm_graph_input_ptr input) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13793,7 +13793,21 @@ struct llm_build_glm4_moe : public llm_graph_context {
 
         ggml_tensor * cur;
 
-        if (params.mtp_params.op_type != MTP_OP_NONE) {
+        // if (params.mtp_params.op_type != MTP_OP_NONE && params.mtp_params.op_type != MTP_OP_MAIN_VALIDATION) {
+        //     ggml_tensor* hidden_states_from_main_model;
+
+        //     if (params.mtp_params.op_type == MTP_OP_WARMUP || params.mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED) {
+        //         hidden_states_from_main_model = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+        //     } else {
+        //         hidden_states_from_main_model = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hparams.n_embd);     
+        //     }
+        //     ggml_set_name(hidden_states_from_main_model, "result_embd_pooled");
+        //     ggml_set_input(hidden_states_from_main_model);
+
+        //     auto inp_mtp = std::make_unique<llm_graph_input_mtp_states>();
+        //     inp_mtp->states = hidden_states_from_main_model;
+        //     res->add_input(std::move(inp_mtp));
+        if (params.mtp_params.op_type != MTP_OP_NONE && params.mtp_params.op_type != MTP_OP_MAIN_VALIDATION) {
             ggml_tensor* hidden_states_from_main_model;
 
             if (params.mtp_params.op_type == MTP_OP_WARMUP || params.mtp_params.op_type == MTP_OP_UPDATE_ACCEPTED) {
@@ -13971,8 +13985,9 @@ struct llm_build_glm4_moe : public llm_graph_context {
         ggml_tensor * embd_copy = ggml_dup(ctx0, prev_embeddings);
 
         const int il = hparams.n_layer - 1;
+        // cb(embd_copy, "mtp_embd_copy", il);
         ggml_tensor * sum_node = ggml_sum(ctx0, embd_copy);
-
+        // cb(sum_node, "mtp_sum_node", il);
         ggml_set_name(sum_node, "mtp_input_sum");
 
         ggml_tensor * inp_pos = build_inp_pos();
@@ -13983,30 +13998,48 @@ struct llm_build_glm4_moe : public llm_graph_context {
         ggml_tensor * hidden_state_norm = build_norm(embd_copy, mtp_layer.nextn.hnorm, NULL, LLM_NORM_RMS, il);
         
         ggml_tensor * combined = ggml_concat(ctx0, token_emb_norm, hidden_state_norm, 0);
+        // cb(combined, "mtp_combined", il);
+
         ggml_tensor* cur = build_lora_mm(mtp_layer.nextn.eh_proj, combined);
 
         // now proceed through last layer (skipped in main model)
         ggml_tensor * inpSA = cur;
         // Pre-attention norm for the MTP block
         cur = build_norm(cur, mtp_layer.attn_norm, NULL, LLM_NORM_RMS, il);
+        // cb(cur, "mtp_attn_norm", il);
 
         // self-attention
         {
             ggml_tensor * Qcur = build_lora_mm(mtp_layer.wq, cur);
+            // if (mtp_layer.bq) {
+            //     Qcur = ggml_add(ctx0, Qcur, mtp_layer.bq);
+            //     cb(Qcur, "mtp_q_bias", il); // ADICIONADO
+            // }
             if (mtp_layer.bq) Qcur = ggml_add(ctx0, Qcur, mtp_layer.bq);
             cb(Qcur, "Qcur", il);
 
             ggml_tensor * Kcur = build_lora_mm(mtp_layer.wk, cur);
+            // if (mtp_layer.bk) {
+            //     Kcur = ggml_add(ctx0, Kcur, mtp_layer.bk);
+            //     cb(Kcur, "mtp_k_bias", il); // ADICIONADO
+            // }
             if (mtp_layer.bk) Kcur = ggml_add(ctx0, Kcur, mtp_layer.bk);
             cb(Kcur, "Kcur", il);
 
             ggml_tensor * Vcur = build_lora_mm(mtp_layer.wv, cur);
+            // if (mtp_layer.bv) {
+            //     Vcur = ggml_add(ctx0, Vcur, mtp_layer.bv);
+            //     cb(Vcur, "mtp_v_bias", il); // ADICIONADO
+            // }
             if (mtp_layer.bv) Vcur = ggml_add(ctx0, Vcur, mtp_layer.bv);
             cb(Vcur, "Vcur", il);
 
             Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            // cb(Qcur, "mtp_q_reshaped", il);
             Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            // cb(Kcur, "mtp_k_reshaped", il);
             Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            // cb(Vcur, "mtp_v_reshaped", il);
 
             // Apply Q/K norm if available (GLM-4.5 355B variant)
             if (mtp_layer.attn_q_norm) {
@@ -14023,12 +14056,14 @@ struct llm_build_glm4_moe : public llm_graph_context {
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                     );
+            // cb(Qcur, "mtp_q_rope", il);
 
             Kcur = ggml_rope_ext(
                     ctx0, Kcur, inp_pos, nullptr,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                     );
+            // cb(Kcur, "mtp_k_rope", il);
 
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
@@ -14040,8 +14075,10 @@ struct llm_build_glm4_moe : public llm_graph_context {
         }
 
         ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        // cb(ffn_inp, "mtp_ffn_inp", il);
 
         cur = build_norm(ffn_inp, mtp_layer.attn_post_norm, NULL, LLM_NORM_RMS, il);
+        // cb(cur, "post_attn_norm", il);
 
         // moe ffn for nextn block
         {
@@ -14073,7 +14110,10 @@ struct llm_build_glm4_moe : public llm_graph_context {
             cb(cur, "ffn_out", il);
         }
         cur = ggml_add(ctx0, cur, ffn_inp);
+        // cb(cur, "mtp_ffn_residual", il);
+        
         cur = build_norm(cur, mtp_layer.nextn.shared_head_norm, NULL, LLM_NORM_RMS, il);
+        // cb(cur, "mtp_final_norm", il);
         cur = build_lora_mm(mtp_layer.nextn.shared_head_head, cur);
 
         return cur; 
@@ -18305,7 +18345,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 }
 
 ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
-    
+    const int64_t t_start_us = ggml_time_us();
     std::unique_ptr<llm_graph_context> llm;
     switch (arch) {
         case LLM_ARCH_LLAMA:
@@ -18664,10 +18704,16 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             GGML_ABORT("fatal error");
     }
 
-    if (params.mtp_params.op_type == MTP_OP_NONE) {
+    if (params.mtp_params.op_type == MTP_OP_NONE || params.mtp_params.op_type == MTP_OP_MAIN_VALIDATION) {
         // add on pooling layer
         llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
     }
+    const int64_t t_end_us = ggml_time_us();
+    LLAMA_LOG_INFO(
+        "[PERF] Graph build time: %.2f ms (MTP path: %s)\n",
+        (t_end_us - t_start_us) / 1000.0,
+        params.mtp_params.op_type != MTP_OP_NONE || params.mtp_params.op_type != MTP_OP_MAIN_VALIDATION ? "yes" : "no"
+    );
     return llm->res->get_gf();
 }
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp