F1LM1
diff --git a/‎common/speculative.cpp‎
Lines changed: 2 additions & 56 deletions b/‎common/speculative.cpp‎
Lines changed: 2 additions & 56 deletions
diff --git a/‎common/speculative.h‎
Lines changed: 1 addition & 10 deletions b/‎common/speculative.h‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎include/llama.h‎
Lines changed: 2 additions & 24 deletions b/‎include/llama.h‎
Lines changed: 2 additions & 24 deletions
@@ -378,7 +378,7 @@ llama_token mtp_speculative_gen_draft(
     const llama_seq_id draft_seq_id = 0;
     common_batch_add(mtp_batch, id_last, n_past, {0}, true);
 
-    mtp_batch.mtp_params.op_type = MTP_OP_DRAFT_GEN;
+    mtp_batch.mtp_params.op_type = MTP_OP_DRAFT_ONLY;
 
     // Perform the MTP draft generation decode. This writes the MTP layer's
     // KV state for the draft token into the cache.
@@ -406,58 +406,4 @@ llama_token mtp_speculative_gen_draft(
     common_sampler_apply_chain(smpl, cur_p);
 
     return cur_p->data[0].id;
-}
-
-
-void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, bool is_prompt_warmup) {
-    if (batch.n_tokens == 0) {
-        return;
-    }
-
-    LOG_DBG("[MTP-UPDATE|%s] Updating %d tokens...\n", is_prompt_warmup ? "PROMPT_WARMUP" : "GEN_ACCEPTED", batch.n_tokens);
-
-    llama_batch mtp_batch = batch;
-    if (is_prompt_warmup) {
-        mtp_batch.mtp_params.op_type = MTP_OP_WARMUP;
-    } else {
-        mtp_batch.mtp_params.op_type = MTP_OP_UPDATE_ACCEPTED;
-    }
-
-    for (int i = 0; i < mtp_batch.n_tokens; ++i) {
-        mtp_batch.logits[i] = true;
-    }
-    const int64_t t_start_us = ggml_time_us();
-    llama_decode(ctx, mtp_batch);
-    const int64_t t_end_us = ggml_time_us();
-    LOG_INF("[PERF-MTP] mtp_update_kv_cache internal decode (op=%d): %.2f ms\n", (int)mtp_batch.mtp_params.op_type, (t_end_us - t_start_us) / 1000.0);
-}
-
-void mtp_accept_tokens(
-    struct llama_context * ctx,
-    const std::vector<llama_token> & ids,
-    int32_t n_past_base,
-    llama_seq_id seq_id
-) {
-    if (ids.empty()) {
-        return;
-    }
-
-    // Prepare a resized copy of the validation sinfo to match the number of accepted tokens.
-    //    This sets up the context for a "forced sinfo" decode.
-    if (!llama_mtp_prepare_sinfo_for_update(ctx, ids.size())) {
-        return;
-    }
-
-    // Build a new batch containing only the accepted tokens.
-    llama_batch accepted_batch = llama_batch_init(ids.size(), 0, 1);
-    for (size_t i = 0; i < ids.size(); ++i) {
-        common_batch_add(accepted_batch, ids[i], n_past_base + i, { seq_id }, true);
-    }
-
-    mtp_update_kv_cache(ctx, accepted_batch, false);
-
-    // Clean up the forced state to not affect subsequent, normal decode calls.
-    llama_mtp_cancel_sinfo_update(ctx);
-
-    llama_batch_free(accepted_batch);
-}
+}
@@ -47,13 +47,4 @@ llama_tokens common_speculative_gen_draft(
     struct common_speculative * spec,
     struct common_speculative_params   params,
     const llama_tokens & prompt,
-    llama_token   id_last);
-
-void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, bool is_prompt_warmup);
-
-void mtp_accept_tokens(
-    struct llama_context * ctx,
-    const std::vector<llama_token> & ids,
-    int32_t n_past_base,
-    llama_seq_id seq_id
-);
+    llama_token   id_last);
@@ -223,10 +223,8 @@ extern "C" {
     //
     typedef enum {
         MTP_OP_NONE,
-        MTP_OP_WARMUP,
-        MTP_OP_UPDATE_ACCEPTED,
-        MTP_OP_DRAFT_GEN,
-        MTP_OP_MAIN_VALIDATION,
+        MTP_OP_DRAFT_ONLY,
+        MTP_OP_UNIFIED,
     } llama_mtp_op_type;
 
     typedef struct llama_mtp_params {
@@ -1473,26 +1471,6 @@ extern "C" {
 
     LLAMA_API void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state);
 
-    /**
-     * @brief Prepares the context for an MTP KV cache update by creating a resized copy of the last sinfo.
-     *        This is used after speculative validation when only a subset of draft tokens are accepted.
-     * @param n_accepted The number of tokens that were accepted and for which the sinfo should be resized.
-     * @return true on success.
-     */
-    LLAMA_API bool llama_mtp_prepare_sinfo_for_update(struct llama_context * ctx, size_t n_accepted);
-    
-    /**
-     * @brief Prepares the context for an MTP KV cache update by reusing the sinfo from the last main model decode.
-     *        This is used for the prompt warmup to ensure the MTP and main model KV caches are perfectly aligned.
-     * @return true on success.
-     */
-    LLAMA_API bool llama_mtp_prepare_sinfo_for_warmup(struct llama_context * ctx);
-    
-    /**
-     * @brief Clears the forced sinfo state from the context. Must be called after a decode that used a prepared sinfo.
-     */
-    LLAMA_API void llama_mtp_cancel_sinfo_update(struct llama_context * ctx);
-
     /**
      * @brief Removes KV cache metadata for a specified sequence and token range.
      *        This makes the physical cells logically available again without deleting the tensor data.