From 37298a63d870f9d6aaebfc014e0bb3f07e54f04d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 28 Jan 2026 09:45:07 +0200
Subject: [PATCH 1/7] spec : add ngram-mod

---
 common/CMakeLists.txt           |  2 +
 common/arg.cpp                  |  4 +-
 common/common.h                 |  5 ++
 common/ngram-map.cpp            | 35 ++++++-------
 common/ngram-map.h              |  1 +
 common/ngram-mod.cpp            | 44 ++++++++++++++++
 common/ngram-mod.h              | 30 +++++++++++
 common/speculative.cpp          | 91 +++++++++++++++++++++++++++++++++
 tools/perplexity/perplexity.cpp | 33 ++++++++++++
 tools/server/server-context.cpp | 25 +++++++++
 10 files changed, 249 insertions(+), 21 deletions(-)
 create mode 100644 common/ngram-mod.cpp
 create mode 100644 common/ngram-mod.h
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 3bc7bc6210b..295ae9ea254 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
     ngram-cache.h
     ngram-map.cpp
     ngram-map.h
+    ngram-mod.cpp
+    ngram-mod.h
     peg-parser.cpp
     peg-parser.h
     preset.cpp
diff --git a/common/arg.cpp b/common/arg.cpp
index 218418f0701..0c4be8fd2f6 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
+        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
         string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
             common_speculative_type_to_str(params.speculative.type).c_str()),
         [](common_params & params, const std::string & value) {
@@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
             } else if (value == "ngram-map-k4v") {
                 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
+            } else if (value == "ngram-mod") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD;
             } else {
                 throw std::invalid_argument("unknown speculative decoding type without draft model");
             }
diff --git a/common/common.h b/common/common.h
index fd3ab8cd180..c1b9aea7a3e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -171,6 +171,7 @@ enum common_speculative_type {
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD,
     COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
     COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
 };
@@ -252,6 +253,8 @@ struct common_params_model {
     std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };
 
+struct common_ngram_mod;
+
 struct common_params_speculative {
     common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
 
@@ -269,6 +272,8 @@ struct common_params_speculative {
     uint16_t ngram_check_rate =  1; // check rate for ngram lookup
     uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
 
+    common_ngram_mod * ngram_mod = nullptr;
+
     std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
 
diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp
index 930e7a3c10f..84fd761367d 100644
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -7,6 +7,21 @@
 #include <cstdio>
 #include <sstream>
 
+// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
+static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
+    std::ostringstream oss;
+    oss << '[';
+    for (size_t i = 0; i < length; ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << inp[start + i];
+    }
+    oss << ']';
+    return oss.str();
+}
+
+
 // n-gram simple
 //
 
@@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
 // maximum number of counted values of a ngram map value.
 #define COMMON_NGRAM_MAX_VALUE_COUNT 16380
 
-static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
-
 void common_ngram_map_draft(common_ngram_map & map,
         const llama_tokens & inp, llama_token sampled,
         llama_tokens & draft) {
@@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
             n_accepted, curr_value.n_accepted);
     curr_value.n_accepted = n_accepted;
 }
-
-// Helper functions.
-//
-
-// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
-std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
-    std::ostringstream oss;
-    oss << '[';
-    for (size_t i = 0; i < length; ++i) {
-        if (i > 0) {
-            oss << ", ";
-        }
-        oss << inp[start + i];
-    }
-    oss << ']';
-    return oss.str();
-}
-
diff --git a/common/ngram-map.h b/common/ngram-map.h
index bf91883f0c3..b365034ac51 100644
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -11,6 +11,7 @@
 //
 
 #include "llama.h"
+#include "common.h"
 
 #include <vector>
 
diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp
new file mode 100644
index 00000000000..86ffa3a0697
--- /dev/null
+++ b/common/ngram-mod.cpp
@@ -0,0 +1,44 @@
+#include "ngram-mod.h"
+
+common_ngram_mod::common_ngram_mod(uint16_t n, uint64_t size) : n(n) {
+    entries.resize(size);
+}
+
+uint64_t common_ngram_mod::idx(const int32_t * tokens) const {
+    uint64_t res = 0;
+
+    for (uint64_t i = 0; i < n; ++i) {
+        res = (res * 6364136223846793005ULL + tokens[i]);
+    }
+
+    res = res % entries.size();
+
+    return res;
+}
+
+void common_ngram_mod::add(const int32_t * tokens) {
+    const uint64_t i = idx(tokens);
+
+    common_ngram_mod_entry & entry = entries[i];
+
+    if (entry.n_choices < COMMON_NGRAM_MOD_MAX_CHOICES) {
+        entry.n_choices++;
+    }
+
+    entry.choices[entry.head] = tokens[n];
+    entry.head = (entry.head + 1) % COMMON_NGRAM_MOD_MAX_CHOICES;
+}
+
+int32_t common_ngram_mod::get(const int32_t * tokens, int32_t offs) const {
+    const uint64_t i = idx(tokens);
+
+    const common_ngram_mod_entry & entry = entries[i];
+
+    if (entry.n_choices == 0) {
+        return -1;
+    }
+
+    const int32_t k = (offs + entry.head) % entry.n_choices;
+
+    return entry.choices[k];
+}
diff --git a/common/ngram-mod.h b/common/ngram-mod.h
new file mode 100644
index 00000000000..3186b80743c
--- /dev/null
+++ b/common/ngram-mod.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <memory>
+
+#define COMMON_NGRAM_MOD_MAX_CHOICES 4
+
+struct common_ngram_mod_entry {
+    uint16_t head = 0;
+    uint16_t n_choices = 0;
+
+    int32_t choices[COMMON_NGRAM_MOD_MAX_CHOICES];
+};
+
+// basic n-gram hasher
+struct common_ngram_mod {
+    common_ngram_mod(uint16_t n, uint64_t size);
+
+    uint64_t idx(const int32_t * tokens) const;
+
+    void    add(const int32_t * tokens);
+    int32_t get(const int32_t * tokens, int32_t offs) const; // return -1 if not found
+
+    uint16_t n; // ngram size to hash
+
+    std::vector<common_ngram_mod_entry> entries;
+};
+
+using common_ngram_mod_ptr = std::unique_ptr<common_ngram_mod>;
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 3f314b5d578..c07e414400d 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -6,6 +6,7 @@
 #include "log.h"
 #include "ngram-cache.h"
 #include "ngram-map.h"
+#include "ngram-mod.h"
 #include "sampling.h"
 
 #include <algorithm>
@@ -23,6 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD,
     COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
 };
 
@@ -33,6 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
     {"ngram_simple",  COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
     {"ngram_map_k",   COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
     {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
+    {"ngram_map_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD},
     {"ngram_cache",   COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
 };
 
@@ -509,6 +512,84 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
     }
 };
 
+struct common_speculative_state_ngram_mod : public common_speculative_state {
+    common_ngram_mod & mod;
+
+    // size of the last begin() prompt
+    size_t n_last = 0;
+
+    common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
+        : common_speculative_state(type), mod(mod) {}
+
+    void begin(const llama_tokens & prompt) override {
+        n_last = 0;
+
+        if (prompt.size() < (size_t) mod.n) {
+            return;
+        }
+
+        for (size_t i = 0; i < prompt.size() - mod.n; ++i) {
+            mod.add(prompt.data() + i);
+        }
+
+        n_last = prompt.size() - mod.n;
+    }
+
+    void draft(
+            const common_params_speculative & params,
+            const llama_tokens & prompt_tgt,
+            llama_token id_last,
+            llama_tokens & result) override {
+        GGML_UNUSED(params);
+
+        const size_t cur_len = prompt_tgt.size();
+        if (cur_len < (size_t) mod.n) {
+            return;
+        }
+
+        // add new ngrams in chunks
+        if (n_last + 16*mod.n < cur_len) {
+            for (size_t i = n_last; i < cur_len - mod.n; ++i) {
+                mod.add(prompt_tgt.data() + i);
+            }
+
+            n_last = cur_len - mod.n;
+        }
+
+        result.resize(mod.n + params.n_max);
+        for (size_t i = 0; i < mod.n - 1; ++i) {
+            result[i] = prompt_tgt[cur_len - mod.n + 1 + i];
+        }
+        result[mod.n - 1] = id_last;
+
+        for (int i = 0; i < params.n_max; ++i) {
+            const llama_token token = mod.get(result.data() + i, cur_len + i);
+            if (token == LLAMA_TOKEN_NULL) {
+                if (i < params.n_min) {
+                    result.clear();
+                    return;
+                }
+
+                result.resize(mod.n + i);
+                break;
+            }
+            result[mod.n + i] = token;
+        }
+
+        // only return the m tokens that were drafted
+        for (size_t i = 0; mod.n + i < result.size(); ++i) {
+            result[i] = result[mod.n + i];
+        }
+        result.resize(result.size() - mod.n);
+    }
+
+    void accept(uint16_t n_accepted) override {
+        LOG_WRN("XXXXXXXXXXXXX n_accepted = %d\n", n_accepted);
+        // noop
+        GGML_UNUSED(n_accepted);
+    }
+};
+
 struct common_speculative_state_ngram_cache : public common_speculative_state {
     uint16_t n_draft;
     bool save_dynamic;
@@ -650,6 +731,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
         case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:  return "ngram_simple";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:   return "ngram_map_k";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
+        case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD: return "ngram_map_mod";
         case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE:   return "ngram_cache";
         default:                                    return "unknown";
     }
@@ -687,6 +769,7 @@ common_speculative * common_speculative_init(
         bool has_ngram_simple  = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
         bool has_ngram_map_k   = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
         bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
+        bool has_ngram_map_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD);
 
         // In a more complex implementation we could use the same implementation but with different parameters.
         // This was initially used in PR-18471 but removed to simplify the code.
@@ -701,6 +784,9 @@ common_speculative * common_speculative_init(
             // This implementation can guess tokens with high acceptance rate but is more expensive.
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
         }
+        if (has_ngram_map_mod) {
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD, params));
+        }
         if (has_ngram_cache) {
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
         }
@@ -758,6 +844,11 @@ common_speculative * common_speculative_init(
                 ));
                 break;
             }
+            case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD: {
+                GGML_ASSERT(config.params.ngram_mod);
+                impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
+                break;
+            }
             case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
                 auto state = create_state_ngram_cache(
                         params.lookup_cache_static, params.lookup_cache_dynamic, config);
diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
index 1ead9c871e9..31e918507f6 100644
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "ngram-mod.h"
 
 #include <chrono>
 #include <algorithm>
@@ -471,6 +472,38 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
+    // TODO: remove
+    //{
+    //    common_ngram_mod ngrams(16, 1, 1024*1024);
+
+    //    for (size_t i = 0; i < tokens.size() - ngrams.n; ++i) {
+    //        ngrams.add(tokens.data() + i);
+    //    }
+    //    // Determine the maximum number of choices across all bins
+    //    int max_choices = 0;
+    //    for (const auto & entry : ngrams.entries) {
+    //        if (entry.n_choices > max_choices) {
+    //            max_choices = entry.n_choices;
+    //        }
+    //    }
+
+    //    // make max_choices the next power of 2
+    //    max_choices = 1 << (32 - __builtin_clz(max_choices));
+
+    //    // Build a histogram: count how many bins have each possible number of choices
+    //    std::vector<int> histogram(max_choices + 1, 0);
+    //    for (const auto & entry : ngrams.entries) {
+    //        histogram[entry.n_choices]++;
+    //    }
+
+    //    // Print the histogram
+    //    LOG_INF("Histogram of choices (0 .. %d):\n", max_choices);
+    //    for (int i = 0; i <= max_choices; ++i) {
+    //        LOG_INF("choices %3d: %d bins\n", i, histogram[i]);
+    //    }
+    //}
+    //exit(0);
+
     auto tim2 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 1ca4e3cc0e9..f4bb9d0af39 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -11,6 +11,7 @@
 #include "speculative.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#include "ngram-mod.h"
 
 #include <cstddef>
 #include <cinttypes>
@@ -560,6 +561,8 @@ struct server_context_impl {
 
     llama_model_ptr model_dft;
 
+    common_ngram_mod_ptr ngram_mod;
+
     bool add_bos_token  = true;
 
     int32_t n_ctx; // total context for all clients / slots
@@ -705,6 +708,11 @@ struct server_context_impl {
                 params_base.n_cache_reuse = 0;
                 SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
             }
+
+            if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
+                params_base.speculative.type =  COMMON_SPECULATIVE_TYPE_NONE;
+                SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
+            }
         }
 
         if (!llama_memory_can_shift(llama_get_memory(ctx))) {
@@ -748,6 +756,18 @@ struct server_context_impl {
 
             // try speculative decoding
             {
+                // initialize a shared ngram_mod instance for all speculative contexts
+                if (!ngram_mod && params_base.speculative.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD) {
+                    ngram_mod = std::make_unique<common_ngram_mod>(params_base.speculative.ngram_size_n, 1024*1024);
+
+                    params_base.speculative.ngram_mod = ngram_mod.get();
+
+                    SRV_INF("initialized ngram_mod with n=%d, size=%d (%.3f MB)\n",
+                        params_base.speculative.ngram_size_n, 1024*1024,
+                        (float)(1024*1024*sizeof(common_ngram_mod_entry))/1024/1024
+                    );
+                }
+
                 slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
                 if (slot.spec) {
                     if (mctx) {
@@ -2046,6 +2066,11 @@ struct server_context_impl {
                     draft.resize(n_draft_max);
                 }
 
+                if (draft.size() > 0) {
+                    std::string tmp = common_detokenize(slot.ctx, draft);
+                    //LOG_WRN("XXXXXX: draft: '%s'\n", tmp.c_str());
+                }
+
                 // add the sampled token to the batch
                 slot.i_batch_dft.push_back(batch.n_tokens);
                 common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);

From 7f52bcafebebe0165f9ff5db017208fb1b190784 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 30 Jan 2026 14:07:36 +0200
Subject: [PATCH 2/7] cont : simplify + keep track of occupancy

---
 common/ngram-mod.cpp            | 83 +++++++++++++++++++++++++++++----
 common/ngram-mod.h              | 53 +++++++++++++++++----
 common/speculative.cpp          | 58 ++++++++++++++---------
 tools/server/server-context.cpp |  9 ++--
 4 files changed, 158 insertions(+), 45 deletions(-)

diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp
index 86ffa3a0697..58db1fb283f 100644
--- a/common/ngram-mod.cpp
+++ b/common/ngram-mod.cpp
@@ -1,13 +1,76 @@
 #include "ngram-mod.h"
 
-common_ngram_mod::common_ngram_mod(uint16_t n, uint64_t size) : n(n) {
+//
+// common_ngram_mod
+//
+
+common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
+    entries.resize(size);
+
+    std::fill(entries.begin(), entries.end(), EMPTY);
+}
+
+size_t common_ngram_mod::idx(const entry_t * tokens) const {
+    size_t res = 0;
+
+    for (size_t i = 0; i < n; ++i) {
+        res = (res * 6364136223846793005ULL + tokens[i]);
+    }
+
+    res = res % entries.size();
+
+    return res;
+}
+
+void common_ngram_mod::add(const entry_t * tokens) {
+    const size_t i = idx(tokens);
+
+    if (entries[i] != EMPTY) {
+        used++;
+    }
+
+    entries[i] = tokens[n];
+}
+
+common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
+    const size_t i = idx(tokens);
+
+    return entries[i];
+}
+
+void common_ngram_mod::reset() {
+    std::fill(entries.begin(), entries.end(), EMPTY);
+    used = 0;
+}
+
+size_t common_ngram_mod::get_n() const {
+    return n;
+}
+
+size_t common_ngram_mod::get_used() const {
+    return used;
+}
+
+size_t common_ngram_mod::size() const {
+    return entries.size();
+}
+
+size_t common_ngram_mod::size_bytes() const {
+    return entries.size() * sizeof(entries[0]);
+}
+
+//
+// common_ngram_mod_ext
+//
+
+common_ngram_mod_ext::common_ngram_mod_ext(uint16_t n, size_t size) : n(n) {
     entries.resize(size);
 }
 
-uint64_t common_ngram_mod::idx(const int32_t * tokens) const {
-    uint64_t res = 0;
+size_t common_ngram_mod_ext::idx(const int32_t * tokens) const {
+    size_t res = 0;
 
-    for (uint64_t i = 0; i < n; ++i) {
+    for (size_t i = 0; i < n; ++i) {
         res = (res * 6364136223846793005ULL + tokens[i]);
     }
 
@@ -16,10 +79,10 @@ uint64_t common_ngram_mod::idx(const int32_t * tokens) const {
     return res;
 }
 
-void common_ngram_mod::add(const int32_t * tokens) {
-    const uint64_t i = idx(tokens);
+void common_ngram_mod_ext::add(const int32_t * tokens) {
+    const size_t i = idx(tokens);
 
-    common_ngram_mod_entry & entry = entries[i];
+    common_ngram_mod_ext_entry & entry = entries[i];
 
     if (entry.n_choices < COMMON_NGRAM_MOD_MAX_CHOICES) {
         entry.n_choices++;
@@ -29,10 +92,10 @@ void common_ngram_mod::add(const int32_t * tokens) {
     entry.head = (entry.head + 1) % COMMON_NGRAM_MOD_MAX_CHOICES;
 }
 
-int32_t common_ngram_mod::get(const int32_t * tokens, int32_t offs) const {
-    const uint64_t i = idx(tokens);
+int32_t common_ngram_mod_ext::get(const int32_t * tokens, int32_t offs) const {
+    const size_t i = idx(tokens);
 
-    const common_ngram_mod_entry & entry = entries[i];
+    const common_ngram_mod_ext_entry & entry = entries[i];
 
     if (entry.n_choices == 0) {
         return -1;
diff --git a/common/ngram-mod.h b/common/ngram-mod.h
index 3186b80743c..3f28ee89d76 100644
--- a/common/ngram-mod.h
+++ b/common/ngram-mod.h
@@ -4,27 +4,64 @@
 #include <vector>
 #include <memory>
 
+//
+// common_ngram_mod
+//
+
+// basic n-gram hasher
+struct common_ngram_mod {
+    using entry_t = int32_t;
+
+    static constexpr entry_t EMPTY = -1;
+
+    common_ngram_mod(uint16_t n, size_t size);
+
+    size_t  idx(const entry_t * tokens) const;
+    void    add(const entry_t * tokens);
+    entry_t get(const entry_t * tokens) const; // return -1 if not found
+
+    void reset();
+
+    size_t get_n()    const;
+    size_t get_used() const;
+
+    size_t size()       const;
+    size_t size_bytes() const;
+
+private:
+    size_t n; // ngram size to hash
+
+    size_t used;
+
+    std::vector<entry_t> entries;
+};
+
+using common_ngram_mod_ptr = std::unique_ptr<common_ngram_mod>;
+
+//
+// common_ngram_mod_ext (for experiments)
+//
+
 #define COMMON_NGRAM_MOD_MAX_CHOICES 4
 
-struct common_ngram_mod_entry {
+struct common_ngram_mod_ext_entry {
     uint16_t head = 0;
     uint16_t n_choices = 0;
 
     int32_t choices[COMMON_NGRAM_MOD_MAX_CHOICES];
 };
 
-// basic n-gram hasher
-struct common_ngram_mod {
-    common_ngram_mod(uint16_t n, uint64_t size);
+struct common_ngram_mod_ext {
+    common_ngram_mod_ext(uint16_t n, size_t size);
 
-    uint64_t idx(const int32_t * tokens) const;
+    size_t idx(const int32_t * tokens) const;
 
     void    add(const int32_t * tokens);
     int32_t get(const int32_t * tokens, int32_t offs) const; // return -1 if not found
 
-    uint16_t n; // ngram size to hash
+    size_t n; // ngram size to hash
 
-    std::vector<common_ngram_mod_entry> entries;
+    std::vector<common_ngram_mod_ext_entry> entries;
 };
 
-using common_ngram_mod_ptr = std::unique_ptr<common_ngram_mod>;
+using common_ngram_mod_ext_ptr = std::unique_ptr<common_ngram_mod_ext>;
diff --git a/common/speculative.cpp b/common/speculative.cpp
index c07e414400d..0e6357e3b6b 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -515,24 +515,36 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
 struct common_speculative_state_ngram_mod : public common_speculative_state {
     common_ngram_mod & mod;
 
-    // size of the last begin() prompt
-    size_t n_last = 0;
+    // the last position in the prompt that was added to the ngram container
+    size_t i_last = 0;
 
     common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
-        : common_speculative_state(type), mod(mod) {}
+        : common_speculative_state(type), mod(mod) {
+            static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
+    }
 
     void begin(const llama_tokens & prompt) override {
-        n_last = 0;
+        i_last = 0;
+
+        const size_t n = mod.get_n();
 
-        if (prompt.size() < (size_t) mod.n) {
+        if (prompt.size() < n) {
             return;
         }
 
-        for (size_t i = 0; i < prompt.size() - mod.n; ++i) {
+        for (size_t i = 0; i < prompt.size() - n; ++i) {
             mod.add(prompt.data() + i);
         }
 
-        n_last = prompt.size() - mod.n;
+        i_last = prompt.size() - n;
+
+        const double f = (double)mod.get_used() * 100.0 / (double)mod.size();
+        LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f%%)\\n", __func__, mod.get_used(), mod.size(), f);
+
+        if (f > 0.25) {
+            mod.reset();
+            LOG_WRN("%s: ngram_mod occupancy %.2f%% exceeds threshold, resetting\n", __func__, f);
+        }
     }
 
     void draft(
@@ -543,44 +555,46 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
         GGML_UNUSED(params);
 
         const size_t cur_len = prompt_tgt.size();
-        if (cur_len < (size_t) mod.n) {
+        if (cur_len < mod.get_n()) {
             return;
         }
 
+        const size_t n = mod.get_n();
+
         // add new ngrams in chunks
-        if (n_last + 16*mod.n < cur_len) {
-            for (size_t i = n_last; i < cur_len - mod.n; ++i) {
+        if (i_last + 16*n < cur_len) {
+            for (size_t i = i_last; i < cur_len - n; ++i) {
                 mod.add(prompt_tgt.data() + i);
             }
 
-            n_last = cur_len - mod.n;
+            i_last = cur_len - n;
         }
 
-        result.resize(mod.n + params.n_max);
-        for (size_t i = 0; i < mod.n - 1; ++i) {
-            result[i] = prompt_tgt[cur_len - mod.n + 1 + i];
+        result.resize(n + params.n_max);
+        for (size_t i = 0; i < n - 1; ++i) {
+            result[i] = prompt_tgt[cur_len - n + 1 + i];
         }
-        result[mod.n - 1] = id_last;
+        result[n - 1] = id_last;
 
         for (int i = 0; i < params.n_max; ++i) {
-            const llama_token token = mod.get(result.data() + i, cur_len + i);
-            if (token == LLAMA_TOKEN_NULL) {
+            const llama_token token = mod.get(result.data() + i);
+            if (token == common_ngram_mod::EMPTY) {
                 if (i < params.n_min) {
                     result.clear();
                     return;
                 }
 
-                result.resize(mod.n + i);
+                result.resize(n + i);
                 break;
             }
-            result[mod.n + i] = token;
+            result[n + i] = token;
         }
 
         // only return the m tokens that were drafted
-        for (size_t i = 0; mod.n + i < result.size(); ++i) {
-            result[i] = result[mod.n + i];
+        for (size_t i = 0; n + i < result.size(); ++i) {
+            result[i] = result[n + i];
         }
-        result.resize(result.size() - mod.n);
+        result.resize(result.size() - n);
     }
 
     void accept(uint16_t n_accepted) override {
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index f4bb9d0af39..fef6bb73522 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -758,14 +758,13 @@ struct server_context_impl {
             {
                 // initialize a shared ngram_mod instance for all speculative contexts
                 if (!ngram_mod && params_base.speculative.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD) {
-                    ngram_mod = std::make_unique<common_ngram_mod>(params_base.speculative.ngram_size_n, 1024*1024);
+                    ngram_mod = std::make_unique<common_ngram_mod>(params_base.speculative.ngram_size_n, 4*1024*1024);
 
                     params_base.speculative.ngram_mod = ngram_mod.get();
 
-                    SRV_INF("initialized ngram_mod with n=%d, size=%d (%.3f MB)\n",
-                        params_base.speculative.ngram_size_n, 1024*1024,
-                        (float)(1024*1024*sizeof(common_ngram_mod_entry))/1024/1024
-                    );
+                    SRV_INF("initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n",
+                            params_base.speculative.ngram_size_n, ngram_mod->size(),
+                            (float)(ngram_mod->size_bytes())/1024/1024);
                 }
 
                 slot.spec = common_speculative_init(params_base.speculative, slot.ctx);

From 57173c3719730812c9e92be5a79563f88f63587e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 30 Jan 2026 14:31:17 +0200
Subject: [PATCH 3/7] cont : cleanup

---
 common/speculative.cpp          | 51 +++++++++++++++++++++++++++------
 tools/perplexity/perplexity.cpp | 33 ---------------------
 tools/server/server-context.cpp |  9 ++----
 3 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 0e6357e3b6b..be20880beab 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -518,14 +518,25 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
     // the last position in the prompt that was added to the ngram container
     size_t i_last = 0;
 
+    // length of the last drafted n‑gram (number of tokens returned by draft)
+    size_t n_draft_last = 0;
+
+    // consecutive accept rounds with low acceptance fraction (< 0.5)
+    int n_low = 0;
+
+    // enable trace logging if LLAMA_TRACE is set
+    const bool verbose;
+
     common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
-        : common_speculative_state(type), mod(mod) {
-            static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
+        : common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
+        static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
     }
 
     void begin(const llama_tokens & prompt) override {
         i_last = 0;
 
+        n_draft_last = 0;
+
         const size_t n = mod.get_n();
 
         if (prompt.size() < n) {
@@ -538,12 +549,14 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
 
         i_last = prompt.size() - n;
 
-        const double f = (double)mod.get_used() * 100.0 / (double)mod.size();
-        LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f%%)\\n", __func__, mod.get_used(), mod.size(), f);
+        const double f = (double)mod.get_used() / (double)mod.size();
+        LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
+
+        constexpr double f_thold = 0.25;
+        if (f > f_thold) {
+            LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
 
-        if (f > 0.25) {
             mod.reset();
-            LOG_WRN("%s: ngram_mod occupancy %.2f%% exceeds threshold, resetting\n", __func__, f);
         }
     }
 
@@ -554,6 +567,8 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
             llama_tokens & result) override {
         GGML_UNUSED(params);
 
+        n_draft_last = 0;
+
         const size_t cur_len = prompt_tgt.size();
         if (cur_len < mod.get_n()) {
             return;
@@ -595,12 +610,30 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
             result[i] = result[n + i];
         }
         result.resize(result.size() - n);
+
+        // store length of drafted n‑gram for later acceptance analysis
+        n_draft_last = result.size();
     }
 
     void accept(uint16_t n_accepted) override {
-        LOG_WRN("XXXXXXXXXXXXX n_accepted = %d\n", n_accepted);
-        // noop
-        GGML_UNUSED(n_accepted);
+        if (verbose) {
+            LOG_INF("%s: accepted %d tokens\n", __func__, n_accepted);
+        }
+
+        // compute acceptance fraction if we have a recorded draft length
+        if (n_draft_last > 0) {
+            const double f_acc = (double)n_accepted / (double)n_draft_last;
+            if (f_acc < 0.5) {
+                n_low++;
+                if (n_low >= 3) {
+                    LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
+                    mod.reset();
+                    n_low = 0;
+                }
+            } else {
+                n_low = 0;
+            }
+        }
     }
 };
 
diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
index 31e918507f6..1ead9c871e9 100644
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -2,7 +2,6 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-#include "ngram-mod.h"
 
 #include <chrono>
 #include <algorithm>
@@ -472,38 +471,6 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
-    // TODO: remove
-    //{
-    //    common_ngram_mod ngrams(16, 1, 1024*1024);
-
-    //    for (size_t i = 0; i < tokens.size() - ngrams.n; ++i) {
-    //        ngrams.add(tokens.data() + i);
-    //    }
-    //    // Determine the maximum number of choices across all bins
-    //    int max_choices = 0;
-    //    for (const auto & entry : ngrams.entries) {
-    //        if (entry.n_choices > max_choices) {
-    //            max_choices = entry.n_choices;
-    //        }
-    //    }
-
-    //    // make max_choices the next power of 2
-    //    max_choices = 1 << (32 - __builtin_clz(max_choices));
-
-    //    // Build a histogram: count how many bins have each possible number of choices
-    //    std::vector<int> histogram(max_choices + 1, 0);
-    //    for (const auto & entry : ngrams.entries) {
-    //        histogram[entry.n_choices]++;
-    //    }
-
-    //    // Print the histogram
-    //    LOG_INF("Histogram of choices (0 .. %d):\n", max_choices);
-    //    for (int i = 0; i <= max_choices; ++i) {
-    //        LOG_INF("choices %3d: %d bins\n", i, histogram[i]);
-    //    }
-    //}
-    //exit(0);
-
     auto tim2 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index fef6bb73522..85fde205ade 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -773,9 +773,9 @@ struct server_context_impl {
                         SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
                         return false;
                     }
-                    SRV_WRN("%s", "speculative decoding context initialized\n");
+                    SLT_INF(slot, "%s", "speculative decoding context initialized\n");
                 } else {
-                    SRV_WRN("%s", "speculative decoding context not initialized\n");
+                    SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
                 }
             }
 
@@ -2065,11 +2065,6 @@ struct server_context_impl {
                     draft.resize(n_draft_max);
                 }
 
-                if (draft.size() > 0) {
-                    std::string tmp = common_detokenize(slot.ctx, draft);
-                    //LOG_WRN("XXXXXX: draft: '%s'\n", tmp.c_str());
-                }
-
                 // add the sampled token to the batch
                 slot.i_batch_dft.push_back(batch.n_tokens);
                 common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);

From a9a076f971cab365d3402509e6c2c2f5247e1dc4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 30 Jan 2026 15:19:51 +0200
Subject: [PATCH 4/7] cont : move initialization to common/speculative

---
 common/arg.cpp                  |  2 +-
 common/common.h                 |  4 ++--
 common/speculative.cpp          | 29 +++++++++++++++++++----------
 common/speculative.h            |  4 ++--
 tools/server/server-context.cpp | 11 -----------
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 0c4be8fd2f6..5fbc9022c02 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3411,7 +3411,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             } else if (value == "ngram-map-k4v") {
                 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
             } else if (value == "ngram-mod") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD;
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
             } else {
                 throw std::invalid_argument("unknown speculative decoding type without draft model");
             }
diff --git a/common/common.h b/common/common.h
index c1b9aea7a3e..398ebb09601 100644
--- a/common/common.h
+++ b/common/common.h
@@ -171,7 +171,7 @@ enum common_speculative_type {
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
-    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD,
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
     COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
     COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
 };
@@ -272,7 +272,7 @@ struct common_params_speculative {
     uint16_t ngram_check_rate =  1; // check rate for ngram lookup
     uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
 
-    common_ngram_mod * ngram_mod = nullptr;
+    std::shared_ptr<common_ngram_mod> ngram_mod;
 
     std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
diff --git a/common/speculative.cpp b/common/speculative.cpp
index be20880beab..a5c2dc5e577 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -24,7 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
-    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD,
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
     COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
 };
 
@@ -35,7 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
     {"ngram_simple",  COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
     {"ngram_map_k",   COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
     {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
-    {"ngram_map_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD},
+    {"ngram_mod",     COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
     {"ngram_cache",   COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
 };
 
@@ -617,7 +617,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
 
     void accept(uint16_t n_accepted) override {
         if (verbose) {
-            LOG_INF("%s: accepted %d tokens\n", __func__, n_accepted);
+            LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
         }
 
         // compute acceptance fraction if we have a recorded draft length
@@ -778,7 +778,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
         case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:  return "ngram_simple";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:   return "ngram_map_k";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
-        case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD: return "ngram_map_mod";
+        case COMMON_SPECULATIVE_TYPE_NGRAM_MOD:     return "ngram_mod";
         case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE:   return "ngram_cache";
         default:                                    return "unknown";
     }
@@ -795,8 +795,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // initialization of the speculative decoding system
 //
 common_speculative * common_speculative_init(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt) {
+        common_params_speculative & params,
+        llama_context             * ctx_tgt) {
     llama_context * ctx_dft = nullptr;
     if (params.model_dft) {
         ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
@@ -816,7 +816,7 @@ common_speculative * common_speculative_init(
         bool has_ngram_simple  = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
         bool has_ngram_map_k   = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
         bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
-        bool has_ngram_map_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD);
+        bool has_ngram_mod     = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
 
         // In a more complex implementation we could use the same implementation but with different parameters.
         // This was initially used in PR-18471 but removed to simplify the code.
@@ -831,8 +831,17 @@ common_speculative * common_speculative_init(
             // This implementation can guess tokens with high acceptance rate but is more expensive.
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
         }
-        if (has_ngram_map_mod) {
-            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD, params));
+        if (has_ngram_mod) {
+            // shared instance for all speculative decoding contexts
+            if (!params.ngram_mod) {
+                params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
+
+                LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
+                        params.ngram_size_n, params.ngram_mod->size(),
+                        (float)(params.ngram_mod->size_bytes())/1024/1024);
+            }
+
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
         }
         if (has_ngram_cache) {
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
@@ -891,7 +900,7 @@ common_speculative * common_speculative_init(
                 ));
                 break;
             }
-            case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD: {
+            case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
                 GGML_ASSERT(config.params.ngram_mod);
                 impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
                 break;
diff --git a/common/speculative.h b/common/speculative.h
index 9e1888e4be0..76fe6bb7bca 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 std::string common_speculative_type_to_str(enum common_speculative_type type);
 
 common_speculative * common_speculative_init(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt);
+        common_params_speculative & params,
+        llama_context             * ctx_tgt);
 
 void common_speculative_free(common_speculative * spec);
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 85fde205ade..e143e26a87a 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -756,17 +756,6 @@ struct server_context_impl {
 
             // try speculative decoding
             {
-                // initialize a shared ngram_mod instance for all speculative contexts
-                if (!ngram_mod && params_base.speculative.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_MOD) {
-                    ngram_mod = std::make_unique<common_ngram_mod>(params_base.speculative.ngram_size_n, 4*1024*1024);
-
-                    params_base.speculative.ngram_mod = ngram_mod.get();
-
-                    SRV_INF("initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n",
-                            params_base.speculative.ngram_size_n, ngram_mod->size(),
-                            (float)(ngram_mod->size_bytes())/1024/1024);
-                }
-
                 slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
                 if (slot.spec) {
                     if (mctx) {

From 1644da7870df5fee8b83c6f3b42e039e0be81bd0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 30 Jan 2026 15:28:53 +0200
Subject: [PATCH 5/7] cont : cleanup

---
 tools/server/server-context.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index e143e26a87a..6f26fc9a9b2 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -11,7 +11,6 @@
 #include "speculative.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
-#include "ngram-mod.h"
 
 #include <cstddef>
 #include <cinttypes>
@@ -561,8 +560,6 @@ struct server_context_impl {
 
     llama_model_ptr model_dft;
 
-    common_ngram_mod_ptr ngram_mod;
-
     bool add_bos_token  = true;
 
     int32_t n_ctx; // total context for all clients / slots

From 20a0483f16744ce28ea77332f01c157152040011 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 30 Jan 2026 17:58:11 +0200
Subject: [PATCH 6/7] cont : cleanup

---
 common/ngram-mod.cpp   | 49 +-----------------------------------------
 common/ngram-mod.h     | 32 +--------------------------
 common/speculative.cpp | 15 ++++++++++---
 3 files changed, 14 insertions(+), 82 deletions(-)

diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp
index 58db1fb283f..b16f79a0753 100644
--- a/common/ngram-mod.cpp
+++ b/common/ngram-mod.cpp
@@ -14,7 +14,7 @@ size_t common_ngram_mod::idx(const entry_t * tokens) const {
     size_t res = 0;
 
     for (size_t i = 0; i < n; ++i) {
-        res = (res * 6364136223846793005ULL + tokens[i]);
+        res = res*6364136223846793005ULL + tokens[i];
     }
 
     res = res % entries.size();
@@ -58,50 +58,3 @@ size_t common_ngram_mod::size() const {
 size_t common_ngram_mod::size_bytes() const {
     return entries.size() * sizeof(entries[0]);
 }
-
-//
-// common_ngram_mod_ext
-//
-
-common_ngram_mod_ext::common_ngram_mod_ext(uint16_t n, size_t size) : n(n) {
-    entries.resize(size);
-}
-
-size_t common_ngram_mod_ext::idx(const int32_t * tokens) const {
-    size_t res = 0;
-
-    for (size_t i = 0; i < n; ++i) {
-        res = (res * 6364136223846793005ULL + tokens[i]);
-    }
-
-    res = res % entries.size();
-
-    return res;
-}
-
-void common_ngram_mod_ext::add(const int32_t * tokens) {
-    const size_t i = idx(tokens);
-
-    common_ngram_mod_ext_entry & entry = entries[i];
-
-    if (entry.n_choices < COMMON_NGRAM_MOD_MAX_CHOICES) {
-        entry.n_choices++;
-    }
-
-    entry.choices[entry.head] = tokens[n];
-    entry.head = (entry.head + 1) % COMMON_NGRAM_MOD_MAX_CHOICES;
-}
-
-int32_t common_ngram_mod_ext::get(const int32_t * tokens, int32_t offs) const {
-    const size_t i = idx(tokens);
-
-    const common_ngram_mod_ext_entry & entry = entries[i];
-
-    if (entry.n_choices == 0) {
-        return -1;
-    }
-
-    const int32_t k = (offs + entry.head) % entry.n_choices;
-
-    return entry.choices[k];
-}
diff --git a/common/ngram-mod.h b/common/ngram-mod.h
index 3f28ee89d76..cf3c89c915d 100644
--- a/common/ngram-mod.h
+++ b/common/ngram-mod.h
@@ -2,10 +2,10 @@
 
 #include <cstdint>
 #include <vector>
-#include <memory>
 
 //
 // common_ngram_mod
+// ref: https://github.com/ggml-org/llama.cpp/pull/19164
 //
 
 // basic n-gram hasher
@@ -35,33 +35,3 @@ struct common_ngram_mod {
 
     std::vector<entry_t> entries;
 };
-
-using common_ngram_mod_ptr = std::unique_ptr<common_ngram_mod>;
-
-//
-// common_ngram_mod_ext (for experiments)
-//
-
-#define COMMON_NGRAM_MOD_MAX_CHOICES 4
-
-struct common_ngram_mod_ext_entry {
-    uint16_t head = 0;
-    uint16_t n_choices = 0;
-
-    int32_t choices[COMMON_NGRAM_MOD_MAX_CHOICES];
-};
-
-struct common_ngram_mod_ext {
-    common_ngram_mod_ext(uint16_t n, size_t size);
-
-    size_t idx(const int32_t * tokens) const;
-
-    void    add(const int32_t * tokens);
-    int32_t get(const int32_t * tokens, int32_t offs) const; // return -1 if not found
-
-    size_t n; // ngram size to hash
-
-    std::vector<common_ngram_mod_ext_entry> entries;
-};
-
-using common_ngram_mod_ext_ptr = std::unique_ptr<common_ngram_mod_ext>;
diff --git a/common/speculative.cpp b/common/speculative.cpp
index a5c2dc5e577..a1a3b51c134 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -113,6 +113,8 @@ static bool common_speculative_are_compatible(
 struct common_speculative_state {
     const enum common_speculative_type type;
 
+    // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
+    // TODO: add n_call_begin, n_call_accept
     size_t drafts_call_count       = 0; // number of times this implementation was called.
     size_t drafts_generated_count  = 0; // number of times a draft or part was generated by this implementation.
     size_t drafts_accepted_count   = 0; // number of times a draft or part was accepted by the target model.
@@ -122,6 +124,8 @@ struct common_speculative_state {
     // TODO: track performance of most recent calls
     const bool gen_perf = true; // whether to generate performance stats.
 
+    // TODO: rename to t_draft_us
+    // TODO: add t_begin_us, t_accept_us
     int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
 
     common_speculative_state(enum common_speculative_type type) : type(type) {}
@@ -577,7 +581,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
         const size_t n = mod.get_n();
 
         // add new ngrams in chunks
-        if (i_last + 16*n < cur_len) {
+        if (i_last + 32 < cur_len) {
             for (size_t i = i_last; i < cur_len - n; ++i) {
                 mod.add(prompt_tgt.data() + i);
             }
@@ -627,6 +631,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
                 n_low++;
                 if (n_low >= 3) {
                     LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
+
                     mod.reset();
                     n_low = 0;
                 }
@@ -839,6 +844,10 @@ common_speculative * common_speculative_init(
                 LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
                         params.ngram_size_n, params.ngram_mod->size(),
                         (float)(params.ngram_mod->size_bytes())/1024/1024);
+
+                if (params.ngram_size_n < 16) {
+                    LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
+                }
             }
 
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
@@ -969,8 +978,7 @@ llama_tokens common_speculative_draft(
 
         if (!result.empty()) {
             LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
-                    common_speculative_type_to_str(impl.get()->type).c_str(),
-                    prompt_tgt.size(),
+                    common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
                     impl.get()->drafts_call_count, result.size());
 
             spec->curr_impl = impl.get(); // set current implementation for stats
@@ -1016,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
             str_perf = "";
         }
 
+        // TODO: report time for begin() and accept()
         LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
                 common_speculative_type_to_str(impl->type).c_str(),
                 impl->drafts_call_count,

From 518926d80948a0cd23548bdf97c36f639017d596 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 30 Jan 2026 18:16:58 +0200
Subject: [PATCH 7/7] cont : fix

---
 common/ngram-mod.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp
index b16f79a0753..76f7257f611 100644
--- a/common/ngram-mod.cpp
+++ b/common/ngram-mod.cpp
@@ -7,7 +7,7 @@
 common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
     entries.resize(size);
 
-    std::fill(entries.begin(), entries.end(), EMPTY);
+    reset();
 }
 
 size_t common_ngram_mod::idx(const entry_t * tokens) const {
@@ -25,7 +25,7 @@ size_t common_ngram_mod::idx(const entry_t * tokens) const {
 void common_ngram_mod::add(const entry_t * tokens) {
     const size_t i = idx(tokens);
 
-    if (entries[i] != EMPTY) {
+    if (entries[i] == EMPTY) {
         used++;
     }