Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
1fb2658
server: introduce self-speculative decoding
srogmann Dec 29, 2025
1faeb62
server: moved self-call into speculative.cpp
srogmann Dec 30, 2025
e3e809c
can_speculate() includes self-speculation
srogmann Jan 1, 2026
38f7c28
server: can_speculate() tests self-spec
srogmann Jan 1, 2026
917f4bb
server: replace can_speculate() with slot.can_speculate()
srogmann Jan 2, 2026
f1f6584
common: use %zu format specifier for size_t in logging
srogmann Jan 3, 2026
907d094
server: can_speculate() requires a task instance
srogmann Jan 3, 2026
456268f
common: ngram map, config self-speculative decoding
srogmann Jan 14, 2026
b38eb59
common: add enum common_speculative_type
srogmann Jan 18, 2026
eb43748
common: add vector of speculative states
srogmann Jan 21, 2026
1e29af4
common: add option --spec-draftless
srogmann Jan 22, 2026
a1584ac
server: cleanup (remove slot.batch_spec, rename)
srogmann Jan 23, 2026
cb3a402
common: moved self-spec impl to ngram-map
srogmann Jan 25, 2026
af382c3
common: cleanup (use common_speculative_state_draft)
srogmann Jan 25, 2026
924517d
spec : refactor
ggerganov Jan 25, 2026
9ac8817
cont : naming
ggerganov Jan 25, 2026
8ea068e
spec: remove --spec-config
srogmann Jan 25, 2026
288ab50
doc: (draftless) speculative decoding
srogmann Jan 25, 2026
fd4d803
common: print performance in spec decoding
srogmann Jan 25, 2026
f895bca
minor : cleanup
ggerganov Jan 26, 2026
a330093
common : better names
ggerganov Jan 26, 2026
1f8d366
minor : cleanup + fix build
ggerganov Jan 26, 2026
72f416e
minor: comments
srogmann Jan 26, 2026
dd23149
CODEOWNERS: add common/ngram-map.* (#18471)
srogmann Jan 26, 2026
351e798
Merge branch 'master' into pr/18471
ggerganov Jan 27, 2026
bc33838
common : rename speculative.draftless_type -> speculative.type
ggerganov Jan 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
/common/jinja/ @ngxson @CISC @aldehir
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
/common/ngram-map.* @srogmann
/common/peg-parser.* @aldehir
/common/sampling.* @ggerganov
/common/speculative.* @ggerganov
Expand Down
2 changes: 2 additions & 0 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ add_library(${TARGET} STATIC
log.h
ngram-cache.cpp
ngram-cache.h
ngram-map.cpp
ngram-map.h
peg-parser.cpp
peg-parser.h
preset.cpp
Expand Down
69 changes: 65 additions & 4 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "json-schema-to-grammar.h"
#include "log.h"
#include "sampling.h"
#include "speculative.h"
#include "preset.h"

// fix problem with std::min and std::max
Expand Down Expand Up @@ -1216,16 +1217,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-lcs", "--lookup-cache-static"}, "FNAME",
"path to static lookup cache to use for lookup decoding (not updated by generation)",
[](common_params & params, const std::string & value) {
params.lookup_cache_static = value;
params.speculative.lookup_cache_static = value;
}
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-lcd", "--lookup-cache-dynamic"}, "FNAME",
"path to dynamic lookup cache to use for lookup decoding (updated by generation)",
[](common_params & params, const std::string & value) {
params.lookup_cache_dynamic = value;
params.speculative.lookup_cache_dynamic = value;
}
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-c", "--ctx-size"}, "N",
string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
Expand Down Expand Up @@ -3396,6 +3397,66 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.replacements.push_back({ tgt, dft });
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--spec-draftless"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
common_speculative_type_to_str(params.speculative.type).c_str()),
[](common_params & params, const std::string & value) {
if (value == "none") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
} else if (value == "ngram-cache") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
} else if (value == "ngram-simple") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
} else if (value == "ngram-map-k") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
} else if (value == "ngram-map-k4v") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
} else {
throw std::invalid_argument("unknown speculative decoding type without draft model");
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-size-n"}, "N",
string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n),
[](common_params & params, int value) {
if (value < 1 || value > 1024) {
throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
}
params.speculative.ngram_size_n = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-size-m"}, "N",
string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_size_m),
[](common_params & params, int value) {
if (value < 1 || value > 1024) {
throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
}
params.speculative.ngram_size_m = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-check-rate"}, "N",
string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate),
[](common_params & params, int value) {
if (value < 1) {
throw std::invalid_argument("ngram check rate must be at least 1");
}
params.speculative.ngram_check_rate = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-min-hits"}, "N",
string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
[](common_params & params, int value) {
if (value < 1) {
throw std::invalid_argument("ngram min hits must be at least 1");
}
params.speculative.ngram_min_hits = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
Expand Down
9 changes: 4 additions & 5 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1097,7 +1097,10 @@ common_init_result::common_init_result(common_params & params) :
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
params.tensor_split,
params.tensor_buft_overrides.data(),
params.fit_params_target.data(),
params.fit_params_min_ctx,
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
}

Expand Down Expand Up @@ -1208,10 +1211,6 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
return pimpl->lora;
}

void common_init_result::free_context() {
pimpl->context.reset();
}

common_init_result_ptr common_init_from_params(common_params & params) {
common_init_result_ptr res(new common_init_result(params));

Expand Down
33 changes: 25 additions & 8 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,16 @@ enum common_params_sampling_config : uint64_t {
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
};

enum common_speculative_type {
COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
};

// sampling parameters
struct common_params_sampling {
Expand Down Expand Up @@ -251,6 +261,7 @@ struct common_params_speculative {
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)

std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

Expand All @@ -261,6 +272,20 @@ struct common_params_speculative {
struct cpu_params cpuparams_batch;

struct common_params_model model;

common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding

uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed

std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT

bool has_dft() const {
return !model.path.empty() || !model.hf_repo.empty();
}
};

struct common_params_vocoder {
Expand Down Expand Up @@ -378,8 +403,6 @@ struct common_params {
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT

// llama-debug specific options
Expand Down Expand Up @@ -575,10 +598,6 @@ struct common_params {
// return false from callback to abort model loading or true to continue
llama_progress_callback load_progress_callback = NULL;
void * load_progress_callback_user_data = NULL;

bool has_speculative() const {
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
}
};

// call once at the start of a program if it uses libcommon
Expand Down Expand Up @@ -714,8 +733,6 @@ struct common_init_result {

std::vector<llama_adapter_lora_ptr> & lora();

void free_context();

private:
struct impl;
std::unique_ptr<impl> pimpl;
Expand Down
7 changes: 3 additions & 4 deletions common/ngram-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,12 @@ void common_ngram_cache_draft(
break;
}

LOG(" - draft candidate: token=%d\n", drafted_token);
LOG_DBG(" - draft candidate: token=%d\n", drafted_token);
draft.push_back(drafted_token);
}
}

void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
void common_ngram_cache_save(common_ngram_cache & ngram_cache, const std::string & filename) {
std::ofstream file_out(filename, std::ios::binary);
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
const common_ngram ngram = item.first;
Expand All @@ -217,10 +217,9 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
}
}

}

common_ngram_cache common_ngram_cache_load(std::string & filename) {
common_ngram_cache common_ngram_cache_load(const std::string & filename) {
std::ifstream hashmap_file(filename, std::ios::binary);
if (!hashmap_file) {
throw std::ifstream::failure("Unable to open file " + filename);
Expand Down
4 changes: 2 additions & 2 deletions common/ngram-cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,12 @@ void common_ngram_cache_draft(
// Save an ngram cache to a file.
// ngram_cache: the ngram cache to save.
// filename: the path under which to save the ngram cache.
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
void common_ngram_cache_save(common_ngram_cache & ngram_cache, const std::string & filename);

// Load an ngram cache saved with common_ngram_cache_save.
// filename: the path from which to load the ngram cache.
// returns: an ngram cache containing the information saved to filename.
common_ngram_cache common_ngram_cache_load(std::string & filename);
common_ngram_cache common_ngram_cache_load(const std::string & filename);

// Merge two ngram caches.
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
Expand Down
Loading