Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3437,16 +3437,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.ngram_size_m = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-check-rate"}, "N",
string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate),
[](common_params & params, int value) {
if (value < 1) {
throw std::invalid_argument("ngram check rate must be at least 1");
}
params.speculative.ngram_check_rate = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-min-hits"}, "N",
string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
Expand Down
1 change: 0 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ struct common_params_speculative {

uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed

std::shared_ptr<common_ngram_mod> ngram_mod;
Expand Down
7 changes: 3 additions & 4 deletions common/ngram-map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,9 @@ void common_ngram_map_draft(common_ngram_map & map,
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
}

// Only check every check_rate tokens to save compute
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
if (map.idx_last_check + map.check_rate > cur_len) {
return;
if (map.idx_last_check > cur_len) {
// Should not happen because of common_ngram_map_begin().
GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
}
map.idx_last_check = cur_len;

Expand Down
8 changes: 3 additions & 5 deletions common/ngram-map.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
struct common_ngram_simple_config {
uint16_t size_ngram; // size of n-grams to lookup in self-mode
uint16_t size_mgram; // size of m-grams to draft in self-mode
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
};

// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
Expand Down Expand Up @@ -66,15 +65,14 @@ struct common_ngram_map {
bool key_only; // true if only key n-grams are used, no values.

std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
uint16_t min_hits; // minimum number of key hits to consider a draft

bool show_key_map_stats = false; // true, if statitics of the key_map should be printed.
bool show_key_map_stats = false; // true, if statistics of the key_map should be printed.

common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
uint16_t check_rate, uint16_t min_hits)
uint16_t min_hits)
: size_key(sz_key), size_value(sz_value), key_only(only_keys),
check_rate(check_rate), min_hits(min_hits) {
min_hits(min_hits) {
key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
}

Expand Down
55 changes: 24 additions & 31 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,14 @@ static bool common_speculative_are_compatible(
struct common_speculative_state {
const enum common_speculative_type type;

// TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
// TODO: add n_call_begin, n_call_accept
size_t drafts_call_count = 0; // number of times this implementation was called.
size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation.
size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model.
size_t drafts_generated_tokens = 0; // number of tokens generated by this implementation.
size_t drafts_accepted_tokens = 0; // number of tokens accepted by the target model.
size_t n_call_begin = 0; // number of times this implementation was called for refresh.
size_t n_call_draft = 0; // number of times this implementation was called for generation.
size_t n_call_accept = 0; // number of times this implementation was called for accumulation.

size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation.
size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model.
size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
size_t n_acc_tokens = 0; // number of tokens accepted by the target model.

// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
Expand Down Expand Up @@ -465,8 +466,6 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
struct common_speculative_state_ngram_simple : public common_speculative_state {
common_ngram_simple_config config;

uint16_t check_id = 0; // used to control the frequency of generating drafts

common_speculative_state_ngram_simple(
enum common_speculative_type type,
common_ngram_simple_config config)
Expand All @@ -481,11 +480,6 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
++check_id;
if (check_id < config.check_rate) {
return;
}
check_id = 0;

result = common_ngram_simple_draft(config, prompt_tgt, id_last);
GGML_UNUSED(params);
Expand Down Expand Up @@ -752,10 +746,9 @@ static common_ngram_map get_common_ngram_map(const common_speculative_config & c
uint16_t size_key = config.params.ngram_size_n;
uint16_t size_value = config.params.ngram_size_m;
bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
uint16_t check_rate = config.params.ngram_check_rate;
uint16_t min_hits = config.params.ngram_min_hits;

return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits);
return common_ngram_map(size_key, size_value, key_only, min_hits);
}

static common_speculative_state_ngram_cache create_state_ngram_cache(
Expand Down Expand Up @@ -895,12 +888,10 @@ common_speculative * common_speculative_init(

uint16_t ngram_size_key = ngram_map.size_key;
uint16_t mgram_size_value = ngram_map.size_value;
uint16_t check_rate = ngram_map.check_rate;

auto config_simple = common_ngram_simple_config {
/* .size_ngram = */ ngram_size_key,
/* .size_mgram = */ mgram_size_value,
/* .check_rate = */ check_rate
/* .size_mgram = */ mgram_size_value
};
auto state = std::make_unique<common_speculative_state_ngram_simple>(
/* .type = */ config.type,
Expand Down Expand Up @@ -961,6 +952,7 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
for (auto & impl : spec->impls) {
common_time_meas tm(impl->t_begin_us, !impl->gen_perf);
impl->begin(prompt);
impl->n_call_begin++;
}
}

Expand All @@ -977,17 +969,17 @@ llama_tokens common_speculative_draft(
{
common_time_meas tm(impl->t_draft_us, !impl->gen_perf);
impl->draft(params, prompt_tgt, id_last, result);
impl->drafts_call_count++;
impl->n_call_draft++;
}

if (!result.empty()) {
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
impl.get()->drafts_call_count, result.size());
impl.get()->n_call_draft, result.size());

spec->curr_impl = impl.get(); // set current implementation for stats
impl->drafts_generated_count++;
impl->drafts_generated_tokens += result.size();
impl->n_gen_drafts++;
impl->n_gen_tokens += result.size();

break; // We have a draft, so break out of the loop and return it.
}
Expand All @@ -1008,11 +1000,12 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
{
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
if (n_accepted > 0) {
impl->drafts_accepted_count++;
impl->drafts_accepted_tokens += n_accepted;
impl->n_acc_drafts++;
impl->n_acc_tokens += n_accepted;
}

impl->accept(n_accepted);
impl->n_call_accept++;
}
}

Expand All @@ -1033,13 +1026,13 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf = "";
}

LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->drafts_call_count,
impl->drafts_generated_count,
impl->drafts_accepted_count,
impl->drafts_generated_tokens,
impl->drafts_accepted_tokens,
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
impl->n_gen_drafts,
impl->n_acc_drafts,
impl->n_gen_tokens,
impl->n_acc_tokens,
str_perf.c_str());
}
}
13 changes: 6 additions & 7 deletions docs/speculative.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,6 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
of lookup n-gram (default: 12)
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
of draft m-gram (default: 48)
--spec-ngram-check-rate N ngram check rate for ngram-simple/ngram-map speculative decoding
(default: 1)
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
```

Expand Down Expand Up @@ -153,10 +151,6 @@ Sets the size M of the draft m-gram for n-gram map based speculative decoding.
The m-gram size determines how many tokens to draft when a match is found.
Larger values can provide more speedup but may reduce acceptance rate.

### `--spec-ngram-check-rate R`

This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token).

### `--spec-ngram-min-hits H`

This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
Expand All @@ -175,7 +169,12 @@ draft acceptance rate = 0.70312 ( 90 accepted / 128 generated)
statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
```

- `#calls`: number of calls of this implementations
```
statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts = 26, #gen tokens = 1248, #acc tokens = 968, dur(b,g,a) = 2.234, 1.427, 0.016 ms
```


- `#calls(b,g,a)`: number of calls of begin (new prompt), generation and accumulation of this implementations
- `#gen drafts`: number of drafts generated by this implementation
- `#acc drafts`: number of drafts accepted (partially) by the main model
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
Expand Down
4 changes: 0 additions & 4 deletions tools/server/server-task.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ json task_params::to_json(bool only_metrics) const {
{"speculative.type", common_speculative_type_to_str(speculative.type)},
{"speculative.ngram_size_n", speculative.ngram_size_n},
{"speculative.ngram_size_m", speculative.ngram_size_m},
{"speculative.ngram_c_rate", speculative.ngram_check_rate},
{"speculative.ngram_m_hits", speculative.ngram_min_hits},
{"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs},
Expand Down Expand Up @@ -144,7 +143,6 @@ json task_params::to_json(bool only_metrics) const {
{"speculative.type", common_speculative_type_to_str(speculative.type)},
{"speculative.ngram_size_n", speculative.ngram_size_n},
{"speculative.ngram_size_m", speculative.ngram_size_m},
{"speculative.ngram_c_rate", speculative.ngram_check_rate},
{"speculative.ngram_m_hits", speculative.ngram_min_hits},
{"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs},
Expand Down Expand Up @@ -257,12 +255,10 @@ task_params server_task::params_from_json_cmpl(

params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n);
params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m);
params.speculative.ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.ngram_check_rate);
params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits);

params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024);
params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024);
params.speculative.ngram_check_rate = std::max(std::min(1, (int) params.speculative.ngram_check_rate), 1024);
params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024);

// Use OpenAI API logprobs only if n_probs wasn't provided
Expand Down
Loading