Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
ngram-cache.h
ngram-map.cpp
ngram-map.h
ngram-mod.cpp
ngram-mod.h
peg-parser.cpp
peg-parser.h
preset.cpp
Expand Down
4 changes: 3 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
common_speculative_type_to_str(params.speculative.type).c_str()),
[](common_params & params, const std::string & value) {
Expand All @@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
} else if (value == "ngram-map-k4v") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
} else if (value == "ngram-mod") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
} else {
throw std::invalid_argument("unknown speculative decoding type without draft model");
}
Expand Down
5 changes: 5 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ enum common_speculative_type {
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
};
Expand Down Expand Up @@ -252,6 +253,8 @@ struct common_params_model {
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
};

struct common_ngram_mod;

struct common_params_speculative {
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding

Expand All @@ -269,6 +272,8 @@ struct common_params_speculative {
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed

std::shared_ptr<common_ngram_mod> ngram_mod;

std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT

Expand Down
35 changes: 15 additions & 20 deletions common/ngram-map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@
#include <cstdio>
#include <sstream>

// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
std::ostringstream oss;
oss << '[';
for (size_t i = 0; i < length; ++i) {
if (i > 0) {
oss << ", ";
}
oss << inp[start + i];
}
oss << ']';
return oss.str();
}


// n-gram simple
//

Expand Down Expand Up @@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
// maximum number of counted values of a ngram map value.
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380

static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);

void common_ngram_map_draft(common_ngram_map & map,
const llama_tokens & inp, llama_token sampled,
llama_tokens & draft) {
Expand Down Expand Up @@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
n_accepted, curr_value.n_accepted);
curr_value.n_accepted = n_accepted;
}

// Helper functions.
//

// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
std::ostringstream oss;
oss << '[';
for (size_t i = 0; i < length; ++i) {
if (i > 0) {
oss << ", ";
}
oss << inp[start + i];
}
oss << ']';
return oss.str();
}

1 change: 1 addition & 0 deletions common/ngram-map.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
//

#include "llama.h"
#include "common.h"

#include <vector>

Expand Down
60 changes: 60 additions & 0 deletions common/ngram-mod.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include "ngram-mod.h"

//
// common_ngram_mod
//

common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
entries.resize(size);

reset();
}

size_t common_ngram_mod::idx(const entry_t * tokens) const {
size_t res = 0;

for (size_t i = 0; i < n; ++i) {
res = res*6364136223846793005ULL + tokens[i];
}

res = res % entries.size();

return res;
}

void common_ngram_mod::add(const entry_t * tokens) {
const size_t i = idx(tokens);

if (entries[i] == EMPTY) {
used++;
}

entries[i] = tokens[n];
}

common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
const size_t i = idx(tokens);

return entries[i];
}

void common_ngram_mod::reset() {
std::fill(entries.begin(), entries.end(), EMPTY);
used = 0;
}

size_t common_ngram_mod::get_n() const {
return n;
}

size_t common_ngram_mod::get_used() const {
return used;
}

size_t common_ngram_mod::size() const {
return entries.size();
}

size_t common_ngram_mod::size_bytes() const {
return entries.size() * sizeof(entries[0]);
}
37 changes: 37 additions & 0 deletions common/ngram-mod.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma once

#include <cstdint>
#include <vector>

//
// common_ngram_mod
// ref: https://github.com/ggml-org/llama.cpp/pull/19164
//

// basic n-gram hasher
struct common_ngram_mod {
using entry_t = int32_t;

static constexpr entry_t EMPTY = -1;

common_ngram_mod(uint16_t n, size_t size);

size_t idx(const entry_t * tokens) const;
void add(const entry_t * tokens);
entry_t get(const entry_t * tokens) const; // return -1 if not found

void reset();

size_t get_n() const;
size_t get_used() const;

size_t size() const;
size_t size_bytes() const;

private:
size_t n; // ngram size to hash

size_t used;

std::vector<entry_t> entries;
};
Loading
Loading