Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1525,6 +1525,25 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.antiprompt.emplace_back(argv[i]);
return true;
}
if (arg == "--banned-string-file") {
CHECK_ARG
std::string files = read_file(std::string(argv[i]));
std::vector<std::string> ban_strings=string_split(files, "\n");
std::vector<std::string> ban_phrases;
for (auto& str : ban_strings) {
std::erase(str, '"');
if (!str.empty()) {
ban_phrases.push_back(str);
}
}
params.ban_phrases = ban_phrases;
return true;
}
if (arg == "--banned-n") {
CHECK_ARG
params.banned_n = std::stoi(argv[i]);
return true;
}
if (arg == "-ld" || arg == "--logdir") {
CHECK_ARG
params.logdir = argv[i];
Expand Down Expand Up @@ -2235,6 +2254,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --top-n-sigma t", "top-n-sigma parmeter (default: %.1f, 0.0 = disabled)", (double)sparams.top_n_sigma});
options.push_back({ "*", " --adaptive-target", "adaptive-p sampling: (default: %.2f, <0.0 = disabled)", (double)sparams.adaptive_target});
options.push_back({ "*", " --adaptive-decay", "adaptive-p sampling: (default: %.2f)", (double)sparams.adaptive_decay});
options.push_back({ "*", " --banned-string-file", "file path of the list of banned strings on each line" });
options.push_back({ "*", " --banned-n", "number of tokens banned in the phrase during rewind. -1 means all tokens: (default: %d)",params.banned_n });
options.push_back({ "*", " --adaptive-updt-w-cur", "adaptive-p sampling: (default: %s)", sparams.adaptive_updt_w_cur ? "true" : "false"});
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
Expand Down Expand Up @@ -2630,6 +2651,18 @@ std::string string_get_sortable_timestamp() {
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
}

// could be improved to support more languages
std::string string_lower(const std::string& str) {
std::string result = str;
for (char& c : result) {
if (c >= 'A' && c <= 'Z') {
c = static_cast<char>(c + ('a' - 'A'));
}
}
return result;
}


void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string
Expand Down
78 changes: 42 additions & 36 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,40 +144,41 @@ struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed

int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_ctx_draft = 0; // context size for draft model
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_draft_min = 1; // minimum number of tokens to draft during speculative decoding
float p_draft_min = 0.8f; // minimum speculative decoding probability (greedy)
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
int32_t max_extra_alloc_MiB = 256; // extra VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
int32_t nrep = 1; // number of repetitions used in sweep bench
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_ctx_draft = 0; // context size for draft model
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_draft_min = 1; // minimum number of tokens to draft during speculative decoding
float p_draft_min = 0.8f; // minimum speculative decoding probability (greedy)
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
float ban_phrases_bias = -999.0f; // logit bias applied to ban phrases
int32_t max_extra_alloc_MiB = 256; // additional VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
int32_t nrep = 1; // number of repetitions used in sweep bench

ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
Expand Down Expand Up @@ -213,8 +214,12 @@ struct gpt_params {

std::string cuda_params = ""; // comma separated list of cuda parameters key=value1,key2=value2

std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<std::string> ban_phrases; // strings that are banned in generation
int32_t banned_n = 1; // number of tokens that are banned in the phrase
size_t n_buffer = 0; // number of token buffers for string ban

std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
std::vector<std::pair<int,int>> offload_policy;
Expand Down Expand Up @@ -431,6 +436,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string&
std::string string_join(const std::vector<std::string>& values, const std::string& separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_lower(const std::string & str);

static bool string_starts_with(const std::string& str,
const std::string& prefix) { // While we wait for C++20's std::string::starts_with...
Expand Down
3 changes: 3 additions & 0 deletions examples/server/server-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <string>
#include <vector>
#include <cinttypes>
#include <deque>



Expand Down Expand Up @@ -225,6 +226,8 @@ struct completion_token_output {
static json probs_vector_to_json(const std::vector<completion_token_output>& probs, bool post_sampling_probs);
};

using completion_token_outputs = std::deque<completion_token_output>;

// convert a vector of completion_token_output to json
json probs_vector_to_json(const llama_context* ctx, const std::vector<completion_token_output>& probs);

Expand Down
Loading