ikawrakow · ikawrakow · Feb 5, 2026 · Jan 20, 2026 · Feb 1, 2026 · Feb 2, 2026
diff --git a/common/common.cpp b/common/common.cpp
@@ -1525,6 +1525,25 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.antiprompt.emplace_back(argv[i]);
         return true;
     }
+    if (arg == "--banned-string-file") {
+        CHECK_ARG
+        std::string files = read_file(std::string(argv[i]));
+        std::vector<std::string> ban_strings=string_split(files, "\n");
+        std::vector<std::string> ban_phrases;
+        for (auto& str : ban_strings) {
+            std::erase(str, '"');
+            if (!str.empty()) {
+                ban_phrases.push_back(str);
+            }
+        }
+        params.ban_phrases = ban_phrases;
+        return true;
+    }
+    if (arg == "--banned-n") {
+        CHECK_ARG
+        params.banned_n = std::stoi(argv[i]);
+        return true;
+    }
     if (arg == "-ld" || arg == "--logdir") {
         CHECK_ARG
         params.logdir = argv[i];
@@ -2235,6 +2254,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --top-n-sigma t",        "top-n-sigma parmeter (default: %.1f, 0.0 = disabled)", (double)sparams.top_n_sigma});
     options.push_back({ "*",           "       --adaptive-target",      "adaptive-p sampling: (default: %.2f, <0.0 = disabled)", (double)sparams.adaptive_target});
     options.push_back({ "*",           "       --adaptive-decay",       "adaptive-p sampling: (default: %.2f)", (double)sparams.adaptive_decay});
+    options.push_back({ "*",           "       --banned-string-file",   "file path of the list of banned strings on each line" });
+    options.push_back({ "*",           "       --banned-n",             "number of tokens banned in the phrase during rewind. -1 means all tokens: (default: %d)",params.banned_n });
     options.push_back({ "*",           "       --adaptive-updt-w-cur",  "adaptive-p sampling: (default: %s)", sparams.adaptive_updt_w_cur ? "true" : "false"});
     options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
                                                                         "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
@@ -2630,6 +2651,18 @@ std::string string_get_sortable_timestamp() {
     return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
 }
 
+// could be improved to support more languages
+std::string string_lower(const std::string& str) {
+    std::string result = str;
+    for (char& c : result) {
+        if (c >= 'A' && c <= 'Z') {
+            c = static_cast<char>(c + ('a' - 'A')); 
+        }
+    }
+    return result;
+}
+
+
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
     if (search.empty()) {
         return; // Avoid infinite loop if 'search' is an empty string

diff --git a/common/common.h b/common/common.h
@@ -144,40 +144,41 @@ struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
     int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1;  // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
-    int32_t n_predict             =    -1;  // new tokens to predict
-    int32_t n_ctx                 =     0;  // context size
-    int32_t n_ctx_draft           =     0;  // context size for draft model
-    int32_t n_batch               =  2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_ubatch              =   512;  // physical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                =     0;  // number of tokens to keep from initial prompt
-    int32_t n_draft               =    16;  // number of tokens to draft during speculative decoding
-    int32_t n_draft_min           =     1;  // minimum number of tokens to draft during speculative decoding
-    float   p_draft_min           =  0.8f;  // minimum speculative decoding probability (greedy)
-    int32_t n_chunks              =    -1;  // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel            =     1;  // number of parallel sequences to decode
-    int32_t n_sequences           =     1;  // number of sequences to decode
-    float   p_split               =  0.1f;  // speculative decoding split probability
-    int32_t n_gpu_layers          =    -1;  // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    =    -1;  // number of layers to store in VRAM for the draft model (-1 - use default)
-    int32_t main_gpu              =     0;  // the GPU that is used for scratch and small tensors
-    int32_t max_gpu               =     0;  // max number of GPUs to use at a time for split mode "graph"
-    float   tensor_split[128]     =   {0};  // how split tensors should be distributed across GPUs
-    int32_t grp_attn_n            =     1;  // group-attention factor
-    int32_t grp_attn_w            =   512;  // group-attention width
-    int32_t n_print               =    -1;  // print token count every n tokens (-1 = disabled)
-    float   rope_freq_base        =  0.0f;  // RoPE base frequency
-    float   rope_freq_scale       =  0.0f;  // RoPE frequency scaling factor
-    float   yarn_ext_factor       = -1.0f;  // YaRN extrapolation mix factor
-    float   yarn_attn_factor      =  -1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast        = -1.0f;  // YaRN low correction dim
-    float   yarn_beta_slow        =  -1.0f; // YaRN high correction dim
-    int32_t yarn_orig_ctx         =     0;  // YaRN original context length
-    float   defrag_thold          = -1.0f;  // KV cache defragmentation threshold
-    int32_t max_extra_alloc_MiB   = 256;    // extra VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
-    int32_t nrep                  = 1;      // number of repetitions used in sweep bench
+    int32_t n_threads_draft       =      -1;
+    int32_t n_threads_batch       =      -1; // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch_draft =      -1;
+    int32_t n_predict             =      -1; // new tokens to predict
+    int32_t n_ctx                 =       0; // context size
+    int32_t n_ctx_draft           =       0; // context size for draft model
+    int32_t n_batch               =    2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              =     512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                =       0; // number of tokens to keep from initial prompt
+    int32_t n_draft               =      16; // number of tokens to draft during speculative decoding
+    int32_t n_draft_min           =       1; // minimum number of tokens to draft during speculative decoding
+    float   p_draft_min           =    0.8f; // minimum speculative decoding probability (greedy)
+    int32_t n_chunks              =      -1; // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel            =       1; // number of parallel sequences to decode
+    int32_t n_sequences           =       1; // number of sequences to decode
+    float   p_split               =    0.1f; // speculative decoding split probability
+    int32_t n_gpu_layers          =      -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft    =      -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t main_gpu              =       0; // the GPU that is used for scratch and small tensors
+    int32_t max_gpu               =       0; // max number of GPUs to use at a time for split mode "graph"
+    float   tensor_split[128]     =     {0}; // how split tensors should be distributed across GPUs
+    int32_t grp_attn_n            =       1; // group-attention factor
+    int32_t grp_attn_w            =     512; // group-attention width
+    int32_t n_print               =      -1; // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        =    0.0f; // RoPE base frequency
+    float   rope_freq_scale       =    0.0f; // RoPE frequency scaling factor
+    float   yarn_ext_factor       =   -1.0f; // YaRN extrapolation mix factor
+    float   yarn_attn_factor      =   -1.0f; // YaRN magnitude scaling factor
+    float   yarn_beta_fast        =   -1.0f; // YaRN low correction dim
+    float   yarn_beta_slow        =   -1.0f; // YaRN high correction dim
+    int32_t yarn_orig_ctx         =       0; // YaRN original context length
+    float   defrag_thold          =   -1.0f; // KV cache defragmentation threshold
+    float   ban_phrases_bias      = -999.0f; // logit bias applied to ban phrases
+    int32_t max_extra_alloc_MiB   =     256; // additional VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
+    int32_t nrep                  =       1; // number of repetitions used in sweep bench
 
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
@@ -213,8 +214,12 @@ struct gpt_params {
 
     std::string cuda_params          = ""; // comma separated list of cuda parameters key=value1,key2=value2
 
-    std::vector<std::string> in_files;   // all input files
-    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
+    std::vector<std::string> in_files;     // all input files
+    std::vector<std::string> antiprompt;   // strings upon which more user input is prompted (a.k.a. reverse prompts)
+    std::vector<std::string> ban_phrases;  // strings that are banned in generation
+    int32_t banned_n                 =  1; // number of tokens that are banned in the phrase
+    size_t n_buffer 				 =  0; // number of token buffers for string ban
+
     std::vector<llama_model_kv_override> kv_overrides;
     std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
     std::vector<std::pair<int,int>> offload_policy;
@@ -431,6 +436,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string&
 std::string string_join(const std::vector<std::string>& values, const std::string& separator);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
+std::string string_lower(const std::string & str);
 
 static bool string_starts_with(const std::string& str,
     const std::string& prefix) {  // While we wait for C++20's std::string::starts_with...

diff --git a/examples/server/server-common.h b/examples/server/server-common.h
@@ -13,6 +13,7 @@
 #include <string>
 #include <vector>
 #include <cinttypes>
+#include <deque>
 
 
 
@@ -225,6 +226,8 @@ struct completion_token_output {
     static json probs_vector_to_json(const std::vector<completion_token_output>& probs, bool post_sampling_probs);
 };
 
+using completion_token_outputs = std::deque<completion_token_output>;
+
 // convert a vector of completion_token_output to json
 json probs_vector_to_json(const llama_context* ctx, const std::vector<completion_token_output>& probs);