diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp index ec58864795..7e92fa9830 100644 --- a/llama.cpp/llama.cpp +++ b/llama.cpp/llama.cpp @@ -4321,6 +4321,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "dbrx") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; + } else if ( + tokenizer_pre == "smollm") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12185,6 +12188,7 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_STARCODER: case LLAMA_VOCAB_PRE_TYPE_REFACT: case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + case LLAMA_VOCAB_PRE_TYPE_SMOLLM: word_collection = unicode_regex_split(text, { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/llama.cpp/llama.h b/llama.cpp/llama.h index 5927f06cdb..78b4890cd0 100644 --- a/llama.cpp/llama.h +++ b/llama.cpp/llama.h @@ -87,6 +87,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, LLAMA_VOCAB_PRE_TYPE_OLMO = 12, LLAMA_VOCAB_PRE_TYPE_DBRX = 13, + LLAMA_VOCAB_PRE_TYPE_SMOLLM = 14, }; // note: these values should be synchronized with ggml_rope