Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llama.cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4321,6 +4321,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "dbrx") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
} else if (
tokenizer_pre == "smollm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -12185,6 +12188,7 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
Expand Down
1 change: 1 addition & 0 deletions llama.cpp/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 14,
};

// note: these values should be synchronized with ggml_rope
Expand Down