Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ struct llm_tokenizer_wpm_session {

void tokenize(const std::string & text, std::vector<llama_token> & output) {
// normalize and split by whitespace
std::vector<std::string> words = preprocess(text);
std::vector<std::string> words = preprocess(text, vocab.get_normalizer_lowercase());
// bos token prepended already

// find the longest tokens that form the words
Expand Down Expand Up @@ -799,7 +799,7 @@ struct llm_tokenizer_wpm_session {
}

// TODO: reduce string copies by using cpts_offs array
static std::vector<std::string> preprocess(const std::string & text) {
static std::vector<std::string> preprocess(const std::string & text, bool lowercase) {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only lowercase is needed here for now, so I kept this as a single bool.
Should I make this an options struct up front to allow for future BertNormalizer flags like strip_accents, or keep it minimal?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think options make sense, add a TODO for the other ones.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Want to do this, or intend to follow up right away?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll open a separate PR to add strip_accents and refactor this into an options struct.

const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
std::vector<std::string> words(1, "");

Expand All @@ -818,7 +818,7 @@ struct llm_tokenizer_wpm_session {
continue;
}

const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
if (words.back().size()) { // finish previous word if any
words.emplace_back();
Expand Down Expand Up @@ -2159,6 +2159,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} else if (
tokenizer_pre == "whitespace") {
pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
normalizer_lowercase = false;
Comment thread
CISC marked this conversation as resolved.
} else if (
tokenizer_pre == "refact") {
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
Expand Down Expand Up @@ -2339,9 +2340,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}

ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
}

const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
Expand Down Expand Up @@ -2511,6 +2511,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
}
}

// Lowercase normalizer flag (consulted by WPM / whitespace BPE)
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);

// auto-detect special tokens by text
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
// for now, we apply this workaround to find the tokens based on their text
Expand Down
Loading