-
Notifications
You must be signed in to change notification settings - Fork 19.2k
vocab: add tokenizer support for jina-embeddings-v2-base-zh #18756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -519,6 +519,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { | |
| "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", | ||
| }; | ||
| break; | ||
| case LLAMA_VOCAB_PRE_TYPE_WHITESPACE: | ||
| // whitespace pre-tokenizer (jinaai/jina-embeddings-v2-base-zh) | ||
| regex_exprs = { | ||
| "\\S+", | ||
| }; | ||
| byte_encode = false; | ||
| break; | ||
| default: | ||
| // default regex for BPE tokenization pre-processing | ||
| regex_exprs = { | ||
|
|
@@ -1671,6 +1678,35 @@ struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session { | |
| const llama_vocab & vocab; | ||
| }; | ||
|
|
||
| struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session { | ||
| llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} | ||
|
|
||
| void tokenize(const std::string & text, std::vector<llama_token> & output) override { | ||
| const bool lowercase = vocab.get_normalizer_lowercase(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be checked in WPM as well (for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened #23899 for this (verified with |
||
|
|
||
| std::string segment; | ||
| auto flush = [&]() { | ||
| if (!segment.empty()) { | ||
| llm_tokenizer_bpe_session::tokenize(segment, output); | ||
| segment.clear(); | ||
| } | ||
| }; | ||
|
|
||
| for (uint32_t cpt : unicode_cpts_from_utf8(text)) { | ||
| // drop whitespace | ||
| if (unicode_cpt_flags_from_cpt(cpt).is_whitespace) { | ||
| flush(); | ||
| } else { | ||
| segment += unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt); | ||
| } | ||
| } | ||
| flush(); | ||
| } | ||
|
|
||
| private: | ||
| const llama_vocab & vocab; | ||
| }; | ||
|
|
||
| // | ||
| // impl | ||
| // | ||
|
|
@@ -1751,6 +1787,7 @@ struct llama_vocab::impl { | |
| bool remove_extra_whitespaces = false; | ||
| bool escape_whitespaces = true; | ||
| bool treat_whitespace_as_suffix = false; | ||
| bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json) | ||
|
|
||
| std::unordered_map<std::string, llama_token> token_to_id; | ||
| std::vector<token_data> id_to_token; | ||
|
|
@@ -1900,7 +1937,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { | |
| special_mask_id = 103; | ||
|
|
||
| add_sep = true; | ||
| } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") { | ||
| } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") { | ||
| type = LLAMA_VOCAB_TYPE_BPE; | ||
|
|
||
| // read bpe merges and populate bpe ranks | ||
|
|
@@ -2119,6 +2156,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { | |
| tokenizer_pre == "roberta-bpe") { | ||
| pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; | ||
| add_sep = true; | ||
| } else if ( | ||
| tokenizer_pre == "whitespace") { | ||
| pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE; | ||
| } else if ( | ||
| tokenizer_pre == "refact") { | ||
| pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT; | ||
|
|
@@ -2299,8 +2339,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { | |
| pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; | ||
| } | ||
|
|
||
| ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); | ||
| ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); | ||
| ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); | ||
| ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); | ||
| ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); | ||
| } | ||
|
|
||
| const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); | ||
|
|
@@ -3264,6 +3305,8 @@ std::vector<llama_token> llama_vocab::impl::tokenize( | |
| std::unique_ptr<llm_tokenizer_bpe_session> session; | ||
| if (vocab.get_tokenizer_model() == "hybriddna") { | ||
| session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe); | ||
| } else if (vocab.get_tokenizer_model() == "whitespace") { | ||
| session = std::make_unique<llm_tokenizer_whitespace_session>(vocab, *tok_bpe); | ||
| } else { | ||
| session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe); | ||
| } | ||
|
|
@@ -3892,6 +3935,10 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const { | |
| return pimpl->treat_whitespace_as_suffix; | ||
| } | ||
|
|
||
| bool llama_vocab::get_normalizer_lowercase() const { | ||
| return pimpl->normalizer_lowercase; | ||
| } | ||
|
|
||
| int llama_vocab::max_token_len() const { | ||
| return pimpl->max_token_len; | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The obvious flaw here is that someone might try to use
Lowercasewith another tokenizer, or leave it out withWhitespace, but let's worry about that then as this complicates things for something that probably will never happen (famous last words).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, tokenizer variants are endless...