Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
res = "jina-v2-zh"
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
Expand Down
1 change: 1 addition & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
Expand Down
43 changes: 42 additions & 1 deletion src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
// ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
// whitespace pre-tokenizer
regex_exprs = {
"\\S+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
Expand Down Expand Up @@ -518,7 +525,20 @@ struct llm_tokenizer_bpe_session {

void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);

std::string text_normalized;
if (vocab.get_apply_lowercase()) {
for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
text_normalized += unicode_cpt_to_utf8(unicode_tolower(cpt));
}
} else {
text_normalized = text;
}

auto word_collection = unicode_regex_split(text_normalized, tokenizer.regex_exprs);
if (vocab.get_use_byte_encoding()) {
word_collection = unicode_words_byte_encode(word_collection);
}

symbols_final.clear();

Expand Down Expand Up @@ -1591,6 +1611,8 @@ struct llama_vocab::impl {
bool remove_extra_whitespaces = false;
bool escape_whitespaces = true;
bool treat_whitespace_as_suffix = false;
bool apply_lowercase = false; // lowercase normalization
bool use_byte_encoding = true; // GPT-2 byte encoding for BPE vocab

std::unordered_map<std::string, llama_token> token_to_id;
std::vector<token_data> id_to_token;
Expand Down Expand Up @@ -2031,6 +2053,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "solar-open") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
clean_spaces = false;
} else if (
tokenizer_pre == "jina-v2-zh") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
clean_spaces = true;
add_bos = true;
add_sep = true;
apply_lowercase = true;
use_byte_encoding = false;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -3130,6 +3160,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
if (!use_byte_encoding) {
return _try_copy(token_text.data(), token_text.size());
}
std::string result = llama_decode_text(token_text);
return _try_copy(result.data(), result.size());
}
Expand Down Expand Up @@ -3554,6 +3587,14 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
return pimpl->treat_whitespace_as_suffix;
}

bool llama_vocab::get_apply_lowercase() const {
return pimpl->apply_lowercase;
}

bool llama_vocab::get_use_byte_encoding() const {
return pimpl->use_byte_encoding;
}

int llama_vocab::max_token_len() const {
return pimpl->max_token_len;
}
Expand Down
3 changes: 3 additions & 0 deletions src/llama-vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 45,
};

struct LLM_KV;
Expand Down Expand Up @@ -130,6 +131,8 @@ struct llama_vocab {
bool get_remove_extra_whitespaces () const;
bool get_escape_whitespaces () const;
bool get_treat_whitespace_as_suffix() const;
bool get_apply_lowercase () const;
bool get_use_byte_encoding () const;

int max_token_len() const;

Expand Down
37 changes: 19 additions & 18 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,23 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
return conv.from_bytes(s);
}

static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
std::vector<std::string> bpe_encoded_words;
for (const auto & word : bpe_words) {
std::string text_utf;
auto utf_word = unicode_cpts_from_utf8(word);
for (size_t i = 0; i < utf_word.size(); ++i) {
text_utf += unicode_cpt_to_utf8(utf_word[i]);
}

std::string encoded_token;
for (char & c : text_utf) {
encoded_token += unicode_byte_to_utf8(c);
}
bpe_encoded_words.emplace_back(encoded_token);
}
return bpe_encoded_words;
}

// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
Expand Down Expand Up @@ -956,6 +939,24 @@ bool unicode_cpt_is_han(uint32_t cpt) {
return false;
}

std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words) {
std::vector<std::string> bpe_encoded_words;
for (const auto & word : bpe_words) {
std::string text_utf;
auto utf_word = unicode_cpts_from_utf8(word);
for (size_t i = 0; i < utf_word.size(); ++i) {
text_utf += unicode_cpt_to_utf8(utf_word[i]);
}

std::string encoded_token;
for (char & c : text_utf) {
encoded_token += unicode_byte_to_utf8(c);
}
bpe_encoded_words.emplace_back(encoded_token);
}
return bpe_encoded_words;
}

std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
// unicode categories
static const std::map<std::string, int> k_ucat_enum = {
Expand Down Expand Up @@ -1143,5 +1144,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
start += offset;
}

return unicode_byte_encoding_process(bpe_words);
return bpe_words;
}
2 changes: 2 additions & 0 deletions src/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt);

bool unicode_cpt_is_han(uint32_t cpt);

std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words);

std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);