Skip to content

Commit

Permalink
Testing (ggerganov#1)
Browse files Browse the repository at this point in the history
* a chinese word formed of 3 chinese charcters but the first 2 is not word

* tokenizer-fix

* E5 Pretokenizer bugfix

* whitespace fix

* remove extra wpm

---------

Co-authored-by: Mike Fan <[email protected]>
Co-authored-by: Oliver Ye <[email protected]>
  • Loading branch information
3 people authored Jul 22, 2024
1 parent 741a9a8 commit bec1494
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 175 deletions.
12 changes: 5 additions & 7 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,18 +458,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
# ref: https://huggingface.co/databricks/dbrx-base
res = "dbrx"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-v2-en"
Expand All @@ -488,6 +482,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327":
# ref: https://huggingface.co/intfloat/multilingual-e5-base
res = "multilingual-e5-base"
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
res = "chatglm-bpe"
Expand Down Expand Up @@ -2354,7 +2351,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_pooling_type(pooling_type)

def set_vocab(self):
tokens, toktypes, tokpre = self.get_vocab_base('default')
tokens, toktypes, tokpre = self.get_vocab_base()
self.vocab_size = len(tokens)

self.gguf_writer.add_token_type_count(int(self.hparams['type_vocab_size']))
Expand All @@ -2364,6 +2361,7 @@ def set_vocab(self):
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_eos_token(True)

# handle special tokens
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
Expand Down
3 changes: 2 additions & 1 deletion convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "multilingual-e5-base", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/intfloat/multilingual-e5-base", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
Expand Down Expand Up @@ -141,7 +142,7 @@ def download_model(model):
name = model["name"]
tokt = model["tokt"]

if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
if (tokt == TOKENIZER_TYPE.SPM and name != "multilingual-e5-base") or tokt == TOKENIZER_TYPE.UGM:
continue

# Skip if the tokenizer folder does not exist or there are other download issues previously
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_E5 = 20,
};

// note: these values should be synchronized with ggml_rope
Expand Down
226 changes: 59 additions & 167 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" },
{ LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_XLMROBERTA, "xlm-roberta" },
{ LLM_ARCH_XLMROBERTA, "xlm-roberta" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};

Expand Down Expand Up @@ -5462,7 +5462,12 @@ static void llm_load_vocab(
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
if (tokenizer_pre == "multilingual-e5-base") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_E5;
}
else{
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
vocab.tokenizer_add_space_prefix = true;
vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_add_bos = true;
Expand Down Expand Up @@ -15194,7 +15199,10 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
}
// Try to fall back to just the byte as a string
const char buf2[2] = { (char)ch, 0 };
return vocab.token_to_id.at(buf2);
// printf("%hhdu\n",buf2[0]);
// printf("%hhd\n",buf2[1]);

return vocab.token_to_id.find(buf2) != vocab.token_to_id.end() ? token->second:vocab.special_unk_id;
}
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: {
Expand Down Expand Up @@ -15679,34 +15687,59 @@ struct llm_tokenizer_bpe {
};

struct llm_tokenizer_wpm {
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {
is_xlm_vocab = vocab.token_to_id.size() > 100000 &&
vocab.token_to_id.find("数据") != vocab.token_to_id.end();
}
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}

void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
if (is_xlm_vocab) {
tokenize_xlm(text, output);
} else {
tokenize_default(text, output);
}
}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
const auto & token_map = vocab.token_to_id;

void tokenize_default(const std::string & text, std::vector<llama_vocab::id> & output) {
// normalize and split by whitespace
std::vector<std::string> words = preprocess_default(text);
std::vector<std::string> words = preprocess(text);

// bos token prepended already

// find the longest tokens that form the words
for (const std::string &word : words) {
if (word.size() > 0) {
tokenize_word_default(word, output);
for (const std::string & word : words) {
// skip empty words
if (word.size() == 0) {
continue;
}

// prepend phantom space
const std::string word1 = "\xe2\x96\x81" + word;
const int n = word1.size();

const size_t current_tokens = output.size();

// we're at the start of a new word
// move through character position in word
for (int i = 0; i < n; ++i) {
// loop through possible match length
bool match = false;
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
auto it = token_map.find(word1.substr(i, j - i));
if (it != token_map.end()) {
output.push_back(it->second);
match = true;
i = j - 1;
break;
}
}

if (!match) { // discard all
output.resize(current_tokens);
break; // and discard next tokens
}
}

// we didn't find any matches for this word
if (current_tokens == output.size()) {
output.push_back(vocab.special_unk_id);
}
}
}

std::vector<std::string> preprocess_default(const std::string & text) {
// TODO: reduce string copies by using cpts_offs array
std::vector<std::string> preprocess(const std::string & text) const {
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
std::vector<std::string> words(1, "");

Expand Down Expand Up @@ -15744,151 +15777,6 @@ struct llm_tokenizer_wpm {
return words;
}

void tokenize_xlm(const std::string & text, std::vector<llama_vocab::id> & output) {
auto cpts_word_2_str = [](const std::vector<uint32_t> & cpts_word) {
std::string word;
for (auto c : cpts_word) {
word += unicode_cpt_to_utf8(c);
}
return word;
};

auto is_english_char = [](uint32_t cpt) {
const auto flags = unicode_cpt_flags(cpt);
return !(cpt == 0 || cpt == 0xFFFD || flags.is_control || flags.is_punctuation ||
(cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt));
};

const auto & token_map = vocab.token_to_id;

// normalize and split by whitespace
auto all_cpts_words = preprocess_xlm(text);

// bos token prepended already

// find the longest tokens that form the words
for (int i = 0; i < (int)all_cpts_words.size(); ++i) {
const auto & cpts_word = all_cpts_words[i];
// skip empty words
if (cpts_word.size() == 0) {
continue;
}

std::string word = cpts_word_2_str(cpts_word);
if (cpts_word.size() != 1 || (cpts_word.size() == 1 && is_english_char(cpts_word[0]))) {
tokenize_word_default(word, output);
continue;
}

auto it = token_map.find(word);
auto token_id = it != token_map.end() ? it->second : vocab.special_unk_id;
if (token_id == vocab.special_unk_id) {
output.push_back(token_id);
continue;
}

auto j = i + 1;
for (; j < (int)all_cpts_words.size(); j++) {
const auto& next_cpts_word = all_cpts_words[j];
if (next_cpts_word.size() != 1 || (next_cpts_word.size() == 1 && is_english_char(next_cpts_word[0]))) {
break;
}

auto next_word = cpts_word_2_str(next_cpts_word);
it = token_map.find(word + next_word);
auto token_id_2 = it != token_map.end() ? it->second : vocab.special_unk_id;;
if (token_id_2 == vocab.special_unk_id) {
break;
}

token_id = token_id_2;
word += next_word;
}

output.push_back(token_id);
i = j - 1;
}
}

std::vector<std::vector<uint32_t>> preprocess_xlm(const std::string & text) {
std::vector<uint32_t> cpts_word;
std::vector<std::vector<uint32_t>> all_cpts_words;
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
for (const uint32_t cpt : cpts_nfd) {
const auto flags = unicode_cpt_flags(cpt);

if (flags.is_whitespace) {
if (!cpts_word.empty()) {
all_cpts_words.emplace_back(cpts_word);
cpts_word.clear();
}
continue;
}

assert (!flags.is_separator);
if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
if (!cpts_word.empty()) {
all_cpts_words.emplace_back(cpts_word);
cpts_word.clear();
}
continue;
}

if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
if (!cpts_word.empty()) {
all_cpts_words.emplace_back(cpts_word);
cpts_word.clear();
}
all_cpts_words.emplace_back(std::vector<uint32_t>{cpt});
}
else {
cpts_word.emplace_back(cpt);
}
}

if (!cpts_word.empty()) {
all_cpts_words.emplace_back(cpts_word);
}

return all_cpts_words;
}

void tokenize_word_default(const std::string & word, std::vector<llama_vocab::id> & output) {
const auto & token_map = vocab.token_to_id;

// prepend phantom space
const std::string word1 = "\xe2\x96\x81" + word;
const int n = word1.size();

const size_t current_tokens = output.size();

// we're at the start of a new word
// move through character position in word
for (int i = 0; i < n; ++i) {
// loop through possible match length
bool match = false;
for (int j = n; j > i; j--) {
auto it = token_map.find(word1.substr(i, j - i));
if (it != token_map.end()) {
output.push_back(it->second);
match = true;
i = j - 1;
break;
}
}

if (!match) { // discard all
output.resize(current_tokens);
break; // and discard next tokens
}
}

// we didn't find any matches for this word
if (current_tokens == output.size()) {
output.push_back(vocab.special_unk_id);
}
}

static bool is_chinese_char(uint32_t cpt) {
return
(cpt >= 0x04E00 && cpt <= 0x09FFF) ||
Expand All @@ -15903,8 +15791,7 @@ struct llm_tokenizer_wpm {
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
}

bool is_xlm_vocab;
const llama_vocab & vocab;
const llama_vocab & vocab;
};

struct naive_trie {
Expand Down Expand Up @@ -16473,6 +16360,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
llm_tokenizer_spm tokenizer(vocab);
//Temporary workaround for SPM Preprocessor
if(vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_E5){
std::regex ws_re("\\s+");
raw_text = std::regex_replace(raw_text, ws_re, " ");
}
llama_escape_whitespace(raw_text);
tokenizer.tokenize(raw_text, output);
is_prev_special = false;
Expand Down

0 comments on commit bec1494

Please sign in to comment.