Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
params.prompt = argv[i];
std::string aux = "for idx, x in enumerate(xs):\n print(idx, x)";
return true;
}
if (arg == "-e" || arg == "--escape") {
Expand Down
14 changes: 11 additions & 3 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class TOKENIZER_TYPE(IntEnum):

# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL # Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)'

if len(sys.argv) == 2:
token = sys.argv[1]
Expand Down Expand Up @@ -144,8 +144,16 @@ def download_model(model):
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded

chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
try:
if tokenizer.is_fast:
chktok = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
else:
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
except:
chktok = 'ha'
chkhsh = 'hi'

logger.info(f"model: {name}")
logger.info(f"tokt: {tokt}")
Expand Down
36 changes: 19 additions & 17 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer

chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL # Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)'

chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
Expand All @@ -426,37 +426,37 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
res = "deepseek-llm"
if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
if chkhsh == "b5971447e22eeab46a99f9214146f5aa7a0b976a1ba44ba920330b08970b538f":
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
res = "deepseek-coder"
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
if chkhsh == "6f9b95824a16270c180000affa9a33c7cf0a2d257c255a2a781bdf74ccedd296":
# ref: https://huggingface.co/tiiuae/falcon-7b
res = "falcon"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
if chkhsh == "a81435dcfe35da4270630047e3b298a5711a9ab255030065df578e247667b049":
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
res = "bert-bge"
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b":
# ref: https://huggingface.co/mosaicml/mpt-7b
res = "mpt"
if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
if chkhsh == "123d32ec0d221232846c22dad13f1a322e7319eab37ab2d0ea22754d652060a7":
# ref: https://huggingface.co/bigcode/starcoder2-3b
res = "starcoder"
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b":
# ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2"
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
if chkhsh == "e97d3ead2ca033c7fe1c334ee51ef1d7900dafecc6f714775e2b42bff5fac995":
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
res = "stablelm2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
if chkhsh == "123d32ec0d221232846c22dad13f1a322e7319eab37ab2d0ea22754d652060a7":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
if chkhsh == "123d32ec0d221232846c22dad13f1a322e7319eab37ab2d0ea22754d652060a7":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
if chkhsh == "e97d3ead2ca033c7fe1c334ee51ef1d7900dafecc6f714775e2b42bff5fac995":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b":
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
Expand All @@ -465,19 +465,21 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-v2-en"
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
res = "jina-v2-es"
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
if chkhsh == "5bbd5b1b041c7eaa7792a5d901950e426fc12d7d84ce2ebfe3c498edd96e517f":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"

if chkhsh == "0ba81c154fbe31d2030588ac40629fbe0ce7dc851252bfc04199dfa6e09a68e8":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
res = "jina-v2-zh"
if res is None:
logger.warning("\n")
logger.warning("**************************************************************************************")
Expand Down
34 changes: 26 additions & 8 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4705,6 +4705,8 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "smaug-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
} else if (tokenizer_pre == "jina-v2-zh") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -4736,7 +4738,7 @@ static void llm_load_vocab(

for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);

vocab.token_to_id[word] = i;

Expand Down Expand Up @@ -4772,8 +4774,13 @@ static void llm_load_vocab(
vocab.linefeed_id = vocab.special_pad_id;
} else {
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
vocab.linefeed_id = ids[0];
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
if (ids.empty()) {
vocab.linefeed_id = vocab.special_pad_id;
} else {
vocab.linefeed_id = ids[0];
}

}

// special tokens
Expand Down Expand Up @@ -13014,9 +13021,10 @@ struct llm_tokenizer_bpe {
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
case LLAMA_VOCAB_PRE_TYPE_OLMO:
word_collection = unicode_regex_split(text, {
/*word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
});*/
word_collection = unicode_regex_split(text, {"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",});
break;
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
Expand All @@ -13026,6 +13034,15 @@ struct llm_tokenizer_bpe {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
});
break;
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
{
std::string data(text.c_str());
std::transform(data.begin(), data.end(), data.begin(),[](unsigned char c){ return std::tolower(c); });
word_collection = unicode_regex_split(data, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
}
break;
default:
// default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, {
Expand Down Expand Up @@ -13132,10 +13149,11 @@ struct llm_tokenizer_bpe {
for (auto j = str.begin(); j != str.end(); ++j) {
std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) {
throw std::runtime_error("ERROR: byte not found in vocab");
if (token_multibyte != vocab.token_to_id.end()) {
output.push_back((*token_multibyte).second);
//throw std::runtime_error("ERROR: byte not found in vocab");
}
output.push_back((*token_multibyte).second);

}
} else {
output.push_back((*token).second);
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 15,
};

// note: these values should be synchronized with ggml_rope
Expand Down