diff --git a/common/common.cpp b/common/common.cpp index c8df9a4ce8ef5..6d205145ff76d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -344,6 +344,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.prompt = argv[i]; + std::string aux = "for idx, x in enumerate(xs):\n print(idx, x)"; return true; } if (arg == "-e" || arg == "--escape") { diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index ee1eeaca007b1..bb0dff6907e4c 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -48,7 +48,7 @@ class TOKENIZER_TYPE(IntEnum): # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome -chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' +chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL # Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)' if len(sys.argv) == 2: token = sys.argv[1] @@ -144,8 +144,16 @@ def download_model(model): logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") continue # Skip to the next model if the tokenizer can't be loaded - chktok = tokenizer.encode(chktxt) - chkhsh = sha256(str(chktok).encode()).hexdigest() + try: + if tokenizer.is_fast: + chktok = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + else: + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + except: + chktok = 'ha' + chkhsh = 'hi' logger.info(f"model: {name}") logger.info(f"tokt: {tokt}") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6632bd95a23f4..e9742c3af099d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -407,7 +407,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL # Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)' chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -426,37 +426,37 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + if chkhsh == "b5971447e22eeab46a99f9214146f5aa7a0b976a1ba44ba920330b08970b538f": # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + if chkhsh == "6f9b95824a16270c180000affa9a33c7cf0a2d257c255a2a781bdf74ccedd296": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + if chkhsh == "a81435dcfe35da4270630047e3b298a5711a9ab255030065df578e247667b049": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b": # ref: https://huggingface.co/mosaicml/mpt-7b res = "mpt" - if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": + if chkhsh == "123d32ec0d221232846c22dad13f1a322e7319eab37ab2d0ea22754d652060a7": # ref: https://huggingface.co/bigcode/starcoder2-3b res = "starcoder" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" - if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": + if chkhsh == "e97d3ead2ca033c7fe1c334ee51ef1d7900dafecc6f714775e2b42bff5fac995": # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b res = "stablelm2" - if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + if chkhsh == "123d32ec0d221232846c22dad13f1a322e7319eab37ab2d0ea22754d652060a7": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + if chkhsh == "123d32ec0d221232846c22dad13f1a322e7319eab37ab2d0ea22754d652060a7": # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 res = "command-r" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": + if chkhsh == "e97d3ead2ca033c7fe1c334ee51ef1d7900dafecc6f714775e2b42bff5fac995": # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b": # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf res = "olmo" if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": @@ -465,19 +465,21 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" - if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": + if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es res = "jina-v2-es" - if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": + if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" - if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": + if chkhsh == "5bbd5b1b041c7eaa7792a5d901950e426fc12d7d84ce2ebfe3c498edd96e517f": # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" - if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": + if chkhsh == "9732afa2de833a78655aab9a74bf5b0d332a3b023cc9938beeae4ca9e674523b": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" - + if chkhsh == "0ba81c154fbe31d2030588ac40629fbe0ce7dc851252bfc04199dfa6e09a68e8": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh + res = "jina-v2-zh" if res is None: logger.warning("\n") logger.warning("**************************************************************************************") diff --git a/llama.cpp b/llama.cpp index 6def11fee4296..dc0ed11b53542 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4705,6 +4705,8 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "smaug-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; + } else if (tokenizer_pre == "jina-v2-zh") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -4736,7 +4738,7 @@ static void llm_load_vocab( for (uint32_t i = 0; i < n_vocab; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); - GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); + //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); vocab.token_to_id[word] = i; @@ -4772,8 +4774,13 @@ static void llm_load_vocab( vocab.linefeed_id = vocab.special_pad_id; } else { const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A - GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); - vocab.linefeed_id = ids[0]; + //GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); + if (ids.empty()) { + vocab.linefeed_id = vocab.special_pad_id; + } else { + vocab.linefeed_id = ids[0]; + } + } // special tokens @@ -13014,9 +13021,10 @@ struct llm_tokenizer_bpe { break; case LLAMA_VOCAB_PRE_TYPE_GPT2: case LLAMA_VOCAB_PRE_TYPE_OLMO: - word_collection = unicode_regex_split(text, { + /*word_collection = unicode_regex_split(text, { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - }); + });*/ + word_collection = unicode_regex_split(text, {"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",}); break; case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: @@ -13026,6 +13034,15 @@ struct llm_tokenizer_bpe { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); break; + case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: + { + std::string data(text.c_str()); + std::transform(data.begin(), data.end(), data.begin(),[](unsigned char c){ return std::tolower(c); }); + word_collection = unicode_regex_split(data, { + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + } + break; default: // default regex for BPE tokenization pre-processing word_collection = unicode_regex_split(text, { @@ -13132,10 +13149,11 @@ struct llm_tokenizer_bpe { for (auto j = str.begin(); j != str.end(); ++j) { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str); - if (token_multibyte == vocab.token_to_id.end()) { - throw std::runtime_error("ERROR: byte not found in vocab"); + if (token_multibyte != vocab.token_to_id.end()) { + output.push_back((*token_multibyte).second); + //throw std::runtime_error("ERROR: byte not found in vocab"); } - output.push_back((*token_multibyte).second); + } } else { output.push_back((*token).second); diff --git a/llama.h b/llama.h index 9dcd67bef5036..93ec94a671f7c 100644 --- a/llama.h +++ b/llama.h @@ -86,6 +86,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_OLMO = 12, LLAMA_VOCAB_PRE_TYPE_DBRX = 13, LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, + LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 15, }; // note: these values should be synchronized with ggml_rope