auroralabs-loci · loci-dev · Jan 11, 2026
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1138,6 +1138,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
             res = "jina-v2-de"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
+            res = "jina-v2-zh"
         if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
             # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
             res = "smaug-bpe"

diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -106,6 +106,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
     {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
     {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v2-zh",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
     {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
     {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -461,6 +461,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
+                // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
+                // whitespace pre-tokenizer
+                regex_exprs = {
+                    "\\S+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -518,7 +525,20 @@ struct llm_tokenizer_bpe_session {
 
     void tokenize(const std::string & text, std::vector<llama_token> & output) {
         int final_prev_index = -1;
-        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+
+        std::string text_normalized;
+        if (vocab.get_apply_lowercase()) {
+            for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
+                text_normalized += unicode_cpt_to_utf8(unicode_tolower(cpt));
+            }
+        } else {
+            text_normalized = text;
+        }
+
+        auto word_collection = unicode_regex_split(text_normalized, tokenizer.regex_exprs);
+        if (vocab.get_use_byte_encoding()) {
+            word_collection = unicode_words_byte_encode(word_collection);
+        }
 
         symbols_final.clear();
 
@@ -1591,6 +1611,8 @@ struct llama_vocab::impl {
     bool remove_extra_whitespaces   = false;
     bool escape_whitespaces         = true;
     bool treat_whitespace_as_suffix = false;
+    bool apply_lowercase            = false;  // lowercase normalization
+    bool use_byte_encoding          = true;   // GPT-2 byte encoding for BPE vocab
 
     std::unordered_map<std::string, llama_token> token_to_id;
     std::vector<token_data>                      id_to_token;
@@ -2031,6 +2053,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "solar-open") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
                 clean_spaces = false;
+            } else if (
+                tokenizer_pre == "jina-v2-zh") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
+                clean_spaces = true;
+                add_bos = true;
+                add_sep = true;
+                apply_lowercase = true;
+                use_byte_encoding = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -3130,6 +3160,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
                     return _try_copy(token_text.data(), token_text.size());
                 }
                 if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    if (!use_byte_encoding) {
+                        return _try_copy(token_text.data(), token_text.size());
+                    }
                     std::string result = llama_decode_text(token_text);
                     return _try_copy(result.data(), result.size());
                 }
@@ -3554,6 +3587,14 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
     return pimpl->treat_whitespace_as_suffix;
 }
 
+bool llama_vocab::get_apply_lowercase() const {
+    return pimpl->apply_lowercase;
+}
+
+bool llama_vocab::get_use_byte_encoding() const {
+    return pimpl->use_byte_encoding;
+}
+
 int llama_vocab::max_token_len() const {
     return pimpl->max_token_len;
 }

diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -53,6 +53,7 @@ enum llama_vocab_pre_type {
     LLAMA_VOCAB_PRE_TYPE_AFMOE           = 42,
     LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
     LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
+    LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH      = 45,
 };
 
 struct LLM_KV;
@@ -130,6 +131,8 @@ struct llama_vocab {
     bool get_remove_extra_whitespaces  () const;
     bool get_escape_whitespaces        () const;
     bool get_treat_whitespace_as_suffix() const;
+    bool get_apply_lowercase           () const;
+    bool get_use_byte_encoding         () const;
 
     int max_token_len() const;
 

diff --git a/src/unicode.cpp b/src/unicode.cpp
@@ -220,23 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
     return conv.from_bytes(s);
 }
 
-static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
-    std::vector<std::string> bpe_encoded_words;
-    for (const auto & word : bpe_words) {
-        std::string text_utf;
-        auto utf_word =  unicode_cpts_from_utf8(word);
-        for (size_t i = 0; i < utf_word.size(); ++i) {
-            text_utf += unicode_cpt_to_utf8(utf_word[i]);
-        }
-
-        std::string encoded_token;
-        for (char & c : text_utf) {
-            encoded_token += unicode_byte_to_utf8(c);
-        }
-        bpe_encoded_words.emplace_back(encoded_token);
-    }
-    return bpe_encoded_words;
-}
 
 // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
 static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
@@ -956,6 +939,24 @@ bool unicode_cpt_is_han(uint32_t cpt) {
     return false;
 }
 
+std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words) {
+    std::vector<std::string> bpe_encoded_words;
+    for (const auto & word : bpe_words) {
+        std::string text_utf;
+        auto utf_word =  unicode_cpts_from_utf8(word);
+        for (size_t i = 0; i < utf_word.size(); ++i) {
+            text_utf += unicode_cpt_to_utf8(utf_word[i]);
+        }
+
+        std::string encoded_token;
+        for (char & c : text_utf) {
+            encoded_token += unicode_byte_to_utf8(c);
+        }
+        bpe_encoded_words.emplace_back(encoded_token);
+    }
+    return bpe_encoded_words;
+}
+
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
     // unicode categories
     static const std::map<std::string, int> k_ucat_enum = {
@@ -1143,5 +1144,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         start += offset;
     }
 
-    return unicode_byte_encoding_process(bpe_words);
+    return bpe_words;
 }
diff --git a/src/unicode.h b/src/unicode.h
@@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt);
 
 bool unicode_cpt_is_han(uint32_t cpt);
 
+std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words);
+
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt);

		bool unicode_cpt_is_han(uint32_t cpt);

		std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words);

		std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);