From 0c1c9d33a805aad92c28cc15213a0dc5867f8818 Mon Sep 17 00:00:00 2001
From: o7si <o7si@kanda-mashiro.cc>
Date: Sat, 30 May 2026 03:50:53 +0800
Subject: [PATCH 1/3] vocab : add jina-embeddings-v2-base-zh (whitespace
 tokenizer)

---
 conversion/base.py          | 10 +++++++
 conversion/bert.py          | 11 +++++++-
 gguf-py/gguf/constants.py   |  2 ++
 gguf-py/gguf/gguf_writer.py |  3 +++
 gguf-py/gguf/vocab.py       | 27 +++++++++++++++++++
 src/llama-arch.cpp          |  1 +
 src/llama-arch.h            |  1 +
 src/llama-vocab.cpp         | 53 ++++++++++++++++++++++++++++++++++---
 src/llama-vocab.h           |  2 ++
 9 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/conversion/base.py b/conversion/base.py
index 44b2c964f4b..5ccf8d49263 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1689,6 +1689,16 @@ def _set_vocab_gpt2(self) -> None:
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
+    def _set_vocab_whitespace(self) -> None:
+        tokens, toktypes, _ = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("whitespace")
+        self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
     def _set_vocab_hybriddna(self):
         from transformers import AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
diff --git a/conversion/bert.py b/conversion/bert.py
index 8af6c534d07..9eb320e58aa 100644
--- a/conversion/bert.py
+++ b/conversion/bert.py
@@ -571,7 +571,16 @@ def set_vocab(self):
         if tokenizer_class == 'BertTokenizer':
             super().set_vocab()
         elif tokenizer_class == 'RobertaTokenizer':
-            self._set_vocab_gpt2()
+            pre_tokenizer_type = None
+            tokenizer_json_path = self.dir_model / "tokenizer.json"
+            if tokenizer_json_path.is_file():
+                with open(tokenizer_json_path, "r", encoding="utf-8") as f:
+                    pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type")
+
+            if pre_tokenizer_type == "Whitespace":
+                self._set_vocab_whitespace()
+            else:
+                self._set_vocab_gpt2()
             self.gguf_writer.add_token_type_count(2)
         else:
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 5a567e2d159..b4dfd58382d 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -268,6 +268,8 @@ class Tokenizer:
         CHAT_TEMPLATE        = "tokenizer.chat_template"
         CHAT_TEMPLATE_N      = "tokenizer.chat_template.{name}"
         CHAT_TEMPLATES       = "tokenizer.chat_templates"
+        # Normalizer constants
+        NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
         # FIM/Infill special tokens constants
         FIM_PRE_ID           = "tokenizer.ggml.fim_pre_token_id"
         FIM_SUF_ID           = "tokenizer.ggml.fim_suf_token_id"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a101382719d..e94b47badb4 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1110,6 +1110,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
 
         self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
 
+    def add_normalizer_lowercase(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
+
     def add_eot_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.EOT_ID, id)
 
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 09a9b7d1835..fcadd1491ec 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -52,6 +52,7 @@ class SpecialVocab:
     add_special_token: dict[str, bool]
     special_token_ids: dict[str, int]
     chat_template: str | Sequence[Mapping[str, str]] | None
+    normalizer_lowercase: bool
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
@@ -64,6 +65,7 @@ def __init__(
         self.load_merges = load_merges
         self.merges = []
         self.chat_template = None
+        self.normalizer_lowercase = False
         if special_token_types is not None:
             self.special_token_types = special_token_types
         else:
@@ -102,6 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
             if not quiet:
                 logger.info(f'Setting chat_template to {self.chat_template}')
             gw.add_chat_template(self.chat_template)
+        if self.normalizer_lowercase:
+            if not quiet:
+                logger.info('Setting normalizer_lowercase to True')
+            gw.add_normalizer_lowercase(True)
 
     def _load(self, path: Path) -> None:
         self._try_load_from_tokenizer_json(path)
@@ -146,6 +152,24 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
             return
         logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
 
+    def _parse_normalizer(self, normalizer: dict) -> None:
+        # ref: https://huggingface.co/docs/tokenizers/api/normalizers
+        #
+        # Detects lowercase normalization in three possible formats:
+        # 1. Standalone: {"type": "Lowercase"}
+        # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
+        # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
+
+        normalizer_type = normalizer.get('type')
+        if normalizer_type == 'Lowercase':
+            self.normalizer_lowercase = True
+        elif normalizer_type == 'BertNormalizer':
+            if normalizer.get('lowercase', True):
+                self.normalizer_lowercase = True
+        elif normalizer_type == 'Sequence':
+            for norm in normalizer.get('normalizers', []):
+                self._parse_normalizer(norm)
+
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
         tokenizer = None
         tokenizer_file = path / 'tokenizer.json'
@@ -178,6 +202,9 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
                         ]
                     else:
                         raise ValueError("Unknown tokenizer merges format")
+            # Parse normalizer configuration (e.g. Lowercase) into metadata
+            if normalizer := tokenizer.get('normalizer'):
+                self._parse_normalizer(normalizer)
             added_tokens = tokenizer.get('added_tokens', {})
         else:
             added_tokens = {}
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index b485ac02e75..be8f73cc1ed 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -319,6 +319,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
     { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
     { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
+    { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase"     },
     { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
     { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
     { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index b59043e408f..2c71bbe8156 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -308,6 +308,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
     LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+    LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
     LLM_KV_TOKENIZER_FIM_PRE_ID,
     LLM_KV_TOKENIZER_FIM_SUF_ID,
     LLM_KV_TOKENIZER_FIM_MID_ID,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 473becade82..a01f2ace372 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -519,6 +519,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_WHITESPACE:
+                // whitespace pre-tokenizer (jinaai/jina-embeddings-v2-base-zh)
+                regex_exprs = {
+                    "\\S+",
+                };
+                byte_encode = false;
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -1671,6 +1678,35 @@ struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
     const llama_vocab & vocab;
 };
 
+struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session {
+    llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
+
+    void tokenize(const std::string & text, std::vector<llama_token> & output) override {
+        const bool lowercase = vocab.get_normalizer_lowercase();
+
+        std::string segment;
+        auto flush = [&]() {
+            if (!segment.empty()) {
+                llm_tokenizer_bpe_session::tokenize(segment, output);
+                segment.clear();
+            }
+        };
+
+        for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
+            // drop whitespace
+            if (unicode_cpt_flags_from_cpt(cpt).is_whitespace) {
+                flush();
+            } else {
+                segment += unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
+            }
+        }
+        flush();
+    }
+
+private:
+    const llama_vocab & vocab;
+};
+
 //
 // impl
 //
@@ -1751,6 +1787,7 @@ struct llama_vocab::impl {
     bool remove_extra_whitespaces   = false;
     bool escape_whitespaces         = true;
     bool treat_whitespace_as_suffix = false;
+    bool normalizer_lowercase       = false; // Lowercase normalizer (tokenizer.json)
 
     std::unordered_map<std::string, llama_token> token_to_id;
     std::vector<token_data>                      id_to_token;
@@ -1900,7 +1937,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_mask_id = 103;
 
             add_sep = true;
-        } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
+        } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
             // read bpe merges and populate bpe ranks
@@ -2119,6 +2156,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     tokenizer_pre == "roberta-bpe") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
                 add_sep = true;
+            } else if (
+                    tokenizer_pre == "whitespace") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
             } else if (
                     tokenizer_pre == "refact") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -2299,8 +2339,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
 
-        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      add_space_prefix,         false);
-        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,           add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      remove_extra_whitespaces, false);
+        ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase,     false);
     }
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@@ -3264,6 +3305,8 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
                 std::unique_ptr<llm_tokenizer_bpe_session> session;
                 if (vocab.get_tokenizer_model() == "hybriddna") {
                     session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
+                } else if (vocab.get_tokenizer_model() == "whitespace") {
+                    session = std::make_unique<llm_tokenizer_whitespace_session>(vocab, *tok_bpe);
                 } else {
                     session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
                 }
@@ -3892,6 +3935,10 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
     return pimpl->treat_whitespace_as_suffix;
 }
 
+bool llama_vocab::get_normalizer_lowercase() const {
+    return pimpl->normalizer_lowercase;
+}
+
 int llama_vocab::max_token_len() const {
     return pimpl->max_token_len;
 }
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 8ab77594284..093e5d02cda 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -61,6 +61,7 @@ enum llama_vocab_pre_type {
     LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
     LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE      = 51,
     LLAMA_VOCAB_PRE_TYPE_MINICPM5        = 52,
+    LLAMA_VOCAB_PRE_TYPE_WHITESPACE      = 53,
 };
 
 struct LLM_KV;
@@ -138,6 +139,7 @@ struct llama_vocab {
     bool get_remove_extra_whitespaces  () const;
     bool get_escape_whitespaces        () const;
     bool get_treat_whitespace_as_suffix() const;
+    bool get_normalizer_lowercase      () const;
 
     int max_token_len() const;
 

From f7a7610b8b1aee7bceb4ae560080dbe30b3601c4 Mon Sep 17 00:00:00 2001
From: o7si <o7si@kanda-mashiro.cc>
Date: Sat, 30 May 2026 15:34:10 +0800
Subject: [PATCH 2/3] vocab : add normalizer.lowercase support to WPM

---
 gguf-py/gguf/vocab.py | 13 ++++++-------
 src/llama-vocab.cpp   | 17 +++++++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index fcadd1491ec..2847de5e002 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -52,7 +52,7 @@ class SpecialVocab:
     add_special_token: dict[str, bool]
     special_token_ids: dict[str, int]
     chat_template: str | Sequence[Mapping[str, str]] | None
-    normalizer_lowercase: bool
+    normalizer_lowercase: bool | None
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
@@ -65,7 +65,7 @@ def __init__(
         self.load_merges = load_merges
         self.merges = []
         self.chat_template = None
-        self.normalizer_lowercase = False
+        self.normalizer_lowercase = None
         if special_token_types is not None:
             self.special_token_types = special_token_types
         else:
@@ -104,10 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
             if not quiet:
                 logger.info(f'Setting chat_template to {self.chat_template}')
             gw.add_chat_template(self.chat_template)
-        if self.normalizer_lowercase:
+        if self.normalizer_lowercase is not None:
             if not quiet:
-                logger.info('Setting normalizer_lowercase to True')
-            gw.add_normalizer_lowercase(True)
+                logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}')
+            gw.add_normalizer_lowercase(self.normalizer_lowercase)
 
     def _load(self, path: Path) -> None:
         self._try_load_from_tokenizer_json(path)
@@ -164,8 +164,7 @@ def _parse_normalizer(self, normalizer: dict) -> None:
         if normalizer_type == 'Lowercase':
             self.normalizer_lowercase = True
         elif normalizer_type == 'BertNormalizer':
-            if normalizer.get('lowercase', True):
-                self.normalizer_lowercase = True
+            self.normalizer_lowercase = normalizer.get('lowercase', True)
         elif normalizer_type == 'Sequence':
             for norm in normalizer.get('normalizers', []):
                 self._parse_normalizer(norm)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a01f2ace372..dc3b3e99440 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -754,7 +754,7 @@ struct llm_tokenizer_wpm_session {
 
     void tokenize(const std::string & text, std::vector<llama_token> & output) {
         // normalize and split by whitespace
-        std::vector<std::string> words = preprocess(text);
+        std::vector<std::string> words = preprocess(text, vocab.get_normalizer_lowercase());
         // bos token prepended already
 
         // find the longest tokens that form the words
@@ -799,7 +799,7 @@ struct llm_tokenizer_wpm_session {
     }
 
     // TODO: reduce string copies by using cpts_offs array
-    static std::vector<std::string> preprocess(const std::string & text)  {
+    static std::vector<std::string> preprocess(const std::string & text, bool lowercase)  {
         const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
         std::vector<std::string> words(1, "");
 
@@ -818,7 +818,7 @@ struct llm_tokenizer_wpm_session {
                 continue;
             }
 
-            const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
+            const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
             if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
                 if (words.back().size()) {  // finish previous word if any
                     words.emplace_back();
@@ -1937,6 +1937,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_mask_id = 103;
 
             add_sep = true;
+
+            // BERT lowercases by default (used when the metadata flag is absent, e.g. legacy GGUFs)
+            normalizer_lowercase = true;
         } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
@@ -2339,9 +2342,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
 
-        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,           add_space_prefix,         false);
-        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      remove_extra_whitespaces, false);
-        ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase,     false);
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
     }
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@@ -2511,6 +2513,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
         }
 
+        // Lowercase normalizer flag (consulted by WPM / whitespace BPE)
+        ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
+
         // auto-detect special tokens by text
         // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
         //       for now, we apply this workaround to find the tokens based on their text

From 3f550f62d3b43027c429e1375a2409c7e93b37fa Mon Sep 17 00:00:00 2001
From: o7si <o7si@kanda-mashiro.cc>
Date: Mon, 1 Jun 2026 10:26:51 +0800
Subject: [PATCH 3/3] vocab : default normalizer.lowercase to false for
 whitespace pre-tokenizer

---
 src/llama-vocab.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 76c738880c4..04183efc4d0 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2159,6 +2159,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             } else if (
                     tokenizer_pre == "whitespace") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
+                normalizer_lowercase = false;
             } else if (
                     tokenizer_pre == "refact") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;