From 0c1c9d33a805aad92c28cc15213a0dc5867f8818 Mon Sep 17 00:00:00 2001 From: o7si Date: Sat, 30 May 2026 03:50:53 +0800 Subject: [PATCH 1/3] vocab : add jina-embeddings-v2-base-zh (whitespace tokenizer) --- conversion/base.py | 10 +++++++ conversion/bert.py | 11 +++++++- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ gguf-py/gguf/vocab.py | 27 +++++++++++++++++++ src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-vocab.cpp | 53 ++++++++++++++++++++++++++++++++++--- src/llama-vocab.h | 2 ++ 9 files changed, 106 insertions(+), 4 deletions(-) diff --git a/conversion/base.py b/conversion/base.py index 44b2c964f4b..5ccf8d49263 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1689,6 +1689,16 @@ def _set_vocab_gpt2(self) -> None: special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_whitespace(self) -> None: + tokens, toktypes, _ = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("whitespace") + self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_hybriddna(self): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) diff --git a/conversion/bert.py b/conversion/bert.py index 8af6c534d07..9eb320e58aa 100644 --- a/conversion/bert.py +++ b/conversion/bert.py @@ -571,7 +571,16 @@ def set_vocab(self): if tokenizer_class == 'BertTokenizer': super().set_vocab() elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() + pre_tokenizer_type = None + tokenizer_json_path = self.dir_model / "tokenizer.json" + if tokenizer_json_path.is_file(): + with open(tokenizer_json_path, "r", encoding="utf-8") as f: + pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type") + + if pre_tokenizer_type == "Whitespace": + self._set_vocab_whitespace() + else: + self._set_vocab_gpt2() self.gguf_writer.add_token_type_count(2) else: raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5a567e2d159..b4dfd58382d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -268,6 +268,8 @@ class Tokenizer: CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" + # Normalizer constants + NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase" # FIM/Infill special tokens constants FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a101382719d..e94b47badb4 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1110,6 +1110,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def add_normalizer_lowercase(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value) + def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 09a9b7d1835..fcadd1491ec 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -52,6 +52,7 @@ class SpecialVocab: add_special_token: dict[str, bool] special_token_ids: dict[str, int] chat_template: str | Sequence[Mapping[str, str]] | None + normalizer_lowercase: bool def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -64,6 +65,7 @@ def __init__( self.load_merges = load_merges self.merges = [] self.chat_template = None + self.normalizer_lowercase = False if special_token_types is not None: self.special_token_types = special_token_types else: @@ -102,6 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if not quiet: logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) + if self.normalizer_lowercase: + if not quiet: + logger.info('Setting normalizer_lowercase to True') + gw.add_normalizer_lowercase(True) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -146,6 +152,24 @@ def _set_special_token(self, typ: str, tid: Any) -> None: return logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') + def _parse_normalizer(self, normalizer: dict) -> None: + # ref: https://huggingface.co/docs/tokenizers/api/normalizers + # + # Detects lowercase normalization in three possible formats: + # 1. Standalone: {"type": "Lowercase"} + # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...} + # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]} + + normalizer_type = normalizer.get('type') + if normalizer_type == 'Lowercase': + self.normalizer_lowercase = True + elif normalizer_type == 'BertNormalizer': + if normalizer.get('lowercase', True): + self.normalizer_lowercase = True + elif normalizer_type == 'Sequence': + for norm in normalizer.get('normalizers', []): + self._parse_normalizer(norm) + def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer = None tokenizer_file = path / 'tokenizer.json' @@ -178,6 +202,9 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: ] else: raise ValueError("Unknown tokenizer merges format") + # Parse normalizer configuration (e.g. Lowercase) into metadata + if normalizer := tokenizer.get('normalizer'): + self._parse_normalizer(normalizer) added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b485ac02e75..be8f73cc1ed 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -319,6 +319,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, + { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index b59043e408f..2c71bbe8156 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -308,6 +308,7 @@ enum llm_kv { LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_CHAT_TEMPLATE, + LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, LLM_KV_TOKENIZER_FIM_PRE_ID, LLM_KV_TOKENIZER_FIM_SUF_ID, LLM_KV_TOKENIZER_FIM_MID_ID, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 473becade82..a01f2ace372 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -519,6 +519,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_WHITESPACE: + // whitespace pre-tokenizer (jinaai/jina-embeddings-v2-base-zh) + regex_exprs = { + "\\S+", + }; + byte_encode = false; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1671,6 +1678,35 @@ struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session { const llama_vocab & vocab; }; +struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session { + llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} + + void tokenize(const std::string & text, std::vector & output) override { + const bool lowercase = vocab.get_normalizer_lowercase(); + + std::string segment; + auto flush = [&]() { + if (!segment.empty()) { + llm_tokenizer_bpe_session::tokenize(segment, output); + segment.clear(); + } + }; + + for (uint32_t cpt : unicode_cpts_from_utf8(text)) { + // drop whitespace + if (unicode_cpt_flags_from_cpt(cpt).is_whitespace) { + flush(); + } else { + segment += unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt); + } + } + flush(); + } + +private: + const llama_vocab & vocab; +}; + // // impl // @@ -1751,6 +1787,7 @@ struct llama_vocab::impl { bool remove_extra_whitespaces = false; bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; + bool normalizer_lowercase = false; // Lowercase normalizer (tokenizer.json) std::unordered_map token_to_id; std::vector id_to_token; @@ -1900,7 +1937,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_mask_id = 103; add_sep = true; - } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") { + } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") { type = LLAMA_VOCAB_TYPE_BPE; // read bpe merges and populate bpe ranks @@ -2119,6 +2156,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "roberta-bpe") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; add_sep = true; + } else if ( + tokenizer_pre == "whitespace") { + pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE; } else if ( tokenizer_pre == "refact") { pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT; @@ -2299,8 +2339,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } - ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); - ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); + ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); + ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); @@ -3264,6 +3305,8 @@ std::vector llama_vocab::impl::tokenize( std::unique_ptr session; if (vocab.get_tokenizer_model() == "hybriddna") { session = std::make_unique(vocab, *tok_bpe); + } else if (vocab.get_tokenizer_model() == "whitespace") { + session = std::make_unique(vocab, *tok_bpe); } else { session = std::make_unique(vocab, *tok_bpe); } @@ -3892,6 +3935,10 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const { return pimpl->treat_whitespace_as_suffix; } +bool llama_vocab::get_normalizer_lowercase() const { + return pimpl->normalizer_lowercase; +} + int llama_vocab::max_token_len() const { return pimpl->max_token_len; } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 8ab77594284..093e5d02cda 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -61,6 +61,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52, + LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53, }; struct LLM_KV; @@ -138,6 +139,7 @@ struct llama_vocab { bool get_remove_extra_whitespaces () const; bool get_escape_whitespaces () const; bool get_treat_whitespace_as_suffix() const; + bool get_normalizer_lowercase () const; int max_token_len() const; From f7a7610b8b1aee7bceb4ae560080dbe30b3601c4 Mon Sep 17 00:00:00 2001 From: o7si Date: Sat, 30 May 2026 15:34:10 +0800 Subject: [PATCH 2/3] vocab : add normalizer.lowercase support to WPM --- gguf-py/gguf/vocab.py | 13 ++++++------- src/llama-vocab.cpp | 17 +++++++++++------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index fcadd1491ec..2847de5e002 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -52,7 +52,7 @@ class SpecialVocab: add_special_token: dict[str, bool] special_token_ids: dict[str, int] chat_template: str | Sequence[Mapping[str, str]] | None - normalizer_lowercase: bool + normalizer_lowercase: bool | None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -65,7 +65,7 @@ def __init__( self.load_merges = load_merges self.merges = [] self.chat_template = None - self.normalizer_lowercase = False + self.normalizer_lowercase = None if special_token_types is not None: self.special_token_types = special_token_types else: @@ -104,10 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if not quiet: logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) - if self.normalizer_lowercase: + if self.normalizer_lowercase is not None: if not quiet: - logger.info('Setting normalizer_lowercase to True') - gw.add_normalizer_lowercase(True) + logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}') + gw.add_normalizer_lowercase(self.normalizer_lowercase) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -164,8 +164,7 @@ def _parse_normalizer(self, normalizer: dict) -> None: if normalizer_type == 'Lowercase': self.normalizer_lowercase = True elif normalizer_type == 'BertNormalizer': - if normalizer.get('lowercase', True): - self.normalizer_lowercase = True + self.normalizer_lowercase = normalizer.get('lowercase', True) elif normalizer_type == 'Sequence': for norm in normalizer.get('normalizers', []): self._parse_normalizer(norm) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a01f2ace372..dc3b3e99440 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -754,7 +754,7 @@ struct llm_tokenizer_wpm_session { void tokenize(const std::string & text, std::vector & output) { // normalize and split by whitespace - std::vector words = preprocess(text); + std::vector words = preprocess(text, vocab.get_normalizer_lowercase()); // bos token prepended already // find the longest tokens that form the words @@ -799,7 +799,7 @@ struct llm_tokenizer_wpm_session { } // TODO: reduce string copies by using cpts_offs array - static std::vector preprocess(const std::string & text) { + static std::vector preprocess(const std::string & text, bool lowercase) { const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); std::vector words(1, ""); @@ -818,7 +818,7 @@ struct llm_tokenizer_wpm_session { continue; } - const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); + const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt); if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { if (words.back().size()) { // finish previous word if any words.emplace_back(); @@ -1937,6 +1937,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_mask_id = 103; add_sep = true; + + // BERT lowercases by default (used when the metadata flag is absent, e.g. legacy GGUFs) + normalizer_lowercase = true; } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") { type = LLAMA_VOCAB_TYPE_BPE; @@ -2339,9 +2342,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } - ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); - ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); - ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); + ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); + ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); @@ -2511,6 +2513,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } } + // Lowercase normalizer flag (consulted by WPM / whitespace BPE) + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); + // auto-detect special tokens by text // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_... // for now, we apply this workaround to find the tokens based on their text From 3f550f62d3b43027c429e1375a2409c7e93b37fa Mon Sep 17 00:00:00 2001 From: o7si Date: Mon, 1 Jun 2026 10:26:51 +0800 Subject: [PATCH 3/3] vocab : default normalizer.lowercase to false for whitespace pre-tokenizer --- src/llama-vocab.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 76c738880c4..04183efc4d0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2159,6 +2159,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "whitespace") { pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE; + normalizer_lowercase = false; } else if ( tokenizer_pre == "refact") { pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;