Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1689,6 +1689,16 @@ def _set_vocab_gpt2(self) -> None:
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_whitespace(self) -> None:
tokens, toktypes, _ = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("whitespace")
self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_hybriddna(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
Expand Down
11 changes: 10 additions & 1 deletion conversion/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,16 @@ def set_vocab(self):
if tokenizer_class == 'BertTokenizer':
super().set_vocab()
elif tokenizer_class == 'RobertaTokenizer':
self._set_vocab_gpt2()
pre_tokenizer_type = None
tokenizer_json_path = self.dir_model / "tokenizer.json"
if tokenizer_json_path.is_file():
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type")

if pre_tokenizer_type == "Whitespace":
self._set_vocab_whitespace()
else:
self._set_vocab_gpt2()
self.gguf_writer.add_token_type_count(2)
else:
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
Expand Down
2 changes: 2 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ class Tokenizer:
CHAT_TEMPLATE = "tokenizer.chat_template"
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates"
# Normalizer constants
NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
# FIM/Infill special tokens constants
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:

self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)

def add_normalizer_lowercase(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)

def add_eot_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOT_ID, id)

Expand Down
27 changes: 27 additions & 0 deletions gguf-py/gguf/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class SpecialVocab:
add_special_token: dict[str, bool]
special_token_ids: dict[str, int]
chat_template: str | Sequence[Mapping[str, str]] | None
normalizer_lowercase: bool | None

def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False,
Expand All @@ -64,6 +65,7 @@ def __init__(
self.load_merges = load_merges
self.merges = []
self.chat_template = None
self.normalizer_lowercase = None
if special_token_types is not None:
self.special_token_types = special_token_types
else:
Expand Down Expand Up @@ -102,6 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
if not quiet:
logger.info(f'Setting chat_template to {self.chat_template}')
gw.add_chat_template(self.chat_template)
if self.normalizer_lowercase is not None:
if not quiet:
logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}')
gw.add_normalizer_lowercase(self.normalizer_lowercase)

def _load(self, path: Path) -> None:
self._try_load_from_tokenizer_json(path)
Expand Down Expand Up @@ -146,6 +152,24 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
return
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')

def _parse_normalizer(self, normalizer: dict) -> None:
# ref: https://huggingface.co/docs/tokenizers/api/normalizers
#
# Detects lowercase normalization in three possible formats:
# 1. Standalone: {"type": "Lowercase"}
# 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
# 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
Comment on lines +158 to +161
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The obvious flaw here is that someone might try to use Lowercase with another tokenizer, or leave it out with Whitespace, but let's worry about that then as this complicates things for something that probably will never happen (famous last words).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, tokenizer variants are endless...


normalizer_type = normalizer.get('type')
if normalizer_type == 'Lowercase':
self.normalizer_lowercase = True
elif normalizer_type == 'BertNormalizer':
if 'lowercase' in normalizer:
self.normalizer_lowercase = normalizer['lowercase']
elif normalizer_type == 'Sequence':
for norm in normalizer.get('normalizers', []):
self._parse_normalizer(norm)

def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer = None
tokenizer_file = path / 'tokenizer.json'
Expand Down Expand Up @@ -178,6 +202,9 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
]
else:
raise ValueError("Unknown tokenizer merges format")
# Parse normalizer configuration (e.g. Lowercase) into metadata
if normalizer := tokenizer.get('normalizer'):
self._parse_normalizer(normalizer)
added_tokens = tokenizer.get('added_tokens', {})
else:
added_tokens = {}
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
LLM_KV_TOKENIZER_FIM_PRE_ID,
LLM_KV_TOKENIZER_FIM_SUF_ID,
LLM_KV_TOKENIZER_FIM_MID_ID,
Expand Down
53 changes: 50 additions & 3 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_WHITESPACE:
// whitespace pre-tokenizer (jinaai/jina-embeddings-v2-base-zh)
regex_exprs = {
"\\S+",
};
byte_encode = false;
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
Expand Down Expand Up @@ -1671,6 +1678,35 @@ struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
const llama_vocab & vocab;
};

struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session {
llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}

void tokenize(const std::string & text, std::vector<llama_token> & output) override {
const bool lowercase = vocab.get_normalizer_lowercase();
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be checked in WPM as well (for German_Semantic_V3?), feel free to make a follow-up PR.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #23899 for this (verified with German_Semantic_V3).


std::string segment;
auto flush = [&]() {
if (!segment.empty()) {
llm_tokenizer_bpe_session::tokenize(segment, output);
segment.clear();
}
};

for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
// drop whitespace
if (unicode_cpt_flags_from_cpt(cpt).is_whitespace) {
flush();
} else {
segment += unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
}
}
flush();
}

private:
const llama_vocab & vocab;
};

//
// impl
//
Expand Down Expand Up @@ -1751,6 +1787,7 @@ struct llama_vocab::impl {
bool remove_extra_whitespaces = false;
bool escape_whitespaces = true;
bool treat_whitespace_as_suffix = false;
bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json)

std::unordered_map<std::string, llama_token> token_to_id;
std::vector<token_data> id_to_token;
Expand Down Expand Up @@ -1900,7 +1937,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_mask_id = 103;

add_sep = true;
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") {
type = LLAMA_VOCAB_TYPE_BPE;

// read bpe merges and populate bpe ranks
Expand Down Expand Up @@ -2119,6 +2156,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "roberta-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
add_sep = true;
} else if (
tokenizer_pre == "whitespace") {
pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
} else if (
tokenizer_pre == "refact") {
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
Expand Down Expand Up @@ -2299,8 +2339,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}

ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
}

const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
Expand Down Expand Up @@ -3264,6 +3305,8 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
std::unique_ptr<llm_tokenizer_bpe_session> session;
if (vocab.get_tokenizer_model() == "hybriddna") {
session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
} else if (vocab.get_tokenizer_model() == "whitespace") {
session = std::make_unique<llm_tokenizer_whitespace_session>(vocab, *tok_bpe);
} else {
session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
}
Expand Down Expand Up @@ -3892,6 +3935,10 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
return pimpl->treat_whitespace_as_suffix;
}

bool llama_vocab::get_normalizer_lowercase() const {
return pimpl->normalizer_lowercase;
}

int llama_vocab::max_token_len() const {
return pimpl->max_token_len;
}
Expand Down
2 changes: 2 additions & 0 deletions src/llama-vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
};

struct LLM_KV;
Expand Down Expand Up @@ -138,6 +139,7 @@ struct llama_vocab {
bool get_remove_extra_whitespaces () const;
bool get_escape_whitespaces () const;
bool get_treat_whitespace_as_suffix() const;
bool get_normalizer_lowercase () const;

int max_token_len() const;

Expand Down
Loading