From 283c76566da92e81cb24b8979f1d1401ef71444f Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 5 Mar 2026 17:01:26 -0800 Subject: [PATCH 01/35] Add backwards-compatible support for multiple EOS tokens Models like Qwen 3/3.5 define multiple EOS token IDs in their GenerationConfig (e.g. [151645, 151643]), but llguidance only supported a single EOS token. This caused models to enter infinite loops when they tried to emit an EOS token that was masked out. Changes: - Add eos_tokens: Vec to TokTrie with accessors and with_eos_tokens() builder - Update TokenParser to check full EOS set for mask computation, token consumption, rollback, and stop detection - Add tok_eos_extra/tok_eos_extra_count to C API LlgTokenizerInit - Python eos_token parameter now accepts int | list[int] - Add eos_tokens getter property to Python LLTokenizer - Update type stubs and all Python helper modules (hf, tiktoken, llamacpp) All existing APIs remain unchanged; single EOS usage is unaffected. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 4 ++ parser/llguidance.h | 10 ++++ parser/src/ffi.rs | 18 ++++++- parser/src/tokenparser.rs | 23 ++++++--- python/llguidance/_lib.pyi | 4 +- python/llguidance/hf.py | 6 +-- python/llguidance/llamacpp.py | 5 +- python/llguidance/tiktoken.py | 6 +-- python_ext/src/llamatokenizer.rs | 10 ++-- python_ext/src/py.rs | 47 ++++++++++++++--- toktrie/src/toktree.rs | 87 ++++++++++++++++++++++++++++++-- toktrie_hf_tokenizers/src/lib.rs | 22 +++++++- toktrie_tiktoken/src/lib.rs | 4 ++ 13 files changed, 211 insertions(+), 35 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 94cacf14..91e23fc6 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -29,6 +29,10 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, LlgTokenizerInit tok_init = {}; tok_init.vocab_size = (uint32_t)tokens.size(); tok_init.tok_eos = tok_eos; + // For models with multiple EOS tokens (e.g., Qwen3), set: + // LlgToken extra_eos[] = {second_eos, third_eos}; + // tok_init.tok_eos_extra = extra_eos; + // tok_init.tok_eos_extra_count = sizeof(extra_eos) / sizeof(extra_eos[0]); tok_init.token_lens = token_lens; tok_init.token_bytes = token_bytes; tok_init.tokenize_assumes_string = false; diff --git a/parser/llguidance.h b/parser/llguidance.h index 6348b395..db905e21 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -239,6 +239,16 @@ typedef struct LlgTokenizerInit { * Pass NULL to use defaults. Pass empty array to disable. */ const char *const *slices; + /** + * Additional EOS token IDs beyond `tok_eos`. + * Points to an array of `tok_eos_extra_count` elements. + * When NULL (the default for zero-initialized structs), only `tok_eos` is used. + */ + const LlgToken *tok_eos_extra; + /** + * Number of elements in the `tok_eos_extra` array. + */ + uint32_t tok_eos_extra_count; } LlgTokenizerInit; diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 89b8ad4c..945130b2 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -137,7 +137,15 @@ impl LlgTokenizer { token_bytes }; - let trie = TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens); + let mut trie = TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens); + + if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { + let extra = + unsafe { std::slice::from_raw_parts(init.tok_eos_extra, init.tok_eos_extra_count as usize) }; + let mut eos_tokens = vec![init.tok_eos]; + eos_tokens.extend_from_slice(extra); + trie = trie.with_eos_tokens(&eos_tokens); + } let tok_env: TokEnv = Arc::new(CTokenizerInner { trie, @@ -249,6 +257,14 @@ pub struct LlgTokenizerInit { /// This is array of pointers to strings, terminated with NULL (argv style). /// Pass NULL to use defaults. Pass empty array to disable. pub slices: *const *const c_char, + + /// Additional EOS token IDs beyond `tok_eos`. + /// Points to an array of `tok_eos_extra_count` elements. + /// When NULL (the default for zero-initialized structs), only `tok_eos` is used. + pub tok_eos_extra: *const LlgToken, + + /// Number of elements in the `tok_eos_extra` array. + pub tok_eos_extra_count: u32, } #[derive(Clone)] diff --git a/parser/src/tokenparser.rs b/parser/src/tokenparser.rs index d1394c3e..e7be6027 100644 --- a/parser/src/tokenparser.rs +++ b/parser/src/tokenparser.rs @@ -21,7 +21,7 @@ pub struct TokenParser { pub dbg_grammar: String, last_step_stats: ParserStats, max_step_stats: ParserStats, - eos_token: TokenId, + eos_tokens: Vec, had_rollback: bool, had_backtrack: bool, @@ -91,7 +91,7 @@ impl TokenParser { factory.perf_counters(), )?; parser.metrics_mut().rand = factory.next_rng(); - let eos_token = token_env.tok_trie().eos_token(); + let eos_tokens = token_env.tok_trie().eos_tokens().to_vec(); Ok(TokenParser { bias_computer: factory.slicer().clone(), @@ -108,7 +108,7 @@ impl TokenParser { error_message: None, parser, dbg_grammar: String::new(), - eos_token, + eos_tokens, llm_tokens: Vec::new(), llm_bytes: Vec::new(), grm_prefix: Vec::new(), @@ -389,7 +389,7 @@ impl TokenParser { let new_len = self.llm_tokens.len() - n_tokens; let mut bytes_to_drop = 0; for tok in &self.llm_tokens[new_len..] { - if *tok == self.eos_token { + if self.eos_tokens.contains(tok) { // doesn't count; we hope it's last though... bytes_to_drop += 0; } else { @@ -492,8 +492,12 @@ impl TokenParser { return Err(self.stop_for_parser_error("", s)); } - if self.eos_token != INVALID_TOKEN && self.is_accepting() { - allowed_tokens.allow_token(self.eos_token); + if self.is_accepting() { + for &eos in &self.eos_tokens { + if eos != INVALID_TOKEN { + allowed_tokens.allow_token(eos); + } + } } self.log_final(&prefix, &allowed_tokens); @@ -797,7 +801,7 @@ impl TokenParser { } self.max_tokens_total -= 1; - if token == self.eos_token { + if self.eos_tokens.contains(&token) { if self.parser.scan_eos() { // it got scanned correctly, so we remove it // this only happens for gen() terminated by EOS @@ -838,7 +842,10 @@ impl TokenParser { /// This generally should be called after consume_token(). pub fn check_stop(&mut self) -> Result { let empty_token_prefix = !self.has_ff_bytes(); - let pending_eos = self.llm_tokens.last() == Some(&self.eos_token); + let pending_eos = self + .llm_tokens + .last() + .is_some_and(|t| self.eos_tokens.contains(t)); let lexer_bytes = self.parser.has_pending_lexeme_bytes(); let is_accepting = self.is_accepting(); let can_advance = self.parser.can_advance(); diff --git a/python/llguidance/_lib.pyi b/python/llguidance/_lib.pyi index 2c5ad662..9f8f6f25 100644 --- a/python/llguidance/_lib.pyi +++ b/python/llguidance/_lib.pyi @@ -6,13 +6,14 @@ from ._tokenizer import TokenizerWrapper class LLTokenizer: vocab_size: int eos_token: TokenId + eos_tokens: List[TokenId] is_canonical: bool def __new__( cls, tokenizer: Union[str, TokenizerWrapper], n_vocab: Optional[int] = None, - eos_token: Optional[TokenId] = None, + eos_token: Optional[Union[TokenId, List[TokenId]]] = None, slices: Optional[List[str]] = None, ) -> "LLTokenizer": """ @@ -23,6 +24,7 @@ class LLTokenizer: Args: tokenizer: str or TokenizerWrapper - if str, it is the name or path to the HF tokenizers tokenizer; otherwise it is a TokenizerWrapper n_vocab: int - override the size of the vocabulary + eos_token: int or list of ints - override the EOS token(s) slices: List[str] - configuration for slicer optimization; pass [] to disable, or None to use general_slices() """ diff --git a/python/llguidance/hf.py b/python/llguidance/hf.py index 11da7d45..05c78c03 100644 --- a/python/llguidance/hf.py +++ b/python/llguidance/hf.py @@ -1,5 +1,5 @@ from copy import copy -from typing import List, Optional +from typing import List, Optional, Union import transformers @@ -9,7 +9,7 @@ def from_tokenizer( hf_tokenizer: transformers.PreTrainedTokenizerFast, n_vocab: Optional[int] = None, - eos_token: Optional[int] = None, + eos_token: Optional[Union[int, List[int]]] = None, slices: Optional[List[str]] = None, ) -> LLTokenizer: """ @@ -21,7 +21,7 @@ def from_tokenizer( Args: hf_tokenizer: transformers.PreTrainedTokenizerFast - the tokenizer to wrap n_vocab: int - override the size of the vocabulary - eos_token: int - override the EOS token + eos_token: int or list of ints - override the EOS token(s) slices: List[str] - configuration for slicer optimization; pass [] to disable, or None to use the default configuration """ diff --git a/python/llguidance/llamacpp.py b/python/llguidance/llamacpp.py index 7d495d5d..8adb17c3 100644 --- a/python/llguidance/llamacpp.py +++ b/python/llguidance/llamacpp.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Union from ._lib import LLTokenizer @@ -8,7 +8,7 @@ def lltokenizer_from_vocab( vocab: llama_cpp.llama_vocab_p, n_vocab: Optional[int] = None, - eos_token: Optional[int] = None, + eos_token: Optional[Union[int, List[int]]] = None, slices: Optional[List[str]] = None, ) -> LLTokenizer: """ @@ -18,6 +18,7 @@ def lltokenizer_from_vocab( Args: vocab: llama_cpp.llama_vocab_p - the vocab object to use n_vocab: int - override the size of the vocabulary + eos_token: int or list of ints - override the EOS token(s) eos_token: int - override the EOS token slices: List[str] - configuration for slicer optimization; pass [] to disable, or None to use the default configuration diff --git a/python/llguidance/tiktoken.py b/python/llguidance/tiktoken.py index 4e4c6668..dced8f97 100644 --- a/python/llguidance/tiktoken.py +++ b/python/llguidance/tiktoken.py @@ -1,4 +1,4 @@ -from typing import List, Optional, TYPE_CHECKING +from typing import List, Optional, Union, TYPE_CHECKING from ._lib import LLTokenizer @@ -10,7 +10,7 @@ def lltokenizer_from_encoding( encoding: 'tiktoken.Encoding', *, n_vocab: Optional[int] = None, - eos_token: Optional[int] = None, + eos_token: Optional[Union[int, List[int]]] = None, slices: Optional[List[str]] = None, ) -> LLTokenizer: """ @@ -20,7 +20,7 @@ def lltokenizer_from_encoding( Args: encoding: tiktoken.Encoding - the encoding object to use n_vocab: int - override the size of the vocabulary - eos_token: int - override the EOS token + eos_token: int or list of ints - override the EOS token(s) slices: List[str] - configuration for slicer optimization; pass [] to disable, or None to use the default configuration """ diff --git a/python_ext/src/llamatokenizer.rs b/python_ext/src/llamatokenizer.rs index 15e92445..c5642ffa 100644 --- a/python_ext/src/llamatokenizer.rs +++ b/python_ext/src/llamatokenizer.rs @@ -119,13 +119,17 @@ pub fn tokenv_from_llamacpp( tokens: Vec>, vocab_ptr: usize, tokenize_fptr: usize, - eos_token: u32, + eos_tokens: &[u32], ) -> Result { + ensure!(!eos_tokens.is_empty(), "eos_tokens must not be empty"); ensure!(vocab_ptr != 0, "vocab_ptr must be non-null"); ensure!(tokenize_fptr != 0, "tokenize_fptr must be non-null"); - let info = TokRxInfo::new(tokens.len() as u32, eos_token); - let trie = TokTrie::from(&info, &tokens); + let info = TokRxInfo::new(tokens.len() as u32, eos_tokens[0]); + let mut trie = TokTrie::from(&info, &tokens); + if eos_tokens.len() > 1 { + trie = trie.with_eos_tokens(eos_tokens); + } let mut llama_tok = LlamaTokenizer { trie, diff --git a/python_ext/src/py.rs b/python_ext/src/py.rs index ec595432..615ff2a4 100644 --- a/python_ext/src/py.rs +++ b/python_ext/src/py.rs @@ -16,6 +16,23 @@ use toktrie_tiktoken::TikTokenBPE; use crate::llamatokenizer::tokenv_from_llamacpp; +/// Extract eos_token from a Python value that may be int or list[int]. +/// Returns None if the value is None, or Some(vec) if it's an int or list of ints. +fn extract_eos_tokens(obj: &Bound<'_, PyAny>) -> PyResult> { + if let Ok(single) = obj.extract::() { + Ok(vec![single]) + } else if let Ok(list) = obj.extract::>() { + if list.is_empty() { + return Err(PyValueError::new_err("eos_token list must not be empty")); + } + Ok(list) + } else { + Err(PyValueError::new_err( + "eos_token must be an int or a non-empty list of ints", + )) + } +} + struct PyTokenizer { tok_trie: Arc, tokenizer_fun: Py, @@ -36,9 +53,13 @@ impl LLTokenizer { fn py_new( tokenizer: Bound<'_, PyAny>, n_vocab: Option, - eos_token: Option, + eos_token: Option>, slices: Option>, ) -> PyResult { + let eos_tokens = eos_token + .as_ref() + .map(extract_eos_tokens) + .transpose()?; let tok_env: TokEnv = if let Ok(tokenizer_str) = tokenizer.extract::() { if tokenizer_str == "byte" { ApproximateTokEnv::single_byte_env() @@ -48,8 +69,8 @@ impl LLTokenizer { } else { ByteTokenizer::from_file(&tokenizer_str).map_err(val_error)? }; - if let Some(eos_token) = eos_token { - tok.set_eos_token(eos_token); + if let Some(ref eos_tokens) = eos_tokens { + tok.set_eos_tokens(eos_tokens); } tok.into_tok_env(n_vocab).map_err(val_error)? } @@ -77,18 +98,22 @@ impl LLTokenizer { encoder: HashMap, u32>, special_tokens: HashMap, pattern: &str, - eos_token: u32, + eos_token: Bound<'_, PyAny>, n_vocab: Option, slices: Option>, ) -> PyResult { - let bpe = TikTokenBPE::new( + let eos_tokens = extract_eos_tokens(&eos_token)?; + let mut bpe = TikTokenBPE::new( encoder.into_iter().collect(), special_tokens.into_iter().collect(), pattern, n_vocab, - eos_token, + eos_tokens[0], ) .map_err(val_error)?; + if eos_tokens.len() > 1 { + bpe.set_eos_tokens(&eos_tokens); + } let tok_env = bpe.to_env(); let factory = ParserFactory::new( @@ -108,11 +133,12 @@ impl LLTokenizer { tokens: Vec>, vocab_ptr: usize, tokenize_fptr: usize, - eos_token: u32, + eos_token: Bound<'_, PyAny>, slices: Option>, ) -> PyResult { + let eos_tokens = extract_eos_tokens(&eos_token)?; let tok_env = - tokenv_from_llamacpp(tokens, vocab_ptr, tokenize_fptr, eos_token).map_err(val_error)?; + tokenv_from_llamacpp(tokens, vocab_ptr, tokenize_fptr, &eos_tokens).map_err(val_error)?; let factory = ParserFactory::new( &tok_env, @@ -244,6 +270,11 @@ impl LLTokenizer { fn eos_token(&self) -> u32 { self.tok_trie().eos_token() } + + #[getter] + fn eos_tokens(&self) -> Vec { + self.tok_trie().eos_tokens().to_vec() + } } impl LLTokenizer { diff --git a/toktrie/src/toktree.rs b/toktrie/src/toktree.rs index 5c542e39..4166f1b9 100644 --- a/toktrie/src/toktree.rs +++ b/toktrie/src/toktree.rs @@ -101,6 +101,7 @@ pub struct TokTrie { token_data: Vec, nodes: Vec, max_token_len: usize, + eos_tokens: Vec, } #[derive(Clone, Copy, Zeroable, Pod)] @@ -194,6 +195,7 @@ impl TokTrie { token_data, nodes, max_token_len, + eos_tokens: vec![info.tok_eos], }; r.validate(); r @@ -209,19 +211,27 @@ impl TokTrie { }; words.push(b.to_vec()); } - Self::from(self.info(), &words) + let mut r = Self::from(self.info(), &words); + r.eos_tokens = self.eos_tokens.clone(); + r } pub fn with_eos_token(&self, eos_token: TokenId) -> Self { - self.with_info(TokRxInfo { - tok_eos: eos_token, - ..self.info - }) + self.with_eos_tokens(&[eos_token]) + } + + pub fn with_eos_tokens(&self, eos_tokens: &[TokenId]) -> Self { + assert!(!eos_tokens.is_empty(), "eos_tokens must not be empty"); + let mut r = self.clone(); + r.info.tok_eos = eos_tokens[0]; + r.eos_tokens = eos_tokens.to_vec(); + r } pub fn with_info(&self, info: TokRxInfo) -> Self { let mut r = self.clone(); r.info = info; + r.eos_tokens = vec![info.tok_eos]; r } @@ -248,6 +258,10 @@ impl TokTrie { self.info.tok_eos } + pub fn eos_tokens(&self) -> &[TokenId] { + &self.eos_tokens + } + pub fn vocab_size(&self) -> usize { self.info.vocab_size as usize } @@ -1189,3 +1203,66 @@ impl Recognizer for AnythingGoes { true } } + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_trie(eos: TokenId) -> TokTrie { + let info = TokRxInfo::new(4, eos); + let words = vec![ + b"a".to_vec(), + b"b".to_vec(), + b"c".to_vec(), + b"d".to_vec(), + ]; + TokTrie::from(&info, &words) + } + + #[test] + fn test_default_single_eos() { + let trie = make_test_trie(2); + assert_eq!(trie.eos_token(), 2); + assert_eq!(trie.eos_tokens(), &[2]); + } + + #[test] + fn test_with_eos_tokens_multiple() { + let trie = make_test_trie(0).with_eos_tokens(&[1, 3]); + assert_eq!(trie.eos_token(), 1); + assert_eq!(trie.eos_tokens(), &[1, 3]); + assert_eq!(trie.info().tok_eos, 1); + } + + #[test] + fn test_with_eos_token_backwards_compat() { + let trie = make_test_trie(0).with_eos_token(2); + assert_eq!(trie.eos_token(), 2); + assert_eq!(trie.eos_tokens(), &[2]); + } + + #[test] + fn test_with_info_resets_eos_tokens() { + let trie = make_test_trie(0).with_eos_tokens(&[1, 2]); + let trie2 = trie.with_info(TokRxInfo::new(4, 3)); + assert_eq!(trie2.eos_token(), 3); + assert_eq!(trie2.eos_tokens(), &[3]); + } + + #[test] + fn test_filter_preserves_eos_tokens() { + let trie = make_test_trie(0).with_eos_tokens(&[1, 2]); + let mut filter = trie.alloc_token_set(); + for i in 0..4 { + filter.allow_token(i); + } + let filtered = trie.filter(&filter); + assert_eq!(filtered.eos_tokens(), &[1, 2]); + } + + #[test] + #[should_panic(expected = "eos_tokens must not be empty")] + fn test_with_eos_tokens_empty_panics() { + make_test_trie(0).with_eos_tokens(&[]); + } +} diff --git a/toktrie_hf_tokenizers/src/lib.rs b/toktrie_hf_tokenizers/src/lib.rs index f77f19cf..6f54beab 100644 --- a/toktrie_hf_tokenizers/src/lib.rs +++ b/toktrie_hf_tokenizers/src/lib.rs @@ -12,6 +12,7 @@ pub struct ByteTokenizer { pub hf_tokenizer: Tokenizer, info: TokRxInfo, token_bytes: Vec>, + eos_tokens_extra: Vec, } // useful when debugging this: https://www.cogsci.ed.ac.uk/~richard/utf-8.cgi @@ -148,6 +149,7 @@ impl ByteTokenizer { info: TokRxInfo::new(vocab_size, 0), token_bytes: (0..vocab_size).map(|_| Vec::new()).collect(), hf_tokenizer: hft, + eos_tokens_extra: Vec::new(), }; let mut specials = HashSet::new(); @@ -231,6 +233,19 @@ impl ByteTokenizer { pub fn set_eos_token(&mut self, tok_id: u32) { self.info.tok_eos = tok_id; + self.eos_tokens_extra.clear(); + } + + pub fn set_eos_tokens(&mut self, tokens: &[TokenId]) { + assert!(!tokens.is_empty(), "eos_tokens must not be empty"); + self.info.tok_eos = tokens[0]; + self.eos_tokens_extra = tokens[1..].to_vec(); + } + + pub fn eos_tokens(&self) -> Vec { + let mut r = vec![self.info.tok_eos]; + r.extend_from_slice(&self.eos_tokens_extra); + r } pub fn into_tok_env(self, n_vocab: Option) -> Result { @@ -259,7 +274,11 @@ impl ByteTokenizerEnv { } info.vocab_size = n_vocab as u32; } - let tok_trie = TokTrie::from(&info, &token_bytes); + let eos_tokens = tokenizer.eos_tokens(); + let mut tok_trie = TokTrie::from(&info, &token_bytes); + if eos_tokens.len() > 1 { + tok_trie = tok_trie.with_eos_tokens(&eos_tokens); + } Ok(ByteTokenizerEnv { tokenizer, tok_trie, @@ -352,6 +371,7 @@ mod tests { hf_tokenizer, info, token_bytes, + eos_tokens_extra: Vec::new(), }; let env = ByteTokenizerEnv::new(tokenizer, None).unwrap(); let special_id = env.tok_trie().get_special_token("<|end|>").unwrap(); diff --git a/toktrie_tiktoken/src/lib.rs b/toktrie_tiktoken/src/lib.rs index 27d826e9..e76d57cc 100644 --- a/toktrie_tiktoken/src/lib.rs +++ b/toktrie_tiktoken/src/lib.rs @@ -79,6 +79,10 @@ impl TikTokenBPE { *self.tok_trie.info() } + pub fn set_eos_tokens(&mut self, tokens: &[TokenId]) { + self.tok_trie = self.tok_trie.with_eos_tokens(tokens); + } + pub fn to_env(self) -> TokEnv { Arc::new(self) } From b261ae9ba18eb2ecb63dfec38f6ecb1e04e9d3bd Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 5 Mar 2026 17:32:44 -0800 Subject: [PATCH 02/35] cargo fmt --- parser/src/ffi.rs | 5 +++-- python_ext/src/py.rs | 9 +++------ toktrie/src/toktree.rs | 7 +------ 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 945130b2..125ebb1b 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -140,8 +140,9 @@ impl LlgTokenizer { let mut trie = TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens); if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { - let extra = - unsafe { std::slice::from_raw_parts(init.tok_eos_extra, init.tok_eos_extra_count as usize) }; + let extra = unsafe { + std::slice::from_raw_parts(init.tok_eos_extra, init.tok_eos_extra_count as usize) + }; let mut eos_tokens = vec![init.tok_eos]; eos_tokens.extend_from_slice(extra); trie = trie.with_eos_tokens(&eos_tokens); diff --git a/python_ext/src/py.rs b/python_ext/src/py.rs index 615ff2a4..48b81c41 100644 --- a/python_ext/src/py.rs +++ b/python_ext/src/py.rs @@ -56,10 +56,7 @@ impl LLTokenizer { eos_token: Option>, slices: Option>, ) -> PyResult { - let eos_tokens = eos_token - .as_ref() - .map(extract_eos_tokens) - .transpose()?; + let eos_tokens = eos_token.as_ref().map(extract_eos_tokens).transpose()?; let tok_env: TokEnv = if let Ok(tokenizer_str) = tokenizer.extract::() { if tokenizer_str == "byte" { ApproximateTokEnv::single_byte_env() @@ -137,8 +134,8 @@ impl LLTokenizer { slices: Option>, ) -> PyResult { let eos_tokens = extract_eos_tokens(&eos_token)?; - let tok_env = - tokenv_from_llamacpp(tokens, vocab_ptr, tokenize_fptr, &eos_tokens).map_err(val_error)?; + let tok_env = tokenv_from_llamacpp(tokens, vocab_ptr, tokenize_fptr, &eos_tokens) + .map_err(val_error)?; let factory = ParserFactory::new( &tok_env, diff --git a/toktrie/src/toktree.rs b/toktrie/src/toktree.rs index 4166f1b9..4f3a3d93 100644 --- a/toktrie/src/toktree.rs +++ b/toktrie/src/toktree.rs @@ -1210,12 +1210,7 @@ mod tests { fn make_test_trie(eos: TokenId) -> TokTrie { let info = TokRxInfo::new(4, eos); - let words = vec![ - b"a".to_vec(), - b"b".to_vec(), - b"c".to_vec(), - b"d".to_vec(), - ]; + let words = vec![b"a".to_vec(), b"b".to_vec(), b"c".to_vec(), b"d".to_vec()]; TokTrie::from(&info, &words) } From 985e34494269fae7d1c4c831f419df972c034830 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 5 Mar 2026 18:14:42 -0800 Subject: [PATCH 03/35] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- python/llguidance/llamacpp.py | 2 +- python_ext/src/py.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/llguidance/llamacpp.py b/python/llguidance/llamacpp.py index 8adb17c3..5e2cedc3 100644 --- a/python/llguidance/llamacpp.py +++ b/python/llguidance/llamacpp.py @@ -19,7 +19,7 @@ def lltokenizer_from_vocab( vocab: llama_cpp.llama_vocab_p - the vocab object to use n_vocab: int - override the size of the vocabulary eos_token: int or list of ints - override the EOS token(s) - eos_token: int - override the EOS token + slices: List[str] - configuration for slicer optimization; pass [] to disable, or None to use the default configuration """ diff --git a/python_ext/src/py.rs b/python_ext/src/py.rs index 48b81c41..dec13f6d 100644 --- a/python_ext/src/py.rs +++ b/python_ext/src/py.rs @@ -16,8 +16,8 @@ use toktrie_tiktoken::TikTokenBPE; use crate::llamatokenizer::tokenv_from_llamacpp; -/// Extract eos_token from a Python value that may be int or list[int]. -/// Returns None if the value is None, or Some(vec) if it's an int or list of ints. +/// Extract EOS tokens from a Python value that must be an int or a non-empty list[int]. +/// Returns a Vec on success, or raises PyValueError if the value is invalid or the list is empty. fn extract_eos_tokens(obj: &Bound<'_, PyAny>) -> PyResult> { if let Ok(single) = obj.extract::() { Ok(vec![single]) From abed676aa2ca08dd45de317af6f955ec724ee815 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 5 Mar 2026 17:50:56 -0800 Subject: [PATCH 04/35] Add zero-initialization requirement comment to LlgTokenizerInit Document that LlgTokenizerInit must be zero-initialized before setting fields, as new fields may be appended in future versions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parser/llguidance.h | 5 +++++ parser/src/ffi.rs | 3 +++ 2 files changed, 8 insertions(+) diff --git a/parser/llguidance.h b/parser/llguidance.h index db905e21..e4099da0 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -186,6 +186,11 @@ typedef size_t (*LlgTokenizeFn)(const void *user_data, uint32_t *output_tokens, size_t output_tokens_len); +/** + * This struct must be zero-initialized (e.g., `= {}` in C/C++) before setting fields. + * New fields may be appended in future versions, and zero-initialization ensures + * they receive safe default values. + */ typedef struct LlgTokenizerInit { /** * The number of tokens in the vocabulary diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 125ebb1b..3cb7e7f6 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -215,6 +215,9 @@ pub type LlgTokenizeFn = Option< /// Function which llg calls when an operation is done. pub type LlgCallback = Option; +/// This struct must be zero-initialized (e.g., `= {}` in C/C++) before setting fields. +/// New fields may be appended in future versions, and zero-initialization ensures +/// they receive safe default values. #[repr(C)] pub struct LlgTokenizerInit { /// The number of tokens in the vocabulary From 963d764aa872384216c75a815fb89d5e5d659c59 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 10:53:36 -0800 Subject: [PATCH 05/35] Introduce LlgTokenizerInitV2 and llg_new_tokenizer_v2 for ABI stability Move tok_eos_extra/tok_eos_extra_count out of LlgTokenizerInit into a new LlgTokenizerInitV2 struct that embeds the original as its 'base' field. This keeps LlgTokenizerInit identical to its pre-multi-EOS layout, avoiding any ABI break for existing C consumers. Add llg_new_tokenizer_v2() which accepts the v2 struct. The original llg_new_tokenizer() continues to work unchanged with single-EOS. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 12 ++++-- parser/llguidance.h | 27 ++++++++++-- parser/src/ffi.rs | 95 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 111 insertions(+), 23 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 91e23fc6..72cdbe81 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -29,16 +29,20 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, LlgTokenizerInit tok_init = {}; tok_init.vocab_size = (uint32_t)tokens.size(); tok_init.tok_eos = tok_eos; - // For models with multiple EOS tokens (e.g., Qwen3), set: - // LlgToken extra_eos[] = {second_eos, third_eos}; - // tok_init.tok_eos_extra = extra_eos; - // tok_init.tok_eos_extra_count = sizeof(extra_eos) / sizeof(extra_eos[0]); tok_init.token_lens = token_lens; tok_init.token_bytes = token_bytes; tok_init.tokenize_assumes_string = false; tok_init.tokenize_user_data = tokenize_user_data; tok_init.tokenize_fn = tokenize_fn; + // For models with multiple EOS tokens (e.g., Qwen3), use the v2 API: + // LlgTokenizerInitV2 tok_init_v2 = {}; + // tok_init_v2.base = tok_init; // copy base fields + // LlgToken extra_eos[] = {second_eos, third_eos}; + // tok_init_v2.tok_eos_extra = extra_eos; + // tok_init_v2.tok_eos_extra_count = sizeof(extra_eos) / sizeof(extra_eos[0]); + // auto tok = llg_new_tokenizer_v2(&tok_init_v2, error_buf, sizeof(error_buf)); + char error_buf[128]; auto tok = llg_new_tokenizer(&tok_init, error_buf, sizeof(error_buf)); diff --git a/parser/llguidance.h b/parser/llguidance.h index e4099da0..ff3d53f6 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -244,17 +244,30 @@ typedef struct LlgTokenizerInit { * Pass NULL to use defaults. Pass empty array to disable. */ const char *const *slices; +} LlgTokenizerInit; + +/** + * V2 of the tokenizer initialization struct. + * Extends LlgTokenizerInit with support for multiple EOS tokens. + * Use with `llg_new_tokenizer_v2()`. + * This struct must also be zero-initialized before setting fields. + */ +typedef struct LlgTokenizerInitV2 { + /** + * All fields from the original LlgTokenizerInit. + */ + struct LlgTokenizerInit base; /** - * Additional EOS token IDs beyond `tok_eos`. + * Additional EOS token IDs beyond `base.tok_eos`. * Points to an array of `tok_eos_extra_count` elements. - * When NULL (the default for zero-initialized structs), only `tok_eos` is used. + * When NULL (the default for zero-initialized structs), only `base.tok_eos` is used. */ const LlgToken *tok_eos_extra; /** * Number of elements in the `tok_eos_extra` array. */ uint32_t tok_eos_extra_count; -} LlgTokenizerInit; +} LlgTokenizerInitV2; @@ -362,6 +375,14 @@ struct LlgTokenizer *llg_new_tokenizer(const struct LlgTokenizerInit *tok_init, char *error_string, size_t error_string_len); +/** + * Create a new tokenizer from a LlgTokenizerInitV2 struct. + * This is the v2 API that supports multiple EOS tokens. + */ +struct LlgTokenizer *llg_new_tokenizer_v2(const struct LlgTokenizerInitV2 *tok_init, + char *error_string, + size_t error_string_len); + /** * Clone a tokenizer. * This increments a reference count and does a small allocation. diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 3cb7e7f6..3adf494c 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -137,26 +137,48 @@ impl LlgTokenizer { token_bytes }; - let mut trie = TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens); + let trie = TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens); - if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { + Self::finish_init(init, trie) + } + + fn from_init_v2(init_v2: &LlgTokenizerInitV2) -> Result { + let init = &init_v2.base; + // Build the base tokenizer the same way as v1 + let mut tok = Self::from_init(init)?; + + // Apply additional EOS tokens if provided + if !init_v2.tok_eos_extra.is_null() && init_v2.tok_eos_extra_count > 0 { let extra = unsafe { - std::slice::from_raw_parts(init.tok_eos_extra, init.tok_eos_extra_count as usize) + std::slice::from_raw_parts( + init_v2.tok_eos_extra, + init_v2.tok_eos_extra_count as usize, + ) }; let mut eos_tokens = vec![init.tok_eos]; eos_tokens.extend_from_slice(extra); - trie = trie.with_eos_tokens(&eos_tokens); + + // Rebuild the factory with updated EOS tokens on the trie + let trie = tok.factory.tok_env().tok_trie().clone().with_eos_tokens(&eos_tokens); + let tok_env: TokEnv = Arc::new(CTokenizerInner { + trie, + tokenize_assumes_string: init.tokenize_assumes_string + && init.tokenize_fn.is_some(), + tokenize_fn: init.tokenize_fn, + tokenize_user_data: init.tokenize_user_data, + }); + let slices = Self::read_slices(init)?; + let factory = + ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; + tok.factory = Arc::new(factory); } - let tok_env: TokEnv = Arc::new(CTokenizerInner { - trie, - tokenize_assumes_string: init.tokenize_assumes_string && init.tokenize_fn.is_some(), - tokenize_fn: init.tokenize_fn, - tokenize_user_data: init.tokenize_user_data, - }); + Ok(tok) + } - let slices = if init.slices.is_null() { - SlicedBiasComputer::general_slices() + fn read_slices(init: &LlgTokenizerInit) -> Result> { + if init.slices.is_null() { + Ok(SlicedBiasComputer::general_slices()) } else { let mut slices = vec![]; let mut idx = 0; @@ -169,8 +191,19 @@ impl LlgTokenizer { slices.push(s.to_string()); idx += 1; } - slices - }; + Ok(slices) + } + } + + fn finish_init(init: &LlgTokenizerInit, trie: TokTrie) -> Result { + let tok_env: TokEnv = Arc::new(CTokenizerInner { + trie, + tokenize_assumes_string: init.tokenize_assumes_string && init.tokenize_fn.is_some(), + tokenize_fn: init.tokenize_fn, + tokenize_user_data: init.tokenize_user_data, + }); + + let slices = Self::read_slices(init)?; let factory = ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; @@ -262,9 +295,20 @@ pub struct LlgTokenizerInit { /// Pass NULL to use defaults. Pass empty array to disable. pub slices: *const *const c_char, - /// Additional EOS token IDs beyond `tok_eos`. +} + +/// V2 of the tokenizer initialization struct. +/// Extends LlgTokenizerInit with support for multiple EOS tokens. +/// Use with `llg_new_tokenizer_v2()`. +/// This struct must also be zero-initialized before setting fields. +#[repr(C)] +pub struct LlgTokenizerInitV2 { + /// All fields from the original LlgTokenizerInit. + pub base: LlgTokenizerInit, + + /// Additional EOS token IDs beyond `base.tok_eos`. /// Points to an array of `tok_eos_extra_count` elements. - /// When NULL (the default for zero-initialized structs), only `tok_eos` is used. + /// When NULL (the default for zero-initialized structs), only `base.tok_eos` is used. pub tok_eos_extra: *const LlgToken, /// Number of elements in the `tok_eos_extra` array. @@ -689,6 +733,25 @@ pub unsafe extern "C" fn llg_new_tokenizer( } } +/// Create a new tokenizer from a LlgTokenizerInitV2 struct. +/// This is the v2 API that supports multiple EOS tokens. +/// # Safety +/// This function should only be called from C code. +#[no_mangle] +pub unsafe extern "C" fn llg_new_tokenizer_v2( + tok_init: &LlgTokenizerInitV2, + error_string: *mut c_char, + error_string_len: usize, +) -> *mut LlgTokenizer { + match LlgTokenizer::from_init_v2(tok_init) { + Ok(tok) => Box::into_raw(Box::new(tok)), + Err(e) => { + save_error_string(e, error_string, error_string_len); + std::ptr::null_mut() + } + } +} + /// Clone a tokenizer. /// This increments a reference count and does a small allocation. #[no_mangle] From ca775a510e69de77b5e3837a9ca18ced626784dd Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 10:57:40 -0800 Subject: [PATCH 06/35] Add struct_size field to LlgTokenizerInitV2 for forward compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The leading struct_size field (set to sizeof(LlgTokenizerInitV2) by callers) lets the library detect which fields are present. Future fields can be appended to the struct without a v3 — callers compiled against an older header will simply have a smaller struct_size, and new fields will be treated as zero/default. llg_new_tokenizer_v2() validates struct_size >= the minimum expected size and returns an error if it's unset or too small. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 1 + parser/llguidance.h | 11 ++++++++++- parser/src/ffi.rs | 19 ++++++++++++++++++- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 72cdbe81..512c0843 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -37,6 +37,7 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, // For models with multiple EOS tokens (e.g., Qwen3), use the v2 API: // LlgTokenizerInitV2 tok_init_v2 = {}; + // tok_init_v2.struct_size = sizeof(tok_init_v2); // tok_init_v2.base = tok_init; // copy base fields // LlgToken extra_eos[] = {second_eos, third_eos}; // tok_init_v2.tok_eos_extra = extra_eos; diff --git a/parser/llguidance.h b/parser/llguidance.h index ff3d53f6..079cce1c 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -250,9 +250,18 @@ typedef struct LlgTokenizerInit { * V2 of the tokenizer initialization struct. * Extends LlgTokenizerInit with support for multiple EOS tokens. * Use with `llg_new_tokenizer_v2()`. - * This struct must also be zero-initialized before setting fields. + * + * Initialize with: `LlgTokenizerInitV2 init = {}; init.struct_size = sizeof(init);` + * The struct_size field allows future fields to be appended without breaking + * existing callers — new fields will default to zero when struct_size is smaller + * than the library expects. */ typedef struct LlgTokenizerInitV2 { + /** + * Must be set to `sizeof(LlgTokenizerInitV2)`. + * This allows the library to detect which fields are available. + */ + size_t struct_size; /** * All fields from the original LlgTokenizerInit. */ diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 3adf494c..ad21e0b9 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -143,6 +143,15 @@ impl LlgTokenizer { } fn from_init_v2(init_v2: &LlgTokenizerInitV2) -> Result { + let min_size = std::mem::size_of::(); + ensure!( + init_v2.struct_size >= min_size, + "LlgTokenizerInitV2.struct_size is {} but expected at least {}. \ + Set struct_size = sizeof(LlgTokenizerInitV2).", + init_v2.struct_size, + min_size + ); + let init = &init_v2.base; // Build the base tokenizer the same way as v1 let mut tok = Self::from_init(init)?; @@ -300,9 +309,17 @@ pub struct LlgTokenizerInit { /// V2 of the tokenizer initialization struct. /// Extends LlgTokenizerInit with support for multiple EOS tokens. /// Use with `llg_new_tokenizer_v2()`. -/// This struct must also be zero-initialized before setting fields. +/// +/// Initialize with: `LlgTokenizerInitV2 init = {}; init.struct_size = sizeof(init);` +/// The struct_size field allows future fields to be appended without breaking +/// existing callers — new fields will default to zero when struct_size is smaller +/// than the library expects. #[repr(C)] pub struct LlgTokenizerInitV2 { + /// Must be set to `sizeof(LlgTokenizerInitV2)`. + /// This allows the library to detect which fields are available. + pub struct_size: usize, + /// All fields from the original LlgTokenizerInit. pub base: LlgTokenizerInit, From db177e9ac19d176c9df7c7a0cc6f509427267f3c Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 11:02:23 -0800 Subject: [PATCH 07/35] Flatten LlgTokenizerInitV2 fields instead of embedding LlgTokenizerInit Replace the nested 'base: LlgTokenizerInit' member with flat copies of all fields so C consumers write init.vocab_size instead of init.base.vocab_size. Since v2 is the recommended struct going forward, this avoids a permanent ergonomic tax. Internally, from_init_v2() builds a temporary LlgTokenizerInit to delegate to from_init(), keeping the code DRY. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 16 +++++--- parser/llguidance.h | 56 ++++++++++++++++++++++++-- parser/src/ffi.rs | 93 +++++++++++++++++++++++++++++++++---------- 3 files changed, 133 insertions(+), 32 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 512c0843..c5539276 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -36,13 +36,17 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, tok_init.tokenize_fn = tokenize_fn; // For models with multiple EOS tokens (e.g., Qwen3), use the v2 API: - // LlgTokenizerInitV2 tok_init_v2 = {}; - // tok_init_v2.struct_size = sizeof(tok_init_v2); - // tok_init_v2.base = tok_init; // copy base fields + // LlgTokenizerInitV2 init_v2 = {}; + // init_v2.struct_size = sizeof(init_v2); + // init_v2.vocab_size = tok_init.vocab_size; + // init_v2.tok_eos = tok_init.tok_eos; + // init_v2.token_lens = tok_init.token_lens; + // init_v2.token_bytes = tok_init.token_bytes; + // init_v2.tokenize_fn = tok_init.tokenize_fn; // LlgToken extra_eos[] = {second_eos, third_eos}; - // tok_init_v2.tok_eos_extra = extra_eos; - // tok_init_v2.tok_eos_extra_count = sizeof(extra_eos) / sizeof(extra_eos[0]); - // auto tok = llg_new_tokenizer_v2(&tok_init_v2, error_buf, sizeof(error_buf)); + // init_v2.tok_eos_extra = extra_eos; + // init_v2.tok_eos_extra_count = sizeof(extra_eos) / sizeof(extra_eos[0]); + // auto tok = llg_new_tokenizer_v2(&init_v2, error_buf, sizeof(error_buf)); char error_buf[128]; auto tok = llg_new_tokenizer(&tok_init, error_buf, sizeof(error_buf)); diff --git a/parser/llguidance.h b/parser/llguidance.h index 079cce1c..010d4221 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -263,13 +263,61 @@ typedef struct LlgTokenizerInitV2 { */ size_t struct_size; /** - * All fields from the original LlgTokenizerInit. + * The number of tokens in the vocabulary + */ + uint32_t vocab_size; + /** + * The token ID for the end of sentence token + * For chat mode, set it to end-of-turn token + */ + LlgToken tok_eos; + /** + * An array of the lengths of the token strings (vocab_size elements) + */ + const uint32_t *token_lens; + /** + * A pointer to the token strings + * The length of this the sum of all token_lens + */ + const uint8_t *token_bytes; + /** + * Instead of passing token_lens and token_bytes, this can be set to + * the contents of HF tokenizer.json file. + */ + const char *tokenizer_json; + /** + * Set to true to enable hack that works around the tokenize_fn only + * accepting valid UTF-8 strings and possibly adding etc. + * TODO: the bit not implemented yet */ - struct LlgTokenizerInit base; + bool tokenize_assumes_string; + /** + * Tokenization function, see LlgTokenizeFn docs. + * It should only tokenize the bytes and not add + * any etc. It should also work on any byte sequence, including + * invalid UTF-8. If this is not the case, set tokenize_assumes_string to true. + * Either way, this function has to be thread-safe! + */ + LlgTokenizeFn tokenize_fn; + /** + * Set to true to not use tokenize_fn and instead tokenize greedily, + * which is often incorrect and may reduce accuracy. + */ + bool use_approximate_greedy_tokenize_fn; + /** + * User data to pass to the tokenize_fn + */ + const void *tokenize_user_data; + /** + * Tokenizer partitions for the slicer optimization. + * This is array of pointers to strings, terminated with NULL (argv style). + * Pass NULL to use defaults. Pass empty array to disable. + */ + const char *const *slices; /** - * Additional EOS token IDs beyond `base.tok_eos`. + * Additional EOS token IDs beyond `tok_eos`. * Points to an array of `tok_eos_extra_count` elements. - * When NULL (the default for zero-initialized structs), only `base.tok_eos` is used. + * When NULL (the default for zero-initialized structs), only `tok_eos` is used. */ const LlgToken *tok_eos_extra; /** diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index ad21e0b9..a42a3b2f 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -142,32 +142,42 @@ impl LlgTokenizer { Self::finish_init(init, trie) } - fn from_init_v2(init_v2: &LlgTokenizerInitV2) -> Result { + fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { let min_size = std::mem::size_of::(); ensure!( - init_v2.struct_size >= min_size, + init.struct_size >= min_size, "LlgTokenizerInitV2.struct_size is {} but expected at least {}. \ Set struct_size = sizeof(LlgTokenizerInitV2).", - init_v2.struct_size, + init.struct_size, min_size ); - let init = &init_v2.base; - // Build the base tokenizer the same way as v1 - let mut tok = Self::from_init(init)?; + // Build a v1 init from the shared fields and delegate + let v1 = LlgTokenizerInit { + vocab_size: init.vocab_size, + tok_eos: init.tok_eos, + token_lens: init.token_lens, + token_bytes: init.token_bytes, + tokenizer_json: init.tokenizer_json, + tokenize_assumes_string: init.tokenize_assumes_string, + tokenize_fn: init.tokenize_fn, + use_approximate_greedy_tokenize_fn: init.use_approximate_greedy_tokenize_fn, + tokenize_user_data: init.tokenize_user_data, + slices: init.slices, + }; + let mut tok = Self::from_init(&v1)?; // Apply additional EOS tokens if provided - if !init_v2.tok_eos_extra.is_null() && init_v2.tok_eos_extra_count > 0 { + if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { let extra = unsafe { std::slice::from_raw_parts( - init_v2.tok_eos_extra, - init_v2.tok_eos_extra_count as usize, + init.tok_eos_extra, + init.tok_eos_extra_count as usize, ) }; let mut eos_tokens = vec![init.tok_eos]; eos_tokens.extend_from_slice(extra); - // Rebuild the factory with updated EOS tokens on the trie let trie = tok.factory.tok_env().tok_trie().clone().with_eos_tokens(&eos_tokens); let tok_env: TokEnv = Arc::new(CTokenizerInner { trie, @@ -176,7 +186,7 @@ impl LlgTokenizer { tokenize_fn: init.tokenize_fn, tokenize_user_data: init.tokenize_user_data, }); - let slices = Self::read_slices(init)?; + let slices = Self::read_slices_raw(init.slices)?; let factory = ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; tok.factory = Arc::new(factory); @@ -185,22 +195,22 @@ impl LlgTokenizer { Ok(tok) } - fn read_slices(init: &LlgTokenizerInit) -> Result> { - if init.slices.is_null() { + fn read_slices_raw(slices: *const *const c_char) -> Result> { + if slices.is_null() { Ok(SlicedBiasComputer::general_slices()) } else { - let mut slices = vec![]; + let mut result = vec![]; let mut idx = 0; loop { - let p = unsafe { *init.slices.add(idx) }; + let p = unsafe { *slices.add(idx) }; if p.is_null() { break; } let s = unsafe { c_str_to_str(p, "slice") }?; - slices.push(s.to_string()); + result.push(s.to_string()); idx += 1; } - Ok(slices) + Ok(result) } } @@ -212,7 +222,7 @@ impl LlgTokenizer { tokenize_user_data: init.tokenize_user_data, }); - let slices = Self::read_slices(init)?; + let slices = Self::read_slices_raw(init.slices)?; let factory = ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; @@ -320,12 +330,51 @@ pub struct LlgTokenizerInitV2 { /// This allows the library to detect which fields are available. pub struct_size: usize, - /// All fields from the original LlgTokenizerInit. - pub base: LlgTokenizerInit, + /// The number of tokens in the vocabulary + pub vocab_size: u32, + + /// The token ID for the end of sentence token + /// For chat mode, set it to end-of-turn token + pub tok_eos: LlgToken, + + /// An array of the lengths of the token strings (vocab_size elements) + pub token_lens: *const u32, + + /// A pointer to the token strings + /// The length of this the sum of all token_lens + pub token_bytes: *const u8, + + /// Instead of passing token_lens and token_bytes, this can be set to + /// the contents of HF tokenizer.json file. + pub tokenizer_json: *const c_char, + + /// Set to true to enable hack that works around the tokenize_fn only + /// accepting valid UTF-8 strings and possibly adding etc. + /// TODO: the bit not implemented yet + pub tokenize_assumes_string: bool, + + /// Tokenization function, see LlgTokenizeFn docs. + /// It should only tokenize the bytes and not add + /// any etc. It should also work on any byte sequence, including + /// invalid UTF-8. If this is not the case, set tokenize_assumes_string to true. + /// Either way, this function has to be thread-safe! + pub tokenize_fn: LlgTokenizeFn, + + /// Set to true to not use tokenize_fn and instead tokenize greedily, + /// which is often incorrect and may reduce accuracy. + pub use_approximate_greedy_tokenize_fn: bool, + + /// User data to pass to the tokenize_fn + pub tokenize_user_data: *const c_void, + + /// Tokenizer partitions for the slicer optimization. + /// This is array of pointers to strings, terminated with NULL (argv style). + /// Pass NULL to use defaults. Pass empty array to disable. + pub slices: *const *const c_char, - /// Additional EOS token IDs beyond `base.tok_eos`. + /// Additional EOS token IDs beyond `tok_eos`. /// Points to an array of `tok_eos_extra_count` elements. - /// When NULL (the default for zero-initialized structs), only `base.tok_eos` is used. + /// When NULL (the default for zero-initialized structs), only `tok_eos` is used. pub tok_eos_extra: *const LlgToken, /// Number of elements in the `tok_eos_extra` array. From 6e4315994f3239a3b88a04ffe6e50d7eac723dc6 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 11:06:31 -0800 Subject: [PATCH 08/35] Test both v1 and v2 C ABI in c_sample Add create_tokenizer_v2() and create_byte_tokenizer_v2() that exercise LlgTokenizerInitV2 with struct_size, flat fields, and an extra EOS token. Extract run_constraint_test() helper and run the full constraint test with both v1 and v2 tokenizers. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 117 +++++++++++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 24 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index c5539276..69438d03 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -9,6 +9,51 @@ #include "llguidance.h" +// Create an LlgTokenizer using the v2 API with an extra EOS token. +// tok_eos is the primary; extra_eos_tokens are additional EOS token IDs. +LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, + uint32_t tok_eos, + std::vector extra_eos_tokens, + LlgTokenizeFn tokenize_fn, + const void *tokenize_user_data) { + auto token_lens = new uint32_t[tokens.size()]; + size_t total_size = 0; + for (size_t i = 0; i < tokens.size(); i++) { + token_lens[i] = tokens[i].size(); + total_size += token_lens[i]; + } + auto token_bytes = new uint8_t[total_size]; + size_t offset = 0; + for (size_t i = 0; i < tokens.size(); i++) { + memcpy(token_bytes + offset, tokens[i].data(), token_lens[i]); + offset += token_lens[i]; + } + + LlgTokenizerInitV2 tok_init = {}; + tok_init.struct_size = sizeof(tok_init); + tok_init.vocab_size = (uint32_t)tokens.size(); + tok_init.tok_eos = tok_eos; + tok_init.token_lens = token_lens; + tok_init.token_bytes = token_bytes; + tok_init.tokenize_assumes_string = false; + tok_init.tokenize_user_data = tokenize_user_data; + tok_init.tokenize_fn = tokenize_fn; + if (!extra_eos_tokens.empty()) { + tok_init.tok_eos_extra = extra_eos_tokens.data(); + tok_init.tok_eos_extra_count = (uint32_t)extra_eos_tokens.size(); + } + + char error_buf[128]; + auto tok = llg_new_tokenizer_v2(&tok_init, error_buf, sizeof(error_buf)); + + if (tok == nullptr) { + printf("Error (v2): %s\n", error_buf); + exit(1); + } + + return tok; +} + // Create an LlgTokenizer; tokens[token_id] is a byte sequence corresponding to // given token_id; see below for tokenize_fn LlgTokenizer *create_tokenizer(std::vector> &tokens, @@ -95,6 +140,22 @@ LlgTokenizer *create_byte_tokenizer(void) { nullptr); } +// Same as above but using the v2 API with an extra (unused) EOS token. +LlgTokenizer *create_byte_tokenizer_v2(void) { + std::vector> tokens; + for (size_t i = 0; i < 256; i++) { + tokens.push_back({(uint8_t)i}); + } + const char *eos = ""; + tokens.push_back(std::vector(eos, eos + strlen(eos))); + const char *eos2 = ""; + tokens.push_back(std::vector(eos2, eos2 + strlen(eos2))); + // Primary EOS is token 256 (), extra EOS is token 257 () + std::vector extra_eos = {(uint32_t)(tokens.size() - 1)}; + return create_tokenizer_v2(tokens, tokens.size() - 2, extra_eos, + tokenize_callback, nullptr); +} + LlgTokenizer *create_hf_tokenizer(std::string tokenizer_json, uint32_t tok_eos) { LlgTokenizerInit tok_init = {}; @@ -154,21 +215,8 @@ std::string do_llg_stringify_tokens(const LlgTokenizer *tok, } } -int main(int argc, const char *argv[]) { - if (argc < 3) { - printf("Usage: %s [tokenizer.json]\n", - argv[0]); - return 1; - } - - // the tokenizer can (and should) be shared between constraints - LlgTokenizer *tokenizer = argc > 3 - ? create_hf_tokenizer(read_file(argv[3]), 2) - : create_byte_tokenizer(); - - auto schema_json = read_file(argv[1]); - auto sample_json = read_file(argv[2]); - +void run_constraint_test(LlgTokenizer *tokenizer, const std::string &schema_json, + const std::string &sample_json, const char *label) { LlgConstraintInit init; llg_constraint_init_set_defaults(&init, tokenizer); init.log_stderr_level = 0; // default to 1 (warnings only) @@ -180,14 +228,6 @@ int main(int argc, const char *argv[]) { fail_constraint(c); } - // for debugging the tokenizer: - // for (int i = 0; i < 320; ++i) { - // std::vector tokens; - // tokens.push_back(i); - // std::string s = do_llg_stringify_tokens(tokenizer, tokens); - // printf("Token %d: %s\n", i, s.c_str()); - // } - // we assume our "LLM" will generate these tokens auto tokens = do_llg_tokenize(tokenizer, sample_json); @@ -225,6 +265,35 @@ int main(int argc, const char *argv[]) { // we assume the constraint will force EOS at the end of the input assert(mask_res.is_stop); - printf("OK!\n"); + llg_free_constraint(c); + printf("%s: OK!\n", label); +} + +int main(int argc, const char *argv[]) { + if (argc < 3) { + printf("Usage: %s [tokenizer.json]\n", + argv[0]); + return 1; + } + + auto schema_json = read_file(argv[1]); + auto sample_json = read_file(argv[2]); + + // Test with v1 API (LlgTokenizerInit + llg_new_tokenizer) + { + LlgTokenizer *tokenizer = argc > 3 + ? create_hf_tokenizer(read_file(argv[3]), 2) + : create_byte_tokenizer(); + run_constraint_test(tokenizer, schema_json, sample_json, "v1"); + llg_free_tokenizer(tokenizer); + } + + // Test with v2 API (LlgTokenizerInitV2 + llg_new_tokenizer_v2) + { + LlgTokenizer *tokenizer = create_byte_tokenizer_v2(); + run_constraint_test(tokenizer, schema_json, sample_json, "v2"); + llg_free_tokenizer(tokenizer); + } + return 0; } From bc534b3b5c668948ee9d7a951c09757d0b9e0f14 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 11:18:40 -0800 Subject: [PATCH 09/35] cargo fmt --- parser/src/ffi.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index a42a3b2f..6a439895 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -170,25 +170,25 @@ impl LlgTokenizer { // Apply additional EOS tokens if provided if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { let extra = unsafe { - std::slice::from_raw_parts( - init.tok_eos_extra, - init.tok_eos_extra_count as usize, - ) + std::slice::from_raw_parts(init.tok_eos_extra, init.tok_eos_extra_count as usize) }; let mut eos_tokens = vec![init.tok_eos]; eos_tokens.extend_from_slice(extra); - let trie = tok.factory.tok_env().tok_trie().clone().with_eos_tokens(&eos_tokens); + let trie = tok + .factory + .tok_env() + .tok_trie() + .clone() + .with_eos_tokens(&eos_tokens); let tok_env: TokEnv = Arc::new(CTokenizerInner { trie, - tokenize_assumes_string: init.tokenize_assumes_string - && init.tokenize_fn.is_some(), + tokenize_assumes_string: init.tokenize_assumes_string && init.tokenize_fn.is_some(), tokenize_fn: init.tokenize_fn, tokenize_user_data: init.tokenize_user_data, }); let slices = Self::read_slices_raw(init.slices)?; - let factory = - ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; + let factory = ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; tok.factory = Arc::new(factory); } @@ -313,7 +313,6 @@ pub struct LlgTokenizerInit { /// This is array of pointers to strings, terminated with NULL (argv style). /// Pass NULL to use defaults. Pass empty array to disable. pub slices: *const *const c_char, - } /// V2 of the tokenizer initialization struct. From 4387f51a48498751e7ca227dbf49c14eb92d755d Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 11:49:48 -0800 Subject: [PATCH 10/35] Validate EOS token IDs and fix struct_size forward compatibility with_eos_tokens() now asserts all token IDs are within vocab_size, preventing out-of-bounds panics during mask computation. This covers all paths (C API, Python bindings, Rust API). from_init_v2() now accepts smaller struct_size values from callers compiled against older headers. Fields beyond what struct_size covers are treated as zero/default. The minimum accepted size is the base fields through slices (matching v1 + struct_size). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parser/llguidance.h | 3 ++- parser/src/ffi.rs | 19 +++++++++++++------ toktrie/src/toktree.rs | 7 +++++++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/parser/llguidance.h b/parser/llguidance.h index 010d4221..2b578768 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -259,7 +259,8 @@ typedef struct LlgTokenizerInit { typedef struct LlgTokenizerInitV2 { /** * Must be set to `sizeof(LlgTokenizerInitV2)`. - * This allows the library to detect which fields are available. + * The library uses this to determine which fields are present, allowing + * older callers (with a smaller struct) to work with newer library versions. */ size_t struct_size; /** diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 6a439895..84ba405c 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -143,15 +143,21 @@ impl LlgTokenizer { } fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { - let min_size = std::mem::size_of::(); + // The minimum struct_size is the base fields through slices (same as + // LlgTokenizerInit + the leading struct_size field). Fields appended + // after slices are only read when struct_size indicates they're present. + let base_size = std::mem::offset_of!(LlgTokenizerInitV2, tok_eos_extra); ensure!( - init.struct_size >= min_size, + init.struct_size >= base_size, "LlgTokenizerInitV2.struct_size is {} but expected at least {}. \ Set struct_size = sizeof(LlgTokenizerInitV2).", init.struct_size, - min_size + base_size ); + let has_eos_extra = + init.struct_size >= std::mem::size_of::(); + // Build a v1 init from the shared fields and delegate let v1 = LlgTokenizerInit { vocab_size: init.vocab_size, @@ -167,8 +173,8 @@ impl LlgTokenizer { }; let mut tok = Self::from_init(&v1)?; - // Apply additional EOS tokens if provided - if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { + // Apply additional EOS tokens if the struct is large enough to contain them + if has_eos_extra && !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { let extra = unsafe { std::slice::from_raw_parts(init.tok_eos_extra, init.tok_eos_extra_count as usize) }; @@ -326,7 +332,8 @@ pub struct LlgTokenizerInit { #[repr(C)] pub struct LlgTokenizerInitV2 { /// Must be set to `sizeof(LlgTokenizerInitV2)`. - /// This allows the library to detect which fields are available. + /// The library uses this to determine which fields are present, allowing + /// older callers (with a smaller struct) to work with newer library versions. pub struct_size: usize, /// The number of tokens in the vocabulary diff --git a/toktrie/src/toktree.rs b/toktrie/src/toktree.rs index 4f3a3d93..763ad9aa 100644 --- a/toktrie/src/toktree.rs +++ b/toktrie/src/toktree.rs @@ -222,6 +222,13 @@ impl TokTrie { pub fn with_eos_tokens(&self, eos_tokens: &[TokenId]) -> Self { assert!(!eos_tokens.is_empty(), "eos_tokens must not be empty"); + let vocab = self.vocab_size() as u32; + for &tok in eos_tokens { + assert!( + tok < vocab, + "EOS token ID {tok} is out of range (vocab_size={vocab})" + ); + } let mut r = self.clone(); r.info.tok_eos = eos_tokens[0]; r.eos_tokens = eos_tokens.to_vec(); From d65de0d379f8cc4268e741ac0a52efb49b784de7 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 11:57:58 -0800 Subject: [PATCH 11/35] Fix struct_size check and add EOS validation in FFI path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert struct_size to strict check (require >= sizeof) since the function takes &LlgTokenizerInitV2 — Rust assumes the full struct is readable, so accepting smaller sizes would be UB. Update docs to note struct_size is reserved for future forward compatibility. from_init_v2(), before calling with_eos_tokens(). This gives C callers a graceful error instead of a panic across FFI. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parser/llguidance.h | 10 +++++----- parser/src/ffi.rs | 34 ++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/parser/llguidance.h b/parser/llguidance.h index 2b578768..1f922645 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -252,15 +252,15 @@ typedef struct LlgTokenizerInit { * Use with `llg_new_tokenizer_v2()`. * * Initialize with: `LlgTokenizerInitV2 init = {}; init.struct_size = sizeof(init);` - * The struct_size field allows future fields to be appended without breaking - * existing callers — new fields will default to zero when struct_size is smaller - * than the library expects. + * The struct_size field is reserved for forward compatibility: future library + * versions will accept older (smaller) struct sizes and default new fields to zero. + * Currently, struct_size must equal `sizeof(LlgTokenizerInitV2)`. */ typedef struct LlgTokenizerInitV2 { /** * Must be set to `sizeof(LlgTokenizerInitV2)`. - * The library uses this to determine which fields are present, allowing - * older callers (with a smaller struct) to work with newer library versions. + * Reserved for forward compatibility: future library versions will use this + * to detect which fields are present when new fields are appended. */ size_t struct_size; /** diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 84ba405c..cf87513d 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -143,21 +143,15 @@ impl LlgTokenizer { } fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { - // The minimum struct_size is the base fields through slices (same as - // LlgTokenizerInit + the leading struct_size field). Fields appended - // after slices are only read when struct_size indicates they're present. - let base_size = std::mem::offset_of!(LlgTokenizerInitV2, tok_eos_extra); + let expected_size = std::mem::size_of::(); ensure!( - init.struct_size >= base_size, + init.struct_size >= expected_size, "LlgTokenizerInitV2.struct_size is {} but expected at least {}. \ Set struct_size = sizeof(LlgTokenizerInitV2).", init.struct_size, - base_size + expected_size ); - let has_eos_extra = - init.struct_size >= std::mem::size_of::(); - // Build a v1 init from the shared fields and delegate let v1 = LlgTokenizerInit { vocab_size: init.vocab_size, @@ -173,14 +167,22 @@ impl LlgTokenizer { }; let mut tok = Self::from_init(&v1)?; - // Apply additional EOS tokens if the struct is large enough to contain them - if has_eos_extra && !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { + // Apply additional EOS tokens if provided + if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { let extra = unsafe { std::slice::from_raw_parts(init.tok_eos_extra, init.tok_eos_extra_count as usize) }; let mut eos_tokens = vec![init.tok_eos]; eos_tokens.extend_from_slice(extra); + let vocab_size = tok.factory.tok_env().tok_trie().vocab_size() as u32; + for &id in &eos_tokens { + ensure!( + id < vocab_size, + "EOS token ID {id} is out of range (vocab_size={vocab_size})" + ); + } + let trie = tok .factory .tok_env() @@ -326,14 +328,14 @@ pub struct LlgTokenizerInit { /// Use with `llg_new_tokenizer_v2()`. /// /// Initialize with: `LlgTokenizerInitV2 init = {}; init.struct_size = sizeof(init);` -/// The struct_size field allows future fields to be appended without breaking -/// existing callers — new fields will default to zero when struct_size is smaller -/// than the library expects. +/// The struct_size field is reserved for forward compatibility: future library +/// versions will accept older (smaller) struct sizes and default new fields to zero. +/// Currently, struct_size must equal `sizeof(LlgTokenizerInitV2)`. #[repr(C)] pub struct LlgTokenizerInitV2 { /// Must be set to `sizeof(LlgTokenizerInitV2)`. - /// The library uses this to determine which fields are present, allowing - /// older callers (with a smaller struct) to work with newer library versions. + /// Reserved for forward compatibility: future library versions will use this + /// to detect which fields are present when new fields are appended. pub struct_size: usize, /// The number of tokens in the vocabulary From 56526daab1942beed2b2d8fe6a28d9bad3fa0eaa Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 12:05:58 -0800 Subject: [PATCH 12/35] Make struct_size forward compatibility real via raw pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change llg_new_tokenizer_v2() to take a raw pointer instead of a Rust reference. The function reads struct_size first, then copies only min(struct_size, sizeof) bytes into a local zeroed struct. This means callers compiled against an older (smaller) header genuinely work with newer library versions — new fields default to zero. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parser/llguidance.h | 21 +++++++++++---- parser/src/ffi.rs | 66 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/parser/llguidance.h b/parser/llguidance.h index 1f922645..af304ebf 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -252,15 +252,15 @@ typedef struct LlgTokenizerInit { * Use with `llg_new_tokenizer_v2()`. * * Initialize with: `LlgTokenizerInitV2 init = {}; init.struct_size = sizeof(init);` - * The struct_size field is reserved for forward compatibility: future library - * versions will accept older (smaller) struct sizes and default new fields to zero. - * Currently, struct_size must equal `sizeof(LlgTokenizerInitV2)`. + * The library only reads `struct_size` bytes from the pointer, so callers + * compiled against an older header (with a smaller struct) will work with + * newer library versions — any new fields default to zero. */ typedef struct LlgTokenizerInitV2 { /** * Must be set to `sizeof(LlgTokenizerInitV2)`. - * Reserved for forward compatibility: future library versions will use this - * to detect which fields are present when new fields are appended. + * The library uses this to determine how many bytes to read, enabling + * forward compatibility when new fields are appended in future versions. */ size_t struct_size; /** @@ -436,6 +436,17 @@ struct LlgTokenizer *llg_new_tokenizer(const struct LlgTokenizerInit *tok_init, /** * Create a new tokenizer from a LlgTokenizerInitV2 struct. * This is the v2 API that supports multiple EOS tokens. + * + * The `tok_init` pointer must be valid and `tok_init->struct_size` must be set + * to `sizeof(LlgTokenizerInitV2)` as known by the caller. The library will + * only read `struct_size` bytes, so callers compiled against an older (smaller) + * version of the struct will work with newer library versions — new fields + * default to zero. + * + * `tok_init` must point to at least `tok_init->struct_size` bytes of + * initialized memory, and `struct_size` must be at least + * `offsetof(LlgTokenizerInitV2, tok_eos)` (i.e., include the struct_size + * field itself plus vocab_size). */ struct LlgTokenizer *llg_new_tokenizer_v2(const struct LlgTokenizerInitV2 *tok_init, char *error_string, diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index cf87513d..bdc23d9e 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -143,14 +143,6 @@ impl LlgTokenizer { } fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { - let expected_size = std::mem::size_of::(); - ensure!( - init.struct_size >= expected_size, - "LlgTokenizerInitV2.struct_size is {} but expected at least {}. \ - Set struct_size = sizeof(LlgTokenizerInitV2).", - init.struct_size, - expected_size - ); // Build a v1 init from the shared fields and delegate let v1 = LlgTokenizerInit { @@ -328,14 +320,14 @@ pub struct LlgTokenizerInit { /// Use with `llg_new_tokenizer_v2()`. /// /// Initialize with: `LlgTokenizerInitV2 init = {}; init.struct_size = sizeof(init);` -/// The struct_size field is reserved for forward compatibility: future library -/// versions will accept older (smaller) struct sizes and default new fields to zero. -/// Currently, struct_size must equal `sizeof(LlgTokenizerInitV2)`. +/// The library only reads `struct_size` bytes from the pointer, so callers +/// compiled against an older header (with a smaller struct) will work with +/// newer library versions — any new fields default to zero. #[repr(C)] pub struct LlgTokenizerInitV2 { /// Must be set to `sizeof(LlgTokenizerInitV2)`. - /// Reserved for forward compatibility: future library versions will use this - /// to detect which fields are present when new fields are appended. + /// The library uses this to determine how many bytes to read, enabling + /// forward compatibility when new fields are appended in future versions. pub struct_size: usize, /// The number of tokens in the vocabulary @@ -809,15 +801,57 @@ pub unsafe extern "C" fn llg_new_tokenizer( /// Create a new tokenizer from a LlgTokenizerInitV2 struct. /// This is the v2 API that supports multiple EOS tokens. +/// +/// The `tok_init` pointer must be valid and `tok_init->struct_size` must be set +/// to `sizeof(LlgTokenizerInitV2)` as known by the caller. The library will +/// only read `struct_size` bytes, so callers compiled against an older (smaller) +/// version of the struct will work with newer library versions — new fields +/// default to zero. +/// /// # Safety -/// This function should only be called from C code. +/// `tok_init` must point to at least `tok_init->struct_size` bytes of +/// initialized memory, and `struct_size` must be at least +/// `offsetof(LlgTokenizerInitV2, tok_eos)` (i.e., include the struct_size +/// field itself plus vocab_size). #[no_mangle] pub unsafe extern "C" fn llg_new_tokenizer_v2( - tok_init: &LlgTokenizerInitV2, + tok_init: *const LlgTokenizerInitV2, error_string: *mut c_char, error_string_len: usize, ) -> *mut LlgTokenizer { - match LlgTokenizer::from_init_v2(tok_init) { + if tok_init.is_null() { + save_error_string( + anyhow::anyhow!("tok_init is NULL"), + error_string, + error_string_len, + ); + return std::ptr::null_mut(); + } + + // Read struct_size from the first field (always safe if pointer is valid) + let struct_size = unsafe { std::ptr::read(tok_init as *const usize) }; + let min_size = std::mem::offset_of!(LlgTokenizerInitV2, tok_eos); + if struct_size < min_size { + save_error_string( + anyhow::anyhow!( + "LlgTokenizerInitV2.struct_size is {struct_size} but expected at least {min_size}. \ + Set struct_size = sizeof(LlgTokenizerInitV2)." + ), + error_string, + error_string_len, + ); + return std::ptr::null_mut(); + } + + // Copy the caller's data into a zero-initialized local struct. + // Fields beyond what the caller provides default to zero. + let mut local: LlgTokenizerInitV2 = unsafe { std::mem::zeroed() }; + let copy_size = std::cmp::min(struct_size, std::mem::size_of::()); + unsafe { + std::ptr::copy_nonoverlapping(tok_init as *const u8, &mut local as *mut LlgTokenizerInitV2 as *mut u8, copy_size); + } + + match LlgTokenizer::from_init_v2(&local) { Ok(tok) => Box::into_raw(Box::new(tok)), Err(e) => { save_error_string(e, error_string, error_string_len); From a7292efa08719883490d9caf1d75f8fcea7fe918 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 12:19:14 -0800 Subject: [PATCH 13/35] cargo fmt --- parser/src/ffi.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index bdc23d9e..1076ad82 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -143,7 +143,6 @@ impl LlgTokenizer { } fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { - // Build a v1 init from the shared fields and delegate let v1 = LlgTokenizerInit { vocab_size: init.vocab_size, @@ -848,7 +847,11 @@ pub unsafe extern "C" fn llg_new_tokenizer_v2( let mut local: LlgTokenizerInitV2 = unsafe { std::mem::zeroed() }; let copy_size = std::cmp::min(struct_size, std::mem::size_of::()); unsafe { - std::ptr::copy_nonoverlapping(tok_init as *const u8, &mut local as *mut LlgTokenizerInitV2 as *mut u8, copy_size); + std::ptr::copy_nonoverlapping( + tok_init as *const u8, + &mut local as *mut LlgTokenizerInitV2 as *mut u8, + copy_size, + ); } match LlgTokenizer::from_init_v2(&local) { From 988365ae2344e15d0be3bed5b9febfc9cb9437f7 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 12:44:03 -0800 Subject: [PATCH 14/35] Fix multi-EOS in stopped/error fallbacks and TokenizerWrapper path - Fix TokenizerWrapper path in py_new to apply eos_token override - Add TokTrie::eos_token_set() that includes all EOS tokens - Fix LLMatcher::eos_token_set() to use all EOS tokens (was singleton) - Fix LLMatcher::consume_token_inner() to accept any EOS token - Fix Matcher::compute_mask_or_eos() to use all EOS tokens - Add Python tests for multi-EOS via TokenizerWrapper mock Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parser/src/matcher.rs | 3 +- python/torch_tests/test_matcher.py | 52 ++++++++++++++++++++++++++++++ python_ext/src/llmatcher.rs | 11 +++++-- python_ext/src/py.rs | 7 +++- toktrie/src/toktree.rs | 9 ++++++ 5 files changed, 76 insertions(+), 6 deletions(-) diff --git a/parser/src/matcher.rs b/parser/src/matcher.rs index ff393330..6108d17c 100644 --- a/parser/src/matcher.rs +++ b/parser/src/matcher.rs @@ -109,8 +109,7 @@ impl Matcher { pub fn compute_mask_or_eos(&mut self) -> Result { self.with_inner(|inner| { if inner.parser.stop_reason() != StopReason::NotStopped { - let trie = inner.parser.token_env.tok_trie(); - Ok(trie.singleton_token_set(trie.eos_token())) + Ok(inner.parser.token_env.tok_trie().eos_token_set()) } else { inner.parser.compute_mask() } diff --git a/python/torch_tests/test_matcher.py b/python/torch_tests/test_matcher.py index f32a3241..3a77f8f8 100644 --- a/python/torch_tests/test_matcher.py +++ b/python/torch_tests/test_matcher.py @@ -665,3 +665,55 @@ def test_grammar_warnings() -> None: {"not": { "type": "object" }}) + + +def test_multi_eos_tokens_property() -> None: + """Test that eos_tokens returns the expected list for a single-EOS tokenizer.""" + tok = LLTokenizer("byte") + assert tok.eos_tokens == [tok.eos_token] + assert len(tok.eos_tokens) == 1 + + +class _MockTokenizerWrapper: + """Minimal mock that satisfies the TokenizerWrapper interface for testing.""" + + def __init__(self, tokens: list, eos_token_id: int): + self.tokens = tokens + self.eos_token_id = eos_token_id + self.bos_token_id = None + self.special_token_ids = [] + self.is_tokenizer_wrapper = True + + def __call__(self, s: str) -> list: + return [b for b in s.encode("utf-8")] + + +def test_multi_eos_wrapper_override() -> None: + """Test that eos_token override works with TokenizerWrapper path.""" + # Create a minimal byte-level tokenizer with 258 tokens: + # tokens 0-255 are single bytes, 256 is , 257 is + tokens = [bytes([i]) for i in range(256)] + tokens.append(b"") + tokens.append(b"") + wrapper = _MockTokenizerWrapper(tokens, eos_token_id=256) + + # Without override: single EOS + tok1 = LLTokenizer(wrapper) + assert tok1.eos_token == 256 + assert tok1.eos_tokens == [256] + + # With override: multiple EOS + tok2 = LLTokenizer(wrapper, eos_token=[256, 257]) + assert tok2.eos_token == 256 + assert tok2.eos_tokens == [256, 257] + + # Verify both EOS tokens appear in mask when grammar is accepting + m = LLMatcher(tok2, r'start: "ab"') + assert not m.is_error() + m.consume_token(ord("a")) + m.consume_token(ord("b")) + assert not m.is_error() + + mask = m.compute_logit_bias() + assert mask[256] == 200 # primary EOS + assert mask[257] == 200 # extra EOS diff --git a/python_ext/src/llmatcher.rs b/python_ext/src/llmatcher.rs index edc2edde..f4a86d98 100644 --- a/python_ext/src/llmatcher.rs +++ b/python_ext/src/llmatcher.rs @@ -265,8 +265,7 @@ impl LLMatcher { } fn eos_token_set(&self) -> SimpleVob { - let trie = self.tok_env.tok_trie(); - trie.singleton_token_set(trie.eos_token()) + self.tok_env.tok_trie().eos_token_set() } fn compute_mask_or_eos(&mut self) -> SimpleVob { @@ -280,7 +279,13 @@ impl LLMatcher { } fn consume_token_inner(&mut self, sampled_token: TokenId) -> bool { - if self.inner.is_stopped() && sampled_token == self.tok_env.tok_trie().eos_token() { + if self.inner.is_stopped() + && self + .tok_env + .tok_trie() + .eos_tokens() + .contains(&sampled_token) + { true } else { self.inner.consume_token(sampled_token).is_ok() diff --git a/python_ext/src/py.rs b/python_ext/src/py.rs index dec13f6d..0d555e19 100644 --- a/python_ext/src/py.rs +++ b/python_ext/src/py.rs @@ -72,7 +72,12 @@ impl LLTokenizer { tok.into_tok_env(n_vocab).map_err(val_error)? } } else { - Arc::new(PyTokenizer::py_new(tokenizer)?) + let mut py_tok = PyTokenizer::py_new(tokenizer)?; + if let Some(ref eos_tokens) = eos_tokens { + py_tok.tok_trie = + Arc::new(py_tok.tok_trie.with_eos_tokens(eos_tokens)); + } + Arc::new(py_tok) }; let factory = ParserFactory::new( &tok_env, diff --git a/toktrie/src/toktree.rs b/toktrie/src/toktree.rs index 763ad9aa..2221e462 100644 --- a/toktrie/src/toktree.rs +++ b/toktrie/src/toktree.rs @@ -283,6 +283,15 @@ impl TokTrie { r } + /// Returns a token set containing all EOS tokens. + pub fn eos_token_set(&self) -> SimpleVob { + let mut r = self.alloc_token_set(); + for &eos in self.eos_tokens() { + r.allow_token(eos); + } + r + } + pub fn token_set_dbg(&self, ts: &SimpleVob) -> String { let max_examples = 50; From b0ec8ef216506bf6e4e6ebfc974e00e5fdcbf78d Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 12:51:14 -0800 Subject: [PATCH 15/35] Add Rust tests for multi-EOS stopped-state mask and simplify Python test - Add test_eos_token_set_single and test_eos_token_set_multiple in toktrie - Add test_multi_eos_mask_when_stopped in sample_parser (Matcher level) - Simplify Python mock test to only verify TokenizerWrapper override applies Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python/torch_tests/test_matcher.py | 11 ------- sample_parser/tests/test_raw_parser.rs | 45 +++++++++++++++++++++++++- toktrie/src/toktree.rs | 21 ++++++++++++ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/python/torch_tests/test_matcher.py b/python/torch_tests/test_matcher.py index 3a77f8f8..d12a3239 100644 --- a/python/torch_tests/test_matcher.py +++ b/python/torch_tests/test_matcher.py @@ -706,14 +706,3 @@ def test_multi_eos_wrapper_override() -> None: tok2 = LLTokenizer(wrapper, eos_token=[256, 257]) assert tok2.eos_token == 256 assert tok2.eos_tokens == [256, 257] - - # Verify both EOS tokens appear in mask when grammar is accepting - m = LLMatcher(tok2, r'start: "ab"') - assert not m.is_error() - m.consume_token(ord("a")) - m.consume_token(ord("b")) - assert not m.is_error() - - mask = m.compute_logit_bias() - assert mask[256] == 200 # primary EOS - assert mask[257] == 200 # extra EOS diff --git a/sample_parser/tests/test_raw_parser.rs b/sample_parser/tests/test_raw_parser.rs index f67dceff..b4a39332 100644 --- a/sample_parser/tests/test_raw_parser.rs +++ b/sample_parser/tests/test_raw_parser.rs @@ -2,10 +2,11 @@ use lazy_static::lazy_static; use llguidance::{ api::TopLevelGrammar, earley::SlicedBiasComputer, - toktrie::{InferenceCapabilities, TokEnv}, + toktrie::{ApproximateTokEnv, InferenceCapabilities, TokEnv, TokenizerEnv}, Matcher, ParserFactory, TokenParser, }; use serde_json::{json, Value}; +use std::sync::Arc; lazy_static! { static ref PARSER_FACTORY_PHI: ParserFactory = { @@ -355,3 +356,45 @@ fn test_try_consume_eos_consistency() { assert!(eos_consumed <= 1); assert_eq!(n_consumed_no_eos + eos_consumed, n_consumed_all); } + +#[test] +fn test_multi_eos_mask_when_stopped() { + // Build a byte-level tokenizer with two EOS tokens + let base = ApproximateTokEnv::single_byte(); + let base_trie = base.tok_trie(); + let primary_eos = base_trie.eos_token(); + // Pick a special token as the second EOS + let extra_eos = primary_eos - 1; + let multi_trie = base_trie.clone().with_eos_tokens(&[primary_eos, extra_eos]); + let tok_env: TokEnv = Arc::new(ApproximateTokEnv::new(multi_trie)); + + let factory = ParserFactory::new( + &tok_env, + InferenceCapabilities::default(), + &SlicedBiasComputer::general_slices(), + ) + .unwrap(); + + let grm = TopLevelGrammar::from_lark(r#"start: "a""#.to_string()); + let mut parser = factory.create_parser(grm).unwrap(); + parser.start_without_prompt(); + let mut matcher = Matcher::new(Ok(parser)); + + // Consume "a" — grammar should accept + let mask = matcher.compute_mask().unwrap(); + assert!(mask.is_allowed(b'a' as u32)); + matcher.consume_token(b'a' as u32).unwrap(); + + // Parser stops after accepting the full input. + // compute_mask_or_eos should include BOTH EOS tokens. + let mask = matcher.compute_mask_or_eos().unwrap(); + assert!( + mask.is_allowed(primary_eos), + "primary EOS should be in stopped mask" + ); + assert!( + mask.is_allowed(extra_eos), + "extra EOS should be in stopped mask" + ); + assert!(matcher.is_stopped()); +} diff --git a/toktrie/src/toktree.rs b/toktrie/src/toktree.rs index 2221e462..00895cd5 100644 --- a/toktrie/src/toktree.rs +++ b/toktrie/src/toktree.rs @@ -1276,4 +1276,25 @@ mod tests { fn test_with_eos_tokens_empty_panics() { make_test_trie(0).with_eos_tokens(&[]); } + + #[test] + fn test_eos_token_set_single() { + let trie = make_test_trie(2); + let set = trie.eos_token_set(); + assert!(set.is_allowed(2)); + assert!(!set.is_allowed(0)); + assert!(!set.is_allowed(1)); + assert_eq!(set.num_set(), 1); + } + + #[test] + fn test_eos_token_set_multiple() { + let trie = make_test_trie(0).with_eos_tokens(&[1, 3]); + let set = trie.eos_token_set(); + assert!(set.is_allowed(1)); + assert!(set.is_allowed(3)); + assert!(!set.is_allowed(0)); + assert!(!set.is_allowed(2)); + assert_eq!(set.num_set(), 2); + } } From 821f5933d141b410cc4cfd51673455d2494b56be Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 13:50:42 -0800 Subject: [PATCH 16/35] cargo fmt --- python_ext/src/py.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python_ext/src/py.rs b/python_ext/src/py.rs index 0d555e19..b7a41c6f 100644 --- a/python_ext/src/py.rs +++ b/python_ext/src/py.rs @@ -74,8 +74,7 @@ impl LLTokenizer { } else { let mut py_tok = PyTokenizer::py_new(tokenizer)?; if let Some(ref eos_tokens) = eos_tokens { - py_tok.tok_trie = - Arc::new(py_tok.tok_trie.with_eos_tokens(eos_tokens)); + py_tok.tok_trie = Arc::new(py_tok.tok_trie.with_eos_tokens(eos_tokens)); } Arc::new(py_tok) }; From 229cf296a829cc220800eace05b1b45d0c3a91e0 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 14:01:08 -0800 Subject: [PATCH 17/35] Refactor from_init_v2 to avoid double factory construction Extract build_trie() from from_init() so from_init_v2() can apply with_eos_tokens() to the trie before building the factory once. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parser/src/ffi.rs | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 1076ad82..47050fb4 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -100,6 +100,11 @@ unsafe fn slice_from_ptr_or_empty<'a, T>(data: *const T, len: usize) -> &'a [T] impl LlgTokenizer { fn from_init(init: &LlgTokenizerInit) -> Result { + let trie = Self::build_trie(init)?; + Self::finish_init(init, trie) + } + + fn build_trie(init: &LlgTokenizerInit) -> Result { ensure!( init.tokenize_fn.is_some() || init.use_approximate_greedy_tokenize_fn, "Either tokenize_fn or use_approximate_greedy_tokenize_fn must be set" @@ -137,9 +142,7 @@ impl LlgTokenizer { token_bytes }; - let trie = TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens); - - Self::finish_init(init, trie) + Ok(TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens)) } fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { @@ -156,7 +159,9 @@ impl LlgTokenizer { tokenize_user_data: init.tokenize_user_data, slices: init.slices, }; - let mut tok = Self::from_init(&v1)?; + + // Build the trie via the v1 path + let mut trie = Self::build_trie(&v1)?; // Apply additional EOS tokens if provided if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { @@ -166,7 +171,7 @@ impl LlgTokenizer { let mut eos_tokens = vec![init.tok_eos]; eos_tokens.extend_from_slice(extra); - let vocab_size = tok.factory.tok_env().tok_trie().vocab_size() as u32; + let vocab_size = trie.vocab_size() as u32; for &id in &eos_tokens { ensure!( id < vocab_size, @@ -174,24 +179,10 @@ impl LlgTokenizer { ); } - let trie = tok - .factory - .tok_env() - .tok_trie() - .clone() - .with_eos_tokens(&eos_tokens); - let tok_env: TokEnv = Arc::new(CTokenizerInner { - trie, - tokenize_assumes_string: init.tokenize_assumes_string && init.tokenize_fn.is_some(), - tokenize_fn: init.tokenize_fn, - tokenize_user_data: init.tokenize_user_data, - }); - let slices = Self::read_slices_raw(init.slices)?; - let factory = ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; - tok.factory = Arc::new(factory); + trie = trie.with_eos_tokens(&eos_tokens); } - Ok(tok) + Self::finish_init(&v1, trie) } fn read_slices_raw(slices: *const *const c_char) -> Result> { From 1dc5bd6e57f427534e6caefaf4e3273d2e2977b1 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 14:07:19 -0800 Subject: [PATCH 18/35] Validate EOS token IDs in Python entry points Add validate_eos_tokens() that raises PyValueError for out-of-range IDs. Called in all Python paths (py_new, from_tiktoken, from_llamacpp) before with_eos_tokens/set_eos_tokens to give clean errors instead of panics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python_ext/src/llamatokenizer.rs | 8 ++++++++ python_ext/src/py.rs | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/python_ext/src/llamatokenizer.rs b/python_ext/src/llamatokenizer.rs index c5642ffa..7af93931 100644 --- a/python_ext/src/llamatokenizer.rs +++ b/python_ext/src/llamatokenizer.rs @@ -125,6 +125,14 @@ pub fn tokenv_from_llamacpp( ensure!(vocab_ptr != 0, "vocab_ptr must be non-null"); ensure!(tokenize_fptr != 0, "tokenize_fptr must be non-null"); + let vocab_size = tokens.len() as u32; + for &id in eos_tokens { + ensure!( + id < vocab_size, + "EOS token ID {id} is out of range (vocab_size={vocab_size})" + ); + } + let info = TokRxInfo::new(tokens.len() as u32, eos_tokens[0]); let mut trie = TokTrie::from(&info, &tokens); if eos_tokens.len() > 1 { diff --git a/python_ext/src/py.rs b/python_ext/src/py.rs index b7a41c6f..cbe9f94f 100644 --- a/python_ext/src/py.rs +++ b/python_ext/src/py.rs @@ -33,6 +33,18 @@ fn extract_eos_tokens(obj: &Bound<'_, PyAny>) -> PyResult> { } } +/// Validate that all EOS token IDs are within vocab range. +fn validate_eos_tokens(eos_tokens: &[u32], vocab_size: u32) -> PyResult<()> { + for &id in eos_tokens { + if id >= vocab_size { + return Err(PyValueError::new_err(format!( + "EOS token ID {id} is out of range (vocab_size={vocab_size})" + ))); + } + } + Ok(()) +} + struct PyTokenizer { tok_trie: Arc, tokenizer_fun: Py, @@ -67,6 +79,7 @@ impl LLTokenizer { ByteTokenizer::from_file(&tokenizer_str).map_err(val_error)? }; if let Some(ref eos_tokens) = eos_tokens { + validate_eos_tokens(eos_tokens, tok.tokrx_info().vocab_size)?; tok.set_eos_tokens(eos_tokens); } tok.into_tok_env(n_vocab).map_err(val_error)? @@ -74,6 +87,7 @@ impl LLTokenizer { } else { let mut py_tok = PyTokenizer::py_new(tokenizer)?; if let Some(ref eos_tokens) = eos_tokens { + validate_eos_tokens(eos_tokens, py_tok.tok_trie.vocab_size() as u32)?; py_tok.tok_trie = Arc::new(py_tok.tok_trie.with_eos_tokens(eos_tokens)); } Arc::new(py_tok) @@ -113,6 +127,7 @@ impl LLTokenizer { ) .map_err(val_error)?; if eos_tokens.len() > 1 { + validate_eos_tokens(&eos_tokens, bpe.tokrx_info().vocab_size)?; bpe.set_eos_tokens(&eos_tokens); } let tok_env = bpe.to_env(); From 88caf8bce68ad1097bcec0d11deb964cc8c6a04c Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 14:09:37 -0800 Subject: [PATCH 19/35] Validate single EOS token in from_tiktoken path too Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python_ext/src/py.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_ext/src/py.rs b/python_ext/src/py.rs index cbe9f94f..337fa8ef 100644 --- a/python_ext/src/py.rs +++ b/python_ext/src/py.rs @@ -126,8 +126,8 @@ impl LLTokenizer { eos_tokens[0], ) .map_err(val_error)?; + validate_eos_tokens(&eos_tokens, bpe.tokrx_info().vocab_size)?; if eos_tokens.len() > 1 { - validate_eos_tokens(&eos_tokens, bpe.tokrx_info().vocab_size)?; bpe.set_eos_tokens(&eos_tokens); } let tok_env = bpe.to_env(); From acbb724b8da7d75c834c437df6783f5f36e777ea Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 14:11:41 -0800 Subject: [PATCH 20/35] cargo fmt --- parser/src/ffi.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 47050fb4..6c20e98a 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -142,7 +142,10 @@ impl LlgTokenizer { token_bytes }; - Ok(TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens)) + Ok(TokTrie::from( + &TokRxInfo::new(tokens.len() as u32, init.tok_eos), + &tokens, + )) } fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { From 197b628451d0be2c2e6b7c05040355b8b7a5b18a Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 14:21:00 -0800 Subject: [PATCH 21/35] Address remaining review comments - Fix min_size check to include full tok_eos field, not just its offset - Update doc comment for llg_new_tokenizer_v2 accordingly - Add vocab_size validation to ByteTokenizer::set_eos_token(s) - Free token_lens/token_bytes allocations in c_sample after use Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 6 ++++++ parser/llguidance.h | 4 ++-- parser/src/ffi.rs | 7 ++++--- toktrie_hf_tokenizers/src/lib.rs | 12 ++++++++++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 69438d03..d0a4c3a8 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -51,6 +51,9 @@ LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, exit(1); } + delete[] token_lens; + delete[] token_bytes; + return tok; } @@ -101,6 +104,9 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, exit(1); } + delete[] token_lens; + delete[] token_bytes; + return tok; } diff --git a/parser/llguidance.h b/parser/llguidance.h index af304ebf..16efd15a 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -445,8 +445,8 @@ struct LlgTokenizer *llg_new_tokenizer(const struct LlgTokenizerInit *tok_init, * * `tok_init` must point to at least `tok_init->struct_size` bytes of * initialized memory, and `struct_size` must be at least - * `offsetof(LlgTokenizerInitV2, tok_eos)` (i.e., include the struct_size - * field itself plus vocab_size). + * `offsetof(LlgTokenizerInitV2, token_lens)` (i.e., include struct_size, + * vocab_size, and the complete tok_eos field). */ struct LlgTokenizer *llg_new_tokenizer_v2(const struct LlgTokenizerInitV2 *tok_init, char *error_string, diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 6c20e98a..64cde597 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -804,8 +804,8 @@ pub unsafe extern "C" fn llg_new_tokenizer( /// # Safety /// `tok_init` must point to at least `tok_init->struct_size` bytes of /// initialized memory, and `struct_size` must be at least -/// `offsetof(LlgTokenizerInitV2, tok_eos)` (i.e., include the struct_size -/// field itself plus vocab_size). +/// `offsetof(LlgTokenizerInitV2, token_lens)` (i.e., include struct_size, +/// vocab_size, and the complete tok_eos field). #[no_mangle] pub unsafe extern "C" fn llg_new_tokenizer_v2( tok_init: *const LlgTokenizerInitV2, @@ -823,7 +823,8 @@ pub unsafe extern "C" fn llg_new_tokenizer_v2( // Read struct_size from the first field (always safe if pointer is valid) let struct_size = unsafe { std::ptr::read(tok_init as *const usize) }; - let min_size = std::mem::offset_of!(LlgTokenizerInitV2, tok_eos); + let min_size = + std::mem::offset_of!(LlgTokenizerInitV2, tok_eos) + std::mem::size_of::(); if struct_size < min_size { save_error_string( anyhow::anyhow!( diff --git a/toktrie_hf_tokenizers/src/lib.rs b/toktrie_hf_tokenizers/src/lib.rs index 6f54beab..29d126ad 100644 --- a/toktrie_hf_tokenizers/src/lib.rs +++ b/toktrie_hf_tokenizers/src/lib.rs @@ -232,12 +232,24 @@ impl ByteTokenizer { } pub fn set_eos_token(&mut self, tok_id: u32) { + assert!( + tok_id < self.info.vocab_size, + "EOS token ID {tok_id} is out of range (vocab_size={})", + self.info.vocab_size + ); self.info.tok_eos = tok_id; self.eos_tokens_extra.clear(); } pub fn set_eos_tokens(&mut self, tokens: &[TokenId]) { assert!(!tokens.is_empty(), "eos_tokens must not be empty"); + for &tok in tokens { + assert!( + tok < self.info.vocab_size, + "EOS token ID {tok} is out of range (vocab_size={})", + self.info.vocab_size + ); + } self.info.tok_eos = tokens[0]; self.eos_tokens_extra = tokens[1..].to_vec(); } From 96bb5fbe24bb7999b356273b21e625ea51a89d4c Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 14:34:02 -0800 Subject: [PATCH 22/35] Fix mypy errors in test_matcher.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python/torch_tests/test_matcher.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/torch_tests/test_matcher.py b/python/torch_tests/test_matcher.py index d12a3239..c4e398c1 100644 --- a/python/torch_tests/test_matcher.py +++ b/python/torch_tests/test_matcher.py @@ -677,14 +677,14 @@ def test_multi_eos_tokens_property() -> None: class _MockTokenizerWrapper: """Minimal mock that satisfies the TokenizerWrapper interface for testing.""" - def __init__(self, tokens: list, eos_token_id: int): + def __init__(self, tokens: List[bytes], eos_token_id: int): self.tokens = tokens self.eos_token_id = eos_token_id self.bos_token_id = None - self.special_token_ids = [] + self.special_token_ids: List[int] = [] self.is_tokenizer_wrapper = True - def __call__(self, s: str) -> list: + def __call__(self, s: str) -> List[int]: return [b for b in s.encode("utf-8")] @@ -698,11 +698,11 @@ def test_multi_eos_wrapper_override() -> None: wrapper = _MockTokenizerWrapper(tokens, eos_token_id=256) # Without override: single EOS - tok1 = LLTokenizer(wrapper) + tok1 = LLTokenizer(wrapper) # type: ignore[arg-type] assert tok1.eos_token == 256 assert tok1.eos_tokens == [256] # With override: multiple EOS - tok2 = LLTokenizer(wrapper, eos_token=[256, 257]) + tok2 = LLTokenizer(wrapper, eos_token=[256, 257]) # type: ignore[arg-type] assert tok2.eos_token == 256 assert tok2.eos_tokens == [256, 257] From ec47b5ded27134bd9d10d42c0446444f4ccf4bbc Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 14:44:23 -0800 Subject: [PATCH 23/35] Guard eos_token_set() against INVALID_TOKEN and out-of-range IDs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- toktrie/src/toktree.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/toktrie/src/toktree.rs b/toktrie/src/toktree.rs index 00895cd5..6f04347d 100644 --- a/toktrie/src/toktree.rs +++ b/toktrie/src/toktree.rs @@ -286,8 +286,11 @@ impl TokTrie { /// Returns a token set containing all EOS tokens. pub fn eos_token_set(&self) -> SimpleVob { let mut r = self.alloc_token_set(); + let vocab = self.vocab_size() as u32; for &eos in self.eos_tokens() { - r.allow_token(eos); + if eos != INVALID_TOKEN && eos < vocab { + r.allow_token(eos); + } } r } From 037a7b0483c529314e5f9f86e16b7745517eea33 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 6 Mar 2026 15:06:11 -0800 Subject: [PATCH 24/35] Use offset_of token_lens for min_size to match doc comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parser/src/ffi.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 64cde597..d75b3f3c 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -823,8 +823,7 @@ pub unsafe extern "C" fn llg_new_tokenizer_v2( // Read struct_size from the first field (always safe if pointer is valid) let struct_size = unsafe { std::ptr::read(tok_init as *const usize) }; - let min_size = - std::mem::offset_of!(LlgTokenizerInitV2, tok_eos) + std::mem::size_of::(); + let min_size = std::mem::offset_of!(LlgTokenizerInitV2, token_lens); if struct_size < min_size { save_error_string( anyhow::anyhow!( From afa0241c42884fe0029a85935726544afcc3a99b Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 10 Mar 2026 09:17:39 -0700 Subject: [PATCH 25/35] clean up python tests a little bit --- python/torch_tests/test_matcher.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/python/torch_tests/test_matcher.py b/python/torch_tests/test_matcher.py index b4a4722b..8a8194fe 100644 --- a/python/torch_tests/test_matcher.py +++ b/python/torch_tests/test_matcher.py @@ -694,28 +694,29 @@ def test_multi_eos_tokens_property() -> None: assert len(tok.eos_tokens) == 1 -class _MockTokenizerWrapper: - """Minimal mock that satisfies the TokenizerWrapper interface for testing.""" - def __init__(self, tokens: List[bytes], eos_token_id: int): - self.tokens = tokens - self.eos_token_id = eos_token_id - self.bos_token_id = None - self.special_token_ids: List[int] = [] - self.is_tokenizer_wrapper = True +def test_multi_eos_wrapper_override() -> None: + """Test that eos_token override works with TokenizerWrapper path.""" - def __call__(self, s: str) -> List[int]: - return [b for b in s.encode("utf-8")] + class MockTokenizerWrapper: + """Minimal mock that satisfies the TokenizerWrapper interface for testing.""" + def __init__(self, tokens: List[bytes], eos_token_id: int): + self.tokens = tokens + self.eos_token_id = eos_token_id + self.bos_token_id = None + self.special_token_ids: List[int] = [] + self.is_tokenizer_wrapper = True + + def __call__(self, s: str) -> List[int]: + return [b for b in s.encode("utf-8")] -def test_multi_eos_wrapper_override() -> None: - """Test that eos_token override works with TokenizerWrapper path.""" # Create a minimal byte-level tokenizer with 258 tokens: # tokens 0-255 are single bytes, 256 is , 257 is tokens = [bytes([i]) for i in range(256)] tokens.append(b"") tokens.append(b"") - wrapper = _MockTokenizerWrapper(tokens, eos_token_id=256) + wrapper = MockTokenizerWrapper(tokens, eos_token_id=256) # Without override: single EOS tok1 = LLTokenizer(wrapper) # type: ignore[arg-type] From c15f0d434e95e4758f6760b64c17528e0f2d38fc Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Wed, 11 Mar 2026 09:30:14 -0700 Subject: [PATCH 26/35] Use std::vector instead of new[]/delete[] in c_sample Replace raw new[]/delete[] allocations for token_lens and token_bytes with std::vector in both create_tokenizer_v2() and create_tokenizer(). This is exception-safe and avoids manual memory management. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index d0a4c3a8..8a904b3f 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -16,16 +16,16 @@ LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, std::vector extra_eos_tokens, LlgTokenizeFn tokenize_fn, const void *tokenize_user_data) { - auto token_lens = new uint32_t[tokens.size()]; + std::vector token_lens(tokens.size()); size_t total_size = 0; for (size_t i = 0; i < tokens.size(); i++) { token_lens[i] = tokens[i].size(); total_size += token_lens[i]; } - auto token_bytes = new uint8_t[total_size]; + std::vector token_bytes(total_size); size_t offset = 0; for (size_t i = 0; i < tokens.size(); i++) { - memcpy(token_bytes + offset, tokens[i].data(), token_lens[i]); + memcpy(token_bytes.data() + offset, tokens[i].data(), token_lens[i]); offset += token_lens[i]; } @@ -33,8 +33,8 @@ LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, tok_init.struct_size = sizeof(tok_init); tok_init.vocab_size = (uint32_t)tokens.size(); tok_init.tok_eos = tok_eos; - tok_init.token_lens = token_lens; - tok_init.token_bytes = token_bytes; + tok_init.token_lens = token_lens.data(); + tok_init.token_bytes = token_bytes.data(); tok_init.tokenize_assumes_string = false; tok_init.tokenize_user_data = tokenize_user_data; tok_init.tokenize_fn = tokenize_fn; @@ -51,9 +51,6 @@ LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, exit(1); } - delete[] token_lens; - delete[] token_bytes; - return tok; } @@ -62,23 +59,23 @@ LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, LlgTokenizer *create_tokenizer(std::vector> &tokens, uint32_t tok_eos, LlgTokenizeFn tokenize_fn, const void *tokenize_user_data) { - auto token_lens = new uint32_t[tokens.size()]; + std::vector token_lens(tokens.size()); size_t total_size = 0; for (size_t i = 0; i < tokens.size(); i++) { token_lens[i] = tokens[i].size(); total_size += token_lens[i]; } - auto token_bytes = new uint8_t[total_size]; + std::vector token_bytes(total_size); size_t offset = 0; for (size_t i = 0; i < tokens.size(); i++) { - memcpy(token_bytes + offset, tokens[i].data(), token_lens[i]); + memcpy(token_bytes.data() + offset, tokens[i].data(), token_lens[i]); offset += token_lens[i]; } LlgTokenizerInit tok_init = {}; tok_init.vocab_size = (uint32_t)tokens.size(); tok_init.tok_eos = tok_eos; - tok_init.token_lens = token_lens; - tok_init.token_bytes = token_bytes; + tok_init.token_lens = token_lens.data(); + tok_init.token_bytes = token_bytes.data(); tok_init.tokenize_assumes_string = false; tok_init.tokenize_user_data = tokenize_user_data; tok_init.tokenize_fn = tokenize_fn; @@ -104,9 +101,6 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, exit(1); } - delete[] token_lens; - delete[] token_bytes; - return tok; } From 3026e272f4ddd81882eaeb61b279555377845675 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Wed, 11 Mar 2026 09:53:32 -0700 Subject: [PATCH 27/35] Take a single eos_tokens vector in create_tokenizer_v2 Instead of separate tok_eos + extra_eos_tokens parameters, accept a single std::vector where [0] is the primary EOS and any remaining entries are extra EOS tokens. Cleaner C++ API while still mapping naturally to the underlying C struct fields. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 8a904b3f..19ade8aa 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -9,13 +9,13 @@ #include "llguidance.h" -// Create an LlgTokenizer using the v2 API with an extra EOS token. -// tok_eos is the primary; extra_eos_tokens are additional EOS token IDs. +// Create an LlgTokenizer using the v2 API. +// eos_tokens[0] is the primary EOS; any remaining entries are extra EOS token IDs. LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, - uint32_t tok_eos, - std::vector extra_eos_tokens, + std::vector eos_tokens, LlgTokenizeFn tokenize_fn, const void *tokenize_user_data) { + assert(!eos_tokens.empty()); std::vector token_lens(tokens.size()); size_t total_size = 0; for (size_t i = 0; i < tokens.size(); i++) { @@ -32,15 +32,15 @@ LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, LlgTokenizerInitV2 tok_init = {}; tok_init.struct_size = sizeof(tok_init); tok_init.vocab_size = (uint32_t)tokens.size(); - tok_init.tok_eos = tok_eos; + tok_init.tok_eos = eos_tokens[0]; tok_init.token_lens = token_lens.data(); tok_init.token_bytes = token_bytes.data(); tok_init.tokenize_assumes_string = false; tok_init.tokenize_user_data = tokenize_user_data; tok_init.tokenize_fn = tokenize_fn; - if (!extra_eos_tokens.empty()) { - tok_init.tok_eos_extra = extra_eos_tokens.data(); - tok_init.tok_eos_extra_count = (uint32_t)extra_eos_tokens.size(); + if (eos_tokens.size() > 1) { + tok_init.tok_eos_extra = eos_tokens.data() + 1; + tok_init.tok_eos_extra_count = (uint32_t)(eos_tokens.size() - 1); } char error_buf[128]; @@ -150,10 +150,10 @@ LlgTokenizer *create_byte_tokenizer_v2(void) { tokens.push_back(std::vector(eos, eos + strlen(eos))); const char *eos2 = ""; tokens.push_back(std::vector(eos2, eos2 + strlen(eos2))); - // Primary EOS is token 256 (), extra EOS is token 257 () - std::vector extra_eos = {(uint32_t)(tokens.size() - 1)}; - return create_tokenizer_v2(tokens, tokens.size() - 2, extra_eos, - tokenize_callback, nullptr); + // EOS tokens: token 256 () is primary, token 257 () is extra + std::vector eos_tokens = {(uint32_t)(tokens.size() - 2), + (uint32_t)(tokens.size() - 1)}; + return create_tokenizer_v2(tokens, eos_tokens, tokenize_callback, nullptr); } LlgTokenizer *create_hf_tokenizer(std::string tokenizer_json, From 724856cb61bfcd41c3de6530c1316dfb9b271ecc Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Wed, 11 Mar 2026 10:08:37 -0700 Subject: [PATCH 28/35] Remove stale commented-out v2 snippet from create_tokenizer The v2 API now has a real working example in create_tokenizer_v2() above, so the inline commented-out snippet is redundant. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 19ade8aa..14b5ae03 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -80,19 +80,6 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, tok_init.tokenize_user_data = tokenize_user_data; tok_init.tokenize_fn = tokenize_fn; - // For models with multiple EOS tokens (e.g., Qwen3), use the v2 API: - // LlgTokenizerInitV2 init_v2 = {}; - // init_v2.struct_size = sizeof(init_v2); - // init_v2.vocab_size = tok_init.vocab_size; - // init_v2.tok_eos = tok_init.tok_eos; - // init_v2.token_lens = tok_init.token_lens; - // init_v2.token_bytes = tok_init.token_bytes; - // init_v2.tokenize_fn = tok_init.tokenize_fn; - // LlgToken extra_eos[] = {second_eos, third_eos}; - // init_v2.tok_eos_extra = extra_eos; - // init_v2.tok_eos_extra_count = sizeof(extra_eos) / sizeof(extra_eos[0]); - // auto tok = llg_new_tokenizer_v2(&init_v2, error_buf, sizeof(error_buf)); - char error_buf[128]; auto tok = llg_new_tokenizer(&tok_init, error_buf, sizeof(error_buf)); From cdfbf2d592176dc1ab7b7775d89c14089d17f092 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 17 Mar 2026 16:29:43 -0700 Subject: [PATCH 29/35] Pre-allocate token vector capacity in byte tokenizer constructors Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 14b5ae03..578aad1f 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -117,6 +117,7 @@ size_t tokenize_callback(const void *user_data, const uint8_t *bytes, // This creates a tokenizer that treats each byte as a token. LlgTokenizer *create_byte_tokenizer(void) { std::vector> tokens; + tokens.reserve(257); // 256 byte tokens + 1 EOS // every byte is a token for (size_t i = 0; i < 256; i++) { tokens.push_back({(uint8_t)i}); @@ -130,6 +131,7 @@ LlgTokenizer *create_byte_tokenizer(void) { // Same as above but using the v2 API with an extra (unused) EOS token. LlgTokenizer *create_byte_tokenizer_v2(void) { std::vector> tokens; + tokens.reserve(258); // 256 byte tokens + 2 EOS for (size_t i = 0; i < 256; i++) { tokens.push_back({(uint8_t)i}); } From 6ee6b940f3d349ca520ff5a11c7fc5d4b56fb5e7 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 17 Mar 2026 16:30:56 -0700 Subject: [PATCH 30/35] Use std::copy instead of memcpy for token byte packing Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 578aad1f..7404a3b2 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -25,7 +25,7 @@ LlgTokenizer *create_tokenizer_v2(std::vector> &tokens, std::vector token_bytes(total_size); size_t offset = 0; for (size_t i = 0; i < tokens.size(); i++) { - memcpy(token_bytes.data() + offset, tokens[i].data(), token_lens[i]); + std::copy(tokens[i].begin(), tokens[i].end(), token_bytes.data() + offset); offset += token_lens[i]; } @@ -68,7 +68,7 @@ LlgTokenizer *create_tokenizer(std::vector> &tokens, std::vector token_bytes(total_size); size_t offset = 0; for (size_t i = 0; i < tokens.size(); i++) { - memcpy(token_bytes.data() + offset, tokens[i].data(), token_lens[i]); + std::copy(tokens[i].begin(), tokens[i].end(), token_bytes.data() + offset); offset += token_lens[i]; } LlgTokenizerInit tok_init = {}; From e94107449e1242ebc8bacefc4c2683ca10e9c8d8 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 17 Mar 2026 16:32:07 -0700 Subject: [PATCH 31/35] Replace remaining memcpy with std::copy in tokenize_callback Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- c_sample/c_sample.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_sample/c_sample.cpp b/c_sample/c_sample.cpp index 7404a3b2..62b82767 100644 --- a/c_sample/c_sample.cpp +++ b/c_sample/c_sample.cpp @@ -108,8 +108,8 @@ size_t tokenize_callback(const void *user_data, const uint8_t *bytes, (void)user_data; auto tokens = bogus_tokenize(bytes, bytes_len); if (output_tokens_len > 0) { - memcpy(output_tokens, tokens.data(), - std::min(output_tokens_len, tokens.size()) * sizeof(uint32_t)); + auto n = std::min(output_tokens_len, tokens.size()); + std::copy(tokens.begin(), tokens.begin() + n, output_tokens); } return tokens.size(); } From 434d169fca9036a9c26532d5a72ed5e8286c02c7 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Wed, 18 Mar 2026 09:23:54 -0700 Subject: [PATCH 32/35] doctest format fixes --- parser/src/ffi.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index d75b3f3c..c99b547d 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -284,13 +284,13 @@ pub struct LlgTokenizerInit { pub tokenizer_json: *const c_char, /// Set to true to enable hack that works around the tokenize_fn only - /// accepting valid UTF-8 strings and possibly adding etc. - /// TODO: the bit not implemented yet + /// accepting valid UTF-8 strings and possibly adding `` etc. + /// TODO: the `` bit not implemented yet pub tokenize_assumes_string: bool, /// Tokenization function, see LlgTokenizeFn docs. /// It should only tokenize the bytes and not add - /// any etc. It should also work on any byte sequence, including + /// any `` etc. It should also work on any byte sequence, including /// invalid UTF-8. If this is not the case, set tokenize_assumes_string to true. /// Either way, this function has to be thread-safe! pub tokenize_fn: LlgTokenizeFn, @@ -342,13 +342,13 @@ pub struct LlgTokenizerInitV2 { pub tokenizer_json: *const c_char, /// Set to true to enable hack that works around the tokenize_fn only - /// accepting valid UTF-8 strings and possibly adding etc. - /// TODO: the bit not implemented yet + /// accepting valid UTF-8 strings and possibly adding `` etc. + /// TODO: the `` bit not implemented yet pub tokenize_assumes_string: bool, /// Tokenization function, see LlgTokenizeFn docs. /// It should only tokenize the bytes and not add - /// any etc. It should also work on any byte sequence, including + /// any `` etc. It should also work on any byte sequence, including /// invalid UTF-8. If this is not the case, set tokenize_assumes_string to true. /// Either way, this function has to be thread-safe! pub tokenize_fn: LlgTokenizeFn, From ce51ebe83206672c08c34018aff090913644a9ca Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Wed, 18 Mar 2026 09:31:10 -0700 Subject: [PATCH 33/35] cbindgen --- parser/llguidance.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parser/llguidance.h b/parser/llguidance.h index 2b627f57..0d6c494c 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -288,14 +288,14 @@ typedef struct LlgTokenizerInitV2 { const char *tokenizer_json; /** * Set to true to enable hack that works around the tokenize_fn only - * accepting valid UTF-8 strings and possibly adding etc. - * TODO: the bit not implemented yet + * accepting valid UTF-8 strings and possibly adding `` etc. + * TODO: the `` bit not implemented yet */ bool tokenize_assumes_string; /** * Tokenization function, see LlgTokenizeFn docs. * It should only tokenize the bytes and not add - * any etc. It should also work on any byte sequence, including + * any `` etc. It should also work on any byte sequence, including * invalid UTF-8. If this is not the case, set tokenize_assumes_string to true. * Either way, this function has to be thread-safe! */ From cc619d00da5292df635d542689f618a66b5c20b8 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Wed, 18 Mar 2026 09:56:29 -0700 Subject: [PATCH 34/35] simplify from_init/from_init_v2 by delegating in a more sensible direction --- parser/src/ffi.rs | 82 +++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index fb248b08..b30d6917 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -100,11 +100,10 @@ unsafe fn slice_from_ptr_or_empty<'a, T>(data: *const T, len: usize) -> &'a [T] impl LlgTokenizer { fn from_init(init: &LlgTokenizerInit) -> Result { - let trie = Self::build_trie(init)?; - Self::finish_init(init, trie) + Self::from_init_v2(&LlgTokenizerInitV2::from_v1(init)) } - fn build_trie(init: &LlgTokenizerInit) -> Result { + fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { ensure!( init.tokenize_fn.is_some() || init.use_approximate_greedy_tokenize_fn, "Either tokenize_fn or use_approximate_greedy_tokenize_fn must be set" @@ -142,29 +141,10 @@ impl LlgTokenizer { token_bytes }; - Ok(TokTrie::from( + let mut trie = TokTrie::from( &TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens, - )) - } - - fn from_init_v2(init: &LlgTokenizerInitV2) -> Result { - // Build a v1 init from the shared fields and delegate - let v1 = LlgTokenizerInit { - vocab_size: init.vocab_size, - tok_eos: init.tok_eos, - token_lens: init.token_lens, - token_bytes: init.token_bytes, - tokenizer_json: init.tokenizer_json, - tokenize_assumes_string: init.tokenize_assumes_string, - tokenize_fn: init.tokenize_fn, - use_approximate_greedy_tokenize_fn: init.use_approximate_greedy_tokenize_fn, - tokenize_user_data: init.tokenize_user_data, - slices: init.slices, - }; - - // Build the trie via the v1 path - let mut trie = Self::build_trie(&v1)?; + ); // Apply additional EOS tokens if provided if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 { @@ -185,37 +165,29 @@ impl LlgTokenizer { trie = trie.with_eos_tokens(&eos_tokens); } - Self::finish_init(&v1, trie) - } + let tok_env: TokEnv = Arc::new(CTokenizerInner { + trie, + tokenize_assumes_string: init.tokenize_assumes_string && init.tokenize_fn.is_some(), + tokenize_fn: init.tokenize_fn, + tokenize_user_data: init.tokenize_user_data, + }); - fn read_slices_raw(slices: *const *const c_char) -> Result> { - if slices.is_null() { - Ok(SlicedBiasComputer::general_slices()) + let slices = if init.slices.is_null() { + SlicedBiasComputer::general_slices() } else { - let mut result = vec![]; + let mut slices = vec![]; let mut idx = 0; loop { - let p = unsafe { *slices.add(idx) }; + let p = unsafe { *init.slices.add(idx) }; if p.is_null() { break; } let s = unsafe { c_str_to_str(p, "slice") }?; - result.push(s.to_string()); + slices.push(s.to_string()); idx += 1; } - Ok(result) - } - } - - fn finish_init(init: &LlgTokenizerInit, trie: TokTrie) -> Result { - let tok_env: TokEnv = Arc::new(CTokenizerInner { - trie, - tokenize_assumes_string: init.tokenize_assumes_string && init.tokenize_fn.is_some(), - tokenize_fn: init.tokenize_fn, - tokenize_user_data: init.tokenize_user_data, - }); - - let slices = Self::read_slices_raw(init.slices)?; + slices + }; let factory = ParserFactory::new(&tok_env, InferenceCapabilities::default(), &slices)?; @@ -374,6 +346,26 @@ pub struct LlgTokenizerInitV2 { pub tok_eos_extra_count: u32, } +impl LlgTokenizerInitV2 { + fn from_v1(v1: &LlgTokenizerInit) -> Self { + LlgTokenizerInitV2 { + struct_size: std::mem::size_of::(), + vocab_size: v1.vocab_size, + tok_eos: v1.tok_eos, + token_lens: v1.token_lens, + token_bytes: v1.token_bytes, + tokenizer_json: v1.tokenizer_json, + tokenize_assumes_string: v1.tokenize_assumes_string, + tokenize_fn: v1.tokenize_fn, + use_approximate_greedy_tokenize_fn: v1.use_approximate_greedy_tokenize_fn, + tokenize_user_data: v1.tokenize_user_data, + slices: v1.slices, + tok_eos_extra: std::ptr::null(), + tok_eos_extra_count: 0, + } + } +} + #[derive(Clone)] #[repr(C)] pub struct LlgConstraintInit { From 5cfc0576fee1daa4741dbd53454f8b419e186bb1 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Wed, 18 Mar 2026 09:56:58 -0700 Subject: [PATCH 35/35] cargo fmt --- parser/src/ffi.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index b30d6917..dc610d34 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -141,10 +141,7 @@ impl LlgTokenizer { token_bytes }; - let mut trie = TokTrie::from( - &TokRxInfo::new(tokens.len() as u32, init.tok_eos), - &tokens, - ); + let mut trie = TokTrie::from(&TokRxInfo::new(tokens.len() as u32, init.tok_eos), &tokens); // Apply additional EOS tokens if provided if !init.tok_eos_extra.is_null() && init.tok_eos_extra_count > 0 {