Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
283c765
Add backwards-compatible support for multiple EOS tokens
hudson-ai Mar 6, 2026
b261ae9
cargo fmt
hudson-ai Mar 6, 2026
985e344
Apply suggestions from code review
hudson-ai Mar 6, 2026
abed676
Add zero-initialization requirement comment to LlgTokenizerInit
hudson-ai Mar 6, 2026
963d764
Introduce LlgTokenizerInitV2 and llg_new_tokenizer_v2 for ABI stability
hudson-ai Mar 6, 2026
ca775a5
Add struct_size field to LlgTokenizerInitV2 for forward compatibility
hudson-ai Mar 6, 2026
db177e9
Flatten LlgTokenizerInitV2 fields instead of embedding LlgTokenizerInit
hudson-ai Mar 6, 2026
6e43159
Test both v1 and v2 C ABI in c_sample
hudson-ai Mar 6, 2026
bc534b3
cargo fmt
hudson-ai Mar 6, 2026
4387f51
Validate EOS token IDs and fix struct_size forward compatibility
hudson-ai Mar 6, 2026
d65de0d
Fix struct_size check and add EOS validation in FFI path
hudson-ai Mar 6, 2026
56526da
Make struct_size forward compatibility real via raw pointer
hudson-ai Mar 6, 2026
a7292ef
cargo fmt
hudson-ai Mar 6, 2026
988365a
Fix multi-EOS in stopped/error fallbacks and TokenizerWrapper path
hudson-ai Mar 6, 2026
b0ec8ef
Add Rust tests for multi-EOS stopped-state mask and simplify Python test
hudson-ai Mar 6, 2026
821f593
cargo fmt
hudson-ai Mar 6, 2026
229cf29
Refactor from_init_v2 to avoid double factory construction
hudson-ai Mar 6, 2026
1dc5bd6
Validate EOS token IDs in Python entry points
hudson-ai Mar 6, 2026
88caf8b
Validate single EOS token in from_tiktoken path too
hudson-ai Mar 6, 2026
acbb724
cargo fmt
hudson-ai Mar 6, 2026
197b628
Address remaining review comments
hudson-ai Mar 6, 2026
96bb5fb
Fix mypy errors in test_matcher.py
hudson-ai Mar 6, 2026
ec47b5d
Guard eos_token_set() against INVALID_TOKEN and out-of-range IDs
hudson-ai Mar 6, 2026
037a7b0
Use offset_of token_lens for min_size to match doc comment
hudson-ai Mar 6, 2026
911534d
Merge branch 'main' into multi_eos
hudson-ai Mar 10, 2026
afa0241
clean up python tests a little bit
hudson-ai Mar 10, 2026
c15f0d4
Use std::vector instead of new[]/delete[] in c_sample
hudson-ai Mar 11, 2026
3026e27
Take a single eos_tokens vector in create_tokenizer_v2
hudson-ai Mar 11, 2026
724856c
Remove stale commented-out v2 snippet from create_tokenizer
hudson-ai Mar 11, 2026
cdfbf2d
Pre-allocate token vector capacity in byte tokenizer constructors
hudson-ai Mar 17, 2026
6ee6b94
Use std::copy instead of memcpy for token byte packing
hudson-ai Mar 17, 2026
e941074
Replace remaining memcpy with std::copy in tokenize_callback
hudson-ai Mar 17, 2026
434d169
doctest format fixes
hudson-ai Mar 18, 2026
91e022d
Merge branch 'main' into multi_eos
hudson-ai Mar 18, 2026
ce51ebe
cbindgen
hudson-ai Mar 18, 2026
cc619d0
simplify from_init/from_init_v2 by delegating in a more sensible dire…
hudson-ai Mar 18, 2026
5cfc057
cargo fmt
hudson-ai Mar 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 102 additions & 31 deletions c_sample/c_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,73 @@

#include "llguidance.h"

// Create an LlgTokenizer using the v2 API.
// eos_tokens[0] is the primary EOS; any remaining entries are extra EOS token IDs.
LlgTokenizer *create_tokenizer_v2(std::vector<std::vector<uint8_t>> &tokens,
std::vector<uint32_t> eos_tokens,
LlgTokenizeFn tokenize_fn,
const void *tokenize_user_data) {
assert(!eos_tokens.empty());
std::vector<uint32_t> token_lens(tokens.size());
size_t total_size = 0;
for (size_t i = 0; i < tokens.size(); i++) {
token_lens[i] = tokens[i].size();
total_size += token_lens[i];
}
std::vector<uint8_t> token_bytes(total_size);
size_t offset = 0;
for (size_t i = 0; i < tokens.size(); i++) {
std::copy(tokens[i].begin(), tokens[i].end(), token_bytes.data() + offset);
offset += token_lens[i];
}

LlgTokenizerInitV2 tok_init = {};
tok_init.struct_size = sizeof(tok_init);
tok_init.vocab_size = (uint32_t)tokens.size();
tok_init.tok_eos = eos_tokens[0];
tok_init.token_lens = token_lens.data();
tok_init.token_bytes = token_bytes.data();
tok_init.tokenize_assumes_string = false;
tok_init.tokenize_user_data = tokenize_user_data;
tok_init.tokenize_fn = tokenize_fn;
if (eos_tokens.size() > 1) {
tok_init.tok_eos_extra = eos_tokens.data() + 1;
tok_init.tok_eos_extra_count = (uint32_t)(eos_tokens.size() - 1);
}

char error_buf[128];
auto tok = llg_new_tokenizer_v2(&tok_init, error_buf, sizeof(error_buf));

if (tok == nullptr) {
printf("Error (v2): %s\n", error_buf);
exit(1);
}

return tok;
}

// Create an LlgTokenizer; tokens[token_id] is a byte sequence corresponding to
// given token_id; see below for tokenize_fn
LlgTokenizer *create_tokenizer(std::vector<std::vector<uint8_t>> &tokens,
uint32_t tok_eos, LlgTokenizeFn tokenize_fn,
const void *tokenize_user_data) {
auto token_lens = new uint32_t[tokens.size()];
std::vector<uint32_t> token_lens(tokens.size());
size_t total_size = 0;
for (size_t i = 0; i < tokens.size(); i++) {
token_lens[i] = tokens[i].size();
total_size += token_lens[i];
}
auto token_bytes = new uint8_t[total_size];
std::vector<uint8_t> token_bytes(total_size);
size_t offset = 0;
for (size_t i = 0; i < tokens.size(); i++) {
memcpy(token_bytes + offset, tokens[i].data(), token_lens[i]);
std::copy(tokens[i].begin(), tokens[i].end(), token_bytes.data() + offset);
offset += token_lens[i];
}
LlgTokenizerInit tok_init = {};
tok_init.vocab_size = (uint32_t)tokens.size();
tok_init.tok_eos = tok_eos;
tok_init.token_lens = token_lens;
tok_init.token_bytes = token_bytes;
tok_init.token_lens = token_lens.data();
tok_init.token_bytes = token_bytes.data();
tok_init.tokenize_assumes_string = false;
tok_init.tokenize_user_data = tokenize_user_data;
tok_init.tokenize_fn = tokenize_fn;
Expand Down Expand Up @@ -63,15 +108,16 @@ size_t tokenize_callback(const void *user_data, const uint8_t *bytes,
(void)user_data;
auto tokens = bogus_tokenize(bytes, bytes_len);
if (output_tokens_len > 0) {
memcpy(output_tokens, tokens.data(),
std::min(output_tokens_len, tokens.size()) * sizeof(uint32_t));
auto n = std::min(output_tokens_len, tokens.size());
std::copy(tokens.begin(), tokens.begin() + n, output_tokens);
}
return tokens.size();
}

// This creates a tokenizer that treats each byte as a token.
LlgTokenizer *create_byte_tokenizer(void) {
std::vector<std::vector<uint8_t>> tokens;
tokens.reserve(257); // 256 byte tokens + 1 EOS
// every byte is a token
for (size_t i = 0; i < 256; i++) {
tokens.push_back({(uint8_t)i});
Expand All @@ -82,6 +128,23 @@ LlgTokenizer *create_byte_tokenizer(void) {
nullptr);
}

// Same as above but using the v2 API with an extra (unused) EOS token.
Comment thread
hudson-ai marked this conversation as resolved.
LlgTokenizer *create_byte_tokenizer_v2(void) {
std::vector<std::vector<uint8_t>> tokens;
Copy link
Copy Markdown
Contributor

@riedgar-ms riedgar-ms Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small point, but you can also pre-allocate a capacity in the constructor if you have a pretty good idea of the size you'll need (note that size <= capacity)

tokens.reserve(258); // 256 byte tokens + 2 EOS
for (size_t i = 0; i < 256; i++) {
tokens.push_back({(uint8_t)i});
}
const char *eos = "<EOS>";
tokens.push_back(std::vector<uint8_t>(eos, eos + strlen(eos)));
const char *eos2 = "<EOS2>";
tokens.push_back(std::vector<uint8_t>(eos2, eos2 + strlen(eos2)));
// EOS tokens: token 256 (<EOS>) is primary, token 257 (<EOS2>) is extra
std::vector<uint32_t> eos_tokens = {(uint32_t)(tokens.size() - 2),
(uint32_t)(tokens.size() - 1)};
return create_tokenizer_v2(tokens, eos_tokens, tokenize_callback, nullptr);
}

LlgTokenizer *create_hf_tokenizer(std::string tokenizer_json,
uint32_t tok_eos) {
LlgTokenizerInit tok_init = {};
Expand Down Expand Up @@ -141,21 +204,8 @@ std::string do_llg_stringify_tokens(const LlgTokenizer *tok,
}
}

int main(int argc, const char *argv[]) {
if (argc < 3) {
printf("Usage: %s <schema.ll.json> <sample.json> [tokenizer.json]\n",
argv[0]);
return 1;
}

// the tokenizer can (and should) be shared between constraints
LlgTokenizer *tokenizer = argc > 3
? create_hf_tokenizer(read_file(argv[3]), 2)
: create_byte_tokenizer();

auto schema_json = read_file(argv[1]);
auto sample_json = read_file(argv[2]);

void run_constraint_test(LlgTokenizer *tokenizer, const std::string &schema_json,
const std::string &sample_json, const char *label) {
LlgConstraintInit init;
llg_constraint_init_set_defaults(&init, tokenizer);
init.log_stderr_level = 0; // default to 1 (warnings only)
Expand All @@ -167,14 +217,6 @@ int main(int argc, const char *argv[]) {
fail_constraint(c);
}

// for debugging the tokenizer:
// for (int i = 0; i < 320; ++i) {
// std::vector<uint32_t> tokens;
// tokens.push_back(i);
// std::string s = do_llg_stringify_tokens(tokenizer, tokens);
// printf("Token %d: %s\n", i, s.c_str());
// }

// we assume our "LLM" will generate these tokens
auto tokens = do_llg_tokenize(tokenizer, sample_json);

Expand Down Expand Up @@ -212,6 +254,35 @@ int main(int argc, const char *argv[]) {
// we assume the constraint will force EOS at the end of the input
assert(mask_res.is_stop);

printf("OK!\n");
llg_free_constraint(c);
printf("%s: OK!\n", label);
}

int main(int argc, const char *argv[]) {
if (argc < 3) {
printf("Usage: %s <schema.ll.json> <sample.json> [tokenizer.json]\n",
argv[0]);
return 1;
}

auto schema_json = read_file(argv[1]);
auto sample_json = read_file(argv[2]);

// Test with v1 API (LlgTokenizerInit + llg_new_tokenizer)
{
LlgTokenizer *tokenizer = argc > 3
? create_hf_tokenizer(read_file(argv[3]), 2)
: create_byte_tokenizer();
run_constraint_test(tokenizer, schema_json, sample_json, "v1");
llg_free_tokenizer(tokenizer);
}

// Test with v2 API (LlgTokenizerInitV2 + llg_new_tokenizer_v2)
{
LlgTokenizer *tokenizer = create_byte_tokenizer_v2();
run_constraint_test(tokenizer, schema_json, sample_json, "v2");
llg_free_tokenizer(tokenizer);
}

return 0;
}
105 changes: 105 additions & 0 deletions parser/llguidance.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ typedef size_t (*LlgTokenizeFn)(const void *user_data,
uint32_t *output_tokens,
size_t output_tokens_len);

/**
* This struct must be zero-initialized (e.g., `= {}` in C/C++) before setting fields.
* New fields may be appended in future versions, and zero-initialization ensures
* they receive safe default values.
*/
typedef struct LlgTokenizerInit {
/**
* The number of tokens in the vocabulary
Expand Down Expand Up @@ -241,6 +246,87 @@ typedef struct LlgTokenizerInit {
const char *const *slices;
} LlgTokenizerInit;

/**
* V2 of the tokenizer initialization struct.
* Extends LlgTokenizerInit with support for multiple EOS tokens.
* Use with `llg_new_tokenizer_v2()`.
*
* Initialize with: `LlgTokenizerInitV2 init = {}; init.struct_size = sizeof(init);`
* The library only reads `struct_size` bytes from the pointer, so callers
* compiled against an older header (with a smaller struct) will work with
* newer library versions — any new fields default to zero.
*/
typedef struct LlgTokenizerInitV2 {
/**
* Must be set to `sizeof(LlgTokenizerInitV2)`.
* The library uses this to determine how many bytes to read, enabling
* forward compatibility when new fields are appended in future versions.
*/
size_t struct_size;
/**
* The number of tokens in the vocabulary
*/
uint32_t vocab_size;
/**
* The token ID for the end of sentence token
* For chat mode, set it to end-of-turn token
*/
LlgToken tok_eos;
/**
* An array of the lengths of the token strings (vocab_size elements)
*/
const uint32_t *token_lens;
/**
* A pointer to the token strings
* The length of this the sum of all token_lens
*/
const uint8_t *token_bytes;
/**
* Instead of passing token_lens and token_bytes, this can be set to
* the contents of HF tokenizer.json file.
*/
const char *tokenizer_json;
/**
* Set to true to enable hack that works around the tokenize_fn only
* accepting valid UTF-8 strings and possibly adding `<BOS>` etc.
* TODO: the `<BOS>` bit not implemented yet
*/
bool tokenize_assumes_string;
/**
* Tokenization function, see LlgTokenizeFn docs.
* It should only tokenize the bytes and not add
* any `<BOS>` etc. It should also work on any byte sequence, including
* invalid UTF-8. If this is not the case, set tokenize_assumes_string to true.
* Either way, this function has to be thread-safe!
*/
LlgTokenizeFn tokenize_fn;
/**
* Set to true to not use tokenize_fn and instead tokenize greedily,
* which is often incorrect and may reduce accuracy.
*/
bool use_approximate_greedy_tokenize_fn;
/**
* User data to pass to the tokenize_fn
*/
const void *tokenize_user_data;
/**
* Tokenizer partitions for the slicer optimization.
* This is array of pointers to strings, terminated with NULL (argv style).
* Pass NULL to use defaults. Pass empty array to disable.
*/
const char *const *slices;
/**
* Additional EOS token IDs beyond `tok_eos`.
* Points to an array of `tok_eos_extra_count` elements.
* When NULL (the default for zero-initialized structs), only `tok_eos` is used.
*/
const LlgToken *tok_eos_extra;
/**
* Number of elements in the `tok_eos_extra` array.
*/
uint32_t tok_eos_extra_count;
Comment thread
hudson-ai marked this conversation as resolved.
} LlgTokenizerInitV2;



#ifdef __cplusplus
Expand Down Expand Up @@ -347,6 +433,25 @@ struct LlgTokenizer *llg_new_tokenizer(const struct LlgTokenizerInit *tok_init,
char *error_string,
size_t error_string_len);

/**
* Create a new tokenizer from a LlgTokenizerInitV2 struct.
* This is the v2 API that supports multiple EOS tokens.
*
* The `tok_init` pointer must be valid and `tok_init->struct_size` must be set
* to `sizeof(LlgTokenizerInitV2)` as known by the caller. The library will
* only read `struct_size` bytes, so callers compiled against an older (smaller)
* version of the struct will work with newer library versions — new fields
* default to zero.
*
* `tok_init` must point to at least `tok_init->struct_size` bytes of
* initialized memory, and `struct_size` must be at least
* `offsetof(LlgTokenizerInitV2, token_lens)` (i.e., include struct_size,
* vocab_size, and the complete tok_eos field).
*/
struct LlgTokenizer *llg_new_tokenizer_v2(const struct LlgTokenizerInitV2 *tok_init,
char *error_string,
size_t error_string_len);

/**
* Clone a tokenizer.
* This increments a reference count and does a small allocation.
Expand Down
Loading