-
Notifications
You must be signed in to change notification settings - Fork 67
Add backwards-compatible support for multiple EOS tokens #305
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
37 commits
Select commit
Hold shift + click to select a range
283c765
Add backwards-compatible support for multiple EOS tokens
hudson-ai b261ae9
cargo fmt
hudson-ai 985e344
Apply suggestions from code review
hudson-ai abed676
Add zero-initialization requirement comment to LlgTokenizerInit
hudson-ai 963d764
Introduce LlgTokenizerInitV2 and llg_new_tokenizer_v2 for ABI stability
hudson-ai ca775a5
Add struct_size field to LlgTokenizerInitV2 for forward compatibility
hudson-ai db177e9
Flatten LlgTokenizerInitV2 fields instead of embedding LlgTokenizerInit
hudson-ai 6e43159
Test both v1 and v2 C ABI in c_sample
hudson-ai bc534b3
cargo fmt
hudson-ai 4387f51
Validate EOS token IDs and fix struct_size forward compatibility
hudson-ai d65de0d
Fix struct_size check and add EOS validation in FFI path
hudson-ai 56526da
Make struct_size forward compatibility real via raw pointer
hudson-ai a7292ef
cargo fmt
hudson-ai 988365a
Fix multi-EOS in stopped/error fallbacks and TokenizerWrapper path
hudson-ai b0ec8ef
Add Rust tests for multi-EOS stopped-state mask and simplify Python test
hudson-ai 821f593
cargo fmt
hudson-ai 229cf29
Refactor from_init_v2 to avoid double factory construction
hudson-ai 1dc5bd6
Validate EOS token IDs in Python entry points
hudson-ai 88caf8b
Validate single EOS token in from_tiktoken path too
hudson-ai acbb724
cargo fmt
hudson-ai 197b628
Address remaining review comments
hudson-ai 96bb5fb
Fix mypy errors in test_matcher.py
hudson-ai ec47b5d
Guard eos_token_set() against INVALID_TOKEN and out-of-range IDs
hudson-ai 037a7b0
Use offset_of token_lens for min_size to match doc comment
hudson-ai 911534d
Merge branch 'main' into multi_eos
hudson-ai afa0241
clean up python tests a little bit
hudson-ai c15f0d4
Use std::vector instead of new[]/delete[] in c_sample
hudson-ai 3026e27
Take a single eos_tokens vector in create_tokenizer_v2
hudson-ai 724856c
Remove stale commented-out v2 snippet from create_tokenizer
hudson-ai cdfbf2d
Pre-allocate token vector capacity in byte tokenizer constructors
hudson-ai 6ee6b94
Use std::copy instead of memcpy for token byte packing
hudson-ai e941074
Replace remaining memcpy with std::copy in tokenize_callback
hudson-ai 434d169
doctest format fixes
hudson-ai 91e022d
Merge branch 'main' into multi_eos
hudson-ai ce51ebe
cbindgen
hudson-ai cc619d0
simplify from_init/from_init_v2 by delegating in a more sensible dire…
hudson-ai 5cfc057
cargo fmt
hudson-ai File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,28 +9,73 @@ | |
|
|
||
| #include "llguidance.h" | ||
|
|
||
| // Create an LlgTokenizer using the v2 API. | ||
| // eos_tokens[0] is the primary EOS; any remaining entries are extra EOS token IDs. | ||
| LlgTokenizer *create_tokenizer_v2(std::vector<std::vector<uint8_t>> &tokens, | ||
| std::vector<uint32_t> eos_tokens, | ||
| LlgTokenizeFn tokenize_fn, | ||
| const void *tokenize_user_data) { | ||
| assert(!eos_tokens.empty()); | ||
| std::vector<uint32_t> token_lens(tokens.size()); | ||
| size_t total_size = 0; | ||
| for (size_t i = 0; i < tokens.size(); i++) { | ||
| token_lens[i] = tokens[i].size(); | ||
| total_size += token_lens[i]; | ||
| } | ||
| std::vector<uint8_t> token_bytes(total_size); | ||
| size_t offset = 0; | ||
| for (size_t i = 0; i < tokens.size(); i++) { | ||
| std::copy(tokens[i].begin(), tokens[i].end(), token_bytes.data() + offset); | ||
| offset += token_lens[i]; | ||
| } | ||
|
|
||
| LlgTokenizerInitV2 tok_init = {}; | ||
| tok_init.struct_size = sizeof(tok_init); | ||
| tok_init.vocab_size = (uint32_t)tokens.size(); | ||
| tok_init.tok_eos = eos_tokens[0]; | ||
| tok_init.token_lens = token_lens.data(); | ||
| tok_init.token_bytes = token_bytes.data(); | ||
| tok_init.tokenize_assumes_string = false; | ||
| tok_init.tokenize_user_data = tokenize_user_data; | ||
| tok_init.tokenize_fn = tokenize_fn; | ||
| if (eos_tokens.size() > 1) { | ||
| tok_init.tok_eos_extra = eos_tokens.data() + 1; | ||
| tok_init.tok_eos_extra_count = (uint32_t)(eos_tokens.size() - 1); | ||
| } | ||
|
|
||
| char error_buf[128]; | ||
| auto tok = llg_new_tokenizer_v2(&tok_init, error_buf, sizeof(error_buf)); | ||
|
|
||
| if (tok == nullptr) { | ||
| printf("Error (v2): %s\n", error_buf); | ||
| exit(1); | ||
| } | ||
|
|
||
| return tok; | ||
| } | ||
|
|
||
| // Create an LlgTokenizer; tokens[token_id] is a byte sequence corresponding to | ||
| // given token_id; see below for tokenize_fn | ||
| LlgTokenizer *create_tokenizer(std::vector<std::vector<uint8_t>> &tokens, | ||
| uint32_t tok_eos, LlgTokenizeFn tokenize_fn, | ||
| const void *tokenize_user_data) { | ||
| auto token_lens = new uint32_t[tokens.size()]; | ||
| std::vector<uint32_t> token_lens(tokens.size()); | ||
| size_t total_size = 0; | ||
| for (size_t i = 0; i < tokens.size(); i++) { | ||
| token_lens[i] = tokens[i].size(); | ||
| total_size += token_lens[i]; | ||
| } | ||
| auto token_bytes = new uint8_t[total_size]; | ||
| std::vector<uint8_t> token_bytes(total_size); | ||
| size_t offset = 0; | ||
| for (size_t i = 0; i < tokens.size(); i++) { | ||
| memcpy(token_bytes + offset, tokens[i].data(), token_lens[i]); | ||
| std::copy(tokens[i].begin(), tokens[i].end(), token_bytes.data() + offset); | ||
| offset += token_lens[i]; | ||
| } | ||
| LlgTokenizerInit tok_init = {}; | ||
| tok_init.vocab_size = (uint32_t)tokens.size(); | ||
| tok_init.tok_eos = tok_eos; | ||
| tok_init.token_lens = token_lens; | ||
| tok_init.token_bytes = token_bytes; | ||
| tok_init.token_lens = token_lens.data(); | ||
| tok_init.token_bytes = token_bytes.data(); | ||
| tok_init.tokenize_assumes_string = false; | ||
| tok_init.tokenize_user_data = tokenize_user_data; | ||
| tok_init.tokenize_fn = tokenize_fn; | ||
|
|
@@ -63,15 +108,16 @@ size_t tokenize_callback(const void *user_data, const uint8_t *bytes, | |
| (void)user_data; | ||
| auto tokens = bogus_tokenize(bytes, bytes_len); | ||
| if (output_tokens_len > 0) { | ||
| memcpy(output_tokens, tokens.data(), | ||
| std::min(output_tokens_len, tokens.size()) * sizeof(uint32_t)); | ||
| auto n = std::min(output_tokens_len, tokens.size()); | ||
| std::copy(tokens.begin(), tokens.begin() + n, output_tokens); | ||
| } | ||
| return tokens.size(); | ||
| } | ||
|
|
||
| // This creates a tokenizer that treats each byte as a token. | ||
| LlgTokenizer *create_byte_tokenizer(void) { | ||
| std::vector<std::vector<uint8_t>> tokens; | ||
| tokens.reserve(257); // 256 byte tokens + 1 EOS | ||
| // every byte is a token | ||
| for (size_t i = 0; i < 256; i++) { | ||
| tokens.push_back({(uint8_t)i}); | ||
|
|
@@ -82,6 +128,23 @@ LlgTokenizer *create_byte_tokenizer(void) { | |
| nullptr); | ||
| } | ||
|
|
||
| // Same as above but using the v2 API with an extra (unused) EOS token. | ||
| LlgTokenizer *create_byte_tokenizer_v2(void) { | ||
| std::vector<std::vector<uint8_t>> tokens; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Small point, but you can also pre-allocate a capacity in the constructor if you have a pretty good idea of the size you'll need (note that size <= capacity) |
||
| tokens.reserve(258); // 256 byte tokens + 2 EOS | ||
| for (size_t i = 0; i < 256; i++) { | ||
| tokens.push_back({(uint8_t)i}); | ||
| } | ||
| const char *eos = "<EOS>"; | ||
| tokens.push_back(std::vector<uint8_t>(eos, eos + strlen(eos))); | ||
| const char *eos2 = "<EOS2>"; | ||
| tokens.push_back(std::vector<uint8_t>(eos2, eos2 + strlen(eos2))); | ||
| // EOS tokens: token 256 (<EOS>) is primary, token 257 (<EOS2>) is extra | ||
| std::vector<uint32_t> eos_tokens = {(uint32_t)(tokens.size() - 2), | ||
| (uint32_t)(tokens.size() - 1)}; | ||
| return create_tokenizer_v2(tokens, eos_tokens, tokenize_callback, nullptr); | ||
| } | ||
|
|
||
| LlgTokenizer *create_hf_tokenizer(std::string tokenizer_json, | ||
| uint32_t tok_eos) { | ||
| LlgTokenizerInit tok_init = {}; | ||
|
|
@@ -141,21 +204,8 @@ std::string do_llg_stringify_tokens(const LlgTokenizer *tok, | |
| } | ||
| } | ||
|
|
||
| int main(int argc, const char *argv[]) { | ||
| if (argc < 3) { | ||
| printf("Usage: %s <schema.ll.json> <sample.json> [tokenizer.json]\n", | ||
| argv[0]); | ||
| return 1; | ||
| } | ||
|
|
||
| // the tokenizer can (and should) be shared between constraints | ||
| LlgTokenizer *tokenizer = argc > 3 | ||
| ? create_hf_tokenizer(read_file(argv[3]), 2) | ||
| : create_byte_tokenizer(); | ||
|
|
||
| auto schema_json = read_file(argv[1]); | ||
| auto sample_json = read_file(argv[2]); | ||
|
|
||
| void run_constraint_test(LlgTokenizer *tokenizer, const std::string &schema_json, | ||
| const std::string &sample_json, const char *label) { | ||
| LlgConstraintInit init; | ||
| llg_constraint_init_set_defaults(&init, tokenizer); | ||
| init.log_stderr_level = 0; // default to 1 (warnings only) | ||
|
|
@@ -167,14 +217,6 @@ int main(int argc, const char *argv[]) { | |
| fail_constraint(c); | ||
| } | ||
|
|
||
| // for debugging the tokenizer: | ||
| // for (int i = 0; i < 320; ++i) { | ||
| // std::vector<uint32_t> tokens; | ||
| // tokens.push_back(i); | ||
| // std::string s = do_llg_stringify_tokens(tokenizer, tokens); | ||
| // printf("Token %d: %s\n", i, s.c_str()); | ||
| // } | ||
|
|
||
| // we assume our "LLM" will generate these tokens | ||
| auto tokens = do_llg_tokenize(tokenizer, sample_json); | ||
|
|
||
|
|
@@ -212,6 +254,35 @@ int main(int argc, const char *argv[]) { | |
| // we assume the constraint will force EOS at the end of the input | ||
| assert(mask_res.is_stop); | ||
|
|
||
| printf("OK!\n"); | ||
| llg_free_constraint(c); | ||
| printf("%s: OK!\n", label); | ||
| } | ||
|
|
||
| int main(int argc, const char *argv[]) { | ||
| if (argc < 3) { | ||
| printf("Usage: %s <schema.ll.json> <sample.json> [tokenizer.json]\n", | ||
| argv[0]); | ||
| return 1; | ||
| } | ||
|
|
||
| auto schema_json = read_file(argv[1]); | ||
| auto sample_json = read_file(argv[2]); | ||
|
|
||
| // Test with v1 API (LlgTokenizerInit + llg_new_tokenizer) | ||
| { | ||
| LlgTokenizer *tokenizer = argc > 3 | ||
| ? create_hf_tokenizer(read_file(argv[3]), 2) | ||
| : create_byte_tokenizer(); | ||
| run_constraint_test(tokenizer, schema_json, sample_json, "v1"); | ||
| llg_free_tokenizer(tokenizer); | ||
| } | ||
|
|
||
| // Test with v2 API (LlgTokenizerInitV2 + llg_new_tokenizer_v2) | ||
| { | ||
| LlgTokenizer *tokenizer = create_byte_tokenizer_v2(); | ||
| run_constraint_test(tokenizer, schema_json, sample_json, "v2"); | ||
| llg_free_tokenizer(tokenizer); | ||
| } | ||
|
|
||
| return 0; | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.