Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,3 @@ target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include)
# You can link tokenizers_cpp, it will automatically link tokenizers_c
# and sentencepiece libary
target_link_libraries(example PRIVATE tokenizers_cpp)

2 changes: 1 addition & 1 deletion example/build_and_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ cd ..
mkdir -p dist
cd dist
if [ ! -f "tokenizer.model" ]; then
wget https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model
wget https://huggingface.co/lmsys/vicuna-7b-v1.5/resolve/main/tokenizer.model
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wget cannot download decapoda-research/llama-7b-hf without logging in. But vicuna-7b-v1.5 does not have this restriction.

fi
if [ ! -f "tokenizer.json" ]; then
wget https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1/resolve/main/tokenizer.json
Expand Down
88 changes: 61 additions & 27 deletions example/example.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <tokenizers_cpp.h>

#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
Expand Down Expand Up @@ -30,60 +32,92 @@ void PrintEncodeResult(const std::vector<int>& ids) {
std::cout << "]" << std::endl;
}

void TestTokenizer(std::unique_ptr<Tokenizer> tok, bool print_vocab = false,
bool check_id_back = true) {
// Check #1. Encode and Decode
std::string prompt = "What is the capital of Canada?";
std::vector<int> ids = tok->Encode(prompt);
std::string decoded_prompt = tok->Decode(ids);
PrintEncodeResult(ids);
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
assert(decoded_prompt == prompt);

// Check #2. IdToToken and TokenToId
std::vector<int32_t> ids_to_test = {0, 1, 2, 3, 32, 33, 34, 130, 131, 1000};
for (auto id : ids_to_test) {
auto token = tok->IdToToken(id);
auto id_new = tok->TokenToId(token);
std::cout << "id=" << id << ", token=\"" << token << "\", id_new=" << id_new << std::endl;
if (check_id_back) {
assert(id == id_new);
}
}

// Check #3. GetVocabSize
auto vocab_size = tok->GetVocabSize();
std::cout << "vocab_size=" << vocab_size << std::endl;

std::cout << std::endl;
}

// Sentencepiece tokenizer
// - dist/tokenizer.model
void SentencePieceTokenizerExample() {
std::cout << "Tokenizer: SentencePiece" << std::endl;

auto start = std::chrono::high_resolution_clock::now();

// Read blob from file.
auto blob = LoadBytesFromFile("dist/tokenizer.model");
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
auto tok = Tokenizer::FromBlobSentencePiece(blob);
std::string prompt = "What is the capital of Canada?";
// call Encode to turn prompt into token ids
std::vector<int> ids = tok->Encode(prompt);
// call Decode to turn ids into string
std::string decoded_prompt = tok->Decode(ids);

// print encoded result
std::cout << "SetencePiece tokenizer: " << std::endl;
PrintEncodeResult(ids);
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

std::cout << "Load time: " << duration << " ms" << std::endl;

TestTokenizer(std::move(tok), false, true);
}

// HF tokenizer
// - dist/tokenizer.json
void HuggingFaceTokenizerExample() {
std::cout << "Tokenizer: Huggingface" << std::endl;

auto start = std::chrono::high_resolution_clock::now();

// Read blob from file.
auto blob = LoadBytesFromFile("dist/tokenizer.json");
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
auto tok = Tokenizer::FromBlobJSON(blob);
std::string prompt = "What is the capital of Canada?";
// call Encode to turn prompt into token ids
std::vector<int> ids = tok->Encode(prompt);
// call Decode to turn ids into string
std::string decoded_prompt = tok->Decode(ids);

// print encoded result
std::cout << "HF tokenizer: " << std::endl;
PrintEncodeResult(ids);
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

std::cout << "Load time: " << duration << " ms" << std::endl;

TestTokenizer(std::move(tok), false, true);
}

// RWKV world tokenizer
// - dist/tokenizer_model
void RWKVWorldTokenizerExample() {
std::cout << "Tokenizer: RWKVWorld" << std::endl;

auto start = std::chrono::high_resolution_clock::now();

auto tok = Tokenizer::FromBlobRWKVWorld("dist/tokenizer_model");
std::string prompt = "What is the capital of Canada?";
// call Encode to turn prompt into token ids
std::vector<int> ids = tok->Encode(prompt);
// call Decode to turn ids into string
std::string decoded_prompt = tok->Decode(ids);

// print encoded result
std::cout << "RWKV World tokenizer: " << std::endl;
PrintEncodeResult(ids);
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

std::cout << "Load time: " << duration << " ms" << std::endl;

// We cannot check id back for RWKVWorldTokenizer yet.
TestTokenizer(std::move(tok), false, false);
}

int main(int argc, char* argv[]) {
Expand Down
50 changes: 0 additions & 50 deletions include/rwkv_world_tokenizer.h

This file was deleted.

7 changes: 7 additions & 0 deletions include/tokenizers_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ void tokenizers_get_decode_str(TokenizerHandle handle, const char** data, size_t

void tokenizers_get_encode_ids(TokenizerHandle handle, const uint32_t** id_data, size_t* len);

void tokenizers_get_vocab_size(TokenizerHandle handle, size_t* size);

void tokenizers_id_to_token(TokenizerHandle handle, uint32_t id, const char** data, size_t* len);

// tokenizers_token_to_id stores -1 to *id if the token is not in the vocab
void tokenizers_token_to_id(TokenizerHandle handle, const char* token, size_t len, int32_t* id);

void tokenizers_free(TokenizerHandle handle);

#ifdef __cplusplus
Expand Down
16 changes: 16 additions & 0 deletions include/tokenizers_cpp.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,22 @@ class Tokenizer {
*/
virtual std::string Decode(const std::vector<int32_t>& ids) = 0;

/*!
* \brief Returns the vocabulary size. Special tokens are considered.
*/
virtual size_t GetVocabSize() = 0;

/*!
* \brief Convert the given id to its corresponding token if it exists. If not, return an
* empty string.
*/
virtual std::string IdToToken(int32_t token_id) = 0;

/*!
* \brief Convert the given token to its corresponding id if it exists. If not, return -1.
*/
virtual int32_t TokenToId(const std::string& token) = 0;

//---------------------------------------------------
// Factory functions from byte-blobs
// These factory function takes in in-memory blobs
Expand Down
46 changes: 46 additions & 0 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pub struct TokenizerWrapper {
tokenizer: Tokenizer,
encode_ids: Vec<u32>,
decode_str: String,
id_to_token_result: String,
}

pub type Vocab = HashMap<String, u32>;
Expand All @@ -20,6 +21,7 @@ impl TokenizerWrapper {
tokenizer: Tokenizer::from_str(json).unwrap().into(),
encode_ids: Vec::new(),
decode_str: String::new(),
id_to_token_result: String::new(),
}
}

Expand Down Expand Up @@ -77,6 +79,7 @@ impl TokenizerWrapper {
tokenizer: tokenizer,
encode_ids: Vec::new(),
decode_str: String::new(),
id_to_token_result: String::new(),
}
}

Expand Down Expand Up @@ -182,3 +185,46 @@ extern "C" fn tokenizers_free(wrapper: *mut TokenizerWrapper) {
drop(Box::from_raw(wrapper));
}
}

#[no_mangle]
extern "C" fn tokenizers_get_vocab_size(handle: *mut TokenizerWrapper, size: *mut usize) {
unsafe {
*size = (*handle).tokenizer.get_vocab_size(true);
}
}

#[no_mangle]
extern "C" fn tokenizers_id_to_token(
handle: *mut TokenizerWrapper,
id: u32,
out_cstr: *mut *mut u8,
out_len: *mut usize,
) {
unsafe {
let str = (*handle).tokenizer.id_to_token(id);
(*handle).id_to_token_result = match str {
Some(s) => s,
None => String::from(""),
};

*out_cstr = (*handle).id_to_token_result.as_mut_ptr();
*out_len = (*handle).id_to_token_result.len();
}
}

#[no_mangle]
extern "C" fn tokenizers_token_to_id(
handle: *mut TokenizerWrapper,
token: *const u8,
len: usize,
out_id: *mut i32,
) {
unsafe {
let token: &str = std::str::from_utf8(std::slice::from_raw_parts(token, len)).unwrap();
let id = (*handle).tokenizer.token_to_id(token);
*out_id = match id {
Some(id) => id as i32,
None => -1,
};
}
}
26 changes: 25 additions & 1 deletion src/huggingface_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <tokenizers_c.h>
#include <tokenizers_cpp.h>

#include <cassert>

namespace tokenizers {
/*!
* \brief A simple c++ header of tokenizer via C API.
Expand All @@ -31,7 +33,9 @@ class HFTokenizer : public Tokenizer {
const uint32_t* data;
size_t len;
tokenizers_get_encode_ids(handle_, &data, &len);
return std::vector<int32_t>(data, data + len);
const int32_t* data_i32 = reinterpret_cast<const int32_t*>(data);
auto res = std::vector<int32_t>(data_i32, data_i32 + len);
return res;
}

// use i32 to be consistent with sentencepiece
Expand All @@ -45,6 +49,26 @@ class HFTokenizer : public Tokenizer {
return std::string(data, len);
}

size_t GetVocabSize() final {
size_t size;
tokenizers_get_vocab_size(handle_, &size);
assert(size > 0);
return size;
}

std::string IdToToken(int32_t id) final {
const char* data;
size_t len;
tokenizers_id_to_token(handle_, static_cast<uint32_t>(id), &data, &len);
return std::string(data, len);
}

int32_t TokenToId(const std::string& token) final {
int32_t id;
tokenizers_token_to_id(handle_, token.data(), token.length(), &id);
return id;
}

private:
// internal handle
TokenizerHandle handle_{nullptr};
Expand Down
Loading