mlc-ai · tqchen · Dec 19, 2023 · Dec 19, 2023 · Dec 19, 2023 · Ubospica
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -25,4 +25,3 @@ target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include)
 # You can link tokenizers_cpp, it will automatically link tokenizers_c
 # and sentencepiece libary
 target_link_libraries(example PRIVATE tokenizers_cpp)
-
diff --git a/example/build_and_run.sh b/example/build_and_run.sh
@@ -11,7 +11,7 @@ cd ..
 mkdir -p dist
 cd dist
 if [ ! -f "tokenizer.model" ]; then
-    wget https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model
+    wget https://huggingface.co/lmsys/vicuna-7b-v1.5/resolve/main/tokenizer.model
 fi
 if [ ! -f "tokenizer.json" ]; then
     wget https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1/resolve/main/tokenizer.json

diff --git a/example/example.cc b/example/example.cc
@@ -1,5 +1,7 @@
 #include <tokenizers_cpp.h>
 
+#include <cassert>
+#include <chrono>
 #include <fstream>
 #include <iostream>
 #include <string>
@@ -30,60 +32,92 @@ void PrintEncodeResult(const std::vector<int>& ids) {
   std::cout << "]" << std::endl;
 }
 
+void TestTokenizer(std::unique_ptr<Tokenizer> tok, bool print_vocab = false,
+                   bool check_id_back = true) {
+  // Check #1. Encode and Decode
+  std::string prompt = "What is the  capital of Canada?";
+  std::vector<int> ids = tok->Encode(prompt);
+  std::string decoded_prompt = tok->Decode(ids);
+  PrintEncodeResult(ids);
+  std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
+  assert(decoded_prompt == prompt);
+
+  // Check #2. IdToToken and TokenToId
+  std::vector<int32_t> ids_to_test = {0, 1, 2, 3, 32, 33, 34, 130, 131, 1000};
+  for (auto id : ids_to_test) {
+    auto token = tok->IdToToken(id);
+    auto id_new = tok->TokenToId(token);
+    std::cout << "id=" << id << ", token=\"" << token << "\", id_new=" << id_new << std::endl;
+    if (check_id_back) {
+      assert(id == id_new);
+    }
+  }
+
+  // Check #3. GetVocabSize
+  auto vocab_size = tok->GetVocabSize();
+  std::cout << "vocab_size=" << vocab_size << std::endl;
+
+  std::cout << std::endl;
+}
+
 // Sentencepiece tokenizer
 // - dist/tokenizer.model
 void SentencePieceTokenizerExample() {
+  std::cout << "Tokenizer: SentencePiece" << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+
   // Read blob from file.
   auto blob = LoadBytesFromFile("dist/tokenizer.model");
   // Note: all the current factory APIs takes in-memory blob as input.
   // This gives some flexibility on how these blobs can be read.
   auto tok = Tokenizer::FromBlobSentencePiece(blob);
-  std::string prompt = "What is the capital of Canada?";
-  // call Encode to turn prompt into token ids
-  std::vector<int> ids = tok->Encode(prompt);
-  // call Decode to turn ids into string
-  std::string decoded_prompt = tok->Decode(ids);
 
-  // print encoded result
-  std::cout << "SetencePiece tokenizer: " << std::endl;
-  PrintEncodeResult(ids);
-  std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+  std::cout << "Load time: " << duration << " ms" << std::endl;
+
+  TestTokenizer(std::move(tok), false, true);
 }
 
 // HF tokenizer
 // - dist/tokenizer.json
 void HuggingFaceTokenizerExample() {
+  std::cout << "Tokenizer: Huggingface" << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+
   // Read blob from file.
   auto blob = LoadBytesFromFile("dist/tokenizer.json");
   // Note: all the current factory APIs takes in-memory blob as input.
   // This gives some flexibility on how these blobs can be read.
   auto tok = Tokenizer::FromBlobJSON(blob);
-  std::string prompt = "What is the capital of Canada?";
-  // call Encode to turn prompt into token ids
-  std::vector<int> ids = tok->Encode(prompt);
-  // call Decode to turn ids into string
-  std::string decoded_prompt = tok->Decode(ids);
 
-  // print encoded result
-  std::cout << "HF tokenizer: " << std::endl;
-  PrintEncodeResult(ids);
-  std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+  std::cout << "Load time: " << duration << " ms" << std::endl;
+
+  TestTokenizer(std::move(tok), false, true);
 }
 
 // RWKV world tokenizer
 // - dist/tokenizer_model
 void RWKVWorldTokenizerExample() {
+  std::cout << "Tokenizer: RWKVWorld" << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+
   auto tok = Tokenizer::FromBlobRWKVWorld("dist/tokenizer_model");
-  std::string prompt = "What is the capital of Canada?";
-  // call Encode to turn prompt into token ids
-  std::vector<int> ids = tok->Encode(prompt);
-  // call Decode to turn ids into string
-  std::string decoded_prompt = tok->Decode(ids);
 
-  // print encoded result
-  std::cout << "RWKV World tokenizer: " << std::endl;
-  PrintEncodeResult(ids);
-  std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+  std::cout << "Load time: " << duration << " ms" << std::endl;
+
+  // We cannot check id back for RWKVWorldTokenizer yet.
+  TestTokenizer(std::move(tok), false, false);
 }
 
 int main(int argc, char* argv[]) {

diff --git a/include/rwkv_world_tokenizer.h b/include/rwkv_world_tokenizer.h
diff --git a/include/tokenizers_c.h b/include/tokenizers_c.h
@@ -32,6 +32,13 @@ void tokenizers_get_decode_str(TokenizerHandle handle, const char** data, size_t
 
 void tokenizers_get_encode_ids(TokenizerHandle handle, const uint32_t** id_data, size_t* len);
 
+void tokenizers_get_vocab_size(TokenizerHandle handle, size_t* size);
+
+void tokenizers_id_to_token(TokenizerHandle handle, uint32_t id, const char** data, size_t* len);
+
+// tokenizers_token_to_id stores -1 to *id if the token is not in the vocab
+void tokenizers_token_to_id(TokenizerHandle handle, const char* token, size_t len, int32_t* id);
+
 void tokenizers_free(TokenizerHandle handle);
 
 #ifdef __cplusplus

diff --git a/include/tokenizers_cpp.h b/include/tokenizers_cpp.h
@@ -36,6 +36,22 @@ class Tokenizer {
    */
   virtual std::string Decode(const std::vector<int32_t>& ids) = 0;
 
+  /*!
+   * \brief Returns the vocabulary size. Special tokens are considered.
+   */
+  virtual size_t GetVocabSize() = 0;
+
+  /*!
+   * \brief Convert the given id to its corresponding token if it exists. If not, return an
+   * empty string.
+   */
+  virtual std::string IdToToken(int32_t token_id) = 0;
+
+  /*!
+   * \brief Convert the given token to its corresponding id if it exists. If not, return -1.
+   */
+  virtual int32_t TokenToId(const std::string& token) = 0;
+
   //---------------------------------------------------
   // Factory functions from byte-blobs
   // These factory function takes in in-memory blobs

diff --git a/rust/src/lib.rs b/rust/src/lib.rs
@@ -9,6 +9,7 @@ pub struct TokenizerWrapper {
     tokenizer: Tokenizer,
     encode_ids: Vec<u32>,
     decode_str: String,
+    id_to_token_result: String,
 }
 
 pub type Vocab = HashMap<String, u32>;
@@ -20,6 +21,7 @@ impl TokenizerWrapper {
             tokenizer: Tokenizer::from_str(json).unwrap().into(),
             encode_ids: Vec::new(),
             decode_str: String::new(),
+            id_to_token_result: String::new(),
         }
     }
 
@@ -77,6 +79,7 @@ impl TokenizerWrapper {
             tokenizer: tokenizer,
             encode_ids: Vec::new(),
             decode_str: String::new(),
+            id_to_token_result: String::new(),
         }
     }
 
@@ -182,3 +185,46 @@ extern "C" fn tokenizers_free(wrapper: *mut TokenizerWrapper) {
         drop(Box::from_raw(wrapper));
     }
 }
+
+#[no_mangle]
+extern "C" fn tokenizers_get_vocab_size(handle: *mut TokenizerWrapper, size: *mut usize) {
+    unsafe {
+        *size = (*handle).tokenizer.get_vocab_size(true);
+    }
+}
+
+#[no_mangle]
+extern "C" fn tokenizers_id_to_token(
+    handle: *mut TokenizerWrapper,
+    id: u32,
+    out_cstr: *mut *mut u8,
+    out_len: *mut usize,
+) {
+    unsafe {
+        let str = (*handle).tokenizer.id_to_token(id);
+        (*handle).id_to_token_result = match str {
+            Some(s) => s,
+            None => String::from(""),
+        };
+
+        *out_cstr = (*handle).id_to_token_result.as_mut_ptr();
+        *out_len = (*handle).id_to_token_result.len();
+    }
+}
+
+#[no_mangle]
+extern "C" fn tokenizers_token_to_id(
+    handle: *mut TokenizerWrapper,
+    token: *const u8,
+    len: usize,
+    out_id: *mut i32,
+) {
+    unsafe {
+        let token: &str = std::str::from_utf8(std::slice::from_raw_parts(token, len)).unwrap();
+        let id = (*handle).tokenizer.token_to_id(token);
+        *out_id = match id {
+            Some(id) => id as i32,
+            None => -1,
+        };
+    }
+}
diff --git a/src/huggingface_tokenizer.cc b/src/huggingface_tokenizer.cc
@@ -7,6 +7,8 @@
 #include <tokenizers_c.h>
 #include <tokenizers_cpp.h>
 
+#include <cassert>
+
 namespace tokenizers {
 /*!
  * \brief A simple c++ header of tokenizer via C API.
@@ -31,7 +33,9 @@ class HFTokenizer : public Tokenizer {
     const uint32_t* data;
     size_t len;
     tokenizers_get_encode_ids(handle_, &data, &len);
-    return std::vector<int32_t>(data, data + len);
+    const int32_t* data_i32 = reinterpret_cast<const int32_t*>(data);
+    auto res = std::vector<int32_t>(data_i32, data_i32 + len);
+    return res;
   }
 
   // use i32 to be consistent with sentencepiece
@@ -45,6 +49,26 @@ class HFTokenizer : public Tokenizer {
     return std::string(data, len);
   }
 
+  size_t GetVocabSize() final {
+    size_t size;
+    tokenizers_get_vocab_size(handle_, &size);
+    assert(size > 0);
+    return size;
+  }
+
+  std::string IdToToken(int32_t id) final {
+    const char* data;
+    size_t len;
+    tokenizers_id_to_token(handle_, static_cast<uint32_t>(id), &data, &len);
+    return std::string(data, len);
+  }
+
+  int32_t TokenToId(const std::string& token) final {
+    int32_t id;
+    tokenizers_token_to_id(handle_, token.data(), token.length(), &id);
+    return id;
+  }
+
  private:
   // internal handle
   TokenizerHandle handle_{nullptr};
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,4 +25,3 @@ target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include)
		# You can link tokenizers_cpp, it will automatically link tokenizers_c
		# and sentencepiece libary
		target_link_libraries(example PRIVATE tokenizers_cpp)