diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt index cf8bc96bf23..0735b5331e8 100644 --- a/examples/models/llama2/CMakeLists.txt +++ b/examples/models/llama2/CMakeLists.txt @@ -21,7 +21,7 @@ project(llama_runner) # Duplicating options as root CMakeLists.txt option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF) -option(EXECUTORCH_BUILD_RE2 "Build RE2" OFF) +option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF) include(CMakeDependentOption) # @@ -88,7 +88,7 @@ endif() # llama_runner library add_subdirectory(runner) -if(EXECUTORCH_BUILD_RE2) +if(EXECUTORCH_USE_TIKTOKEN) # find RE2 for tokenizer set(ABSL_ENABLE_INSTALL ON) set(_pic_flag diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp index a8cb048f88b..10a355a6037 100644 --- a/examples/models/llama2/main.cpp +++ b/examples/models/llama2/main.cpp @@ -39,11 +39,6 @@ DEFINE_int32( -1, "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); -DEFINE_bool( - use_tiktoken, - false, - "Use Tiktoken tokenizer instead of the default BPE tokenizer."); - int32_t main(int32_t argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -62,8 +57,6 @@ int32_t main(int32_t argc, char** argv) { int32_t cpu_threads = FLAGS_cpu_threads; - bool use_tiktoken = FLAGS_use_tiktoken; - #if defined(ET_USE_THREADPOOL) uint32_t num_performant_cores = cpu_threads == -1 ? torch::executorch::cpuinfo::get_num_performant_cores() @@ -76,8 +69,7 @@ int32_t main(int32_t argc, char** argv) { } #endif // create llama runner - ::torch::executor::Runner runner( - model_path, tokenizer_path, temperature, use_tiktoken); + ::torch::executor::Runner runner(model_path, tokenizer_path, temperature); // generate runner.generate(prompt, seq_len); diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt index 81a80dab9c5..9e9ad0d4879 100644 --- a/examples/models/llama2/runner/CMakeLists.txt +++ b/examples/models/llama2/runner/CMakeLists.txt @@ -39,9 +39,17 @@ list(TRANSFORM _llama_runner__srcs PREPEND "${EXECUTORCH_ROOT}/") target_include_directories(extension_module INTERFACE ${_common_include_directories}) -if(CMAKE_TOOLCHAIN_IOS OR ANDROID OR APPLE) - # Building a share library on iOS requires code signing - # On Android we see duplicated registration when using shared lib +if(EXECUTORCH_USE_TIKTOKEN) + list(APPEND _llama_runner__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/tiktoken.cpp) + set(_preprocessor_flag -DET_USE_TIKTOKEN) +endif() + +if(CMAKE_TOOLCHAIN_IOS + OR ANDROID + OR APPLE) + # Building a share library on iOS requires code signing On Android we see + # duplicated registration when using shared lib add_library(llama_runner STATIC ${_llama_runner__srcs}) else() add_library(llama_runner SHARED ${_llama_runner__srcs}) @@ -49,9 +57,8 @@ endif() set(llama_runner_deps executorch extension_module extension_data_loader) -target_link_libraries( - llama_runner PUBLIC ${llama_runner_deps}) +target_link_libraries(llama_runner PUBLIC ${llama_runner_deps}) -target_include_directories(llama_runner - INTERFACE ${_common_include_directories} - ${EXECUTORCH_ROOT}) +target_include_directories(llama_runner INTERFACE ${_common_include_directories} + ${EXECUTORCH_ROOT}) +target_compile_options(llama_runner PUBLIC ${_preprocessor_flag}) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index c6889a150dd..222a9185893 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -11,7 +11,9 @@ #include #include +#if defined(ET_USE_TIKTOKEN) #include +#endif #include #include @@ -38,10 +40,8 @@ std::string statsToJsonString(const Runner::Stats& stats); Runner::Runner( const std::string& model_path, const std::string& tokenizer_path, - const float temperature, - bool use_tiktoken) - : use_tiktoken_(use_tiktoken), - module_(std::make_unique( + const float temperature) + : module_(std::make_unique( model_path, Module::MlockConfig::UseMlockIgnoreErrors)), tokenizer_path_(tokenizer_path), @@ -80,11 +80,11 @@ Error Runner::load() { append_eos_ = getMetadataHelper("append_eos_to_prompt", false); // Load tokenizer - if (use_tiktoken_) { - tokenizer_ = std::make_unique(vocab_size_, bos_id_, eos_id_); - } else { - tokenizer_ = std::make_unique(vocab_size_, bos_id_, eos_id_); - } +#if defined(ET_USE_TIKTOKEN) + tokenizer_ = std::make_unique(vocab_size_, bos_id_, eos_id_); +#else + tokenizer_ = std::make_unique(vocab_size_, bos_id_, eos_id_); +#endif tokenizer_->load(tokenizer_path_); if (tokenizer_->bos_tok() != bos_id_) { ET_LOG( diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index f15cdb636d0..4e200d5e6ca 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -29,8 +29,7 @@ class Runner { explicit Runner( const std::string& model_path, const std::string& tokenizer_path, - const float temperature = 0.8f, - bool use_tiktoken = false); + const float temperature = 0.8f); struct Stats { // Scaling factor for timestamps - in this case, we use ms. @@ -86,7 +85,6 @@ class Runner { int32_t n_bos_; int32_t n_eos_; int32_t max_seq_len_; - bool use_tiktoken_; bool use_kv_cache_; bool use_sdpa_with_kv_cache_; bool append_eos_; diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index f3b8e1ed874..5e1a324ce54 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -30,14 +30,17 @@ def define_common_targets(): exported_deps = [ "//executorch/backends/xnnpack:xnnpack_backend", "//executorch/examples/models/llama2/sampler:sampler" + aten_suffix, - "//executorch/examples/models/llama2/tokenizer:tokenizer", "//executorch/extension/evalue_util:print_evalue" + aten_suffix, "//executorch/extension/runner_util:managed_tensor" + aten_suffix, "//executorch/extension/module:module" + aten_suffix, "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - ] + (_get_operator_lib(aten)) + ([ + ] + ([ + "//executorch/examples/models/llama2/tokenizer:tiktoken", + ] if native.read_config("llama", "use_tiktoken", "0") == "1" else [ + "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer", + ]) + (_get_operator_lib(aten)) + ([ # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE) # Therefore enable it explicitly for now to avoid failing tests "//executorch/backends/vulkan:vulkan_backend_lib", diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama2/tokenizer/targets.bzl index 2ac2b483991..9c3d87a7728 100644 --- a/examples/models/llama2/tokenizer/targets.bzl +++ b/examples/models/llama2/tokenizer/targets.bzl @@ -2,14 +2,30 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def define_common_targets(): runtime.cxx_library( - name = "tokenizer", + name = "bpe_tokenizer", srcs = [ "bpe_tokenizer.cpp", - "tiktoken.cpp", ], exported_headers = [ "tokenizer.h", "bpe_tokenizer.h", + ], + exported_deps = [ + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/exec_aten/util:scalar_type_util", + ], + visibility = [ + "@EXECUTORCH_CLIENTS", + ], + ) + + runtime.cxx_library( + name = "tiktoken", + srcs = [ + "tiktoken.cpp", + ], + exported_headers = [ + "tokenizer.h", "tiktoken.h", "base64.h", ], diff --git a/examples/models/llama2/tokenizer/test/targets.bzl b/examples/models/llama2/tokenizer/test/targets.bzl index 3642ceca66f..b8225cd06df 100644 --- a/examples/models/llama2/tokenizer/test/targets.bzl +++ b/examples/models/llama2/tokenizer/test/targets.bzl @@ -8,12 +8,12 @@ def define_common_targets(): """ runtime.cxx_test( - name = "test", + name = "test_bpe_tokenizer", srcs = [ - "test_tokenizer.cpp", + "test_bpe_tokenizer.cpp", ], deps = [ - "//executorch/examples/models/llama2/tokenizer:tokenizer", + "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer", ], env = { "RESOURCES_PATH": "$(location :resources)/resources", @@ -26,7 +26,7 @@ def define_common_targets(): "test_tiktoken.cpp", ], deps = [ - "//executorch/examples/models/llama2/tokenizer:tokenizer", + "//executorch/examples/models/llama2/tokenizer:tiktoken", ], env = { "RESOURCES_PATH": "$(location :resources_fb_only)/resources", diff --git a/examples/models/llama2/tokenizer/test/test_tokenizer.cpp b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp similarity index 100% rename from examples/models/llama2/tokenizer/test/test_tokenizer.cpp rename to examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp