Skip to content

Commit

Permalink
[shortfin] Add C++ tokenizer wrapper library.
Browse files Browse the repository at this point in the history
* This is gated by SHORTFIN_ENABLE_TOKENIZERS (presently off).
* I'd like to either take over the wrapper or get mlc-ai/tokenizers-cpp#50 before putting much weight on this.
* There is no great C++ option for this component, so we go to the trouble of integrating a Rust component. We will need to do a bit of prep on our CI systems to enable this by default.
* Python API will be added in a subsequent commit. This should be more efficient than the tokenizers Python API since we will allow direct access to the tokens vs doing a lot of conversions.
* Obligatory language flame bait: Use Rust, they said. It's super efficient. Prior to this patch, libshortfin was 1.8MB, which gave us an entire GPU and CPU runtime stack. After this patch (stripped) it is 8.4MB. Given how important the use case is, I'm willing to tolerate this for the moment. It seems like there is room for something better here, which is why I did not expose the underlying vendor'd API directly.
  • Loading branch information
stellaraccident committed Nov 26, 2024
1 parent ddc3091 commit 2016aae
Show file tree
Hide file tree
Showing 9 changed files with 330 additions and 5 deletions.
71 changes: 67 additions & 4 deletions shortfin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ option(SHORTFIN_BUILD_TESTS "Builds C++ tests" ON)
option(SHORTFIN_BUNDLE_DEPS "Download dependencies instead of using system libraries" ON)
option(SHORTFIN_ENABLE_TRACING "Enable runtime tracing for iree and shortfin" OFF)
option(SHORTFIN_ENABLE_LTO "Enables LTO if supported" ON)
option(SHORTFIN_ENABLE_TOKENIZERS "Enables integration of native tokenizers library" OFF)

set(SHORTFIN_IREE_SOURCE_DIR "" CACHE FILEPATH "Path to IREE source")

Expand Down Expand Up @@ -80,6 +81,7 @@ list(APPEND CMAKE_MODULE_PATH
${CMAKE_CURRENT_LIST_DIR}/build_tools/cmake/
)
include(shortfin_library)
include(shortfin_testing)
include(CheckCXXCompilerFlag)
include(FetchContent)

Expand All @@ -90,7 +92,9 @@ include(FetchContent)
if(SHORTFIN_ENABLE_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT SHORTFIN_LTO_SUPPORTED OUTPUT SHORTFIN_LTO_ERROR)
if(SHORTFIN_LTO_SUPPORTED)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Not enabling LTO for debug build")
elseif(SHORTFIN_LTO_SUPPORTED)
message(STATUS "Shortfin LTO Enabled")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else()
Expand Down Expand Up @@ -126,7 +130,9 @@ endif()
message(STATUS " - Host")

################################################################################
# Dependencies
# Bundled Dependencies
# These dependencies are either bundled or used via installed packages based
# on the SHORTFIN_BUNDLE_DEPS option.
################################################################################

if(SHORTFIN_BUNDLE_DEPS)
Expand Down Expand Up @@ -164,6 +170,7 @@ if(SHORTFIN_BUNDLE_DEPS)
shortfin_push_bundled_lib_options()
# Enable spdlog shared library options so we can export it.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPDLOG_SHARED_LIB -Dspdlog_EXPORTS")
message(STATUS "Fetching bundled projects")
FetchContent_MakeAvailable(fmt spdlog xtl xtensor)
shortfin_pop_bundled_lib_options()
else()
Expand All @@ -172,7 +179,8 @@ else()
endif()

################################################################################
# IREE
# IREE Dependency
# This is always a source dependency on the IREE runtime.
################################################################################

# Set IREE build flags.
Expand Down Expand Up @@ -237,6 +245,61 @@ else()
endif()
shortfin_pop_bundled_lib_options()

################################################################################
# Tokenizer Library
################################################################################

function(shortfin_check_tokenizers)
# Make sure that rust/cargo is installed and usable.
find_program(SHORTFIN_CARGO_PATH NAMES cargo NO_CACHE)
if(NOT SHORTFIN_CARGO_PATH)
message(SEND_ERROR
"Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool). "
"Please follow Rust documentation to install. On Ubuntu, this can typically be accomplished with:\n"
" sudo apt install rustup && rustup default stable"
)
endif()

# Make sure cargo is functional.
execute_process(
COMMAND ${SHORTFIN_CARGO_PATH}
RESULT_VARIABLE _CARGO_RESULT
OUTPUT_VARIABLE _CARGO_OUT
ERROR_VARIABLE _CARGO_ERR
)
if(NOT "${_CARGO_RESULT}" STREQUAL "0")
message(SEND_ERROR
"Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool) "
"to be configured properly. It was found (${SHORTFIN_CARGO_PATH}) but returned an "
"error. Output below:\n"
"${_CARGO_OUT}\n"
"${_CARGO_ERR}"
)
endif()
endfunction()

if(SHORTFIN_ENABLE_TOKENIZERS)
# TODO: submit a patch to tokenizers_cpp to allow explicit configuration of the
# cargo location and pass that vs relying on environmental alignment.
shortfin_check_tokenizers()

shortfin_push_bundled_lib_options()
set(CMAKE_C_VISIBILITY_PRESET "hidden")
set(CMAKE_CXX_VISIBILITY_PRESET "hidden")
set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER OFF)

FetchContent_Declare(
tokenizers_cpp # From CMake project() declaration
GIT_REPOSITORY https://github.com/mlc-ai/tokenizers-cpp.git
GIT_TAG 4bb753377680e249345b54c6b10e6d0674c8af03 # 2024 Nov 15
EXCLUDE_FROM_ALL
)
message(STATUS "Fetching tokenizers_cpp")
FetchContent_MakeAvailable(tokenizers_cpp)
shortfin_pop_bundled_lib_options()
endif()

################################################################################
# Tests
################################################################################
Expand All @@ -256,7 +319,7 @@ if(SHORTFIN_BUILD_TESTS)
enable_testing()
endif()


add_custom_target(shortfin_testdata_deps)
add_subdirectory(src)

if(SHORTFIN_BUILD_PYTHON_BINDINGS)
Expand Down
5 changes: 4 additions & 1 deletion shortfin/build_tools/cmake/shortfin_library.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,10 @@ function(shortfin_gtest_test)
GTest::gmock
GTest::gtest_main
)
gtest_discover_tests(${_RULE_NAME})
gtest_discover_tests(
${_RULE_NAME}
WORKING_DIRECTORY "${libshortfin_BINARY_DIR}"
)
endfunction()


Expand Down
47 changes: 47 additions & 0 deletions shortfin/build_tools/cmake/shortfin_testing.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 Advanced Micro Devices, Inc.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# Downloads some test data file as part of configure.
# This does a download->rename in an attempt to be robust to partial downloads.
# It should not be used to manage large test data files or anything sensitive
# enough to require a hash check.
# The output file is added as an additional clean file on the global
# shortfin_testdata_deps target, meaning the "ninja clean" will remove it.
# It is also added to the current directories list of configure depends, which
# means that if ninja is run and it is not present, cmake will be re-invoked.
function(shortfin_download_test_data)
cmake_parse_arguments(
_RULE
""
"URL;OUTPUT_FILE"
""
${ARGN}
)
if(NOT EXISTS "${_RULE_OUTPUT_FILE}")
set(_stage_file "${_RULE_OUTPUT_FILE}.stage")
message(STATUS "Downloading test data ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}")
file(DOWNLOAD "${_RULE_URL}" "${_stage_file}" STATUS _status)
list(POP_FRONT _status _status_code)
if(_status_code EQUAL "0")
file(RENAME "${_stage_file}" "${_RULE_OUTPUT_FILE}")
else()
message(SEND_ERROR "Error downloading file ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}")
endif()
endif()

# Make clean remove it.
set_property(
TARGET shortfin_testdata_deps
APPEND PROPERTY ADDITIONAL_CLEAN_FILES
"${CMAKE_CURRENT_BINARY_DIR}/tokenizer.json"
)

# And make us reconfigure if it isn't there.
set_property(
DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
APPEND PROPERTY
CMAKE_CONFIGURE_DEPENDS "${_RULE_OUTPUT_FILE}")
endfunction()
1 change: 1 addition & 0 deletions shortfin/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def build_cmake_configuration(CMAKE_BUILD_DIR: Path, extra_cmake_args=[]):
add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_LTO", default_value="ON")
add_env_cmake_setting(cmake_args, "SHORTFIN_IREE_SOURCE_DIR")
add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_ASAN")
add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_TOKENIZERS", default_value="OFF")

# Only do a from-scratch configure if not already configured.
cmake_cache_file = os.path.join(CMAKE_BUILD_DIR, "CMakeCache.txt")
Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

add_subdirectory(array)
add_subdirectory(components/tokenizers)
add_subdirectory(local)
add_subdirectory(support)
39 changes: 39 additions & 0 deletions shortfin/src/shortfin/components/tokenizers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2024 Advanced Micro Devices, Inc.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

if(SHORTFIN_ENABLE_TOKENIZERS)
shortfin_cc_component(
NAME
shortfin_tokenizers
HDRS
tokenizers.h
SRCS
tokenizers.cc
DEFINES
SHORTFIN_HAVE_TOKENIZERS
COMPONENTS
shortfin_support
DEPS
tokenizers_cpp
)
set_property(GLOBAL APPEND
PROPERTY SHORTFIN_LIB_OPTIONAL_COMPONENTS
shortfin_tokenizers)
target_compile_definitions(shortfin_public_defs INTERFACE SHORTFIN_HAVE_TOKENIZERS)

# Download test data.
shortfin_download_test_data(
URL "https://huggingface.co/google-bert/bert-base-cased/resolve/main/tokenizer.json"
OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/tokenizer.json"
)

# Note that tests run from the binary dir of the project.
shortfin_gtest_test(
NAME shortfin_tokenizers_test
SRCS
tokenizers_test.cc
)
endif()
63 changes: 63 additions & 0 deletions shortfin/src/shortfin/components/tokenizers/tokenizers.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright 2024 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "shortfin/components/tokenizers/tokenizers.h"

#include <exception>

#include "shortfin/support/logging.h"
#include "tokenizers_cpp.h"

namespace shortfin::tokenizers {

namespace {

class AccessibleTokenizer : public Tokenizer {
public:
using Tokenizer::vendor_tokenizer_;
};

::tokenizers::Tokenizer *Get(Tokenizer *self) {
void *ptr = static_cast<AccessibleTokenizer *>(self)->vendor_tokenizer_;
if (!ptr) {
throw std::logic_error("Tokenizer is null");
}
return static_cast<::tokenizers::Tokenizer *>(ptr);
}

} // namespace

Tokenizer::~Tokenizer() { delete Get(this); }

Tokenizer Tokenizer::FromBlobJSON(const std::string &json_blob) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::FromBlobJSON");
return Tokenizer(::tokenizers::Tokenizer::FromBlobJSON(json_blob).release());
}

std::vector<int32_t> Tokenizer::Encode(const std::string &text) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Encode");
return Get(this)->Encode(text);
}

std::vector<std::vector<int32_t>> Tokenizer::EncodeBatch(
const std::vector<std::string> &texts) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::EncodeBatch");
return Get(this)->EncodeBatch(texts);
}

std::string Tokenizer::Decode(const std::vector<int32_t> &ids) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Decode");
return Get(this)->Decode(ids);
}
size_t Tokenizer::GetVocabSize() { return Get(this)->GetVocabSize(); }
std::string Tokenizer::IdToToken(int32_t token_id) {
return Get(this)->IdToToken(token_id);
}
int32_t Tokenizer::TokenToId(const std::string &token) {
return Get(this)->TokenToId(token);
}

} // namespace shortfin::tokenizers
52 changes: 52 additions & 0 deletions shortfin/src/shortfin/components/tokenizers/tokenizers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright 2024 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H
#define SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H

#include <string>
#include <vector>

#include "shortfin/support/api.h"

namespace shortfin::tokenizers {

// A vendored Tokenizer class that does not export the details of the backing
// implementation. While a little bit gross, this keeps us from needing to
// re-export a vendor'ed API as part of our public API.
// The current vendor tokenizer is based on mlc-ai/tokenizers-cpp. The API
// is fairly close to that implementation.
// See: https://github.com/mlc-ai/tokenizers-cpp
class SHORTFIN_API Tokenizer {
public:
Tokenizer(const Tokenizer &) = delete;
Tokenizer &operator=(const Tokenizer &) = delete;
Tokenizer(Tokenizer &&other) : vendor_tokenizer_(other.vendor_tokenizer_) {
vendor_tokenizer_ = nullptr;
}
~Tokenizer();

// Factory functions.
static Tokenizer FromBlobJSON(const std::string &json_blob);

std::vector<int32_t> Encode(const std::string &text);
std::vector<std::vector<int32_t>> EncodeBatch(
const std::vector<std::string> &texts);
std::string Decode(const std::vector<int32_t> &ids);
size_t GetVocabSize();
std::string IdToToken(int32_t token_id);
int32_t TokenToId(const std::string &token);

private:
Tokenizer(void *vendor_tokenizer) : vendor_tokenizer_(vendor_tokenizer) {}

protected:
void *vendor_tokenizer_;
};

} // namespace shortfin::tokenizers

#endif // SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H
Loading

0 comments on commit 2016aae

Please sign in to comment.