88
99// Local
1010#include < pytorch/tokenizers/pre_tokenizer.h>
11- #include < pytorch/tokenizers/third-party/llama.cpp-unicode/ unicode.h>
11+ #include < unicode.h>
1212
1313// Standard
1414#include < algorithm>
@@ -63,37 +63,35 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
6363 " Missing pretokenizers for PreTokenizer of type Sequence" );
6464 }
6565 std::vector<PreTokenizer::Ptr> pretoks;
66- std::transform (
67- pretokenizers->begin (),
68- pretokenizers->end (),
69- std::back_inserter (pretoks),
70- [](const PreTokenizerConfig& cfg) { return cfg.create (); });
66+ std::transform (pretokenizers->begin (), pretokenizers->end (),
67+ std::back_inserter (pretoks),
68+ [](const PreTokenizerConfig &cfg) { return cfg.create (); });
7169 return PreTokenizer::Ptr (new SequencePreTokenizer (pretoks));
7270 }
7371 throw std::runtime_error (" Unsupported PreTokenizer type: " + type);
7472}
7573
76- PreTokenizerConfig& PreTokenizerConfig::parse_json (const json& json_config) {
74+ PreTokenizerConfig & PreTokenizerConfig::parse_json (const json & json_config) {
7775 type = json_config.at (" type" );
7876 if (type == " Split" ) {
7977 try {
8078 pattern = json_config.at (" pattern" );
81- } catch (json::out_of_range&) {
79+ } catch (json::out_of_range &) {
8280 }
8381 } else if (type == " Digits" ) {
8482 try {
8583 individual_digits = json_config.at (" individual_digits" );
86- } catch (json::out_of_range&) {
84+ } catch (json::out_of_range &) {
8785 }
8886 } else if (type == " ByteLevel" ) {
8987 try {
9088 add_prefix_space = json_config.at (" add_prefix_space" );
91- } catch (json::out_of_range&) {
89+ } catch (json::out_of_range &) {
9290 }
9391 // TODO: trim_offsets, use_regex
9492 } else if (type == " Sequence" ) {
9593 pretokenizers = std::vector<PreTokenizerConfig>();
96- for (const auto & entry : json_config.at (" pretokenizers" )) {
94+ for (const auto & entry : json_config.at (" pretokenizers" )) {
9795 pretokenizers->push_back (PreTokenizerConfig ().parse_json (entry));
9896 }
9997 } else {
@@ -104,14 +102,14 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
104102
105103// RegexPreTokenizer ///////////////////////////////////////////////////////////
106104
107- RegexPreTokenizer::Re2UPtr RegexPreTokenizer::create_regex_ (
108- const std::string& pattern) {
105+ RegexPreTokenizer::Re2UPtr
106+ RegexPreTokenizer::create_regex_ ( const std::string & pattern) {
109107 assert (!pattern.empty ());
110108 return std::make_unique<re2::RE2>(" (" + pattern + " )" );
111109}
112110
113- std::vector<std::string> RegexPreTokenizer::pre_tokenize (
114- re2::StringPiece input) const {
111+ std::vector<std::string>
112+ RegexPreTokenizer::pre_tokenize ( re2::StringPiece input) const {
115113 std::vector<std::string> result;
116114 std::string piece;
117115 while (RE2::FindAndConsume (&input, *regex_, &piece)) {
@@ -138,14 +136,13 @@ constexpr char GPT2_EXPR[] =
138136// Construction //
139137// ////////////////
140138
141- ByteLevelPreTokenizer::ByteLevelPreTokenizer (
142- bool add_prefix_space,
143- const std::string& pattern)
139+ ByteLevelPreTokenizer::ByteLevelPreTokenizer (bool add_prefix_space,
140+ const std::string &pattern)
144141 : pattern_(pattern.empty() ? GPT2_EXPR : pattern),
145142 add_prefix_space_ (add_prefix_space) {}
146143
147- std::vector<std::string> ByteLevelPreTokenizer::pre_tokenize (
148- re2::StringPiece input) const {
144+ std::vector<std::string>
145+ ByteLevelPreTokenizer::pre_tokenize ( re2::StringPiece input) const {
149146 // Add the prefix space if configured to do so
150147 std::string input_str (input);
151148 if (add_prefix_space_ && !input_str.empty () && input_str[0 ] != ' ' ) {
@@ -161,13 +158,13 @@ SequencePreTokenizer::SequencePreTokenizer(
161158 std::vector<PreTokenizer::Ptr> pre_tokenizers)
162159 : pre_tokenizers_(std::move(pre_tokenizers)) {}
163160
164- std::vector<std::string> SequencePreTokenizer::pre_tokenize (
165- re2::StringPiece input) const {
161+ std::vector<std::string>
162+ SequencePreTokenizer::pre_tokenize ( re2::StringPiece input) const {
166163 std::vector<std::string> pieces{std::string (input)};
167- for (const auto & pre_tokenizer : pre_tokenizers_) {
164+ for (const auto & pre_tokenizer : pre_tokenizers_) {
168165 std::vector<std::string> new_pieces;
169- for (const auto & piece : pieces) {
170- for (const auto & subpiece : pre_tokenizer->pre_tokenize (piece)) {
166+ for (const auto & piece : pieces) {
167+ for (const auto & subpiece : pre_tokenizer->pre_tokenize (piece)) {
171168 new_pieces.push_back (subpiece);
172169 }
173170 }
0 commit comments