File tree Expand file tree Collapse file tree 3 files changed +16
-27
lines changed
include/pytorch/tokenizers Expand file tree Collapse file tree 3 files changed +16
-27
lines changed Original file line number Diff line number Diff line change 2525#include < pytorch/tokenizers/string_integer_map.h>
2626#include < pytorch/tokenizers/tokenizer.h>
2727
28- #include " re2/re2.h"
29-
3028namespace tokenizers {
3129namespace detail {
3230
@@ -106,25 +104,6 @@ static Result<TokenMap> buildTokenMap(
106104 return buildTokenMap (std::move (pairs));
107105}
108106
109- static Result<std::unique_ptr<IRegex>> build_special_token_regex (
110- const TokenMap& special_token_map) {
111- std::string special_pattern;
112- const std::size_t count = special_token_map.size ();
113-
114- for (std::size_t i = 0 ; i < count; ++i) {
115- const auto & [token, _] = special_token_map.getElement (i);
116- if (!special_pattern.empty ()) {
117- special_pattern += " |" ;
118- }
119- special_pattern += re2::RE2::QuoteMeta (std::string (token));
120- }
121-
122- if (special_pattern.empty ()) {
123- return static_cast <std::unique_ptr<IRegex>>(nullptr );
124- }
125- return create_regex (special_pattern);
126- }
127-
128107class BPETokenizerBase : public Tokenizer {
129108 public:
130109 Result<std::vector<uint64_t >>
Original file line number Diff line number Diff line change @@ -69,11 +69,6 @@ Error HFTokenizer::load(const std::string& path) {
6969 special_tokens,
7070 [](const auto & it) -> std::string { return it.at (" content" ); },
7171 [](const auto & it) -> std::uint64_t { return it.at (" id" ); }));
72-
73- // Create special token regex to help later with encoding.
74- special_token_regex_ = TK_UNWRAP (detail::build_special_token_regex (special_token_map));
75-
76- // Store for future use.
7772 special_token_map_.emplace (std::move (special_token_map));
7873 } catch (const json::out_of_range& e) {
7974 fprintf (stderr, " Could not parse special tokens: %s\n " , e.what ());
Original file line number Diff line number Diff line change 3232#include < fstream>
3333#include < limits>
3434#include < unordered_set>
35+ #include " re2/re2.h"
3536
3637namespace tokenizers {
3738
@@ -46,6 +47,20 @@ static Result<std::unique_ptr<IRegex>> _create_regex(
4647 return create_regex (pattern);
4748}
4849
50+ static Result<std::unique_ptr<IRegex>> _build_special_token_regex (
51+ const std::vector<std::pair<std::string, std::uint64_t >>& special_encoder) {
52+ std::string special_pattern;
53+ for (const auto & ele : special_encoder) {
54+ if (!special_pattern.empty ()) {
55+ special_pattern += " |" ;
56+ }
57+ special_pattern += re2::RE2::QuoteMeta (ele.first );
58+ }
59+ if (special_pattern.empty ()) {
60+ return static_cast <std::unique_ptr<IRegex>>(nullptr );
61+ }
62+ return _create_regex (special_pattern);
63+ }
4964
5065static Result<std::pair<std::string, uint64_t >> _parse (
5166 const std::string& line) {
@@ -138,7 +153,7 @@ Error Tiktoken::load(const std::string& path) {
138153
139154 _regex = TK_UNWRAP (_create_regex (_pattern));
140155 special_token_regex_ =
141- TK_UNWRAP (detail::build_special_token_regex ( TokenMap ( special_token_map) ));
156+ TK_UNWRAP (_build_special_token_regex ( special_token_map));
142157
143158 // initialize vocab_size, bos_tok, eos_tok
144159 vocab_size_ = token_map_->size () + special_token_map_->size ();
You can’t perform that action at this time.
0 commit comments