@@ -19,12 +19,11 @@ using namespace tokenizers;
1919
2020// Helpers /////////////////////////////////////////////////////////////////////
2121
22- static void assert_split_match (
23- const PreTokenizer& ptok,
24- const std::string& prompt,
25- const std::vector<std::string>& expected) {
22+ static void assert_split_match (const PreTokenizer &ptok,
23+ const std::string &prompt,
24+ const std::vector<std::string> &expected) {
2625 re2::StringPiece prompt_view (prompt);
27- const auto & got = ptok.pre_tokenize (prompt_view);
26+ const auto & got = ptok.pre_tokenize (prompt_view);
2827 EXPECT_EQ (expected.size (), got.size ());
2928 for (auto i = 0 ; i < got.size (); ++i) {
3029 EXPECT_EQ (expected[i], got[i]);
@@ -35,16 +34,14 @@ static void assert_split_match(
3534class RegexPreTokenizerTest : public ::testing::Test {};
3635
3736// Test the basic construction
38- TEST_F (RegexPreTokenizerTest, Construct) {
39- RegexPreTokenizer ptok (" [0-9]+" );
40- }
37+ TEST_F (RegexPreTokenizerTest, Construct) { RegexPreTokenizer ptok (" [0-9]+" ); }
4138
4239// Test basic splitting using the expression for Tiktoken
4340TEST_F (RegexPreTokenizerTest, TiktokenExpr) {
4441 RegexPreTokenizer ptok (
4542 R"( (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)" );
46- assert_split_match (
47- ptok, " How are you doing? " , {" How" , " are" , " you" , " doing" , " ?" });
43+ assert_split_match (ptok, " How are you doing? " ,
44+ {" How" , " are" , " you" , " doing" , " ?" });
4845}
4946
5047// DigitsPreTokenizer //////////////////////////////////////////////////////////
@@ -54,18 +51,15 @@ class DigitsPreTokenizerTest : public ::testing::Test {};
5451TEST_F (DigitsPreTokenizerTest, IndividualDigits) {
5552 DigitsPreTokenizer ptok (true );
5653 assert_split_match (
57- ptok,
58- " The number 1 then 234 then 5." ,
54+ ptok, " The number 1 then 234 then 5." ,
5955 {" The number " , " 1" , " then " , " 2" , " 3" , " 4" , " then " , " 5" , " ." });
6056}
6157
6258// Test digit splitting with contiguous digits
6359TEST_F (DigitsPreTokenizerTest, ContiguousDigits) {
6460 DigitsPreTokenizer ptok (false );
65- assert_split_match (
66- ptok,
67- " The number 1 then 234 then 5." ,
68- {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
61+ assert_split_match (ptok, " The number 1 then 234 then 5." ,
62+ {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
6963}
7064
7165// ByteLevelPreTokenizer ///////////////////////////////////////////////////////
@@ -75,8 +69,7 @@ TEST_F(ByteLevelPreTokenizerTest, PreTokenizeDefault) {
7569 ByteLevelPreTokenizer ptok;
7670 assert_split_match (ptok, " Hello World" , {" ĠHello" , " ĠWorld" });
7771 assert_split_match (
78- ptok,
79- " The number 1 then 234 then 5." ,
72+ ptok, " The number 1 then 234 then 5." ,
8073 {" ĠThe" , " Ġnumber" , " Ġ1" , " Ġthen" , " Ġ234" , " Ġthen" , " Ġ5" , " ." });
8174}
8275
@@ -97,22 +90,9 @@ TEST_F(SequencePreTokenizerTest, PreTokenizeDigitAndByteLevel) {
9790 PreTokenizer::Ptr dptok (new DigitsPreTokenizer (true ));
9891 PreTokenizer::Ptr bptok (new ByteLevelPreTokenizer (false ));
9992 SequencePreTokenizer ptok ({dptok, bptok});
100- assert_split_match (
101- ptok,
102- " The number 1 then 234 then 5." ,
103- {" The" ,
104- " Ġnumber" ,
105- " Ġ" ,
106- " 1" ,
107- " Ġthen" ,
108- " Ġ" ,
109- " 2" ,
110- " 3" ,
111- " 4" ,
112- " Ġthen" ,
113- " Ġ" ,
114- " 5" ,
115- " ." });
93+ assert_split_match (ptok, " The number 1 then 234 then 5." ,
94+ {" The" , " Ġnumber" , " Ġ" , " 1" , " Ġthen" , " Ġ" , " 2" , " 3" , " 4" ,
95+ " Ġthen" , " Ġ" , " 5" , " ." });
11696}
11797
11898// PreTokenizerConfig //////////////////////////////////////////////////////////
@@ -152,14 +132,12 @@ TEST_F(PreTokenizerConfigTest, AllTypesFailureCases) {
152132
153133 // Sequence
154134 EXPECT_THROW (PreTokenizerConfig (" Sequence" ).create (), std::runtime_error);
155- EXPECT_THROW (
156- PreTokenizerConfig (" Sequence" ).set_pretokenizers ({}).create (),
157- std::runtime_error);
158- EXPECT_THROW (
159- PreTokenizerConfig (" Sequence" )
160- .set_pretokenizers ({PreTokenizerConfig (" Split" )})
161- .create (),
162- std::runtime_error);
135+ EXPECT_THROW (PreTokenizerConfig (" Sequence" ).set_pretokenizers ({}).create (),
136+ std::runtime_error);
137+ EXPECT_THROW (PreTokenizerConfig (" Sequence" )
138+ .set_pretokenizers ({PreTokenizerConfig (" Split" )})
139+ .create (),
140+ std::runtime_error);
163141
164142 // Unsupported
165143 EXPECT_THROW (PreTokenizerConfig (" Unsupported" ).create (), std::runtime_error);
@@ -183,22 +161,9 @@ TEST_F(PreTokenizerConfigTest, ParseJson) {
183161 }},
184162 })
185163 .create ();
186- assert_split_match (
187- *ptok,
188- " The number 1 then 234 then 5." ,
189- {" The" ,
190- " Ġnumber" ,
191- " Ġ" ,
192- " 1" ,
193- " Ġthen" ,
194- " Ġ" ,
195- " 2" ,
196- " 3" ,
197- " 4" ,
198- " Ġthen" ,
199- " Ġ" ,
200- " 5" ,
201- " ." });
164+ assert_split_match (*ptok, " The number 1 then 234 then 5." ,
165+ {" The" , " Ġnumber" , " Ġ" , " 1" , " Ġthen" , " Ġ" , " 2" , " 3" , " 4" ,
166+ " Ġthen" , " Ġ" , " 5" , " ." });
202167}
203168
204169TEST_F (PreTokenizerConfigTest, ParseJsonOptionalKey) {
@@ -208,10 +173,8 @@ TEST_F(PreTokenizerConfigTest, ParseJsonOptionalKey) {
208173 {" type" , " Digits" },
209174 })
210175 .create ();
211- assert_split_match (
212- *ptok,
213- " The number 1 then 234 then 5." ,
214- {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
176+ assert_split_match (*ptok, " The number 1 then 234 then 5." ,
177+ {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
215178}
216179
217180TEST_F (PreTokenizerConfigTest, Split) {
0 commit comments