Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,10 @@ def tokenizer(self, proto):
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)

user_defined_symbols = [
AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols
]
tokenizer.add_tokens(user_defined_symbols)
return tokenizer


Expand Down
2 changes: 1 addition & 1 deletion tests/models/llama/test_tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
@require_sentencepiece
@require_tokenizers
class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "hf-internal-testing/llama-tokenizer"
from_pretrained_id = ["hf-internal-testing/llama-tokenizer", "meta-llama/Llama-2-7b-hf"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good to me!

tokenizer_class = LlamaTokenizer
rust_tokenizer_class = LlamaTokenizerFast

Expand Down
59 changes: 58 additions & 1 deletion tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
get_tests_dir,
is_pt_tf_cross_test,
require_jinja,
require_read_token,
require_tf,
require_tokenizers,
require_torch,
Expand Down Expand Up @@ -200,13 +201,19 @@ class TokenizerTesterMixin:
def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
self.from_pretrained_id = (
[self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
)

self.tokenizers_list = []
if self.test_rust_tokenizer:
self.tokenizers_list = [
(
self.rust_tokenizer_class,
self.from_pretrained_id,
pretrained_id,
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
)
for pretrained_id in self.from_pretrained_id
]
else:
self.tokenizers_list = []
Expand Down Expand Up @@ -1544,6 +1551,56 @@ def test_maximum_encoding_length_pair_input(self):
self.assertEqual(len(overflowing_tokens), 2 + stride)
self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :])

@slow
@require_read_token
def test_encode_decode_fast_slow_all_tokens(self):
if self.rust_tokenizer_class is not None:
pretrained_name = self.from_pretrained_id

slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False)
with self.subTest(f"{pretrained_name}"):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
pretrained_name, from_slow=True, legacy=False
)
input_full_vocab_ids = list(
range(len(slow_tokenizer))
) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
input_full_vocab_string = rust_tokenizer.convert_tokens_to_string(
rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids)
)
print(f"Length of the input string that is tested: {len(input_full_vocab_string)}")

for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
string_to_check = input_full_vocab_string[chunk : chunk + 1024]
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
slow_encode = slow_tokenizer.encode(string_to_check)
fast_encode = rust_tokenizer.encode(string_to_check)
self.assertEquals(
slow_encode,
fast_encode,
"Hint: the following tokenization diff were obtained for slow vs fast:\n "
f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
f"string used : {string_to_check}",
)
print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
self.assertEquals(
slow_tokenizer.decode(
ids_to_decode,
space_between_special_tokens=False,
clean_up_tokenization_spaces=False,
),
rust_tokenizer.decode(
ids_to_decode,
space_between_special_tokens=False,
clean_up_tokenization_spaces=False,
),
f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}",
)

# def test_encode_input_type(self):
# tokenizers = self.get_tokenizers(do_lower_case=False)
# for tokenizer in tokenizers:
Expand Down