diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 5d0dc48e5aee..65bab658c8b6 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3192,7 +3192,7 @@ def apply_chat_template( truncation: bool = False, max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, - return_dict: bool = False, + return_dict: bool = True, return_assistant_tokens_mask: bool = False, tokenizer_kwargs: Optional[dict[str, Any]] = None, **kwargs, @@ -3265,14 +3265,11 @@ def apply_chat_template( set, will return a dict of tokenizer outputs instead. """ - if return_dict and not tokenize: - raise ValueError( - "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict " - "of tokenizer outputs to return." - ) + if not tokenize: + return_dict = False # dicts are only returned by the tokenizer anyway - if return_assistant_tokens_mask and not return_dict: - raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`") + if return_assistant_tokens_mask and not (return_dict and tokenize): + raise ValueError("`return_assistant_tokens_mask=True` requires `return_dict=True` and `tokenize=True`") if tokenizer_kwargs is None: tokenizer_kwargs = {} @@ -3387,13 +3384,17 @@ def encode_message_with_chat_template( ) if conversation_history is None or len(conversation_history) == 0: - return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs) + return self.apply_chat_template( + [message], add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs + ) conversation = conversation_history + [message] - tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs) + tokens = self.apply_chat_template( + conversation, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs + ) prefix_tokens = self.apply_chat_template( - conversation_history, add_generation_prompt=False, tokenize=True, **kwargs + conversation_history, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs ) # It's possible that the prefix tokens are not a prefix of the full list of tokens. # For example, if the prefix is `User: Hi` and the full conversation is `User: HiAssistant: Hello`. diff --git a/tests/models/blenderbot/test_tokenization_blenderbot.py b/tests/models/blenderbot/test_tokenization_blenderbot.py index 8f7c60f2bf2e..37fece070949 100644 --- a/tests/models/blenderbot/test_tokenization_blenderbot.py +++ b/tests/models/blenderbot/test_tokenization_blenderbot.py @@ -21,23 +21,3 @@ def test_pretokenized_inputs(self, *args, **kwargs): # The issue is that when you have a sequence with leading spaces, splitting it # with .split() loses the leading spaces, so the tokenization results differ pass - - def test_tokenization_for_chat(self): - tok = self.get_tokenizer() - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tok.apply_chat_template(test_chat) for test_chat in test_chats] - expected_tokens = [ - [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 2], - [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 228, 3490, 287, 2273, 304, 21, 2], - [3490, 287, 2273, 304, 21, 228, 228, 6950, 8, 2], - ] - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 267d377ed8e5..6d0ea31f3f8a 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -17,7 +17,7 @@ from datasets import load_dataset from transformers import TokenizersBackend -from transformers.testing_utils import require_jinja, require_tokenizers, slow +from transformers.testing_utils import require_tokenizers, slow from ...test_tokenization_common import TokenizerTesterMixin @@ -129,28 +129,6 @@ def test_encodings_from_xnli_dataset(self): predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens] self.assertListEqual(predicted_text, input_text) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = self.get_tokenizer() - tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - expected_tokens = [ - [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2], - [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2, 229126, 427, 11890, 1152, 17, 2], - [229126, 427, 11890, 1152, 17, 2, 59414, 4, 2], - ] - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - def test_add_prefix_space_fast(self): tokenizer_w_prefix = self.get_tokenizer(add_prefix_space=True) tokenizer_wo_prefix = self.get_tokenizer(add_prefix_space=False) diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py index b428c4fa9bca..75de9835fa01 100644 --- a/tests/models/cohere/test_tokenization_cohere.py +++ b/tests/models/cohere/test_tokenization_cohere.py @@ -73,32 +73,6 @@ def test_pretrained_model_lists(self): self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = self.get_tokenizer() - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - # fmt: off - expected_tokens = [ - [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8], - [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, - 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, - 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, - 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, - 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 43, 48, 41, 60, 42, 55, 60, 71, 60, 55, 51, 45, 54, 99, 38, - 54, 567, 235, 693, 276, 411, 243, 22, 8] - ] - # fmt: on - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - @require_jinja def test_tokenization_for_tool_use(self): tokenizer = self.get_tokenizer() diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index de85bde7666d..8e409064320c 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -16,7 +16,7 @@ import unittest from transformers import AutoTokenizer, GPT2Tokenizer -from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers +from transformers.testing_utils import require_tiktoken, require_tokenizers from ...test_tokenization_common import TokenizerTesterMixin @@ -67,26 +67,6 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self): filtered_sequence = [x for x in filtered_sequence if x is not None] self.assertEqual(encoded_sequence, filtered_sequence) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname) - tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - # fmt: off - expected_tokens = [[1639, 389, 257, 7613, 8537, 13645, 13, 50256, 15496, 0, 50256], [1639, 389, 257, 7613, 8537, 13645, 13, 50256, 15496, 0, 50256, 35284, 284, 1826, 345, 13, 50256], [35284, 284, 1826, 345, 13, 50256, 15496, 0, 50256]] - # fmt: on - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - @require_tiktoken def test_tokenization_tiktoken(self): from tiktoken import encoding_name_for_model diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py index 6d3fd89a91ea..7dbcd524e810 100644 --- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py +++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py @@ -15,7 +15,7 @@ import unittest from transformers import GPTSw3Tokenizer -from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow +from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow from ...test_tokenization_common import TokenizerTesterMixin @@ -129,36 +129,3 @@ def test_tokenizer_integration(self): model_name="AI-Sweden-Models/gpt-sw3-126m", sequences=sequences, ) - - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, name_or_path="test") - tokenizer.chat_template = ( - "{{ eos_token }}{{ bos_token }}" - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}" - "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}" - "{{ message['text'] }}{{ bos_token }}" - "{% endfor %}" - "Bot:" - ) - # This is in English, but it's just here to make sure the chat control tokens are being added properly - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - # fmt: off - expected_tokens = [ - [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419], - [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419], - [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419] - ] - # fmt: on - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 808823c3ff44..673f8def3159 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -950,7 +950,9 @@ def test_chat_template(self): dummy_conversation, chat_template=dummy_template, tokenize=True, return_dict=False ) dict_output = tokenizer.apply_chat_template( - dummy_conversation, chat_template=dummy_template, tokenize=True, return_dict=True + dummy_conversation, + chat_template=dummy_template, + tokenize=True, # This also checks return_dict=True is the default ) self.assertEqual(dict_output["input_ids"], output) # Test return_dict behaviour matches