diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py index 47fe00bf3e9f..15a086703cf7 100644 --- a/src/transformers/models/voxtral/processing_voxtral.py +++ b/src/transformers/models/voxtral/processing_voxtral.py @@ -206,7 +206,7 @@ def apply_chat_template( tokenizer_kwargs = {**processed_kwargs["template_kwargs"], **text_kwargs} tokenizer_kwargs["return_tensors"] = None # let's not return tensors here tokenize = tokenizer_kwargs.pop("tokenize", False) - return_dict = tokenizer_kwargs.pop("return_dict", False) + return_dict = tokenizer_kwargs.pop("return_dict", True) encoded_instruct_inputs = self.tokenizer.apply_chat_template( conversations, diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index a70018c6cf2e..c3487aca431b 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1603,7 +1603,7 @@ def apply_chat_template( conversations = [conversation] tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False) - return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False) + return_dict = processed_kwargs["template_kwargs"].pop("return_dict", True) mm_load_kwargs = processed_kwargs["mm_load_kwargs"] if tokenize: diff --git a/src/transformers/tokenization_mistral_common.py b/src/transformers/tokenization_mistral_common.py index 3eae0a6f6878..713d0cbc6bc1 100644 --- a/src/transformers/tokenization_mistral_common.py +++ b/src/transformers/tokenization_mistral_common.py @@ -1378,7 +1378,7 @@ def apply_chat_template( truncation: bool = False, max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, - return_dict: bool = False, + return_dict: bool = True, **kwargs, ) -> Union[str, list[int], list[str], list[list[int]], BatchEncoding]: """ diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index afdd8270987a..24228738fcde 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1588,7 +1588,7 @@ def apply_chat_template( truncation: bool = False, max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, - return_dict: bool = False, + return_dict: bool = True, return_assistant_tokens_mask: bool = False, tokenizer_kwargs: Optional[dict[str, Any]] = None, **kwargs, @@ -1661,14 +1661,11 @@ def apply_chat_template( set, will return a dict of tokenizer outputs instead. """ - if return_dict and not tokenize: - raise ValueError( - "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict " - "of tokenizer outputs to return." - ) + if not tokenize: + return_dict = False # dicts are only returned by the tokenizer anyway - if return_assistant_tokens_mask and not return_dict: - raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`") + if return_assistant_tokens_mask and not (return_dict and tokenize): + raise ValueError("`return_assistant_tokens_mask=True` requires `return_dict=True` and `tokenize=True`") if tokenizer_kwargs is None: tokenizer_kwargs = {} @@ -1783,13 +1780,17 @@ def encode_message_with_chat_template( ) if conversation_history is None or len(conversation_history) == 0: - return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs) + return self.apply_chat_template( + [message], add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs + ) conversation = conversation_history + [message] - tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs) + tokens = self.apply_chat_template( + conversation, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs + ) prefix_tokens = self.apply_chat_template( - conversation_history, add_generation_prompt=False, tokenize=True, **kwargs + conversation_history, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs ) # It's possible that the prefix tokens are not a prefix of the full list of tokens. # For example, if the prefix is `User: Hi` and the full conversation is `User: HiAssistant: Hello`. diff --git a/tests/models/blenderbot/test_tokenization_blenderbot.py b/tests/models/blenderbot/test_tokenization_blenderbot.py index 0997ac4bde41..da6741940c90 100644 --- a/tests/models/blenderbot/test_tokenization_blenderbot.py +++ b/tests/models/blenderbot/test_tokenization_blenderbot.py @@ -18,7 +18,6 @@ from functools import cached_property from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast -from transformers.testing_utils import require_jinja class Blenderbot3BTokenizerTests(unittest.TestCase): @@ -51,24 +50,3 @@ def test_3B_tokenization_same_as_parlai(self): def test_3B_tokenization_same_as_parlai_rust_tokenizer(self): assert self.rust_tokenizer_3b.add_prefix_space assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]] - - @require_jinja - def test_tokenization_for_chat(self): - tok = self.tokenizer_3b - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tok.apply_chat_template(test_chat) for test_chat in test_chats] - expected_tokens = [ - [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 2], - [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 228, 3490, 287, 2273, 304, 21, 2], - [3490, 287, 2273, 304, 21, 228, 228, 6950, 8, 2], - ] - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 79d330e40277..4a4840dfd9f3 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -18,7 +18,7 @@ from datasets import load_dataset from transformers import BloomTokenizerFast -from transformers.testing_utils import require_jinja, require_tokenizers +from transformers.testing_utils import require_tokenizers from ...test_tokenization_common import TokenizerTesterMixin @@ -137,28 +137,6 @@ def test_encodings_from_xnli_dataset(self): predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens] self.assertListEqual(predicted_text, input_text) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = self.get_rust_tokenizer() - tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - expected_tokens = [ - [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2], - [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2, 229126, 427, 11890, 1152, 17, 2], - [229126, 427, 11890, 1152, 17, 2, 59414, 4, 2], - ] - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - def test_add_prefix_space_fast(self): tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True) tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False) diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py index 73a0942522ac..ce56bbeb6a84 100644 --- a/tests/models/cohere/test_tokenization_cohere.py +++ b/tests/models/cohere/test_tokenization_cohere.py @@ -146,32 +146,6 @@ def test_pretrained_model_lists(self): self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = self.get_rust_tokenizer() - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - # fmt: off - expected_tokens = [ - [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8], - [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, - 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, - 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, - 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, - 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 43, 48, 41, 60, 42, 55, 60, 71, 60, 55, 51, 45, 54, 99, 38, - 54, 567, 235, 693, 276, 411, 243, 22, 8] - ] - # fmt: on - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - @require_jinja def test_tokenization_for_tool_use(self): tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index 913f7546e84a..0bae68e4b0e3 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -27,7 +27,6 @@ from transformers.testing_utils import ( get_tests_dir, nested_simplify, - require_jinja, require_read_token, require_sentencepiece, require_tokenizers, @@ -428,25 +427,6 @@ def test_some_edge_cases(self): # a dummy prefix space is not added by the sp_model as it was de-activated self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str)) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma") - - test_chats = [ - [{"role": "user", "content": "Hello!"}], - [ - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "user", "content": "Hello!"}], - ] - # Matt: The third test case tests the default system message, but if this is ever changed in the - # class/repo code then that test will fail, and the case will need to be updated. - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - expected_tokens = [[235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108, 235322, 235371, 571, 235298, 2997, 73786, 105776, 108, 7731, 577, 4664, 692, 35606, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108]] # fmt: skip - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - def test_save_fast_load_slow(self): # Ensure that we can save a fast tokenizer and load it as a slow tokenizer slow_tokenizer = self.tokenizer diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index c69e0b521086..be6b90bc4637 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -19,7 +19,7 @@ from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES -from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers +from transformers.testing_utils import require_tiktoken, require_tokenizers from ...test_tokenization_common import TokenizerTesterMixin @@ -281,28 +281,6 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self): filtered_sequence = [x for x in filtered_sequence if x is not None] self.assertEqual(encoded_sequence, filtered_sequence) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname) - tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - # fmt: off - expected_tokens = [[20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20], - [20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20, 20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20], - [20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20, 20, 3, 0, 0, 1, 20, 20]] - # fmt: on - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - @require_tiktoken def test_tokenization_tiktoken(self): from tiktoken import encoding_name_for_model diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py index 4a1a3292c5bf..c77eaecede2a 100644 --- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py +++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py @@ -15,7 +15,7 @@ import unittest from transformers import GPTSw3Tokenizer -from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow +from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow from ...test_tokenization_common import TokenizerTesterMixin @@ -127,36 +127,3 @@ def test_tokenizer_integration(self): model_name="AI-Sweden-Models/gpt-sw3-126m", sequences=sequences, ) - - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB) - tokenizer.chat_template = ( - "{{ eos_token }}{{ bos_token }}" - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}" - "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}" - "{{ message['text'] }}{{ bos_token }}" - "{% endfor %}" - "Bot:" - ) - # This is in English, but it's just here to make sure the chat control tokens are being added properly - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], - ] - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - # fmt: off - expected_tokens = [ - [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419], - [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419], - [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419] - ] - # fmt: on - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 58eb1f4e86e8..d69965b1b268 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -32,7 +32,6 @@ from transformers.testing_utils import ( get_tests_dir, nested_simplify, - require_jinja, require_read_token, require_sentencepiece, require_tiktoken, @@ -702,32 +701,6 @@ def test_fast_post_processor(self): with self.assertRaises(ValueError): tokenizer = LlamaTokenizerFast(SAMPLE_VOCAB, eos_token=None, add_bos_token=True, add_eos_token=True) - @require_jinja - def test_tokenization_for_chat(self): - tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False) - - test_chats = [ - [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], - [ - {"role": "system", "content": "You are a helpful chatbot."}, - {"role": "user", "content": "Hello!"}, - {"role": "assistant", "content": "Nice to meet you."}, - ], - [{"role": "user", "content": "Hello!"}], - ] - # Matt: The third test case tests the default system message, but if this is ever changed in the - # class/repo code then that test will fail, and the case will need to be updated. - tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] - # fmt: off - expected_tokens = [ - [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962], - [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2], - [1, 29961, 25580, 29962, 15043, 29991, 518, 29914, 25580, 29962] - ] - # fmt: on - for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): - self.assertListEqual(tokenized_chat, expected_tokens) - @require_sentencepiece @require_tokenizers diff --git a/tests/test_tokenization_mistral_common.py b/tests/test_tokenization_mistral_common.py index 82dba87f7d7e..f33501cdc432 100644 --- a/tests/test_tokenization_mistral_common.py +++ b/tests/test_tokenization_mistral_common.py @@ -799,7 +799,9 @@ def test_apply_chat_template_basic(self): # Test 2: # without tokenize - self.assertEqual(self.tokenizer.apply_chat_template(conversation, tokenize=True), expected_tokenized.tokens) + self.assertEqual( + self.tokenizer.apply_chat_template(conversation, tokenize=True).input_ids, expected_tokenized.tokens + ) with self.assertRaises( ValueError, msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.apply_chat_template`." @@ -824,7 +826,7 @@ def test_apply_chat_template_continue_final_message(self): expected_tokenized.text, ) self.assertEqual( - self.tokenizer.apply_chat_template(conversation, tokenize=True, continue_final_message=True), + self.tokenizer.apply_chat_template(conversation, tokenize=True, continue_final_message=True).input_ids, expected_tokenized.tokens, ) @@ -846,7 +848,7 @@ def test_apply_chat_template_with_add_generation_prompt(self): token_outputs = self.tokenizer.apply_chat_template( conversation, tokenize=True, add_generation_prompt=add_generation_prompt ) - self.assertEqual(token_outputs, expected_tokenized.tokens) + self.assertEqual(token_outputs.input_ids, expected_tokenized.tokens) # Test 2: # with continue_final_message @@ -958,18 +960,16 @@ def test_apply_chat_template_with_image(self): }, ] - output = self.tokenizer.apply_chat_template(conversation, tokenize=True) + output = self.tokenizer.apply_chat_template(conversation).input_ids self.assertEqual(output, expected_tokenized.tokens) - output_dict = self.tokenizer.apply_chat_template(conversation, tokenize=True, return_dict=True) + output_dict = self.tokenizer.apply_chat_template(conversation, tokenize=True) self.assertEqual(output_dict["input_ids"], expected_tokenized.tokens) self.assertEqual(len(output_dict["pixel_values"]), len(expected_tokenized.images)) for o, e in zip(output_dict["pixel_values"], expected_tokenized.images): self.assertTrue(np.allclose(o, e)) - output_dict = self.tokenizer.apply_chat_template( - conversation, tokenize=True, return_dict=True, return_tensors="pt" - ) + output_dict = self.tokenizer.apply_chat_template(conversation, tokenize=True, return_tensors="pt") self.assertEqual(output_dict["input_ids"].tolist()[0], expected_tokenized.tokens) expected_images_pt_tensor = torch.from_numpy(np.stack(expected_tokenized.images)) self.assertTrue(torch.allclose(output_dict["pixel_values"], expected_images_pt_tensor)) @@ -1013,7 +1013,7 @@ def test_apply_chat_template_with_audio(self): }, ] - output = self.tokenizer_audio.apply_chat_template(conversation, tokenize=True) + output = self.tokenizer_audio.apply_chat_template(conversation, tokenize=True).input_ids self.assertEqual(output, expected_tokenized.tokens) output_dict = self.tokenizer_audio.apply_chat_template(conversation, tokenize=True, return_dict=True) @@ -1041,14 +1041,14 @@ def test_apply_chat_template_with_truncation(self): # Test 1: # with truncation self.assertEqual( - self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=True, max_length=20), + self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=True, max_length=20).input_ids, expected_tokenized.tokens[:20], ) # Test 2: # without truncation self.assertEqual( - self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=False, max_length=20), + self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=False, max_length=20).input_ids, expected_tokenized.tokens, ) @@ -1130,7 +1130,7 @@ def test_batch_apply_chat_template(self): ] text_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=False) - token_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=True) + token_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=True).input_ids self.assertEqual(len(text_outputs), len(token_outputs)) self.assertEqual(len(text_outputs), len(expected_tokenized)) @@ -1202,7 +1202,7 @@ def test_batch_apply_chat_template_images(self): ChatCompletionRequest.from_openai(ref_conversation) ) - output = self.tokenizer.apply_chat_template(conversations, tokenize=True) + output = self.tokenizer.apply_chat_template(conversations, tokenize=True).input_ids self.assertEqual(output, [expected_tokenized.tokens] * 3) output = self.tokenizer.apply_chat_template(conversations, tokenize=True, return_dict=True) @@ -1248,7 +1248,9 @@ def test_batch_apply_chat_template_with_continue_final_message(self): for conversation in conversations ] - token_outputs = self.tokenizer.apply_chat_template(conversations, tokenize=True, continue_final_message=True) + token_outputs = self.tokenizer.apply_chat_template( + conversations, tokenize=True, continue_final_message=True + ).input_ids for output, expected in zip(token_outputs, expected_tokenized): self.assertEqual(output, expected.tokens) @@ -1297,7 +1299,7 @@ def test_batch_apply_chat_template_with_add_generation_prompt(self): ] token_outputs = self.tokenizer.apply_chat_template( conversations, tokenize=True, add_generation_prompt=add_generation_prompt - ) + ).input_ids for output, expected in zip(token_outputs, expected_tokenized): self.assertEqual(output, expected.tokens) @@ -1331,7 +1333,7 @@ def test_batch_apply_chat_template_with_truncation( # with truncation token_outputs = self.tokenizer.apply_chat_template( self.fixture_conversations, tokenize=True, truncation=True, max_length=20 - ) + ).input_ids for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): self.assertEqual(output, expected.tokens[:20]) @@ -1340,7 +1342,7 @@ def test_batch_apply_chat_template_with_truncation( # without truncation token_outputs = self.tokenizer.apply_chat_template( self.fixture_conversations, tokenize=True, truncation=False, max_length=20 - ) + ).input_ids self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): self.assertEqual(output, expected.tokens) @@ -1358,7 +1360,9 @@ def test_batch_apply_chat_template_with_padding( for padding in [True, "max_length", PaddingStrategy.LONGEST, PaddingStrategy.MAX_LENGTH]: if padding == PaddingStrategy.MAX_LENGTH: # No padding if no max length is provided - token_outputs = self.tokenizer.apply_chat_template(self.fixture_conversations, padding=padding) + token_outputs = self.tokenizer.apply_chat_template( + self.fixture_conversations, padding=padding, return_dict=False + ) self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): self.assertEqual(output, expected.tokens) @@ -1366,7 +1370,7 @@ def test_batch_apply_chat_template_with_padding( max_length = 20 if padding == PaddingStrategy.MAX_LENGTH else None token_outputs = self.tokenizer.apply_chat_template( - self.fixture_conversations, tokenize=True, padding=padding, max_length=max_length + self.fixture_conversations, tokenize=True, padding=padding, max_length=max_length, return_dict=False ) if padding != PaddingStrategy.MAX_LENGTH: @@ -1390,7 +1394,7 @@ def test_batch_apply_chat_template_with_padding( for padding in [False, "do_not_pad", PaddingStrategy.DO_NOT_PAD]: token_outputs = self.tokenizer.apply_chat_template( - self.fixture_conversations, tokenize=True, padding=padding + self.fixture_conversations, tokenize=True, padding=padding, return_dict=False ) self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): @@ -1402,7 +1406,12 @@ def test_batch_apply_chat_template_with_padding_and_truncation( max_length = 20 for padding in [True, "max_length", PaddingStrategy.LONGEST, PaddingStrategy.MAX_LENGTH]: token_outputs = self.tokenizer.apply_chat_template( - self.fixture_conversations, tokenize=True, truncation=True, padding=padding, max_length=max_length + self.fixture_conversations, + tokenize=True, + truncation=True, + padding=padding, + max_length=max_length, + return_dict=False, ) self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): @@ -1411,7 +1420,12 @@ def test_batch_apply_chat_template_with_padding_and_truncation( ) for padding in [False, "do_not_pad", PaddingStrategy.DO_NOT_PAD]: token_outputs = self.tokenizer.apply_chat_template( - self.fixture_conversations, tokenize=True, truncation=True, padding=padding, max_length=max_length + self.fixture_conversations, + tokenize=True, + truncation=True, + padding=padding, + max_length=max_length, + return_dict=False, ) self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations)) for output, expected in zip(token_outputs, self.tokenized_fixture_conversations): @@ -1421,7 +1435,7 @@ def test_batch_apply_chat_template_return_tensors(self): # Test 1: # with tokenize token_outputs = self.tokenizer.apply_chat_template( - self.fixture_conversations, tokenize=True, return_tensors="pt", padding=True + self.fixture_conversations, tokenize=True, return_tensors="pt", padding=True, return_dict=False ) self.assertIsInstance(token_outputs, torch.Tensor) self.assertEqual( @@ -1432,7 +1446,7 @@ def test_batch_apply_chat_template_return_tensors(self): # Test 2: # without tokenize, should ignore return_tensors token_outputs = self.tokenizer.apply_chat_template( - self.fixture_conversations, tokenize=False, return_tensors="pt", padding=True + self.fixture_conversations, tokenize=False, return_tensors="pt", padding=True, return_dict=False ) self.assertEqual(token_outputs, [t.text for t in self.tokenized_fixture_conversations]) diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index 24aac3719812..6fd20a2cf473 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -323,7 +323,7 @@ def test_encode_message(self): ] # First, test the default case, where we encode the whole conversation at once - whole_conversation_tokens = tokenizer.apply_chat_template(conversation, tokenize=True) + whole_conversation_tokens = tokenizer.apply_chat_template(conversation, tokenize=True, return_dict=False) # Now, test the message-by-message encoding tokens = []