Skip to content
Merged
23 changes: 12 additions & 11 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3192,7 +3192,7 @@ def apply_chat_template(
truncation: bool = False,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_dict: bool = False,
return_dict: bool = True,
return_assistant_tokens_mask: bool = False,
tokenizer_kwargs: Optional[dict[str, Any]] = None,
**kwargs,
Expand Down Expand Up @@ -3265,14 +3265,11 @@ def apply_chat_template(
set, will return a dict of tokenizer outputs instead.
"""

if return_dict and not tokenize:
raise ValueError(
"`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
"of tokenizer outputs to return."
)
if not tokenize:
return_dict = False # dicts are only returned by the tokenizer anyway
Comment on lines +3268 to +3269
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes me wonder, do we need to support a combination of tokenize=True, return_dict=False or can we deprecate/remove return_dict over time? Can't think of cases when users want a list of tokens as output

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can get rid of it over time, but I think it's fine as a backward compatibility flag for now!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, i meant after v5 + several more minor releases, and if users are fine with it


if return_assistant_tokens_mask and not return_dict:
raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`")
if return_assistant_tokens_mask and not (return_dict and tokenize):
raise ValueError("`return_assistant_tokens_mask=True` requires `return_dict=True` and `tokenize=True`")

if tokenizer_kwargs is None:
tokenizer_kwargs = {}
Expand Down Expand Up @@ -3387,13 +3384,17 @@ def encode_message_with_chat_template(
)

if conversation_history is None or len(conversation_history) == 0:
return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs)
return self.apply_chat_template(
[message], add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
)

conversation = conversation_history + [message]
tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs)
tokens = self.apply_chat_template(
conversation, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
)

prefix_tokens = self.apply_chat_template(
conversation_history, add_generation_prompt=False, tokenize=True, **kwargs
conversation_history, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
)
# It's possible that the prefix tokens are not a prefix of the full list of tokens.
# For example, if the prefix is `<s>User: Hi` and the full conversation is `<s>User: Hi</s><s>Assistant: Hello`.
Expand Down
20 changes: 0 additions & 20 deletions tests/models/blenderbot/test_tokenization_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,3 @@ def test_pretokenized_inputs(self, *args, **kwargs):
# The issue is that when you have a sequence with leading spaces, splitting it
# with .split() loses the leading spaces, so the tokenization results differ
pass

def test_tokenization_for_chat(self):
tok = self.get_tokenizer()
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tok.apply_chat_template(test_chat) for test_chat in test_chats]
expected_tokens = [
[553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 2],
[553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 228, 3490, 287, 2273, 304, 21, 2],
[3490, 287, 2273, 304, 21, 228, 228, 6950, 8, 2],
]
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)
24 changes: 1 addition & 23 deletions tests/models/bloom/test_tokenization_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from datasets import load_dataset

from transformers import TokenizersBackend
from transformers.testing_utils import require_jinja, require_tokenizers, slow
from transformers.testing_utils import require_tokenizers, slow

from ...test_tokenization_common import TokenizerTesterMixin

Expand Down Expand Up @@ -129,28 +129,6 @@ def test_encodings_from_xnli_dataset(self):
predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens]
self.assertListEqual(predicted_text, input_text)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = self.get_tokenizer()
tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
expected_tokens = [
[5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2],
[5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2, 229126, 427, 11890, 1152, 17, 2],
[229126, 427, 11890, 1152, 17, 2, 59414, 4, 2],
]
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)

def test_add_prefix_space_fast(self):
tokenizer_w_prefix = self.get_tokenizer(add_prefix_space=True)
tokenizer_wo_prefix = self.get_tokenizer(add_prefix_space=False)
Expand Down
26 changes: 0 additions & 26 deletions tests/models/cohere/test_tokenization_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,32 +73,6 @@ def test_pretrained_model_lists(self):
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = self.get_tokenizer()
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [
[5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8],
[5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65,
59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8,
36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59,
45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61,
58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 43, 48, 41, 60, 42, 55, 60, 71, 60, 55, 51, 45, 54, 99, 38,
54, 567, 235, 693, 276, 411, 243, 22, 8]
]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)

@require_jinja
def test_tokenization_for_tool_use(self):
tokenizer = self.get_tokenizer()
Expand Down
22 changes: 1 addition & 21 deletions tests/models/gpt2/test_tokenization_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import unittest

from transformers import AutoTokenizer, GPT2Tokenizer
from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
from transformers.testing_utils import require_tiktoken, require_tokenizers

from ...test_tokenization_common import TokenizerTesterMixin

Expand Down Expand Up @@ -67,26 +67,6 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self):
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [[1639, 389, 257, 7613, 8537, 13645, 13, 50256, 15496, 0, 50256], [1639, 389, 257, 7613, 8537, 13645, 13, 50256, 15496, 0, 50256, 35284, 284, 1826, 345, 13, 50256], [35284, 284, 1826, 345, 13, 50256, 15496, 0, 50256]]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)

@require_tiktoken
def test_tokenization_tiktoken(self):
from tiktoken import encoding_name_for_model
Expand Down
35 changes: 1 addition & 34 deletions tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import unittest

from transformers import GPTSw3Tokenizer
from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow

from ...test_tokenization_common import TokenizerTesterMixin

Expand Down Expand Up @@ -129,36 +129,3 @@ def test_tokenizer_integration(self):
model_name="AI-Sweden-Models/gpt-sw3-126m",
sequences=sequences,
)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, name_or_path="test")
tokenizer.chat_template = (
"{{ eos_token }}{{ bos_token }}"
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
"{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
"{{ message['text'] }}{{ bos_token }}"
"{% endfor %}"
"Bot:"
)
# This is in English, but it's just here to make sure the chat control tokens are being added properly
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [
[2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419],
[2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419],
[2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419]
]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)
4 changes: 3 additions & 1 deletion tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,9 @@ def test_chat_template(self):
dummy_conversation, chat_template=dummy_template, tokenize=True, return_dict=False
)
dict_output = tokenizer.apply_chat_template(
dummy_conversation, chat_template=dummy_template, tokenize=True, return_dict=True
dummy_conversation,
chat_template=dummy_template,
tokenize=True, # This also checks return_dict=True is the default
)
self.assertEqual(dict_output["input_ids"], output) # Test return_dict behaviour matches

Expand Down