Skip to content
2 changes: 1 addition & 1 deletion src/transformers/models/voxtral/processing_voxtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def apply_chat_template(
tokenizer_kwargs = {**processed_kwargs["template_kwargs"], **text_kwargs}
tokenizer_kwargs["return_tensors"] = None # let's not return tensors here
tokenize = tokenizer_kwargs.pop("tokenize", False)
return_dict = tokenizer_kwargs.pop("return_dict", False)
return_dict = tokenizer_kwargs.pop("return_dict", True)

encoded_instruct_inputs = self.tokenizer.apply_chat_template(
conversations,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1603,7 +1603,7 @@ def apply_chat_template(
conversations = [conversation]

tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
return_dict = processed_kwargs["template_kwargs"].pop("return_dict", True)
mm_load_kwargs = processed_kwargs["mm_load_kwargs"]

if tokenize:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/tokenization_mistral_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1378,7 +1378,7 @@ def apply_chat_template(
truncation: bool = False,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_dict: bool = False,
return_dict: bool = True,
**kwargs,
) -> Union[str, list[int], list[str], list[list[int]], BatchEncoding]:
"""
Expand Down
23 changes: 12 additions & 11 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1588,7 +1588,7 @@ def apply_chat_template(
truncation: bool = False,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_dict: bool = False,
return_dict: bool = True,
return_assistant_tokens_mask: bool = False,
tokenizer_kwargs: Optional[dict[str, Any]] = None,
**kwargs,
Expand Down Expand Up @@ -1661,14 +1661,11 @@ def apply_chat_template(
set, will return a dict of tokenizer outputs instead.
"""

if return_dict and not tokenize:
raise ValueError(
"`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
"of tokenizer outputs to return."
)
if not tokenize:
return_dict = False # dicts are only returned by the tokenizer anyway

if return_assistant_tokens_mask and not return_dict:
raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`")
if return_assistant_tokens_mask and not (return_dict and tokenize):
raise ValueError("`return_assistant_tokens_mask=True` requires `return_dict=True` and `tokenize=True`")

if tokenizer_kwargs is None:
tokenizer_kwargs = {}
Expand Down Expand Up @@ -1783,13 +1780,17 @@ def encode_message_with_chat_template(
)

if conversation_history is None or len(conversation_history) == 0:
return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs)
return self.apply_chat_template(
[message], add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
)

conversation = conversation_history + [message]
tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs)
tokens = self.apply_chat_template(
conversation, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
)

prefix_tokens = self.apply_chat_template(
conversation_history, add_generation_prompt=False, tokenize=True, **kwargs
conversation_history, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
)
# It's possible that the prefix tokens are not a prefix of the full list of tokens.
# For example, if the prefix is `<s>User: Hi` and the full conversation is `<s>User: Hi</s><s>Assistant: Hello`.
Expand Down
22 changes: 0 additions & 22 deletions tests/models/blenderbot/test_tokenization_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from functools import cached_property

from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
from transformers.testing_utils import require_jinja


class Blenderbot3BTokenizerTests(unittest.TestCase):
Expand Down Expand Up @@ -51,24 +50,3 @@ def test_3B_tokenization_same_as_parlai(self):
def test_3B_tokenization_same_as_parlai_rust_tokenizer(self):
assert self.rust_tokenizer_3b.add_prefix_space
assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]

@require_jinja
def test_tokenization_for_chat(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, why do you remove this test?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's extremely old - it's not related to this PR really, but these tests come from before chat templates, and we just patched them to support chat templates after chat templates were added. They only exist for a few models, and I think we don't want to keep them, because it's not clear what they test that the main chat template tests don't.

tok = self.tokenizer_3b
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tok.apply_chat_template(test_chat) for test_chat in test_chats]
expected_tokens = [
[553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 2],
[553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 228, 3490, 287, 2273, 304, 21, 2],
[3490, 287, 2273, 304, 21, 228, 228, 6950, 8, 2],
]
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)
24 changes: 1 addition & 23 deletions tests/models/bloom/test_tokenization_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from datasets import load_dataset

from transformers import BloomTokenizerFast
from transformers.testing_utils import require_jinja, require_tokenizers
from transformers.testing_utils import require_tokenizers

from ...test_tokenization_common import TokenizerTesterMixin

Expand Down Expand Up @@ -137,28 +137,6 @@ def test_encodings_from_xnli_dataset(self):
predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens]
self.assertListEqual(predicted_text, input_text)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = self.get_rust_tokenizer()
tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
expected_tokens = [
[5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2],
[5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2, 229126, 427, 11890, 1152, 17, 2],
[229126, 427, 11890, 1152, 17, 2, 59414, 4, 2],
]
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)

def test_add_prefix_space_fast(self):
tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
Expand Down
26 changes: 0 additions & 26 deletions tests/models/cohere/test_tokenization_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,32 +146,6 @@ def test_pretrained_model_lists(self):
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = self.get_rust_tokenizer()
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [
[5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8],
[5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65,
59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8,
36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59,
45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61,
58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 43, 48, 41, 60, 42, 55, 60, 71, 60, 55, 51, 45, 54, 99, 38,
54, 567, 235, 693, 276, 411, 243, 22, 8]
]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)

@require_jinja
def test_tokenization_for_tool_use(self):
tokenizer = self.get_rust_tokenizer()
Expand Down
20 changes: 0 additions & 20 deletions tests/models/gemma/test_tokenization_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from transformers.testing_utils import (
get_tests_dir,
nested_simplify,
require_jinja,
require_read_token,
require_sentencepiece,
require_tokenizers,
Expand Down Expand Up @@ -428,25 +427,6 @@ def test_some_edge_cases(self):
# a dummy prefix space is not added by the sp_model as it was de-activated
self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")

test_chats = [
[{"role": "user", "content": "Hello!"}],
[
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "user", "content": "Hello!"}],
]
# Matt: The third test case tests the default system message, but if this is ever changed in the
# class/repo code then that test will fail, and the case will need to be updated.
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
expected_tokens = [[235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108, 235322, 235371, 571, 235298, 2997, 73786, 105776, 108, 7731, 577, 4664, 692, 35606, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108]] # fmt: skip
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)

def test_save_fast_load_slow(self):
# Ensure that we can save a fast tokenizer and load it as a slow tokenizer
slow_tokenizer = self.tokenizer
Expand Down
24 changes: 1 addition & 23 deletions tests/models/gpt2/test_tokenization_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
from transformers.testing_utils import require_tiktoken, require_tokenizers

from ...test_tokenization_common import TokenizerTesterMixin

Expand Down Expand Up @@ -281,28 +281,6 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self):
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [[20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20],
[20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20, 20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20],
[20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20, 20, 3, 0, 0, 1, 20, 20]]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)

@require_tiktoken
def test_tokenization_tiktoken(self):
from tiktoken import encoding_name_for_model
Expand Down
35 changes: 1 addition & 34 deletions tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import unittest

from transformers import GPTSw3Tokenizer
from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow

from ...test_tokenization_common import TokenizerTesterMixin

Expand Down Expand Up @@ -127,36 +127,3 @@ def test_tokenizer_integration(self):
model_name="AI-Sweden-Models/gpt-sw3-126m",
sequences=sequences,
)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB)
tokenizer.chat_template = (
"{{ eos_token }}{{ bos_token }}"
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
"{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
"{{ message['text'] }}{{ bos_token }}"
"{% endfor %}"
"Bot:"
)
# This is in English, but it's just here to make sure the chat control tokens are being added properly
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
]
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [
[2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419],
[2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419],
[2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419]
]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)
27 changes: 0 additions & 27 deletions tests/models/llama/test_tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from transformers.testing_utils import (
get_tests_dir,
nested_simplify,
require_jinja,
require_read_token,
require_sentencepiece,
require_tiktoken,
Expand Down Expand Up @@ -702,32 +701,6 @@ def test_fast_post_processor(self):
with self.assertRaises(ValueError):
tokenizer = LlamaTokenizerFast(SAMPLE_VOCAB, eos_token=None, add_bos_token=True, add_eos_token=True)

@require_jinja
def test_tokenization_for_chat(self):
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)

test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
{"role": "system", "content": "You are a helpful chatbot."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Nice to meet you."},
],
[{"role": "user", "content": "Hello!"}],
]
# Matt: The third test case tests the default system message, but if this is ever changed in the
# class/repo code then that test will fail, and the case will need to be updated.
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962],
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2],
[1, 29961, 25580, 29962, 15043, 29991, 518, 29914, 25580, 29962]
]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)


@require_sentencepiece
@require_tokenizers
Expand Down
Loading