Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions src/transformers/tokenization_utils_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,15 +425,12 @@ def update_post_processor(self):
bos = self.bos_token
bos_token_id = self.bos_token_id
if bos is None and self.add_bos_token:
raise ValueError("add_bos_token = True but bos_token = None")
self.add_bos_token = False

eos = self.eos_token
eos_token_id = self.eos_token_id
# If eos_token is None and add_eos_token is True, silently disable add_eos_token
# This allows tokenizers to set add_eos_token even if eos_token is not configured
if eos is None and self.add_eos_token:
self.add_eos_token = False
return

single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
Expand Down
24 changes: 24 additions & 0 deletions tests/test_tokenizers_backend_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,30 @@ def test_rust_tokenizer_add_prefix_space(self, add_prefix_space):
if hasattr(fast_tokenizer.backend_tokenizer.pre_tokenizer, "add_prefix_space"):
self.assertEqual(fast_tokenizer.backend_tokenizer.pre_tokenizer.add_prefix_space, add_prefix_space)

def test_add_bos_token_without_bos_token(self):
"""
Test that setting add_bos_token=True when bos_token=None silently disables add_bos_token.
"""
tokenizer_r = self.get_rust_tokenizer()

# Reload the tokenizer with bos_token=None
with tempfile.TemporaryDirectory() as tmpdir:
tokenizer_r.save_pretrained(tmpdir)
tokenizer_class = getattr(self, "rust_tokenizer_class", None) or getattr(self, "tokenizer_class", None)
tokenizer_no_bos = tokenizer_class.from_pretrained(tmpdir, bos_token=None)

self.assertIsNone(tokenizer_no_bos.bos_token)

tokenizer_no_bos.add_bos_token = True

self.assertFalse(tokenizer_no_bos.add_bos_token)

test_text = "Hello world"
encoded = tokenizer_no_bos(test_text)
self.assertIsNotNone(encoded["input_ids"])
decoded = tokenizer_no_bos.decode(encoded["input_ids"], skip_special_tokens=True)
self.assertIsInstance(decoded, str)

def test_local_files_only(self):
from transformers import AutoTokenizer

Expand Down