Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
74da1b4
_decode signature change and quick return
DuyguA Aug 9, 2024
6f4c1e6
added bunch of decoding tests
DuyguA Aug 9, 2024
7ddc3ca
signature match and return
DuyguA Aug 9, 2024
15c2d9e
added tests for decoding
DuyguA Aug 9, 2024
3716cfd
merged decoding test
DuyguA Aug 28, 2024
251a5ac
more tests for special tokens
DuyguA Aug 28, 2024
2e82e67
cosmetics
DuyguA Aug 28, 2024
97d5cb1
fixed param
DuyguA Aug 28, 2024
f5da92b
ruffed the file
DuyguA Aug 28, 2024
2b097eb
refinement for single special tokens
DuyguA Sep 9, 2024
420993c
added test for single special tokens
DuyguA Sep 9, 2024
689b93e
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 9, 2024
8a29adf
slight change to test name
DuyguA Sep 10, 2024
0d858cd
minor change test name for skip tokens
DuyguA Sep 10, 2024
7ed431e
killed already defined var
DuyguA Sep 10, 2024
0c812cc
minor update with vars
DuyguA Sep 10, 2024
29c5950
killed already defined var once more
DuyguA Sep 10, 2024
edca13e
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 10, 2024
a0227b1
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 12, 2024
7a2a9c1
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 12, 2024
7373ab7
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 13, 2024
ffbfd05
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 13, 2024
2362fe6
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 14, 2024
eefe4b2
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 16, 2024
1051d1c
Merge branch 'huggingface:main' into fix/tokenizer-decoding-space
DuyguA Sep 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,7 +1070,7 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:

def _decode(
self,
token_ids: List[int],
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = True,
Expand All @@ -1079,6 +1079,10 @@ def _decode(
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)

filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
# If given is a single id, prevents splitting the string in upcoming loop
if isinstance(filtered_tokens, str):
filtered_tokens = [filtered_tokens]

legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
}
Expand All @@ -1089,7 +1093,7 @@ def _decode(
current_sub_text = []
# TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
if skip_special_tokens and token in self.all_special_tokens:
continue
if token in legacy_added_tokens:
if current_sub_text:
Expand Down
68 changes: 68 additions & 0 deletions tests/tokenization/test_tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,74 @@ def test_padding_accepts_tensors(self):
self.assertTrue(isinstance(batch["input_ids"], np.ndarray))
self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])

@require_tokenizers
def test_decoding(self):
for tokenizer_class in [BertTokenizer, BertTokenizerFast]:
with self.subTest(f"{tokenizer_class}"):
tokenizer = tokenizer_class.from_pretrained("google-bert/bert-base-cased")

token_id = 2300
decoded_flat = tokenizer.decode(token_id)
decoded_list = tokenizer.decode([token_id])

self.assertEqual(decoded_flat, "Force")
self.assertEqual(decoded_list, "Force")

token_id = 0
decoded_flat = tokenizer.decode(token_id)
decoded_list = tokenizer.decode([token_id])

self.assertEqual(decoded_flat, "[PAD]")
self.assertEqual(decoded_list, "[PAD]")

last_item_id = tokenizer.vocab_size - 1
decoded_flat = tokenizer.decode(last_item_id)
decoded_list = tokenizer.decode([last_item_id])

self.assertEqual(decoded_flat, "##:")
self.assertEqual(decoded_list, "##:")

@require_tokenizers
def test_decoding_extra_params(self):
for tokenizer_class in [BertTokenizer, BertTokenizerFast]:
with self.subTest(f"{tokenizer_class}"):
tokenizer = tokenizer_class.from_pretrained("google-bert/bert-base-cased")
tokenizer.add_tokens(["ஐ"], special_tokens=True)

# test special token with other tokens, skip the special tokens
sentence = "This is a beautiful flower ஐ"
ids = tokenizer(sentence)["input_ids"]
decoded_sent = tokenizer.decode(ids, skip_special_tokens=True)
self.assertEqual(decoded_sent, "This is a beautiful flower")

# test special token with other tokens, do not skip the special tokens
sentence = "This is a beautiful flower ஐ"
ids = tokenizer(sentence)["input_ids"]
decoded_sent = tokenizer.decode(ids, skip_special_tokens=False)
self.assertEqual(decoded_sent, "[CLS] This is a beautiful flower ஐ [SEP]")

# test special token stand alone, skip the special tokens
sentence = "ஐ"
ids = tokenizer(sentence)["input_ids"]
decoded_sent = tokenizer.decode(ids, skip_special_tokens=True)
self.assertEqual(decoded_sent, "")

# test special token stand alone, do not skip the special tokens
sentence = "ஐ"
ids = tokenizer(sentence)["input_ids"]
decoded_sent = tokenizer.decode(ids, skip_special_tokens=False)
self.assertEqual(decoded_sent, "[CLS] ஐ [SEP]")

# test single special token alone, skip
pad_id = 0
decoded_sent = tokenizer.decode(pad_id, skip_special_tokens=True)
self.assertEqual(decoded_sent, "")

# test single special token alone, do not skip
pad_id = 0
decoded_sent = tokenizer.decode(pad_id, skip_special_tokens=False)
self.assertEqual(decoded_sent, "[PAD]")

@require_torch
def test_padding_accepts_tensors_pt(self):
import torch
Expand Down