Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def split_on_token(tok, text):
full_word += sub_text + tok
elif full_word:
full_word += sub_text
result += [full_word]
result.append(full_word)
full_word = ""
continue
# Strip white spaces on the right
Expand All @@ -310,16 +310,16 @@ def split_on_token(tok, text):
sub_text = sub_text.lstrip()

if i == 0 and not sub_text:
result += [tok]
result.append(tok)
elif i == len(split_text) - 1:
if sub_text:
result += [sub_text]
result.append(sub_text)
else:
pass
else:
if sub_text:
result += [sub_text]
result += [tok]
result.append(sub_text)
result.append(tok)
return result

def split_on_tokens(tok_list, text):
Expand All @@ -334,9 +334,9 @@ def split_on_tokens(tok_list, text):
tokenized_text = []
for sub_text in text_list:
if sub_text not in self.unique_no_split_tokens:
tokenized_text += split_on_token(tok, sub_text)
tokenized_text.extend(split_on_token(tok, sub_text))
else:
tokenized_text += [sub_text]
tokenized_text.append(sub_text)
text_list = tokenized_text

return list(
Expand Down