From 7627dde1f8888faf5b05a8f1dbbc2271096cc1e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=B0brahim=20Ethem=20Demirci?= Date: Thu, 14 Nov 2019 17:06:15 +0300 Subject: [PATCH] sum() is the leanest method to flatten a string list, so it's been replaced by itertools.chain.from_iterable() --- transformers/tokenization_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index cd14cc4582db..1eebae08a75f 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -21,6 +21,7 @@ import json import six import copy +import itertools from io import open from .file_utils import cached_path, is_tf_available, is_torch_available @@ -641,9 +642,9 @@ def split_on_tokens(tok_list, text): tokenized_text += [sub_text] text_list = tokenized_text - return sum((self._tokenize(token, **kwargs) if token not \ + return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \ in self.added_tokens_encoder and token not in self.all_special_tokens \ - else [token] for token in tokenized_text), []) + else [token] for token in tokenized_text))) added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens tokenized_text = split_on_tokens(added_tokens, text)