Skip to content

Commit

Permalink
save extra special tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
DesmonDay committed Feb 11, 2025
1 parent 3967f76 commit 2f9774d
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions paddlenlp/transformers/tokenizer_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,11 @@ def add_tokens(

return self._add_tokens(new_tokens, special_tokens=special_tokens)

@classmethod
def _add_extra_special_tokens(cls, extra_sp_token: Union[str, AddedToken]):
if extra_sp_token not in cls.SPECIAL_TOKENS_ATTRIBUTES:
cls.SPECIAL_TOKENS_ATTRIBUTES.append(extra_sp_token)

def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
raise NotImplementedError

Expand Down Expand Up @@ -1213,7 +1218,13 @@ def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
"""
set_attr = {}
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
attr_value = getattr(self, "_" + attr)
try:
attr_value = getattr(self, "_" + attr)
except:
try:
attr_value = getattr(self, attr)
except:
continue
if attr_value:
set_attr[attr] = (
type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
Expand All @@ -1233,7 +1244,13 @@ def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[U
"""
set_attr = {}
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
attr_value = getattr(self, "_" + attr)
try:
attr_value = getattr(self, "_" + attr)
except:
try:
attr_value = getattr(self, attr)
except:
continue
if attr_value:
set_attr[attr] = attr_value
return set_attr
Expand Down Expand Up @@ -1744,6 +1761,7 @@ def convert_added_tokens(obj):
elif isinstance(value, list):
value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
setattr(tokenizer, key, value)
cls._add_extra_special_tokens(key)

# Add supplementary tokens.
special_tokens = tokenizer.all_special_tokens
Expand Down

0 comments on commit 2f9774d

Please sign in to comment.