Skip to content

Commit

Permalink
Add generalized token aliases to TokenizerSpec to conform with Megatr…
Browse files Browse the repository at this point in the history
…onTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

Signed-off-by: Valerie Sarge <[email protected]>
  • Loading branch information
vysarge committed Feb 9, 2024
1 parent 89e76f1 commit 5b23334
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,6 @@ def __init__(
if token is not None and token not in self.tokenizer.get_vocab():
new_tokens_in_vocab.append(token)

# value is required for megatron-core
self.unique_identifiers = OrderedDict()

if len(new_tokens_in_vocab) > 0:
"""
Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
Expand Down Expand Up @@ -231,11 +228,6 @@ def bos_id(self):
def eos_id(self):
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def eod(self):
"""Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core."""
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def sep_id(self):
return self.tokens_to_ids([getattr(self, 'sep_token')])[0]
Expand Down
4 changes: 0 additions & 4 deletions nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,6 @@ def eos_id(self):
eos_id = self.tokenizer.eos_id()
return eos_id

@property
def eod(self):
return self.eos_id

@property
def sep_id(self):
if self.legacy:
Expand Down
52 changes: 52 additions & 0 deletions nemo/collections/common/tokenizers/tokenizer_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,55 @@ def name(self):
def unique_identifiers(self):
"""Property required for use with megatron-core datasets."""
return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"})

@property
def cls(self):
"""Property alias to match MegatronTokenizer; returns cls_id if available."""
if hasattr(self, 'cls_id'):
return self.cls_id
raise AttributeError(f"{type(self).__name__} has no attribute 'cls' or 'cls_id'")

@property
def sep(self):
"""Property alias to match MegatronTokenizer; returns sep_id if available."""
if hasattr(self, 'sep_id'):
return self.sep_id
raise AttributeError(f"{type(self).__name__} has no attribute 'sep' or 'sep_id'")

@property
def pad(self):
"""Property alias to match MegatronTokenizer; returns pad_id if available."""
if hasattr(self, 'pad_id'):
return self.pad_id
raise AttributeError(f"{type(self).__name__} has no attribute 'pad' or 'pad_id'")

@property
def eod(self):
"""Property alias to match MegatronTokenizer; returns eod_id if available."""
if hasattr(self, 'eod_id'):
return self.eod_id
if hasattr(self, 'eos_id'):
# Default to end-of-sentence id if end-of-document is not defined.
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eod', 'eod_id', 'eos', or 'eos_id'")

@property
def bos(self):
"""Property alias to match MegatronTokenizer; returns bos_id if available."""
if hasattr(self, 'bos_id'):
return self.bos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'bos' or 'bos_id'")

@property
def eos(self):
"""Property alias to match MegatronTokenizer; returns eos_id if available."""
if hasattr(self, 'eos_id'):
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eos' or 'eos_id'")

@property
def mask(self):
"""Property alias to match MegatronTokenizer; returns mask_id if available."""
if hasattr(self, 'mask_id'):
return self.mask_id
raise AttributeError(f"{type(self).__name__} has no attribute 'mask' or 'mask_id'")

0 comments on commit 5b23334

Please sign in to comment.