Skip to content

Commit

Permalink
MCore dataset compatibility for tokenizers (#8390) (#8397)
Browse files Browse the repository at this point in the history
* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

---------

Signed-off-by: Valerie Sarge <[email protected]>
Co-authored-by: Valerie Sarge <[email protected]>
Co-authored-by: Pablo Garay <[email protected]>
Co-authored-by: Eric Harper <[email protected]>
Signed-off-by: Alexandros Koumparoulis <[email protected]>
  • Loading branch information
4 people authored and akoumpa committed Feb 19, 2024
1 parent 4829965 commit ebef7c6
Showing 1 changed file with 58 additions and 0 deletions.
58 changes: 58 additions & 0 deletions nemo/collections/common/tokenizers/tokenizer_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import List

__all__ = ['TokenizerSpec']
Expand Down Expand Up @@ -53,3 +54,60 @@ def add_special_tokens(self, special_tokens: List[str]):
@property
def name(self):
return type(self).__name__

@property
def unique_identifiers(self):
"""Property required for use with megatron-core datasets."""
return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"})

@property
def cls(self):
"""Property alias to match MegatronTokenizer; returns cls_id if available."""
if hasattr(self, 'cls_id'):
return self.cls_id
raise AttributeError(f"{type(self).__name__} has no attribute 'cls' or 'cls_id'")

@property
def sep(self):
"""Property alias to match MegatronTokenizer; returns sep_id if available."""
if hasattr(self, 'sep_id'):
return self.sep_id
raise AttributeError(f"{type(self).__name__} has no attribute 'sep' or 'sep_id'")

@property
def pad(self):
"""Property alias to match MegatronTokenizer; returns pad_id if available."""
if hasattr(self, 'pad_id'):
return self.pad_id
raise AttributeError(f"{type(self).__name__} has no attribute 'pad' or 'pad_id'")

@property
def eod(self):
"""Property alias to match MegatronTokenizer; returns eod_id if available."""
if hasattr(self, 'eod_id'):
return self.eod_id
if hasattr(self, 'eos_id'):
# Default to end-of-sentence id if end-of-document is not defined.
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eod', 'eod_id', 'eos', or 'eos_id'")

@property
def bos(self):
"""Property alias to match MegatronTokenizer; returns bos_id if available."""
if hasattr(self, 'bos_id'):
return self.bos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'bos' or 'bos_id'")

@property
def eos(self):
"""Property alias to match MegatronTokenizer; returns eos_id if available."""
if hasattr(self, 'eos_id'):
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eos' or 'eos_id'")

@property
def mask(self):
"""Property alias to match MegatronTokenizer; returns mask_id if available."""
if hasattr(self, 'mask_id'):
return self.mask_id
raise AttributeError(f"{type(self).__name__} has no attribute 'mask' or 'mask_id'")

0 comments on commit ebef7c6

Please sign in to comment.