diff --git a/nemo/collections/common/tokenizers/tokenizer_spec.py b/nemo/collections/common/tokenizers/tokenizer_spec.py index 252571d76ef2..f6e905d75c3b 100644 --- a/nemo/collections/common/tokenizers/tokenizer_spec.py +++ b/nemo/collections/common/tokenizers/tokenizer_spec.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import ABC, abstractmethod +from collections import OrderedDict from typing import List __all__ = ['TokenizerSpec'] @@ -53,3 +54,60 @@ def add_special_tokens(self, special_tokens: List[str]): @property def name(self): return type(self).__name__ + + @property + def unique_identifiers(self): + """Property required for use with megatron-core datasets.""" + return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"}) + + @property + def cls(self): + """Property alias to match MegatronTokenizer; returns cls_id if available.""" + if hasattr(self, 'cls_id'): + return self.cls_id + raise AttributeError(f"{type(self).__name__} has no attribute 'cls' or 'cls_id'") + + @property + def sep(self): + """Property alias to match MegatronTokenizer; returns sep_id if available.""" + if hasattr(self, 'sep_id'): + return self.sep_id + raise AttributeError(f"{type(self).__name__} has no attribute 'sep' or 'sep_id'") + + @property + def pad(self): + """Property alias to match MegatronTokenizer; returns pad_id if available.""" + if hasattr(self, 'pad_id'): + return self.pad_id + raise AttributeError(f"{type(self).__name__} has no attribute 'pad' or 'pad_id'") + + @property + def eod(self): + """Property alias to match MegatronTokenizer; returns eod_id if available.""" + if hasattr(self, 'eod_id'): + return self.eod_id + if hasattr(self, 'eos_id'): + # Default to end-of-sentence id if end-of-document is not defined. + return self.eos_id + raise AttributeError(f"{type(self).__name__} has no attribute 'eod', 'eod_id', 'eos', or 'eos_id'") + + @property + def bos(self): + """Property alias to match MegatronTokenizer; returns bos_id if available.""" + if hasattr(self, 'bos_id'): + return self.bos_id + raise AttributeError(f"{type(self).__name__} has no attribute 'bos' or 'bos_id'") + + @property + def eos(self): + """Property alias to match MegatronTokenizer; returns eos_id if available.""" + if hasattr(self, 'eos_id'): + return self.eos_id + raise AttributeError(f"{type(self).__name__} has no attribute 'eos' or 'eos_id'") + + @property + def mask(self): + """Property alias to match MegatronTokenizer; returns mask_id if available.""" + if hasattr(self, 'mask_id'): + return self.mask_id + raise AttributeError(f"{type(self).__name__} has no attribute 'mask' or 'mask_id'")