diff --git a/setup.py b/setup.py index 0170b82f78..8d4366af2d 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,8 @@ def find_version(*file_paths): 'recommonmark', 'sphinx-gallery', 'sphinx_rtd_theme', + 'mxtheme', + 'sphinx-autodoc-typehints', 'nbsphinx', 'flaky', ], diff --git a/src/gluonnlp/__init__.py b/src/gluonnlp/__init__.py index 9d81c73607..7faf51d690 100644 --- a/src/gluonnlp/__init__.py +++ b/src/gluonnlp/__init__.py @@ -18,6 +18,8 @@ # pylint: disable=wildcard-import """NLP toolkit.""" +import warnings + from . import loss from . import data from . import embedding @@ -41,3 +43,5 @@ 'optimizer', 'utils', 'metric'] + +warnings.filterwarnings(module='gluonnlp', action='default', category=DeprecationWarning) diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py index 5943b90d6b..7138fca370 100644 --- a/src/gluonnlp/vocab/vocab.py +++ b/src/gluonnlp/vocab/vocab.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=consider-iterating-dictionary - """Vocabulary.""" __all__ = ['Vocab'] @@ -33,6 +32,9 @@ from ..data.utils import Counter, DefaultLookupDict, count_tokens UNK_IDX = 0 +_DEPR_PAD = object() +_DEPR_BOS = object() +_DEPR_EOS = object() class Vocab: @@ -60,12 +62,6 @@ class Vocab: `None`, looking up any token that is not part of the vocabulary and thus considered unknown will return the index of `unknown_token`. If None, looking up an unknown token will result in `KeyError`. - padding_token - The representation for the special token of padding token. - bos_token - The representation for the special token of beginning-of-sequence token. - eos_token - The representation for the special token of end-of-sequence token. reserved_tokens A list specifying additional tokens to be added to the vocabulary. `reserved_tokens` must not contain the value of `unknown_token` or @@ -79,7 +75,7 @@ class Vocab: example, it is valid to only set the `unknown_token` index to 10 (instead of the default of 0) with `token_to_idx = {'': 10}`, assuming that there are at least 10 tokens in the vocabulary. - **kwargs + `**kwargs` Keyword arguments of the format `xxx_token` can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index. @@ -90,6 +86,21 @@ class Vocab: just as if it was listed in the `reserved_tokens` argument. The specified tokens are listed together with reserved tokens in the `reserved_tokens` attribute of the vocabulary object. + deprecated_padding_token + The representation for the special token of padding token. Default: + ''. Specifying padding_token as positional argument is deprecated + and support will be removed. Specify it as keyword argument instead + (see documentation of `**kwargs` above) + deprecated_bos_token + The representation for the special token of beginning-of-sequence + token. Default: ''. Specifying bos_token as positional argument is + deprecated and support will be removed. Specify it as keyword argument + instead (see documentation of `**kwargs` above) + deprecated_eos_token + The representation for the special token of end-of-sequence token. + Default: ''. Specifying eos_token as positional argument is + deprecated and support will be removed. Specify it as keyword argument + instead (see documentation of `**kwargs` above) Attributes ---------- @@ -173,15 +184,34 @@ class Vocab: def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] = None, min_freq: int = 1, unknown_token: Optional[Hashable] = C.UNK_TOKEN, + deprecated_padding_token: Optional[Hashable] = _DEPR_PAD, + deprecated_bos_token: Optional[Hashable] = _DEPR_BOS, + deprecated_eos_token: Optional[Hashable] = _DEPR_EOS, + reserved_tokens: Optional[List[Hashable]] = None, + token_to_idx: Optional[Dict[Hashable, int]] = None, *, padding_token: Optional[Hashable] = C.PAD_TOKEN, bos_token: Optional[Hashable] = C.BOS_TOKEN, - eos_token: Optional[Hashable] = C.EOS_TOKEN, - reserved_tokens: Optional[List[Hashable]] = None, - token_to_idx: Optional[Dict[Hashable, int]] = None, **kwargs): + eos_token: Optional[Hashable] = C.EOS_TOKEN, **kwargs): # Sanity checks. assert min_freq > 0, '`min_freq` must be set to a positive value.' + # Deprecation checks and warnings + combs = ((deprecated_padding_token, 'padding_token', _DEPR_PAD, padding_token), + (deprecated_bos_token, 'bos_token', _DEPR_BOS, bos_token), + (deprecated_eos_token, 'eos_token', _DEPR_EOS, eos_token)) + for depr_pos_arg, name, indicator, value in combs: + if depr_pos_arg != indicator: + warnings.warn( + 'Specifying `{n}` as positional argument is deprecated and ' + 'support will be removed. Please specify `{n}` as keyword argument instead, ' + 'for example `Vocab(counter, {n}={v})`'.format(n=name, v=depr_pos_arg), + DeprecationWarning) + # Store positional argument value in kwargs + kwargs[name] = depr_pos_arg + elif name not in kwargs: # Store keyword argument value in kwargs + kwargs[name] = value + # Set up idx_to_token and token_to_idx based on presence of unknown token self._unknown_token = unknown_token self._idx_to_token = [unknown_token] if unknown_token else [] @@ -190,10 +220,6 @@ def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] = else: self._token_to_idx = {} - kwargs['padding_token'] = padding_token - kwargs['bos_token'] = bos_token - kwargs['eos_token'] = eos_token - # Handle special tokens special_tokens = [] for special_token_name, special_token in kwargs.items():