Skip to content

Commit

Permalink
adding special_tokens from tokenizer config for transformer-lm model
Browse files Browse the repository at this point in the history
  • Loading branch information
azzhipa committed Oct 3, 2023
1 parent 5cb76a5 commit aae5ed9
Showing 1 changed file with 3 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
tokenizer_model=cfg.tokenizer.get("tokenizer_model", None),
vocab_file=cfg.tokenizer.get("vocab_file", None),
bpe_dropout=cfg.tokenizer.get("bpe_dropout", 0.0),
special_tokens=OmegaConf.to_container(cfg.tokenizer.special_tokens) if cfg.tokenizer.get("special_tokens", None) else None,
)

# init superclass
Expand Down Expand Up @@ -196,7 +197,7 @@ def on_test_epoch_end(self):
self.test_step_outputs.clear() # free memory

def setup_tokenizer(
self, tokenizer_name=None, tokenizer_model=None, vocab_file=None, bpe_dropout=0.0,
self, tokenizer_name=None, tokenizer_model=None, vocab_file=None, bpe_dropout=0.0, special_tokens=None,
):

supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'word']
Expand All @@ -208,7 +209,7 @@ def setup_tokenizer(
tokenizer_model=self.register_artifact("cfg.tokenizer.tokenizer_model", tokenizer_model),
vocab_file=vocab_file,
bpe_dropout=bpe_dropout,
special_tokens=None,
special_tokens=special_tokens,
use_fast=False,
)

Expand Down

0 comments on commit aae5ed9

Please sign in to comment.