Merge branch 'megatron-bart' of github.com:michalivne/NeMo into megat…

…ron-bart
NVIDIA · Mar 30, 2022 · 6c6b7fd · 6c6b7fd
2 parents 40ad596 + 0b08d2a
commit 6c6b7fd
Show file tree

Hide file tree

Showing 46 changed files with 382 additions and 468 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -2001,34 +2001,34 @@ pipeline {
         sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
       }
     }
-    stage('L2: Megatron P-Tuning GPT LM') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python  examples/nlp/text_classification/ptune_text_classification.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_epochs=1 \
-        +trainer.limit_val_batches=10 \
-        +trainer.limit_train_batches=10 \
-        +trainer.limit_test_batches=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/ptune_results \
-        model.tokenizer.vocab_file=/home/TestData/nlp/ptune/gpt2-vocab.json \
-        model.tensor_model_parallel_size=2 \
-        model.tokenizer.merge_file=/home/TestData/nlp/ptune/gpt2-merges.txt \
-        model.language_model.nemo_file=/home/TestData/nlp/ptune/small_gpt.nemo \
-        model.dataset.classes=[positive,neutral,negative] \
-        model.train_ds.file_path=/home/TestData/nlp/ptune/data/train_0.txt \
-        model.validation_ds.file_path=/home/TestData/nlp/ptune/data/validation_0.txt \
-        model.test_ds.file_path=/home/TestData/nlp/ptune/data/test_0.txt "
-        sh "rm -rf examples/nlp/language_modeling/ptune_results"
-      }
-    }
+    // stage('L2: Megatron P-Tuning GPT LM') {
+    //   when {
+    //     anyOf {
+    //       branch 'main'
+    //       changeRequest target: 'main'
+    //     }
+    //   }
+    //   failFast true
+    //   steps {
+    //     sh "python  examples/nlp/text_classification/ptune_text_classification.py \
+    //     trainer.devices=2 \
+    //     trainer.accelerator=gpu \
+    //     trainer.max_epochs=1 \
+    //     +trainer.limit_val_batches=10 \
+    //     +trainer.limit_train_batches=10 \
+    //     +trainer.limit_test_batches=10 \
+    //     exp_manager.exp_dir=examples/nlp/language_modeling/ptune_results \
+    //     model.tokenizer.vocab_file=/home/TestData/nlp/ptune/gpt2-vocab.json \
+    //     model.tensor_model_parallel_size=2 \
+    //     model.tokenizer.merge_file=/home/TestData/nlp/ptune/gpt2-merges.txt \
+    //     model.language_model.nemo_file=/home/TestData/nlp/ptune/small_gpt.nemo \
+    //     model.dataset.classes=[positive,neutral,negative] \
+    //     model.train_ds.file_path=/home/TestData/nlp/ptune/data/train_0.txt \
+    //     model.validation_ds.file_path=/home/TestData/nlp/ptune/data/validation_0.txt \
+    //     model.test_ds.file_path=/home/TestData/nlp/ptune/data/test_0.txt "
+    //     sh "rm -rf examples/nlp/language_modeling/ptune_results"
+    //   }
+    // }
     stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
       when {
         anyOf {

diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
@@ -84,11 +84,11 @@ Creating a NeMo model is similar to any other PyTorch workflow. We start by init
 
             # instantiate a BERT based encoder
             self.bert_model = get_lm_model(
-                pretrained_model_name=cfg.language_model.pretrained_model_name,
                 config_file=cfg.language_model.config_file,
                 config_dict=cfg.language_model.config,
-                checkpoint_file=cfg.language_model.lm_checkpoint,
                 vocab_file=cfg.tokenizer.vocab_file,
+                trainer=trainer,
+                cfg=cfg,
             )
 
             # instantiate the FFN for classification

diff --git a/docs/source/nemo_text_processing/textprocessing_all.bib b/docs/source/nemo_text_processing/textprocessing_all.bib
diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst
@@ -43,10 +43,6 @@ Modules
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.nlp.modules.common.megatron.MegatronBertEncoder
-    :show-inheritance:
-    :members:
-
 .. autoclass:: nemo.collections.nlp.modules.AlbertEncoder
     :show-inheritance:
     :members:

diff --git a/docs/source/nlp/text_normalization/nn_text_normalization.rst b/docs/source/nlp/text_normalization/nn_text_normalization.rst
@@ -66,7 +66,7 @@ In the example, ``self`` denotes that the spoken form is the same as the written
     <eos>	<eos>
 
 
-More information about the Google text normalization dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-Sproat2016RNNAT`.
+More information about the Google text normalization dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-sproat2016rnn`.
 The script for splitting the Google text normalization data files into `train`, `dev`, `test` can be found here: 
 `data/data_split.py <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/duplex_text_normalization/data/data_split.py>`__.
 
@@ -228,7 +228,7 @@ This pipeline consists of
         * adding space around `-` in alpha-numerical words, e.g. `2-car` -> `2 - car`
         * converting unicode fraction e.g. ½ to 1/2
         * normalizing greek letters and some special characters, e.g. `+` -> `plus`
-    * Moses :cite:`koehnetal2007moses`. tokenization/preprocessing of the input
+    * Moses :cite:`nlp-textnorm-koehn-etal-2007-moses`. tokenization/preprocessing of the input
     * inference with neural tagger and decoder
     * Moses postprocessing/ detokenization
     * WFST-based grammars to verbalize some `VERBATIM`
@@ -245,7 +245,7 @@ transform them into the appropriate forms (e.g., spoken forms for TN and written
 The decoder model is essentially a Transformer-based encoder-decoder seq2seq model (e.g., the example
 training script uses the T5-base model by default). Overall, our design is partly inspired by the
 RNN-based sliding window model proposed in the paper
-`Neural Models of Text Normalization for Speech Applications <https://research.fb.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf>`__ :cite:`nlp-textnorm-Zhang2019NeuralMO`.
+`Neural Models of Text Normalization for Speech Applications <https://research.fb.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf>`__ :cite:`nlp-textnorm-zhang2019neural`.
 
 We introduce a simple but effective technique to allow our model to be duplex. Depending on the
 task the model is handling, we append the appropriate prefix to the input. For example, suppose

diff --git a/docs/source/nlp/text_normalization/tn_itn_all.bib b/docs/source/nlp/text_normalization/tn_itn_all.bib
@@ -53,4 +53,38 @@ @incollection{mohri2009weighted
   pages={213--254},
   year={2009},
   publisher={Springer}
+}
+
+
+@article{zhang2019neural,
+  title={Neural models of text normalization for speech applications},
+  author={Zhang, Hao and Sproat, Richard and Ng, Axel H and Stahlberg, Felix and Peng, Xiaochang and Gorman, Kyle and Roark, Brian},
+  journal={Computational Linguistics},
+  year={2019},
+}
+
+
+@inproceedings{koehn-etal-2007-moses,
+    title = "{M}oses: Open Source Toolkit for Statistical Machine Translation",
+    author = "Koehn, Philipp  and
+      Hoang, Hieu  and
+      Birch, Alexandra  and
+      Callison-Burch, Chris  and
+      Federico, Marcello  and
+      Bertoldi, Nicola  and
+      Cowan, Brooke  and
+      Shen, Wade  and
+      Moran, Christine  and
+      Zens, Richard  and
+      Dyer, Chris  and
+      Bojar, Ond{\v{r}}ej  and
+      Constantin, Alexandra  and
+      Herbst, Evan",
+    booktitle = "Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions",
+    month = jun,
+    year = "2007",
+    address = "Prague, Czech Republic",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/P07-2045",
+    pages = "177--180",
 }
diff --git a/examples/nlp/text_classification/conf/text_classification_config.yaml b/examples/nlp/text_classification/conf/text_classification_config.yaml
@@ -44,7 +44,6 @@ model:
     lm_checkpoint: null
     config_file: null # json file, precedence over config
     config: null
-    nemo_file: null
 
   classifier_head:
     num_output_layers: 2

diff --git a/examples/nlp/token_classification/conf/token_classification_config.yaml b/examples/nlp/token_classification/conf/token_classification_config.yaml
@@ -86,7 +86,6 @@ model:
     lm_checkpoint: null
     config_file: null # json file, precedence over config
     config: null
-    nemo_file: null
 
 
   head:

diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
@@ -208,7 +208,7 @@ def predict(
 
         Args:
             y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding.
-                If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on Embedding.
+                If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on EmbeddiNg.
 
             state: An optional list of states for the RNN. Eg: For LSTM, it is the state list length is 2.
                 Each state must be a tensor of shape [L, B, H].

diff --git a/nemo/collections/nlp/models/dialogue_state_tracking_generative/dialogue_gpt_model.py b/nemo/collections/nlp/models/dialogue_state_tracking_generative/dialogue_gpt_model.py
@@ -59,8 +59,7 @@ def __init__(
         self.cfg = cfg
         self.data_prepared = False
 
-        self.setup_tokenizer(cfg.tokenizer)
-        super().__init__(cfg=cfg, trainer=trainer)
+        super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True)
 
         if self.cfg.library == "huggingface":
             self.language_model = AutoModelWithLMHead.from_pretrained(cfg.language_model.pretrained_model_name)

diff --git a/nemo/collections/nlp/models/dialogue_state_tracking_sgdqa/sgdqa_model.py b/nemo/collections/nlp/models/dialogue_state_tracking_sgdqa/sgdqa_model.py
@@ -36,7 +36,6 @@
 from nemo.collections.nlp.losses import SGDDialogueStateLoss
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.collections.nlp.modules import SGDDecoder, SGDEncoder
-from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.core.neural_types import NeuralType
@@ -52,32 +51,18 @@ class SGDQAModel(NLPModel):
     """Dialogue State Tracking Model SGD-QA"""
 
     @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return self.bert_model.input_types
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return self.decoder.output_types
+    def output_module(self):
+        return self.decoder
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-
         self.data_prepared = False
-        self.setup_tokenizer(cfg.tokenizer)
         super().__init__(cfg=cfg, trainer=trainer)
-        self.bert_model = get_lm_model(
-            pretrained_model_name=cfg.language_model.pretrained_model_name,
-            config_file=self.register_artifact('language_model.config_file', cfg.language_model.config_file),
-            config_dict=OmegaConf.to_container(cfg.language_model.config) if cfg.language_model.config else None,
-            checkpoint_file=cfg.language_model.lm_checkpoint,
-            vocab_file=self.register_artifact('tokenizer.vocab_file', cfg.tokenizer.vocab_file),
-        )
-
         self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout)
         self.decoder = SGDDecoder(embedding_dim=self.bert_model.config.hidden_size)
         self.loss = SGDDialogueStateLoss(reduction="mean")
 
     @typecheck()
-    def forward(self, input_ids, token_type_ids, attention_mask):
+    def forward(self, input_ids, attention_mask, token_type_ids):
         token_embeddings = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
         )

diff --git a/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py b/nemo/collections/nlp/models/duplex_text_normalization/duplex_decoder.py
@@ -71,11 +71,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         if trainer is not None:
             self.world_size = trainer.num_nodes * trainer.num_gpus
 
-        self._tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer)
+        self.tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer)
 
-        super().__init__(cfg=cfg, trainer=trainer)
+        super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True)
         self.model = AutoModelForSeq2SeqLM.from_pretrained(cfg.transformer)
-        self.max_sequence_len = cfg.get('max_sequence_len', self._tokenizer.model_max_length)
+        self.max_sequence_len = cfg.get('max_sequence_len', self.tokenizer.model_max_length)
         self.mode = cfg.get('mode', 'joint')
 
         self.transformer_name = cfg.transformer
@@ -103,7 +103,7 @@ def setup_cgs(self, cfg: DictConfig):
         self.neural_confidence_threshold = cfg.get('neural_confidence_threshold', 0.99)
         self.n_tagged = cfg.get('n_tagged', 1)
         input_case = 'cased'  # input_case is cased by default
-        if hasattr(self._tokenizer, 'do_lower_case') and self._tokenizer.do_lower_case:
+        if hasattr(self.tokenizer, 'do_lower_case') and self.tokenizer.do_lower_case:
             input_case = 'lower_cased'
         if not PYNINI_AVAILABLE:
             raise Exception(
@@ -158,7 +158,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0, split="val"):
             labels=batch['labels'],
         )
 
-        labels_str = self._tokenizer.batch_decode(
+        labels_str = self.tokenizer.batch_decode(
             torch.ones_like(batch['labels']) * ((batch['labels'] == -100) * 100) + batch['labels'],
             skip_special_tokens=True,
         )
@@ -291,7 +291,7 @@ def _generate_predictions(self, input_ids: torch.Tensor, model_max_len: int = 51
         )
 
         generated_ids, sequence_toks_scores = outputs['sequences'], outputs['scores']
-        generated_texts = self._tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
         return generated_texts, generated_ids, sequence_toks_scores
 
@@ -319,7 +319,7 @@ def _infer(
 
         if sum(nb_spans) == 0:
             return [[]] * len(sents)
-        model, tokenizer = self.model, self._tokenizer
+        model, tokenizer = self.model, self.tokenizer
         ctx_size = constants.DECODE_CTX_SIZE
         extra_id_0 = constants.EXTRA_ID_0
         extra_id_1 = constants.EXTRA_ID_1
@@ -370,7 +370,7 @@ def _infer(
                 # Compute selected_toks_probs
                 selected_toks_probs = []
                 for jx, _id in enumerate(cur_generated_ids):
-                    if _id != self._tokenizer.pad_token_id:
+                    if _id != self.tokenizer.pad_token_id:
                         selected_toks_probs.append(cur_toks_probs[jx, _id])
                     else:
                         selected_toks_probs.append(1)
@@ -481,7 +481,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str):
 
             dataset = TextNormalizationDecoderDataset(
                 input_file=input_file,
-                tokenizer=self._tokenizer,
+                tokenizer=self.tokenizer,
                 tokenizer_name=self.transformer_name,
                 mode=self.mode,
                 max_len=self.max_sequence_len,
@@ -504,7 +504,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str):
                 self._val_id_to_class.append({v: k for k, v in dataset.label_ids_semiotic.items()})
 
             data_collator = DataCollatorForSeq2Seq(
-                self._tokenizer, model=self.model, label_pad_token_id=constants.LABEL_PAD_TOKEN_ID, padding=True
+                self.tokenizer, model=self.model, label_pad_token_id=constants.LABEL_PAD_TOKEN_ID, padding=True
             )
             dl = torch.utils.data.DataLoader(
                 dataset=dataset,