NVIDIA · ericharper · Apr 4, 2022 · Mar 31, 2022 · Apr 1, 2022 · Apr 1, 2022
diff --git a/nemo/collections/nlp/models/dialogue_state_tracking_sgdqa/sgdqa_model.py b/nemo/collections/nlp/models/dialogue_state_tracking_sgdqa/sgdqa_model.py
@@ -66,6 +66,9 @@ def forward(self, input_ids, attention_mask, token_type_ids):
         token_embeddings = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
         )
+        if isinstance(token_embeddings, tuple):
+            token_embeddings = token_embeddings[0]
+
         encoded_utterance, token_embeddings = self.encoder(hidden_states=token_embeddings)
         (
             logit_intent_status,

diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
@@ -65,6 +65,8 @@ def forward(self, input_ids, token_type_ids, attention_mask):
         hidden_states = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
         )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
 
         # normalize to unit sphere
         logits = torch.nn.functional.normalize(hidden_states[:, self._idx_conditioned_on], p=2, dim=1)

diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
@@ -135,6 +135,9 @@ def forward(self, input_ids, token_type_ids, attention_mask):
         hidden_states = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
         )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
+
         output = self.pooler(hidden_states=hidden_states)
         return output
 

diff --git a/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py b/nemo/collections/nlp/models/information_retrieval/bert_joint_ir_model.py
@@ -63,6 +63,9 @@ def forward(self, input_ids, attention_mask, token_type_ids):
         hidden_states = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
         )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
+
         scores = self.sim_score_regressor(hidden_states=hidden_states)
 
         return scores

diff --git a/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py b/nemo/collections/nlp/models/intent_slot_classification/intent_slot_classification_model.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import ntpath
 import os
 from typing import Dict, List, Optional
 
@@ -100,8 +101,8 @@ def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
                 {'intent_labels_file': 'intent_labels.csv', 'slot_labels_file': 'slot_labels.csv'}
             )
 
-        slot_labels_file = os.path.join(data_dir, cfg.class_labels.slot_labels_file)
-        intent_labels_file = os.path.join(data_dir, cfg.class_labels.intent_labels_file)
+        slot_labels_file = os.path.join(data_dir, ntpath.basename(cfg.class_labels.slot_labels_file))
+        intent_labels_file = os.path.join(data_dir, ntpath.basename(cfg.class_labels.intent_labels_file))
         self._save_label_ids(data_desc.slots_label_ids, slot_labels_file)
         self._save_label_ids(data_desc.intents_label_ids, intent_labels_file)
 
@@ -187,12 +188,11 @@ def forward(self, input_ids, attention_mask, token_type_ids):
         No special modification required for Lightning, define it as you normally would
         in the `nn.Module` in vanilla PyTorch.
         """
-        if self._cfg.tokenizer.get('library', '') == 'megatron':
-            hidden_states, _ = self.bert_model(input_ids, attention_mask, tokentype_ids=token_type_ids, lm_labels=None)
-        else:
-            hidden_states = self.bert_model(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-            )
+        hidden_states = self.bert_model(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
+        )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
 
         intent_logits, slot_logits = self.classifier(hidden_states=hidden_states)
         return intent_logits, slot_logits

diff --git a/...ctions/nlp/models/intent_slot_classification_refactor/intent_slot_classification_model.py b/...ctions/nlp/models/intent_slot_classification_refactor/intent_slot_classification_model.py
@@ -198,6 +198,9 @@ def forward(self, input_ids, attention_mask, token_type_ids):
         hidden_states = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
         )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
+
         intent_logits, slot_logits = self.classifier(hidden_states=hidden_states)
         return intent_logits, slot_logits
 

diff --git a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
@@ -129,6 +129,9 @@ def forward(self, input_ids, attention_mask, token_type_ids):
         hidden_states = self.bert_model(
             input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
         )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
+
         mlm_log_probs = self.mlm_classifier(hidden_states=hidden_states)
         if self.only_mlm_loss:
             return (mlm_log_probs,)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py
@@ -149,6 +149,7 @@ def __init__(
         openai_gelu=False,
         onnx_safe=False,
         add_binary_head=True,
+        megatron_legacy=False,
     ):
         super(BertModel, self).__init__()
         # args = get_args()
@@ -189,6 +190,7 @@ def __init__(
             bias_gelu_fusion=bias_gelu_fusion,
             openai_gelu=openai_gelu,
             onnx_safe=onnx_safe,
+            megatron_legacy=megatron_legacy,
         )
 
         self.initialize_word_embeddings(
@@ -215,13 +217,15 @@ def set_input_tensor(self, input_tensor):
         """See megatron.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
-    def forward(self, bert_model_input, attention_mask, tokentype_ids=None, lm_labels=None):
+    def forward(self, bert_model_input, attention_mask, token_type_ids=None, lm_labels=None):
 
         extended_attention_mask = bert_extended_attention_mask(attention_mask)
         input_ids = bert_model_input
         position_ids = build_position_ids(input_ids)
 
-        lm_output = self.language_model(input_ids, position_ids, extended_attention_mask, tokentype_ids=tokentype_ids)
+        lm_output = self.language_model(
+            input_ids, position_ids, extended_attention_mask, token_type_ids=token_type_ids
+        )
 
         if self.post_process and self.add_binary_head:
             lm_output, pooled_output = lm_output

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
@@ -168,7 +168,7 @@ def forward(
         attention_mask,
         labels=None,
         prompt_ids=None,
-        tokentype_ids=None,
+        token_type_ids=None,
         layer_past=None,
         get_key_value=False,
         forward_method_parallel_output=None,

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -119,21 +119,22 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             activations_checkpoint_method=cfg.get('activations_checkpoint_method', None),
             activations_checkpoint_num_layers=cfg.get('activations_checkpoint_num_layers', 1),
             layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
-            masked_softmax_fusion=cfg.get('masked_softmax_fusion', False),
-            bias_gelu_fusion=cfg.get('bias_gelu_fusion', False),
+            masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
+            bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
             onnx_safe=cfg.get('onnx_safe', False),
             add_binary_head=cfg.bert_binary_head,
+            megatron_legacy=cfg.get('megatron_legacy', False),
         )
 
-    def forward(self, tokens, attention_mask, tokentype_ids, lm_labels):
-        output_tensor = self.model(tokens, attention_mask, tokentype_ids=tokentype_ids, lm_labels=lm_labels)
+    def forward(self, input_ids, attention_mask, token_type_ids, lm_labels=None):
+        output_tensor = self.model(input_ids, attention_mask, token_type_ids=token_type_ids, lm_labels=lm_labels)
         return output_tensor
 
     def training_step(self, batch, batch_idx):
         tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = self.process_batch(batch)
         if not self.cfg.bert_binary_head:
             types = None
-        output_tensor = self(tokens, padding_mask, tokentype_ids=types, lm_labels=lm_labels)
+        output_tensor = self(tokens, padding_mask, token_type_ids=types, lm_labels=lm_labels)
         loss_dict = self.loss_func(loss_mask, sentence_order, output_tensor)
         if 'sop loss' in loss_dict:
             lm_loss = loss_dict['lm loss']
@@ -176,7 +177,7 @@ def validation_step(self, batch, batch_idx):
         tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = self.process_batch(batch)
         if not self.cfg.bert_binary_head:
             types = None
-        output_tensor = self(tokens, padding_mask, tokentype_ids=types, lm_labels=lm_labels)
+        output_tensor = self(tokens, padding_mask, token_type_ids=types, lm_labels=lm_labels)
         loss_dict = self.loss_func(loss_mask, sentence_order, output_tensor)
         if 'sop loss' in loss_dict:
             lm_loss = loss_dict['lm loss']

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -185,7 +185,7 @@ def forward(
         decoder_input_ids,
         encoder_attn_mask,
         decoder_attn_mask,
-        tokentype_ids=None,
+        token_type_ids=None,
         lm_labels=None,
         enc_hidden_states=None,
         enc_output_mask=None,
@@ -197,7 +197,7 @@ def forward(
             dec_input_ids=decoder_input_ids,
             enc_attn_mask=encoder_attn_mask,
             dec_attn_mask=decoder_attn_mask,
-            tokentype_ids=tokentype_ids,
+            token_type_ids=token_type_ids,
             labels=lm_labels,
             enc_hidden_states=enc_hidden_states,
             enc_output_mask=enc_output_mask,
@@ -414,7 +414,7 @@ def fwd_output_and_loss_func(batch, model):
                 encoder_attn_mask,  # enc_attn_mask
                 decoder_input_ids,  # dec_input_ids
                 decoder_attn_mask,  # dec_attn_mask
-                None,  # tokentype_ids
+                None,  # token_type_ids
                 lm_labels,  # labels
                 None,  # enc_hidden_states
             )
@@ -437,7 +437,7 @@ def fwd_output_only_func(batch, model):
                 encoder_attn_mask,  # enc_attn_mask
                 decoder_input_ids,  # dec_input_ids
                 decoder_attn_mask,  # dec_attn_mask
-                None,  # tokentype_ids
+                None,  # token_type_ids
                 None,  # labels
                 None,  # enc_hidden_states
             )

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_ptune_t5_model.py b/nemo/collections/nlp/models/language_modeling/megatron_ptune_t5_model.py
@@ -190,7 +190,7 @@ def get_loss(self, batch):
                 enc_attn_mask=enc_mask,
                 dec_input_ids=tokens_dec,
                 dec_attn_mask=dec_mask,
-                tokentype_ids=None,
+                token_type_ids=None,
                 labels=labels,
                 enc_hidden_states=None,
                 output_enc_hidden_only=False,
@@ -203,7 +203,7 @@ def get_loss(self, batch):
                     enc_attn_mask=enc_mask,
                     dec_input_ids=tokens_dec,
                     dec_attn_mask=dec_mask,
-                    tokentype_ids=None,
+                    token_type_ids=None,
                     labels=labels,
                     enc_hidden_states=None,
                     output_enc_hidden_only=False,

diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
@@ -79,16 +79,14 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
             self.bert_model = get_lm_model(
                 config_file=config_file, config_dict=config_dict, vocab_file=vocab_file, trainer=trainer, cfg=cfg,
             )
-            if cfg.language_model.get('downstream'):
-                cfg.language_model.downstream = True
 
             # Required to pull up the config for MegatronBert models
             self.pretrained_model_name = cfg.language_model.pretrained_model_name
 
             # register encoder config
             self.register_bert_model()
 
-            if cfg.tokenizer.get("library", "") == 'megatron':
+            if "megatron" in cfg.tokenizer.get("tokenizer_name", ""):
                 self.hidden_size = self.bert_model.cfg.hidden_size
             else:
                 self.hidden_size = self.bert_model.config.hidden_size

diff --git a/nemo/collections/nlp/models/question_answering/qa_model.py b/nemo/collections/nlp/models/question_answering/qa_model.py
@@ -58,12 +58,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
     @typecheck()
     def forward(self, input_ids, attention_mask, token_type_ids):
-        if self._cfg.tokenizer.get('library', '') == 'megatron':
-            hidden_states, _ = self.bert_model(input_ids, attention_mask, tokentype_ids=token_type_ids, lm_labels=None)
-        else:
-            hidden_states = self.bert_model(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-            )
+        hidden_states = self.bert_model(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
+        )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
         logits = self.classifier(hidden_states=hidden_states)
         return logits
 

diff --git a/nemo/collections/nlp/models/text_classification/text_classification_model.py b/nemo/collections/nlp/models/text_classification/text_classification_model.py
@@ -80,12 +80,11 @@ def forward(self, input_ids, attention_mask, token_type_ids):
         No special modification required for Lightning, define it as you normally would
         in the `nn.Module` in vanilla PyTorch.
         """
-        if self._cfg.tokenizer.get('library', '') == 'megatron':
-            hidden_states, _ = self.bert_model(input_ids, attention_mask, tokentype_ids=token_type_ids, lm_labels=None)
-        else:
-            hidden_states = self.bert_model(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-            )
+        hidden_states = self.bert_model(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
+        )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
         logits = self.classifier(hidden_states=hidden_states)
         return logits
 

diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -150,12 +150,11 @@ def forward(
                 - ``capit_logits`` (:obj:`torch.Tensor`): a float torch tensor of shape
                   ``[Batch, Time, NumCapitalizationLabels]`` containing capitalization logits
         """
-        if self._cfg.tokenizer.get('library', '') == 'megatron':
-            hidden_states, _ = self.bert_model(input_ids, attention_mask, tokentype_ids=token_type_ids, lm_labels=None)
-        else:
-            hidden_states = self.bert_model(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-            )
+        hidden_states = self.bert_model(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
+        )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
 
         punct_logits = self.punct_classifier(hidden_states=hidden_states)
         capit_logits = self.capit_classifier(hidden_states=hidden_states)

diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py
@@ -104,13 +104,11 @@ def setup_loss(self, class_balancing: str = None):
 
     @typecheck()
     def forward(self, input_ids, attention_mask, token_type_ids):
-        if self._cfg.tokenizer.get('library', '') == 'megatron':
-            hidden_states, _ = self.bert_model(input_ids, attention_mask, tokentype_ids=token_type_ids, lm_labels=None)
-        else:
-            hidden_states = self.bert_model(
-                input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
-            )
-
+        hidden_states = self.bert_model(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
+        )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
         logits = self.classifier(hidden_states=hidden_states)
         return logits
 

diff --git a/nemo/collections/nlp/modules/common/lm_utils.py b/nemo/collections/nlp/modules/common/lm_utils.py
@@ -87,7 +87,7 @@ def get_lm_model(
             f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used."
         )
 
-    if cfg.tokenizer is not None and cfg.tokenizer.get("library", "") == 'megatron':
+    if cfg.tokenizer is not None and "megatron" in cfg.tokenizer.get("tokenizer_name", ""):
         import torch
 
         from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
@@ -99,12 +99,7 @@ def __init__(self):
             def forward(self, x, *args):
                 return x
 
-        # For finetuning a different downstream task dataset
-        if cfg.language_model.get('downstream'):
-            model = MegatronBertModel(cfg=cfg, trainer=trainer)
-        # For finetuning on a downstream task dataset for the first time
-        else:
-            model = MegatronBertModel.restore_from(restore_path=cfg.language_model.lm_checkpoint, trainer=trainer)
+        model = MegatronBertModel.restore_from(restore_path=cfg.language_model.lm_checkpoint, trainer=trainer)
 
         # remove the headers that are only revelant for pretraining
         model.model.lm_head = Identity()