NVIDIA · ericharper · Feb 9, 2024 · Jan 29, 2024 · Jan 30, 2024 · Jan 31, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -66,7 +66,7 @@ WORKDIR /workspace/
 # We leave it here in case we need to work off of a specific commit in main
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout 27cbe46714a50c43ed290f1b1472db8d2780c55c && \
+  git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
   pip install .
 
 # Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -85,7 +85,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout bed60a881f4b238b1c14b6c6a64997cc636e77b6 && \
+             git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
              pip install .'
       }
     }
@@ -3114,7 +3114,7 @@ pipeline {
       }
       failFast true
       steps {
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
@@ -3143,7 +3143,7 @@ pipeline {
         model.activations_checkpoint_num_layers=1 \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -240,6 +240,7 @@ model:
     pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
     shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
     exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem
+    mock_dataset: False # Set to True and data_prefix to None to use artificially generated mock dataset
 
   # Nsys profiling options
   nsys_profile:

diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import OrderedDict
 from typing import Optional
 
 from transformers import AutoTokenizer as AUTOTOKENIZER
@@ -121,6 +122,9 @@ def __init__(
             if token is not None and token not in self.tokenizer.get_vocab():
                 new_tokens_in_vocab.append(token)
 
+        # value is required for megatron-core
+        self.unique_identifiers = OrderedDict()
+
         if len(new_tokens_in_vocab) > 0:
             """
             Special tokens that were not previously included in the tokenizer's vocabulary file will be added to 
@@ -227,6 +231,11 @@ def bos_id(self):
     def eos_id(self):
         return self.tokens_to_ids([getattr(self, 'eos_token')])[0]
 
+    @property
+    def eod(self):
+        """Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core."""
+        return self.tokens_to_ids([getattr(self, 'eos_token')])[0]
+
     @property
     def sep_id(self):
         return self.tokens_to_ids([getattr(self, 'sep_token')])[0]

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -84,7 +84,7 @@
 try:
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-    from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
+    from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
@@ -1199,17 +1199,19 @@ def build_train_valid_test_datasets(self):
                 1
             ] = 1  # This is to make sure we only have one epoch on every validation iteration
 
+        mock_dataset = self.cfg.data.get("mock_dataset", False)
         kwargs = {
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,
             "sequence_length": self.cfg.data.seq_length,
             "blend": self.cfg.data.data_prefix,
             "split": self.cfg.data.splits_string,
             "path_to_cache": self.cfg.data.index_mapping_dir,
+            "tokenizer": self.tokenizer,
             "reset_position_ids": self.reset_position_ids,
             "reset_attention_mask": self.reset_attention_mask,
             "eod_mask_loss": self.eod_mask_loss,
-            "eod_id": self.tokenizer.eos_id,
+            "mock": mock_dataset,
         }
 
         if self.cfg.data.get('add_fim', False):
@@ -1220,9 +1222,10 @@ def build_train_valid_test_datasets(self):
             ).build()
         else:
             dataset_config = GPTDatasetConfig(**kwargs)
+            dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
             self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                GPTDataset, train_valid_test_num_samples, dataset_config,
+                dataset_type, train_valid_test_num_samples, dataset_config,
             ).build()
 
         if self._train_ds is not None: