From 1e8aa8ca83a52bc617e48ef8a4e0c4f8838999ce Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 16 Feb 2024 10:16:40 -0700 Subject: [PATCH] mcore ds fix (#8283) (#8385) * [tutorial] fixed missing RIR scripts file. (#8257) * add values to en tts dict (#7879) * mcore ds fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update mcore * revert asr files * add comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for mcore mock dataset * update mcore version * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update gpt cfg * update mcore commit * fix Bert unit tests * update bert tests * fix bert mcore test * fix gpt jenkins tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update apex & TE commits * revert apex installation * turn off the fusion for jenkins --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: dimapihtar Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay Co-authored-by: Eric Harper --- Dockerfile | 8 ++++---- Jenkinsfile | 16 +++++++++++----- .../conf/megatron_gpt_config.yaml | 3 ++- .../tokenizers/huggingface/auto_tokenizer.py | 9 +++++++++ .../language_modeling/megatron_gpt_model.py | 9 ++++++--- nemo/core/config/hydra_runner.py | 3 +++ 6 files changed, 35 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6a5c48bee4c4..ec3e5dd87382 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.12-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 # build an image that includes only the nemo dependencies, ensures that dependencies # are included first for optimal caching, and useful for building a development @@ -66,19 +66,19 @@ WORKDIR /workspace/ # We leave it here in case we need to work off of a specific commit in main RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout 27cbe46714a50c43ed290f1b1472db8d2780c55c && \ + git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \ pip install . # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771 RUN git clone https://github.com/NVIDIA/apex.git && \ cd apex && \ - git checkout b496d85fb88a801d8e680872a12822de310951fd && \ + git checkout f058162b215791b15507bb542f22ccfde49c872d && \ pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ # Transformer Engine 1.2.0 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \ cd TransformerEngine && \ - git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \ + git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \ git checkout FETCH_HEAD && \ git submodule init && git submodule update && \ NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . diff --git a/Jenkinsfile b/Jenkinsfile index c2357e280afb..957b69e13c17 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,10 +1,16 @@ pipeline { agent { docker { - image 'nvcr.io/nvidia/pytorch:23.12-py3' + image 'nvcr.io/nvidia/pytorch:24.01-py3' args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1' } } + + environment { + NVTE_FUSED_ATTN = 0 + NVTE_FLASH_ATTN = 0 + } + options { timeout(time: 8, unit: 'HOURS') disableConcurrentBuilds(abortPrevious: true) @@ -62,7 +68,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \ cd TransformerEngine && \ - git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \ + git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \ git checkout FETCH_HEAD && \ git submodule init && git submodule update && \ NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .' @@ -85,7 +91,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout bed60a881f4b238b1c14b6c6a64997cc636e77b6 && \ + git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \ pip install .' } } @@ -3227,7 +3233,7 @@ pipeline { } failFast true steps { - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ @@ -3257,7 +3263,7 @@ pipeline { model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ trainer.devices=2 \ trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 004e8b584a13..aaa00df2e006 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -204,7 +204,7 @@ model: ##Offloading Activations/Weights to CPU cpu_offloading: False - cpu_offloading_num_layers: 11 #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer + cpu_offloading_num_layers: ${sum:${.num_layers},-1} #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer cpu_offloading_activations: True cpu_offloading_weights: True @@ -247,6 +247,7 @@ model: pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem + mock_dataset: False # Set to True and data_prefix to None to use artificially generated mock dataset # Nsys profiling options nsys_profile: diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py index e6e5840d93b9..4ed5dc07dbff 100644 --- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py +++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import OrderedDict from typing import Optional from transformers import AutoTokenizer as AUTOTOKENIZER @@ -121,6 +122,9 @@ def __init__( if token is not None and token not in self.tokenizer.get_vocab(): new_tokens_in_vocab.append(token) + # value is required for megatron-core + self.unique_identifiers = OrderedDict() + if len(new_tokens_in_vocab) > 0: """ Special tokens that were not previously included in the tokenizer's vocabulary file will be added to @@ -227,6 +231,11 @@ def bos_id(self): def eos_id(self): return self.tokens_to_ids([getattr(self, 'eos_token')])[0] + @property + def eod(self): + """Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core.""" + return self.tokens_to_ids([getattr(self, 'eos_token')])[0] + @property def sep_id(self): return self.tokens_to_ids([getattr(self, 'sep_token')])[0] diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 2770090a7c1e..752696ac8faa 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -85,7 +85,7 @@ try: from megatron.core import InferenceParams, parallel_state, tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder - from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig + from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset from megatron.core.models.gpt import GPTModel as MCoreGPTModel from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.pipeline_parallel.schedules import get_forward_backward_func @@ -1199,16 +1199,18 @@ def build_train_valid_test_datasets(self): 1 ] = 1 # This is to make sure we only have one epoch on every validation iteration + mock_dataset = self.cfg.data.get("mock_dataset", False) kwargs = { "is_built_on_rank": is_dataset_built_on_rank, "random_seed": self.cfg.seed, "sequence_length": self.cfg.data.seq_length, "split": self.cfg.data.splits_string, "path_to_cache": self.cfg.data.index_mapping_dir, + "tokenizer": self.tokenizer, "reset_position_ids": self.reset_position_ids, "reset_attention_mask": self.reset_attention_mask, "eod_mask_loss": self.eod_mask_loss, - "eod_id": self.tokenizer.eos_id, + "mock": mock_dataset, } if isinstance(self.cfg.data.data_prefix, DictConfig): @@ -1225,9 +1227,10 @@ def build_train_valid_test_datasets(self): ).build() else: dataset_config = GPTDatasetConfig(**kwargs) + dataset_type = MockGPTDataset if mock_dataset else GPTDataset self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( - GPTDataset, train_valid_test_num_samples, dataset_config, + dataset_type, train_valid_test_num_samples, dataset_config, ).build() if self._train_ds is not None: diff --git a/nemo/core/config/hydra_runner.py b/nemo/core/config/hydra_runner.py index 604d2134f66b..c3c5486d7408 100644 --- a/nemo/core/config/hydra_runner.py +++ b/nemo/core/config/hydra_runner.py @@ -47,6 +47,9 @@ def _get_gpu_name(): # multiple interpolated values in the config OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True) +# sum interpolated values in the config +OmegaConf.register_new_resolver("sum", lambda x, y: x + y, replace=True) + def hydra_runner( config_path: Optional[str] = ".", config_name: Optional[str] = None, schema: Optional[Any] = None