Skip to content

Commit

Permalink
mcore ds fix (#8283) (#8385)
Browse files Browse the repository at this point in the history
* [tutorial] fixed missing RIR scripts file. (#8257)

* add values to en tts dict (#7879)

* mcore ds fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

* revert asr files

* add comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

* update mcore version

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

* update mcore commit

* fix Bert unit tests

* update bert tests

* fix bert mcore test

* fix gpt jenkins tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

* revert apex installation

* turn off the fusion for jenkins

---------

Signed-off-by: Xuesong Yang <[email protected]>
Signed-off-by: Mariana Graterol Fuenmayor <[email protected]>
Signed-off-by: Dmytro Pykhtar <[email protected]>
Signed-off-by: dimapihtar <[email protected]>
Co-authored-by: Dmytro Pykhtar <[email protected]>
Co-authored-by: Xuesong Yang <[email protected]>
Co-authored-by: Mariana <[email protected]>
Co-authored-by: Dmytro Pykhtar <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <[email protected]>
Co-authored-by: Eric Harper <[email protected]>
Signed-off-by: Pablo Garay <[email protected]>
  • Loading branch information
8 people committed Mar 19, 2024
1 parent a8125b2 commit 182e277
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 13 deletions.
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.12-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3

# build an image that includes only the nemo dependencies, ensures that dependencies
# are included first for optimal caching, and useful for building a development
Expand Down Expand Up @@ -66,19 +66,19 @@ WORKDIR /workspace/
# We leave it here in case we need to work off of a specific commit in main
RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 27cbe46714a50c43ed290f1b1472db8d2780c55c && \
git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
pip install .

# Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
git checkout b496d85fb88a801d8e680872a12822de310951fd && \
git checkout f058162b215791b15507bb542f22ccfde49c872d && \
pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./

# Transformer Engine 1.2.0
RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \
git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
Expand Down
16 changes: 11 additions & 5 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
pipeline {
agent {
docker {
image 'nvcr.io/nvidia/pytorch:23.12-py3'
image 'nvcr.io/nvidia/pytorch:24.01-py3'
args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
}
}

environment {
NVTE_FUSED_ATTN = 0
NVTE_FLASH_ATTN = 0
}

options {
timeout(time: 8, unit: 'HOURS')
disableConcurrentBuilds(abortPrevious: true)
Expand Down Expand Up @@ -62,7 +68,7 @@ pipeline {
steps {
sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \
git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
Expand All @@ -85,7 +91,7 @@ pipeline {
steps {
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout bed60a881f4b238b1c14b6c6a64997cc636e77b6 && \
git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
pip install .'
}
}
Expand Down Expand Up @@ -3227,7 +3233,7 @@ pipeline {
}
failFast true
steps {
sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -3257,7 +3263,7 @@ pipeline {
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down
3 changes: 2 additions & 1 deletion examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ model:

##Offloading Activations/Weights to CPU
cpu_offloading: False
cpu_offloading_num_layers: 11 #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer
cpu_offloading_num_layers: ${sum:${.num_layers},-1} #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer
cpu_offloading_activations: True
cpu_offloading_weights: True

Expand Down Expand Up @@ -247,6 +247,7 @@ model:
pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem
mock_dataset: False # Set to True and data_prefix to None to use artificially generated mock dataset

# Nsys profiling options
nsys_profile:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import OrderedDict
from typing import Optional

from transformers import AutoTokenizer as AUTOTOKENIZER
Expand Down Expand Up @@ -121,6 +122,9 @@ def __init__(
if token is not None and token not in self.tokenizer.get_vocab():
new_tokens_in_vocab.append(token)

# value is required for megatron-core
self.unique_identifiers = OrderedDict()

if len(new_tokens_in_vocab) > 0:
"""
Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
Expand Down Expand Up @@ -227,6 +231,11 @@ def bos_id(self):
def eos_id(self):
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def eod(self):
"""Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core."""
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def sep_id(self):
return self.tokens_to_ids([getattr(self, 'sep_token')])[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
try:
from megatron.core import InferenceParams, parallel_state, tensor_parallel
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
from megatron.core.models.gpt import GPTModel as MCoreGPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
Expand Down Expand Up @@ -1199,16 +1199,18 @@ def build_train_valid_test_datasets(self):
1
] = 1 # This is to make sure we only have one epoch on every validation iteration

mock_dataset = self.cfg.data.get("mock_dataset", False)
kwargs = {
"is_built_on_rank": is_dataset_built_on_rank,
"random_seed": self.cfg.seed,
"sequence_length": self.cfg.data.seq_length,
"split": self.cfg.data.splits_string,
"path_to_cache": self.cfg.data.index_mapping_dir,
"tokenizer": self.tokenizer,
"reset_position_ids": self.reset_position_ids,
"reset_attention_mask": self.reset_attention_mask,
"eod_mask_loss": self.eod_mask_loss,
"eod_id": self.tokenizer.eos_id,
"mock": mock_dataset,
}

if isinstance(self.cfg.data.data_prefix, DictConfig):
Expand All @@ -1225,9 +1227,10 @@ def build_train_valid_test_datasets(self):
).build()
else:
dataset_config = GPTDatasetConfig(**kwargs)
dataset_type = MockGPTDataset if mock_dataset else GPTDataset

self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
GPTDataset, train_valid_test_num_samples, dataset_config,
dataset_type, train_valid_test_num_samples, dataset_config,
).build()

if self._train_ds is not None:
Expand Down
3 changes: 3 additions & 0 deletions nemo/core/config/hydra_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def _get_gpu_name():
# multiple interpolated values in the config
OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True)

# sum interpolated values in the config
OmegaConf.register_new_resolver("sum", lambda x, y: x + y, replace=True)


def hydra_runner(
config_path: Optional[str] = ".", config_name: Optional[str] = None, schema: Optional[Any] = None
Expand Down

0 comments on commit 182e277

Please sign in to comment.