Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mcore ds fix #8283

Merged
merged 29 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
37ac5a3
[tutorial] fixed missing RIR scripts file. (#8257)
XuesongYang Jan 29, 2024
7b2415a
add values to en tts dict (#7879)
mgrafu Jan 30, 2024
5b2ffb6
mcore ds fix
Jan 31, 2024
37284d3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 31, 2024
8c99a7f
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
dimapihtar Jan 31, 2024
6732410
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
dimapihtar Jan 31, 2024
12bc3cc
update mcore
dimapihtar Jan 31, 2024
35e1024
revert asr files
dimapihtar Jan 31, 2024
bec85bb
add comments
dimapihtar Jan 31, 2024
29ff2bd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 31, 2024
c84afa1
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
dimapihtar Feb 1, 2024
0edd229
add support for mcore mock dataset
dimapihtar Feb 2, 2024
4098e53
update mcore version
dimapihtar Feb 2, 2024
a4630bf
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 2, 2024
1c3f327
update gpt cfg
dimapihtar Feb 2, 2024
565565b
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
dimapihtar Feb 2, 2024
e1ae18a
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
dimapihtar Feb 5, 2024
a502a57
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
dimapihtar Feb 6, 2024
ccaceb4
update mcore commit
dimapihtar Feb 6, 2024
95159a0
fix Bert unit tests
dimapihtar Feb 7, 2024
842b77b
update bert tests
dimapihtar Feb 7, 2024
ea5443e
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
pablo-garay Feb 7, 2024
ed69105
fix bert mcore test
dimapihtar Feb 8, 2024
0db10f6
Merge branch 'r1.23.0' into dpykhtar/mcore_ds_fix
dimapihtar Feb 8, 2024
9de1ff8
fix gpt jenkins tests
dimapihtar Feb 8, 2024
9fa950e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 8, 2024
a5d2e01
update apex & TE commits
dimapihtar Feb 8, 2024
005d58f
revert apex installation
dimapihtar Feb 9, 2024
5565c18
turn off the fusion for jenkins
dimapihtar Feb 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ WORKDIR /workspace/
# We leave it here in case we need to work off of a specific commit in main
RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 27cbe46714a50c43ed290f1b1472db8d2780c55c && \
git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
pip install .

# Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
Expand Down
6 changes: 3 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ pipeline {
steps {
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout bed60a881f4b238b1c14b6c6a64997cc636e77b6 && \
git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
pip install .'
}
}
Expand Down Expand Up @@ -3114,7 +3114,7 @@ pipeline {
}
failFast true
steps {
sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -3143,7 +3143,7 @@ pipeline {
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ model:
pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem
mock_dataset: False # Set to True and data_prefix to None to use artificially generated mock dataset

# Nsys profiling options
nsys_profile:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import OrderedDict
from typing import Optional

from transformers import AutoTokenizer as AUTOTOKENIZER
Expand Down Expand Up @@ -121,6 +122,9 @@ def __init__(
if token is not None and token not in self.tokenizer.get_vocab():
new_tokens_in_vocab.append(token)

# value is required for megatron-core
self.unique_identifiers = OrderedDict()

if len(new_tokens_in_vocab) > 0:
"""
Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
Expand Down Expand Up @@ -227,6 +231,11 @@ def bos_id(self):
def eos_id(self):
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def eod(self):
"""Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core."""
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def sep_id(self):
return self.tokens_to_ids([getattr(self, 'sep_token')])[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
try:
from megatron.core import InferenceParams, parallel_state, tensor_parallel
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
from megatron.core.models.gpt import GPTModel as MCoreGPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
Expand Down Expand Up @@ -1199,17 +1199,19 @@ def build_train_valid_test_datasets(self):
1
] = 1 # This is to make sure we only have one epoch on every validation iteration

mock_dataset = self.cfg.data.get("mock_dataset", False)
kwargs = {
"is_built_on_rank": is_dataset_built_on_rank,
"random_seed": self.cfg.seed,
"sequence_length": self.cfg.data.seq_length,
"blend": self.cfg.data.data_prefix,
"split": self.cfg.data.splits_string,
"path_to_cache": self.cfg.data.index_mapping_dir,
"tokenizer": self.tokenizer,
"reset_position_ids": self.reset_position_ids,
"reset_attention_mask": self.reset_attention_mask,
"eod_mask_loss": self.eod_mask_loss,
"eod_id": self.tokenizer.eos_id,
"mock": mock_dataset,
}

if self.cfg.data.get('add_fim', False):
Expand All @@ -1220,9 +1222,10 @@ def build_train_valid_test_datasets(self):
).build()
else:
dataset_config = GPTDatasetConfig(**kwargs)
dataset_type = MockGPTDataset if mock_dataset else GPTDataset

self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
GPTDataset, train_valid_test_num_samples, dataset_config,
dataset_type, train_valid_test_num_samples, dataset_config,
).build()

if self._train_ds is not None:
Expand Down
Loading