Skip to content

Commit 3a87eb2

Browse files
committed
Revert "Enable megatron core loggers for GPT pretraining (NVIDIA#8354) (NVIDIA#8384)"
This reverts commit 52f5611.
1 parent 3d51b17 commit 3a87eb2

File tree

4 files changed

+2
-54
lines changed

4 files changed

+2
-54
lines changed

examples/nlp/language_modeling/conf/megatron_gpt_config.yaml

-7
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,6 @@ model:
211211
## Network
212212
sharp: False # Enable the use of SHARP for NCCL data-parallel communications. This is going to be ignored if the network doesn't support SHARP.
213213

214-
## Megatron timers
215-
enable_megatron_timers: False
216-
megatron_timer_kwargs:
217-
log_every_n_steps: 10
218-
log_mode: minmax
219-
barrier: False
220-
221214
data:
222215
# Path to data must be specified by the user.
223216
# Supports List, String and Dictionary

examples/nlp/language_modeling/conf/megatron_model_base_config.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,4 @@ normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sq
3737
num_moe_experts: 1 # When >1, FFNs are changed to MoE layers
3838
moe_frequency: 1 # every Nth ffn layer will be made MoE
3939
moe_dropout: 0.0 # Dropout value for MoE layers
40-
use_flash_attention: false # Use flash attention in self-attention module
41-
enable_megatron_timers: false # Megatron timers
40+
use_flash_attention: false # Use flash attention in self-attention module

nemo/collections/nlp/models/language_modeling/megatron_base_model.py

+1-39
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,6 @@
7070

7171
HAVE_MEGATRON_CORE = False
7272

73-
try:
74-
from megatron.core import Timers
75-
76-
HAVE_MEGATRON_CORE_TIMERS = True
77-
except (ImportError, ModuleNotFoundError):
78-
HAVE_MEGATRON_CORE_TIMERS = False
79-
8073
__all__ = ["MegatronBaseModel"]
8174

8275

@@ -132,17 +125,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
132125
else torch.float32
133126
)
134127

135-
self.megatron_timers = None
136-
if self.cfg.get('enable_megatron_timers', False) and HAVE_MEGATRON_CORE_TIMERS:
137-
self.megatron_timers_cfg = dict(self.cfg.get('megatron_timer_kwargs', dict()))
138-
if 'log_every_n_steps' not in self.megatron_timers_cfg:
139-
self.megatron_timers_cfg['log_every_n_steps'] = self.trainer.log_every_n_steps
140-
if 'log_option' not in self.megatron_timers_cfg:
141-
self.megatron_timers_cfg['log_option'] = 'minmax' # minmax, max, all
142-
if 'barrier' not in self.megatron_timers_cfg:
143-
self.megatron_timers_cfg['barrier'] = False
144-
self.megatron_timers = Timers(log_level=2, log_option=self.megatron_timers_cfg['log_option'])
145-
146128
# set the megatron core model parallel config
147129
self.model_parallel_config: ModelParallelConfig = self.build_model_parallel_config()
148130

@@ -662,13 +644,6 @@ def sync_overlap_parameters(self, params=None):
662644
def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unused: Optional[int] = 0) -> None:
663645
super().on_train_batch_end(outputs, dataloader_iter, batch_idx)
664646

665-
# Megatron Timers
666-
if self.megatron_timers:
667-
if self.global_step % self.megatron_timers_cfg["log_every_n_steps"] == 0:
668-
logging.info(
669-
"\n " + self.megatron_timers.get_all_timers_string(barrier=self.megatron_timers_cfg["barrier"])
670-
)
671-
672647
# TODO: Replace with newer override for scheduler.step() instead of
673648
# search for plugins for fp16 GradScalar
674649
if self.trainer.precision_plugin is not None and isinstance(
@@ -1098,7 +1073,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
10981073
and megatron_amp_O2, # NeMo does not currently support fp16 training with megatron amp O2, eval and inference is supported
10991074
"bf16": self.torch_dtype == torch.bfloat16 and megatron_amp_O2,
11001075
"params_dtype": self.params_dtype,
1101-
"timers": self.megatron_timers,
1076+
"timers": None, # NeMo does not currently support megatron core timers
11021077
"async_tensor_model_parallel_allreduce": self.cfg.get('tensor_model_parallel_world_size', 1) > 1
11031078
and not self.cfg.get('sequence_parallel', False),
11041079
"pipeline_dtype": pipeline_dtype,
@@ -1211,16 +1186,3 @@ def configure_sharded_model(self):
12111186
# Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid
12121187
# out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op.
12131188
self.model = self.model.cuda(torch.cuda.current_device())
1214-
1215-
def megatron_timer_start(self, name, log_level):
1216-
if self.megatron_timers:
1217-
self.megatron_timers(name, log_level).start(barrier=False)
1218-
1219-
def megatron_timer_stop(self, name):
1220-
if self.megatron_timers:
1221-
self.megatron_timers(name).stop()
1222-
1223-
def optimizer_step(self, *args, **kwargs):
1224-
self.megatron_timer_start('optimizer', log_level=1)
1225-
super().optimizer_step(*args, **kwargs)
1226-
self.megatron_timer_stop('optimizer')

nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py

-6
Original file line numberDiff line numberDiff line change
@@ -656,11 +656,8 @@ def training_step(self, dataloader_iter, batch_idx):
656656

657657
# when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
658658
if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
659-
self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
660659
self.allreduce_sequence_parallel_gradients()
661-
self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
662660

663-
self.megatron_timer_start('gradient_allreduce', log_level=1)
664661
if self.use_fsdp:
665662
# Reduce the gradients omitted from FSDP-sharding
666663
self.allreduce_fsdp_sharding_omitted_gradients()
@@ -678,15 +675,12 @@ def training_step(self, dataloader_iter, batch_idx):
678675
# async grad allreduce is not currently implemented for O1/autocasting mixed precision training
679676
# so we all-reduce gradients after the pipeline
680677
self.allreduce_gradients() # @sangkug we think this is causing memory to blow up (hurts perf)
681-
self.megatron_timer_stop('gradient_allreduce')
682678

683679
if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and self.cfg.get(
684680
'share_embeddings_and_output_weights', True
685681
):
686-
self.megatron_timer_start('allreduce_first_last_embeddings', log_level=1)
687682
# when using pipeline parallelism the first and last stage must keep embeddings in sync
688683
self.allreduce_first_last_embeddings()
689-
self.megatron_timer_stop('allreduce_first_last_embeddings')
690684

691685
## logging
692686
if self.log_train_loss:

0 commit comments

Comments
 (0)