|
70 | 70 |
|
71 | 71 | HAVE_MEGATRON_CORE = False
|
72 | 72 |
|
73 |
| -try: |
74 |
| - from megatron.core import Timers |
75 |
| - |
76 |
| - HAVE_MEGATRON_CORE_TIMERS = True |
77 |
| -except (ImportError, ModuleNotFoundError): |
78 |
| - HAVE_MEGATRON_CORE_TIMERS = False |
79 |
| - |
80 | 73 | __all__ = ["MegatronBaseModel"]
|
81 | 74 |
|
82 | 75 |
|
@@ -132,17 +125,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
|
132 | 125 | else torch.float32
|
133 | 126 | )
|
134 | 127 |
|
135 |
| - self.megatron_timers = None |
136 |
| - if self.cfg.get('enable_megatron_timers', False) and HAVE_MEGATRON_CORE_TIMERS: |
137 |
| - self.megatron_timers_cfg = dict(self.cfg.get('megatron_timer_kwargs', dict())) |
138 |
| - if 'log_every_n_steps' not in self.megatron_timers_cfg: |
139 |
| - self.megatron_timers_cfg['log_every_n_steps'] = self.trainer.log_every_n_steps |
140 |
| - if 'log_option' not in self.megatron_timers_cfg: |
141 |
| - self.megatron_timers_cfg['log_option'] = 'minmax' # minmax, max, all |
142 |
| - if 'barrier' not in self.megatron_timers_cfg: |
143 |
| - self.megatron_timers_cfg['barrier'] = False |
144 |
| - self.megatron_timers = Timers(log_level=2, log_option=self.megatron_timers_cfg['log_option']) |
145 |
| - |
146 | 128 | # set the megatron core model parallel config
|
147 | 129 | self.model_parallel_config: ModelParallelConfig = self.build_model_parallel_config()
|
148 | 130 |
|
@@ -662,13 +644,6 @@ def sync_overlap_parameters(self, params=None):
|
662 | 644 | def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unused: Optional[int] = 0) -> None:
|
663 | 645 | super().on_train_batch_end(outputs, dataloader_iter, batch_idx)
|
664 | 646 |
|
665 |
| - # Megatron Timers |
666 |
| - if self.megatron_timers: |
667 |
| - if self.global_step % self.megatron_timers_cfg["log_every_n_steps"] == 0: |
668 |
| - logging.info( |
669 |
| - "\n " + self.megatron_timers.get_all_timers_string(barrier=self.megatron_timers_cfg["barrier"]) |
670 |
| - ) |
671 |
| - |
672 | 647 | # TODO: Replace with newer override for scheduler.step() instead of
|
673 | 648 | # search for plugins for fp16 GradScalar
|
674 | 649 | if self.trainer.precision_plugin is not None and isinstance(
|
@@ -1098,7 +1073,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
|
1098 | 1073 | and megatron_amp_O2, # NeMo does not currently support fp16 training with megatron amp O2, eval and inference is supported
|
1099 | 1074 | "bf16": self.torch_dtype == torch.bfloat16 and megatron_amp_O2,
|
1100 | 1075 | "params_dtype": self.params_dtype,
|
1101 |
| - "timers": self.megatron_timers, |
| 1076 | + "timers": None, # NeMo does not currently support megatron core timers |
1102 | 1077 | "async_tensor_model_parallel_allreduce": self.cfg.get('tensor_model_parallel_world_size', 1) > 1
|
1103 | 1078 | and not self.cfg.get('sequence_parallel', False),
|
1104 | 1079 | "pipeline_dtype": pipeline_dtype,
|
@@ -1211,16 +1186,3 @@ def configure_sharded_model(self):
|
1211 | 1186 | # Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid
|
1212 | 1187 | # out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op.
|
1213 | 1188 | self.model = self.model.cuda(torch.cuda.current_device())
|
1214 |
| - |
1215 |
| - def megatron_timer_start(self, name, log_level): |
1216 |
| - if self.megatron_timers: |
1217 |
| - self.megatron_timers(name, log_level).start(barrier=False) |
1218 |
| - |
1219 |
| - def megatron_timer_stop(self, name): |
1220 |
| - if self.megatron_timers: |
1221 |
| - self.megatron_timers(name).stop() |
1222 |
| - |
1223 |
| - def optimizer_step(self, *args, **kwargs): |
1224 |
| - self.megatron_timer_start('optimizer', log_level=1) |
1225 |
| - super().optimizer_step(*args, **kwargs) |
1226 |
| - self.megatron_timer_stop('optimizer') |
0 commit comments