Skip to content

Commit

Permalink
tp_comm_overlap mistral-nemo
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandros Koumparoulis <[email protected]>
  • Loading branch information
akoumpa committed Oct 2, 2024
1 parent e46e205 commit 2e136d6
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions nemo/collections/llm/recipes/mistral_nemo_12b.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def trainer(
pipeline_parallelism_type: Optional[torch.dtype] = None,
virtual_pipeline_parallelism: Optional[int] = None,
context_parallelism: int = 2,
sequence_parallelism: bool = False,
sequence_parallelism: bool = True,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
max_steps: int = 1168251,
Expand Down Expand Up @@ -175,11 +175,14 @@ def pretrain_recipe(
fn,
model=model(),
trainer=trainer(
tensor_parallelism=2,
context_parallelism=2,
sequence_parallelism=False,
num_nodes=num_nodes,
num_gpus_per_node=num_gpus_per_node,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
resume=default_resume(),
Expand Down Expand Up @@ -226,7 +229,7 @@ def pretrain_recipe_performance(
recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=False,
tp_comm_overlap=True,
)
)
return recipe
Expand Down

0 comments on commit 2e136d6

Please sign in to comment.