Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recipe changes for performance #11763

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f069a19
[Nemo2] allow setting CUDA_DEVICE_MAX_CONNECTIONS
guyueh1 Dec 28, 2024
87f1d43
Add a tp2 ub config
guyueh1 Dec 30, 2024
3bdda64
Recipe tuning for mixtral, nemotron4
Jan 7, 2025
43f45fa
Revert mixtral config change
guyueh1 Jan 8, 2025
8420b22
Decide cuda device max connections based on torch.cuda.get_device_cap…
guyueh1 Jan 8, 2025
633b903
Rename custom_cuda_device_max_connections to num_cuda_device_max_conn…
guyueh1 Jan 8, 2025
88c16c3
Merge branch 'main' into recipe_for_25.01
guyueh1 Jan 8, 2025
5ca96db
Apply isort and black reformatting
guyueh1 Jan 8, 2025
2b8114b
Remove explicit config of align_param_gather in mixtral recipe and us…
guyueh1 Jan 9, 2025
93cb713
Merge branch 'recipe_for_25.01' of github.com:guyueh1/NeMo into recip…
guyueh1 Jan 9, 2025
7c5530b
Revert "Remove explicit config of align_param_gather in mixtral recip…
guyueh1 Jan 9, 2025
e234588
Rename ub config; change proj to ring exchange for nemotron 340b
guyueh1 Jan 9, 2025
43d6e12
Merge branch 'main' into recipe_for_25.01
erhoo82 Jan 14, 2025
9d5cb11
Update the logic to set cuda_device_max_connections
guyueh1 Jan 15, 2025
0fd838e
Revert changes to PerfEnvPlugin
guyueh1 Jan 21, 2025
441036c
Move setup of CUDA_DEVICE_MAX_CONNECTIONS to MegatronCommOverlapCallback
guyueh1 Jan 23, 2025
b18ac96
Apply isort and black reformatting
guyueh1 Jan 23, 2025
1f2ff68
Add b200 tp overlap configs for gpt3 and llama3 models
guyueh1 Jan 24, 2025
5bf3f74
Merge branch 'recipe_for_25.01' of github.com:guyueh1/NeMo into recip…
guyueh1 Jan 24, 2025
6a218f1
Revert changes to nemotron recipe; will put those changes in performa…
guyueh1 Jan 24, 2025
c0d3777
Merge branch 'main' into recipe_for_25.01
Jan 28, 2025
544cd5a
Add two docstrings
guyueh1 Jan 29, 2025
83d35d5
Merge branch 'main' into recipe_for_25.01
erhoo82 Jan 29, 2025
530719a
Fix os.environ.pop
guyueh1 Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions nemo/collections/llm/recipes/mixtral_8x7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,20 +210,10 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
It may not be suitable for all hardware configurations or use cases.
"""

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
# by MegatronCommOverlapCallback. They are added here for user's knowledge.
# overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
# each PP stage launches independently as needed.

recipe.trainer.callbacks.extend(
[
run.Config(MegatronTokenDropCallback),
run.Config(
MegatronCommOverlapCallback,
overlap_param_gather_with_optimizer_step=False, # Currently disabled due to issue with checkpointing.
erhoo82 marked this conversation as resolved.
Show resolved Hide resolved
align_param_gather=True,
),
run.Config(MegatronCommOverlapCallback),
]
)
recipe.trainer.strategy.expert_model_parallel_size = 1
Expand Down
2 changes: 2 additions & 0 deletions nemo/collections/llm/recipes/nemotron4_15b.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

Expand Down Expand Up @@ -202,6 +203,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192,
)
)
return recipe
Expand Down
4 changes: 4 additions & 0 deletions nemo/collections/llm/recipes/nemotron4_340b.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096,
)
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

Expand Down Expand Up @@ -209,6 +212,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
tp_comm_overlap_cfg=userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096,
defer_embedding_wgrad_compute=True,
wgrad_deferral_limit=22,
overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing
Expand Down
28 changes: 28 additions & 0 deletions nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,20 @@ class TransformerLayerTPOverlapCfg:
fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
)

# llama3 70b
userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
proj_fprop=RingExchangeOverlapCfg(),
fc2_fprop=RingExchangeOverlapCfg(),
)

# llama3.1 405b
userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
Expand Down Expand Up @@ -168,3 +182,17 @@ class TransformerLayerTPOverlapCfg:
proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
)

# Nemotron 340B
userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an overlap config for hopper or blackwell?

qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=2, set_sm_margin=True, fp8_buf=True),
fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
)
erhoo82 marked this conversation as resolved.
Show resolved Hide resolved
26 changes: 20 additions & 6 deletions nemo/lightning/run/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import Callable, Optional

import nemo_run as run
import torch
import yaml
from lightning.pytorch import Callback
from lightning.pytorch.loggers import WandbLogger
Expand All @@ -27,7 +28,6 @@
from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
from nemo.utils import logging

from nemo.utils.import_utils import safe_import

res_module, HAVE_RES = safe_import('nvidia_resiliency_ext.ptl_resiliency')
Expand Down Expand Up @@ -315,6 +315,7 @@ class PerfEnvPlugin(run.Plugin):
layernorm_sm_margin: int = 16
enable_vboost: bool = False
nccl_pp_comm_chunksize: Optional[int] = None
num_cuda_device_max_connections: int = None

def get_vboost_srun_cmd(self, nodes, job_dir):
"Create the vboost `sudo nvidia-smi boost-slider --vboost 1` command"
Expand All @@ -341,11 +342,24 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
"""Enable the performance environment settings"""

if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
# Force program order kernel launch for TP, CP overlap
tp_size = task.trainer.strategy.tensor_model_parallel_size
cp_size = task.trainer.strategy.context_parallel_size
if tp_size > 1 or cp_size > 1:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
if torch.cuda.is_available():
major, _ = torch.cuda.get_device_capability()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@erhoo82 This method won't work because it's run on the cluster frontend node, not after slurm allocation. We need to found another way.

if major > 9:
if self.num_cuda_device_max_connections is not None:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
else:
# When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
# the kernel queuing order of the host to GPU for their execution. This is needed for the optimal
# overlap between communication and computation kernels.
tp_size = task.trainer.strategy.tensor_model_parallel_size
cp_size = task.trainer.strategy.context_parallel_size
if tp_size > 1 or cp_size > 1:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
else:
if self.num_cuda_device_max_connections is not None:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(
self.num_cuda_device_max_connections
)

# Set LayerNorm SM margin to support the overlap with LayerNorm kernel
if self.enable_layernorm_sm_margin:
Expand Down
Loading