From 8daa5a7ca6d7b96b6a984f94cf4fdbf22adc1780 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 2 Apr 2024 12:52:11 -0700 Subject: [PATCH 1/5] Update the interface of userbuffer tensor-parallel communication overlap (#8681) * update the interface of userbuffer tensor-parallel communication overlap Signed-off-by: Sangkug Lym * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Sangkug Lym Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../gpt_full_te_layer_autocast_spec.py | 154 +++++++++-------- .../modules/common/megatron/transformer.py | 156 ++++++++++-------- 2 files changed, 164 insertions(+), 146 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index df872e03c682..f89cbedf9f5d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from importlib.metadata import version from typing import Any, Callable, Optional import torch +from pkg_resources import packaging from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults from nemo.collections.nlp.parts import utils_funcs @@ -79,54 +81,56 @@ def __init__( ub_tp_comm_overlap: bool = False, ub_bulk_wgrad: bool = True, ub_bulk_dgrad: bool = True, - ub_split_ag: bool = True, - ub_split_rs: bool = True, - ub_atomic_gemm_ag: bool = False, - ub_atomic_gemm_rs: bool = False, autocast_dtype: Any = 16, zero_centered_gamma: bool = False, device: str = 'cuda', + **kwargs, ) -> None: if not HAVE_MEGATRON_CORE or not HAVE_TE: raise ImportError(IMPORT_ERROR) - super().__init__( - hidden_size=hidden_size, - ffn_hidden_size=ffn_hidden_size, - layernorm_epsilon=layernorm_epsilon, - num_attention_heads=num_attention_heads, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - hidden_dropout=hidden_dropout, - attention_dropout=attention_dropout, - layer_number=layer_number, - kv_channels=kv_channels, - self_attn_mask_type=self_attn_mask_type, - tp_group=tp_group, - tp_size=tp_size, - params_dtype=params_dtype, - get_rng_state_tracker=get_rng_state_tracker, - fuse_wgrad_accumulation=fuse_wgrad_accumulation, - seq_length=seq_length, - micro_batch_size=micro_batch_size, - sequence_parallel=sequence_parallel, - apply_residual_connection_post_layernorm=apply_residual_connection_post_layernorm, - output_layernorm=output_layernorm, - layer_type=layer_type, - drop_path_rate=drop_path_rate, - set_parallel_mode=tp_size > 1, - fuse_qkv_params=True, - zero_centered_gamma=zero_centered_gamma, - ub_tp_comm_overlap=ub_tp_comm_overlap, - ub_bulk_wgrad=ub_bulk_wgrad, - ub_bulk_dgrad=ub_bulk_dgrad, - ub_split_ag=ub_split_ag, - ub_split_rs=ub_split_rs, - ub_atomic_gemm_ag=ub_atomic_gemm_ag, - ub_atomic_gemm_rs=ub_atomic_gemm_rs, - device=device, - ) - # use_emha=use_emha, + transformer_layer_args = { + "hidden_size": hidden_size, + "ffn_hidden_size": ffn_hidden_size, + "layernorm_epsilon": layernorm_epsilon, + "num_attention_heads": num_attention_heads, + "init_method": init_method, + "output_layer_init_method": output_layer_init_method, + "hidden_dropout": hidden_dropout, + "attention_dropout": attention_dropout, + "layer_number": layer_number, + "kv_channels": kv_channels, + "self_attn_mask_type": self_attn_mask_type, + "tp_group": tp_group, + "tp_size": tp_size, + "params_dtype": params_dtype, + "get_rng_state_tracker": get_rng_state_tracker, + "fuse_wgrad_accumulation": fuse_wgrad_accumulation, + "seq_length": seq_length, + "micro_batch_size": micro_batch_size, + "sequence_parallel": sequence_parallel, + "apply_residual_connection_post_layernorm": apply_residual_connection_post_layernorm, + "output_layernorm": output_layernorm, + "layer_type": layer_type, + "drop_path_rate": drop_path_rate, + "set_parallel_mode": tp_size > 1, + "fuse_qkv_params": True, + "zero_centered_gamma": zero_centered_gamma, + "ub_tp_comm_overlap": ub_tp_comm_overlap, + "ub_bulk_wgrad": ub_bulk_wgrad, + "ub_bulk_dgrad": ub_bulk_dgrad, + "device": device, + } + te_version = packaging.version.Version(version("transformer-engine")) + if te_version > packaging.version.Version("1.5.0"): + transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True) + transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True) + else: + transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True) + transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True) + transformer_layer_args["ub_atomic_gemm_ag"] = kwargs.get("ub_atomic_gemm_ag", False) + transformer_layer_args["ub_atomic_gemm_rs"] = kwargs.get("ub_atomic_gemm_rs", False) + super().__init__(**transformer_layer_args) # Dtype for forward pass - ignore amp O2 self.dtype = utils_funcs.torch_dtype_from_precision(autocast_dtype, megatron_amp_O2=None) @@ -172,38 +176,42 @@ def __init__(self, config, layer_number=1, hidden_dropout=None): self.is_first_microbatch = True precision = 'bf16' if config.bf16 else 16 - super().__init__( - hidden_size=config.hidden_size, - ffn_hidden_size=config.ffn_hidden_size, - layernorm_epsilon=config.layernorm_epsilon, - num_attention_heads=config.num_attention_heads, - init_method=config.init_method, - output_layer_init_method=config.output_layer_init_method, - hidden_dropout=config.hidden_dropout, - attention_dropout=config.attention_dropout, - layer_number=layer_number + self._get_layer_offset(), - kv_channels=config.kv_channels, - # self_attn_mask_type='causal', # Use default 'causal' - tp_size=parallel_state.get_tensor_model_parallel_world_size(), - params_dtype=config.params_dtype, - get_rng_state_tracker=tensor_parallel.random.get_cuda_rng_tracker, - fuse_wgrad_accumulation=config.gradient_accumulation_fusion, - seq_length=None, # used for jit warmup - micro_batch_size=None, # used for jit warmup - sequence_parallel=config.sequence_parallel, - apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm, - autocast_dtype=precision, - # use_emha=False, # Use default 'False' - ub_tp_comm_overlap=config.tp_comm_overlap, - ub_bulk_wgrad=config.tp_comm_bulk_wgrad, - ub_bulk_dgrad=config.tp_comm_bulk_dgrad, - ub_split_ag=config.tp_comm_split_ag, - ub_split_rs=config.tp_comm_split_rs, - ub_atomic_gemm_ag=config.tp_comm_atomic_ag, - ub_atomic_gemm_rs=config.tp_comm_atomic_rs, - zero_centered_gamma=config.layernorm_zero_centered_gamma, - device='cpu' if config.use_cpu_initialization else 'cuda', - ) + transformer_layer_args = { + "hidden_size": config.hidden_size, + "ffn_hidden_size": config.ffn_hidden_size, + "layernorm_epsilon": config.layernorm_epsilon, + "num_attention_heads": config.num_attention_heads, + "init_method": config.init_method, + "output_layer_init_method": config.output_layer_init_method, + "hidden_dropout": config.hidden_dropout, + "attention_dropout": config.attention_dropout, + "layer_number": layer_number + self._get_layer_offset(), + "kv_channels": config.kv_channels, + "tp_size": parallel_state.get_tensor_model_parallel_world_size(), + "params_dtype": config.params_dtype, + "get_rng_state_tracker": tensor_parallel.random.get_cuda_rng_tracker, + "fuse_wgrad_accumulation": config.gradient_accumulation_fusion, + "seq_length": None, # used for jit warmup + "micro_batch_size": None, # used for jit warmup + "sequence_parallel": config.sequence_parallel, + "apply_residual_connection_post_layernorm": config.apply_residual_connection_post_layernorm, + "autocast_dtype": precision, + "ub_tp_comm_overlap": config.tp_comm_overlap, + "ub_bulk_wgrad": config.tp_comm_bulk_wgrad, + "ub_bulk_dgrad": config.tp_comm_bulk_dgrad, + "zero_centered_gamma": config.layernorm_zero_centered_gamma, + "device": 'cpu' if config.use_cpu_initialization else 'cuda', + } + te_version = packaging.version.Version(version("transformer-engine")) + if te_version > packaging.version.Version("1.5.0"): + transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag + transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs + else: + transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag + transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs + transformer_layer_args["ub_atomic_gemm_ag"] = config.tp_comm_atomic_ag + transformer_layer_args["ub_atomic_gemm_rs"] = config.tp_comm_atomic_rs + super().__init__(**transformer_layer_args) # Called by MCore's TransformerBlock.forward # megatron/core/transformer/transformer_block.py diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index f115b645666b..d37c1e75d341 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -15,11 +15,13 @@ """Transformer.""" from contextlib import nullcontext +from importlib.metadata import version from typing import Any, Callable, Optional import torch import torch.nn as nn from einops import rearrange +from pkg_resources import packaging from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( @@ -798,51 +800,53 @@ def __init__( ub_tp_comm_overlap: bool = False, ub_bulk_wgrad: bool = True, ub_bulk_dgrad: bool = True, - ub_split_ag: bool = True, - ub_split_rs: bool = True, - ub_atomic_gemm_ag: bool = False, - ub_atomic_gemm_rs: bool = False, autocast_dtype: Any = 16, zero_centered_gamma: bool = False, device: str = 'cuda', + **kwargs, ) -> None: - super().__init__( - hidden_size=hidden_size, - ffn_hidden_size=ffn_hidden_size, - layernorm_epsilon=layernorm_epsilon, - num_attention_heads=num_attention_heads, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - hidden_dropout=hidden_dropout, - attention_dropout=attention_dropout, - layer_number=layer_number, - kv_channels=kv_channels, - self_attn_mask_type=self_attn_mask_type, - tp_group=tp_group, - tp_size=tp_size, - params_dtype=params_dtype, - get_rng_state_tracker=get_rng_state_tracker, - fuse_wgrad_accumulation=fuse_wgrad_accumulation, - seq_length=seq_length, - micro_batch_size=micro_batch_size, - sequence_parallel=sequence_parallel, - apply_residual_connection_post_layernorm=apply_residual_connection_post_layernorm, - output_layernorm=output_layernorm, - layer_type=layer_type, - drop_path_rate=drop_path_rate, - set_parallel_mode=tp_size > 1, - fuse_qkv_params=True, - zero_centered_gamma=zero_centered_gamma, - ub_tp_comm_overlap=ub_tp_comm_overlap, - ub_bulk_wgrad=ub_bulk_wgrad, - ub_bulk_dgrad=ub_bulk_dgrad, - ub_split_ag=ub_split_ag, - ub_split_rs=ub_split_rs, - ub_atomic_gemm_ag=ub_atomic_gemm_ag, - ub_atomic_gemm_rs=ub_atomic_gemm_rs, - device=device, - ) - # use_emha=use_emha, + transformer_layer_args = { + "hidden_size": hidden_size, + "ffn_hidden_size": ffn_hidden_size, + "layernorm_epsilon": layernorm_epsilon, + "num_attention_heads": num_attention_heads, + "init_method": init_method, + "output_layer_init_method": output_layer_init_method, + "hidden_dropout": hidden_dropout, + "attention_dropout": attention_dropout, + "layer_number": layer_number, + "kv_channels": kv_channels, + "self_attn_mask_type": self_attn_mask_type, + "tp_group": tp_group, + "tp_size": tp_size, + "params_dtype": params_dtype, + "get_rng_state_tracker": get_rng_state_tracker, + "fuse_wgrad_accumulation": fuse_wgrad_accumulation, + "seq_length": seq_length, + "micro_batch_size": micro_batch_size, + "sequence_parallel": sequence_parallel, + "apply_residual_connection_post_layernorm": apply_residual_connection_post_layernorm, + "output_layernorm": output_layernorm, + "layer_type": layer_type, + "drop_path_rate": drop_path_rate, + "set_parallel_mode": tp_size > 1, + "fuse_qkv_params": True, + "zero_centered_gamma": zero_centered_gamma, + "ub_tp_comm_overlap": ub_tp_comm_overlap, + "ub_bulk_wgrad": ub_bulk_wgrad, + "ub_bulk_dgrad": ub_bulk_dgrad, + "device": device, + } + te_version = packaging.version.Version(version("transformer-engine")) + if te_version > packaging.version.Version("1.5.0"): + transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True) + transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True) + else: + transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True) + transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True) + transformer_layer_args["ub_atomic_gemm_ag"] = kwargs.get("ub_atomic_gemm_ag", False) + transformer_layer_args["ub_atomic_gemm_rs"] = kwargs.get("ub_atomic_gemm_rs", False) + super().__init__(**transformer_layer_args) # Dtype for forward pass - ignore amp O2 self.dtype = utils_funcs.torch_dtype_from_precision(autocast_dtype, megatron_amp_O2=None) @@ -1065,38 +1069,44 @@ def build_layer(layer_number): lt = layer_type if self.transformer_engine: - return AutocastTransformerLayer( - hidden_size=hidden_size, - ffn_hidden_size=ffn_hidden_size, - layernorm_epsilon=layernorm_epsilon, - num_attention_heads=num_attention_heads, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - hidden_dropout=hidden_dropout, - attention_dropout=attention_dropout, - layer_number=layer_number + layer_number_offset, - kv_channels=kv_channels, - self_attn_mask_type=self_attn_mask_type.name, - tp_size=parallel_state.get_tensor_model_parallel_world_size(), - params_dtype=config.params_dtype, - get_rng_state_tracker=tensor_parallel.random.get_cuda_rng_tracker, - fuse_wgrad_accumulation=config.gradient_accumulation_fusion, - seq_length=None, # used for jit warmup - micro_batch_size=None, # used for jit warmup - sequence_parallel=config.sequence_parallel, - apply_residual_connection_post_layernorm=False, - autocast_dtype=precision, - use_emha=use_emha, - ub_tp_comm_overlap=ub_tp_comm_overlap, - ub_bulk_wgrad=config.tp_comm_bulk_wgrad, - ub_bulk_dgrad=config.tp_comm_bulk_dgrad, - ub_split_ag=config.tp_comm_split_ag, - ub_split_rs=config.tp_comm_split_rs, - ub_atomic_gemm_ag=config.tp_comm_atomic_ag, - ub_atomic_gemm_rs=config.tp_comm_atomic_rs, - zero_centered_gamma=normalization == 'layernorm1p', - device='cpu' if config.use_cpu_initialization else 'cuda', - ) + transformer_layer_args = { + "hidden_size": hidden_size, + "ffn_hidden_size": ffn_hidden_size, + "layernorm_epsilon": layernorm_epsilon, + "num_attention_heads": num_attention_heads, + "init_method": init_method, + "output_layer_init_method": output_layer_init_method, + "hidden_dropout": hidden_dropout, + "attention_dropout": attention_dropout, + "layer_number": layer_number + layer_number_offset, + "kv_channels": kv_channels, + "self_attn_mask_type": self_attn_mask_type.name, + "tp_size": parallel_state.get_tensor_model_parallel_world_size(), + "params_dtype": config.params_dtype, + "get_rng_state_tracker": tensor_parallel.random.get_cuda_rng_tracker, + "fuse_wgrad_accumulation": config.gradient_accumulation_fusion, + "seq_length": None, # used for jit warmup + "micro_batch_size": None, # used for jit warmup + "sequence_parallel": config.sequence_parallel, + "apply_residual_connection_post_layernorm": False, + "autocast_dtype": precision, + "use_emha": use_emha, + "ub_tp_comm_overlap": ub_tp_comm_overlap, + "ub_bulk_wgrad": config.tp_comm_bulk_wgrad, + "ub_bulk_dgrad": config.tp_comm_bulk_dgrad, + "zero_centered_gamma": normalization == 'layernorm1p', + "device": 'cpu' if config.use_cpu_initialization else 'cuda', + } + te_version = packaging.version.Version(version("transformer-engine")) + if te_version > packaging.version.Version("1.5.0"): + transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag + transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs + else: + transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag + transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs + transformer_layer_args["ub_atomic_gemm_ag"] = config.tp_comm_atomic_ag + transformer_layer_args["ub_atomic_gemm_rs"] = config.tp_comm_atomic_rs + return AutocastTransformerLayer(**transformer_layer_args) else: return ParallelTransformerLayer( config=config, From a72a5e8ca7ce60a85691dc6ef985e8bd921fdbae Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 2 Apr 2024 14:41:16 -0700 Subject: [PATCH 2/5] Pass groupedgemm flag to spec maker; re-enable --transformer-engine spec. (#8731) * Pass groupedgemm flag to spec maker. Signed-off-by: Alexandros Koumparoulis * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Use --transformer-engine flag to toggle TE backend in MCore. Signed-off-by: Alexandros Koumparoulis * Move self.transformer_engine higher Signed-off-by: Alexandros Koumparoulis * Import get_gpt_layer_local_spec Signed-off-by: Alexandros Koumparoulis * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use TE in tests Signed-off-by: Alexandros Koumparoulis * Warn user if megatron_amp_o2 is enabled but TE is not. Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 3 +++ .../language_modeling/megatron_gpt_model.py | 25 ++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 883f6e105ed0..07f34babccf9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3843,6 +3843,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.optim.name=distributed_fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=1 \ + model.transformer_engine=true \ model.optim.sched.constant_steps=1 \ model.optim.sched.min_lr=8e-5 \ model.max_position_embeddings=128 \ @@ -3885,6 +3886,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.data.seq_length=128 \ model.normalization=rmsnorm \ model.bias=False \ + model.transformer_engine=True \ model.bias_activation_fusion=False \ model.bias_dropout_add_fusion=False \ model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ @@ -3984,6 +3986,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.position_embedding_type=rope \ model.rotary_percentage=0.5 \ model.normalization=rmsnorm \ + model.transformer_engine=True \ model.bias=False \ model.bias_activation_fusion=False \ model.bias_dropout_add_fusion=False \ diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 546ea429b149..925f92df250e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -93,7 +93,10 @@ from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec from megatron.core.models.gpt import GPTModel as MCoreGPTModel - from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, + ) from megatron.core.pipeline_parallel.schedules import get_forward_backward_func from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig @@ -133,12 +136,15 @@ def mcore_supports_moe() -> bool: return False -def get_specs(spec_name, num_experts=None): +def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True): if num_experts is not None: assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE" + if use_te and spec_name == '': + spec_name = 'te_gpt' name_spec_dict = { - "": get_gpt_layer_with_transformer_engine_spec(num_experts), + "": get_gpt_layer_local_spec(num_experts, moe_grouped_gemm), + "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm), "megatron_falcon_gpt": get_falcon_layer_spec(), "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(), "ammo": get_gpt_layer_ammo_spec(), @@ -301,6 +307,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if self.cfg.get('expert_model_parallel_size', 1) > 1 and self.with_distributed_adam: raise ValueError('Expert parallelism is currently not supporting distributed optimizer') + self.transformer_engine = cfg.get('transformer_engine', False) + if self.megatron_amp_O2 and not self.transformer_engine: + logging.warning('megatron_amp_O2 is enabled but transformer-engine is not.') + # build_model returns a list of modules which are used for interleaved pipeline parallelism if isinstance(self.trainer.accelerator, CPUAccelerator): self.model = build_model( @@ -341,8 +351,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False ) - self.transformer_engine = cfg.get('transformer_engine', False) - # configuration used for inference self._inference_config = None @@ -380,7 +388,12 @@ def model_provider_func(self, pre_process, post_process): if self.mcore_gpt: model = MCoreGPTModel( config=self.transformer_config, - transformer_layer_spec=get_specs(self.spec_name, self.transformer_config.num_moe_experts), + transformer_layer_spec=get_specs( + self.spec_name, + self.transformer_config.num_moe_experts, + self.transformer_config.moe_grouped_gemm, + self.transformer_engine, + ), vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, From 461294e662e9c025d1afb237c3f619e0cd7868c0 Mon Sep 17 00:00:00 2001 From: Ming <111467530+Victor49152@users.noreply.github.com> Date: Tue, 2 Apr 2024 17:03:16 -0700 Subject: [PATCH 3/5] Cherry pick the notebook fixes from 1.23.0 to main (#8556) Signed-off-by: Mingyuan Ma --- tutorials/multimodal/DreamBooth Tutorial.ipynb | 10 +++++----- .../multimodal/Multimodal Data Preparation.ipynb | 2 +- .../multimodal/Stable Diffusion Tutorial.ipynb | 16 ++++++++-------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tutorials/multimodal/DreamBooth Tutorial.ipynb b/tutorials/multimodal/DreamBooth Tutorial.ipynb index 8651b55d6308..f3444df25a51 100644 --- a/tutorials/multimodal/DreamBooth Tutorial.ipynb +++ b/tutorials/multimodal/DreamBooth Tutorial.ipynb @@ -124,10 +124,10 @@ "metadata": {}, "outputs": [], "source": [ - "! python /opt/NeMo/examples/multimodal/foundation/clip/convert_external_clip_to_nemo.py \\\n", + "! python /opt/NeMo/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py \\\n", " --arch ViT-L-14 \\\n", " --version openai \\\n", - " --hparams_file /opt/NeMo/examples/multimodal/foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n", + " --hparams_file /opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n", " --nemo_file /ckpts/openai.nemo" ] }, @@ -167,7 +167,7 @@ "outputs": [], "source": [ "## This is the example command for running dreambooth training\n", - "! python /opt/NeMo/examples/multimodal/generative/dreambooth/dreambooth.py \\\n", + "! python /opt/NeMo/examples/multimodal/text_to_image/dreambooth/dreambooth.py \\\n", " model.unet_config.from_pretrained=/ckpts/unet.bin \\\n", " model.unet_config.from_NeMo=False \\\n", " model.first_stage_config.from_pretrained=/ckpts/vae.bin \\\n", @@ -196,7 +196,7 @@ "outputs": [], "source": [ "## This is the example command for running DreamBooth inference\n", - "! torchrun /opt/NeMo/examples/multimodal/generative/dreambooth/dreambooth_infer.py \\\n", + "! torchrun /opt/NeMo/examples/multimodal/text_to_image/dreambooth/dreambooth_infer.py \\\n", " model.restore_from_path='/opt/NeMo/tutorials/multimodal/nemo_experiments/Dreambooth/checkpoints/Dreambooth.nemo' \\\n", " infer.num_images_per_prompt=4 \\\n", " infer.inference_steps=50 \\\n", @@ -270,4 +270,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/tutorials/multimodal/Multimodal Data Preparation.ipynb b/tutorials/multimodal/Multimodal Data Preparation.ipynb index a65814c5c2eb..bc297a4e1f58 100644 --- a/tutorials/multimodal/Multimodal Data Preparation.ipynb +++ b/tutorials/multimodal/Multimodal Data Preparation.ipynb @@ -660,4 +660,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/tutorials/multimodal/Stable Diffusion Tutorial.ipynb b/tutorials/multimodal/Stable Diffusion Tutorial.ipynb index ed794356f280..48da90dcb23d 100644 --- a/tutorials/multimodal/Stable Diffusion Tutorial.ipynb +++ b/tutorials/multimodal/Stable Diffusion Tutorial.ipynb @@ -9,7 +9,7 @@ "# Stable Diffusion Training / Inference Tutorial\n", "\n", "### Note:\n", - "Currently, this notebook must be run in a NeMo container. An example command to launch the container:\n", + "Currently, this notebook must be run in a NeMo container (> 24.01). An example command to launch the container:\n", "\n", "```\n", "docker run --gpus all -it --rm -v :/opt/NeMo --shm-size=8g \\\n", @@ -30,7 +30,7 @@ "\n", "## Datasets\n", "\n", - "Please refer to [ADD LINK]() for how to prepare a training dataset for Stable diffusion.\n", + "Please refer to [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for how to prepare a training dataset for Stable diffusion.\n", "\n", "For a pre-cached Stable Diffusion dataset, each webdataset tar file should, at a minimum, include the pickle files that store the pre-cached image and text features:\n", "\n", @@ -117,10 +117,10 @@ "metadata": {}, "outputs": [], "source": [ - "! python examples/multimodal/foundation/clip/convert_external_clip_to_nemo.py \\\n", + "! python /opt/NeMo/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py \\\n", " --arch ViT-L-14 \\\n", " --version openai \\\n", - " --hparams_file /opt/NeMo/examples/multimodal/foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n", + " --hparams_file /opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n", " --nemo_file /ckpts/openai.nemo" ] }, @@ -151,7 +151,7 @@ "\n", "### Option 2: Training on Precached Dataset (Training UNet Only)\n", "\n", - "When using precached dataset (please refer to the [Dataset Tutorial](ADD_LINK) for details), every text feature and image feature are stored as key-value pairs in `.pickle` file:\n", + "When using precached dataset (please refer to the [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for details), every text feature and image feature are stored as key-value pairs in `.pickle` file:\n", "\n", "```\n", "{\n", @@ -201,7 +201,7 @@ "metadata": {}, "outputs": [], "source": [ - "! torchrun /opt/NeMo/examples/multimodal/generative/stable_diffusion/sd_train.py trainer.max_steps=100 model.data.synthetic_data=True" + "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_train.py trainer.max_steps=100 model.data.synthetic_data=True" ] }, { @@ -247,7 +247,7 @@ "metadata": {}, "outputs": [], "source": [ - "! ! torchrun /opt/NeMo/examples/multimodal/generative/stable_diffusion/sd_infer.py model.restore_from_path='/opt/NeMo/tutorials/multimodal/nemo_experiments/stable-diffusion-train/checkpoints/stable-diffusion-train.nemo'" + "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py model.restore_from_path='/opt/NeMo/tutorials/multimodal/nemo_experiments/stable-diffusion-train/checkpoints/stable-diffusion-train.nemo'" ] } ], @@ -272,4 +272,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file From 3497afa0b809254db8e5179bd792f70f05b323b5 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 2 Apr 2024 18:21:37 -0700 Subject: [PATCH 4/5] [Nemo CICD] (#8794) * Change container registry login * temp for test * Change container registry login * Revert "temp for test" This reverts commit 0cee494c6d94f3d6a33cc4d65674cf706f9ef45c. --- .github/workflows/cicd-main.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index d0a74da92ab8..adb9c7e690b3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -62,11 +62,9 @@ jobs: # --env HYDRA_FULL_ERROR=1 steps: - name: Log into ACR (Azure Container Registry) # this login is for the pushing step after - uses: azure/docker-login@v1 - with: - login-server: nemoci.azurecr.io - password: ${{ secrets.ACR_PASSWORD }} - username: nemoci + run: | + # Login to Azure Container Registry + az acr login --name nemoci.azurecr.io - name: Checkout repository uses: actions/checkout@v2 From ea5d1eff442159d05a29b399fa40c4ea4dce2621 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Wed, 3 Apr 2024 07:34:00 -0700 Subject: [PATCH 5/5] Add legacy_dataset flag to use legacy NeMo dataset path instead of MCore (#8783) * Add legacy_dataset flag to use legacy NeMo dataset path instead of MCore Signed-off-by: Jaemin Choi * Fix get_batch Signed-off-by: Jaemin Choi --------- Signed-off-by: Jaemin Choi Co-authored-by: Jaemin Choi --- .../language_modeling/megatron_gpt_model.py | 80 +++++++++++-------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 925f92df250e..04f842ed1bca 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -35,6 +35,7 @@ MegatronPretrainingRandomSampler, MegatronPretrainingSampler, ) +from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import build_train_valid_test_datasets from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import ( GPTFIMDataset, GPTFIMDatasetConfig, @@ -971,9 +972,10 @@ def get_batch(self, data_iterator, tuning): 'tokens': data["tokens"], 'labels': data["labels"], 'loss_mask': data["loss_mask"], - 'attention_mask': data["attention_mask"], 'position_ids': data["position_ids"], } + if "attention_mask" in data: + batch['attention_mask'] = data["attention_mask"] return batch @@ -1309,41 +1311,55 @@ def build_train_valid_test_datasets(self): fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod] self.tokenizer.add_special_tokens({'additional_special_tokens': fim_tokens}) - mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False - kwargs = { - "is_built_on_rank": is_dataset_built_on_rank, - "random_seed": self.cfg.seed, - "sequence_length": self.cfg.data.seq_length, - "path_to_cache": self.cfg.data.index_mapping_dir, - "tokenizer": self.tokenizer, - "reset_position_ids": self.reset_position_ids, - "reset_attention_mask": self.reset_attention_mask, - "eod_mask_loss": self.eod_mask_loss, - "mock": mock_dataset, - "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), - } - - # support for dict data input type - if isinstance(self.cfg.data.data_prefix, DictConfig): - _pref = self.cfg.data.data_prefix - kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']] + if self.cfg.data.get("legacy_dataset", False): + self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets( + cfg=self.cfg, + trainer=self.trainer, + data_prefix=self.cfg.data.data_prefix, + data_impl=self.cfg.data.data_impl, + splits_string=self.cfg.data.splits_string, + train_valid_test_num_samples=train_valid_test_num_samples, + seq_length=self.cfg.data.seq_length, + seed=self.cfg.seed, + skip_warmup=self.cfg.data.get('skip_warmup', True), + tokenizer=self.tokenizer, + ) else: - kwargs['blend'] = self.cfg.data.data_prefix - kwargs["split"] = self.cfg.data.splits_string + mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False + kwargs = { + "is_built_on_rank": is_dataset_built_on_rank, + "random_seed": self.cfg.seed, + "sequence_length": self.cfg.data.seq_length, + "path_to_cache": self.cfg.data.index_mapping_dir, + "tokenizer": self.tokenizer, + "reset_position_ids": self.reset_position_ids, + "reset_attention_mask": self.reset_attention_mask, + "eod_mask_loss": self.eod_mask_loss, + "mock": mock_dataset, + "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), + } - if self.cfg.data.get('add_fim', False): - dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs) + # support for dict data input type + if isinstance(self.cfg.data.data_prefix, DictConfig): + _pref = self.cfg.data.data_prefix + kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']] + else: + kwargs['blend'] = self.cfg.data.data_prefix + kwargs["split"] = self.cfg.data.splits_string - self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( - GPTFIMDataset, train_valid_test_num_samples, dataset_config, - ).build() - else: - dataset_config = GPTDatasetConfig(**kwargs) - dataset_type = MockGPTDataset if mock_dataset else GPTDataset + if self.cfg.data.get('add_fim', False): + dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs) + + self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( + GPTFIMDataset, train_valid_test_num_samples, dataset_config, + ).build() + else: + dataset_config = GPTDatasetConfig(**kwargs) + dataset_type = MockGPTDataset if mock_dataset else GPTDataset - self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( - dataset_type, train_valid_test_num_samples, dataset_config, - ).build() + self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( + dataset_type, train_valid_test_num_samples, dataset_config, + ).build() if self._train_ds is not None: logging.info(f'Length of train dataset: {len(self._train_ds)}')