From fe90ac2b1ae3fce0ba46c54d4518849b52ac88e2 Mon Sep 17 00:00:00 2001 From: vivian chen Date: Tue, 12 Sep 2023 16:20:08 +0000 Subject: [PATCH 01/69] support falcon --- .../convert_hf_falcon_to_nemo.py | 355 ++++++++++++++++++ 1 file changed, 355 insertions(+) create mode 100644 scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py new file mode 100644 index 000000000000..20a94f8f5c54 --- /dev/null +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -0,0 +1,355 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Conversion script to convert Huggingface Falcon 1B/7B/40B/180B checkpoints into nemo checkpoint. + +This script will generate a Megatron model with TP=1 and PP=1. If you need different TP/PP +values, then after running this script, please use the script located below to set the +TP/PP values you want: + NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py + +Example to run this conversion script: +``` + python convert_hf_falcon_to_nemo.py \ + --in-file \ + --out-file \ +``` +""" + +import os +import logging +import time +from argparse import ArgumentParser +from collections import OrderedDict + +import torch +from omegaconf import OmegaConf +from pytorch_lightning.core.saving import _load_state as ptl_load_state +from pytorch_lightning.trainer.trainer import Trainer +from transformers import AutoTokenizer, AutoModelForCausalLM, FalconConfig + + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) + + +# TODO: +# [Y] refactor ckpt func to make it cleaner +# [Y] dict tokenizer mapping for falcon family +# [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml +# [ ] safetensors loading. (only 180b used safetensors) +# [ ] test on non parallel attention model (block by no alibi support?) +# [Y] hf config name mapping for falcon 7b and 40b. +# [Y] trust remote code add +# [ ] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA) +# [ ] When bias_gelu_fusion is True, add_bias_linear must also be True. error +# [ ] remove unnecessary comments and codes. + +def setup_logging(log_file="test_log.txt"): + logging.basicConfig(filename=log_file, level=logging.INFO, + format='%(asctime)s [%(levelname)s] - %(message)s', datefmt='%d-%b-%y %H:%M:%S') +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--in-file", type=str, default=None, required=True, help="Path to Huggingface Falcon checkpoints", + ) + parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.") + parser.add_argument("--precision", type=str, default="32", help="Model precision") + parser.add_argument("--tokenizer-type", type=str, default="tiiuae/falcon-7b", help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'." + ) + args = parser.parse_args() + return args + + +def load_model(cls, checkpoint, strict, **kwargs): + try: + if 'cfg' in kwargs: + model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs) + else: + model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs) + for name, module in model.named_parameters(): + if name in checkpoint['state_dict']: + module.data = checkpoint['state_dict'][name] + checkpoint['state_dict'].pop(name) + else: + print(f"Unexpected key: {name} not in checkpoint but in model.") + if len(checkpoint['state_dict'].keys()) != 0: + raise RuntimeError( + f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model." + ) + + # register the artifacts + cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] + if cfg.tokenizer.model is not None: + model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model) + if cfg.tokenizer.vocab_file is not None: + model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file) + if cfg.tokenizer.merge_file is not None: + model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file) + finally: + cls._set_model_restore_state(is_being_restored=False) + return model + + +def load_falcon_config(args) -> FalconConfig: + """ Helper utility to load FalconConfig. + + 7B and 40B are not compatible with `transformers.FalconConfig` and + `transformers.FalconModel`. need to manually set the config values + and force to `falcon` model type. + """ + config = FalconConfig.from_pretrained(args.in_file) + + if config.model_type == 'RefinedWeb': + mappings = { + "num_hidden_layers": config.n_layer, + "num_attention_heads": config.n_head, + "num_kv_heads": config.n_head_kv, + "new_decoder_architecture": True + } + elif config.model_type == 'RefinedWebModel': + mappings = { + "num_hidden_layers": config.n_layer, + "num_attention_heads": config.n_head, + "num_kv_heads": 1 if config.multi_query else config.n_head, + "new_decoder_architecture": False + } + else: + return config + + for key, value in mappings.items(): + setattr(config, key, value) + + config.model_type = 'falcon' + return config + + +def load_config(args): + falcon_config = load_falcon_config(args) + logging.info(f"falcon_config, {falcon_config}") + nemo_config = OmegaConf.load( + os.path.join(os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml') + ).model + nemo_config.encoder_seq_length = falcon_config.max_position_embeddings + nemo_config.num_layers = int(falcon_config.num_hidden_layers) + nemo_config.hidden_size = falcon_config.hidden_size + nemo_config.num_attention_heads = falcon_config.num_attention_heads + nemo_config.max_position_embeddings = falcon_config.max_position_embeddings + nemo_config.init_method_std = falcon_config.initializer_range + nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon + if falcon_config.alibi: + raise ValueError("Alibi is not yet supported in Megatron Core") + else: + nemo_config.position_embedding_type = 'rope' + nemo_config.bias = falcon_config.bias + nemo_config.hidden_dropout = falcon_config.hidden_dropout + nemo_config.attention_dropout = falcon_config.attention_dropout + # need to map to nemo config as well. + tokenizer_dict = { + 'library': 'huggingface', + 'type': args.tokenizer_type, # FIXME: can it work from local args.input too, fix for falcon family? + } + + nemo_config.tokenizer = tokenizer_dict + ############################################## + # need refactor Mcore to support parallel attn + #nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn + #nemo_config.parallel_attention = falcon_config['parallel_attn'] + ############################################### + #if hasattr(falcon_config,'num_kv_heads'): + if falcon_config.new_decoder_architecture or falcon_config.multi_query: + nemo_config.num_query_groups = falcon_config.num_kv_heads + nemo_config.use_cpu_initialization = True + nemo_config.activation = 'gelu' + if falcon_config.rope_scaling is not None: + if falcon_config.rope_scaling.type == 'linear': + nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor + else: + raise ValueError("Only linear rope scaling type is supported now") + + base = 128 + while falcon_config.vocab_size % base != 0: + base //= 2 + nemo_config.make_vocab_size_divisible_by = base + + return nemo_config + + +def convert(args): + logging.info(f"loading checkpoint {args.in_file}") + tik = time.time() + model = AutoModelForCausalLM.from_pretrained(args.in_file, trust_remote_code=True) + #tokenizer = AutoTokenizer.from_pretrained(args.in_file) + hf_config = load_falcon_config(args) + + # print(f"hf_config: {hf_config}") + # print("named parameters:") + # for name, param in model.named_parameters(): + # print(f"- {name}") + + # add debug state dict list + hf_keys = list(model.state_dict().keys()) + + nemo_config = load_config(args) + + if args.precision in ["32", "16"]: + precision = int(float(args.precision)) + elif args.precision in ["bf16", "bf16-mixed"]: + if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): + precision = args.precision + else: + logging.warning("BF16 is not supported on this device. Using FP16 instead.") + precision = args.precision[2:] # prune bf in string + else: + precision = args.precision + + plugins = [] + if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: + scaler = None + if precision in [16, '16', '16-mixed']: + scaler = GradScaler( + init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32), + growth_interval=nemo_config.get('native_amp_growth_interval', 1000), + hysteresis=nemo_config.get('hysteresis', 2), + ) + # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed + plugin_precision = '16-mixed' + else: + plugin_precision = 'bf16-mixed' + + if nemo_config.get('megatron_amp_O2', False): + plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) + + if precision == 32: + dtype = torch.float32 + elif precision in [16, "16", "16-mixed"]: + dtype = torch.float16 + elif precision == ["bf16", "bf16-mixed"]: + dtype = torch.bfloat16 + else: + dtype = torch.float32 # fallback + + nemo_config.precision = precision + + trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision) + + hidden_size = hf_config.hidden_size + head_num = hf_config.num_attention_heads + head_size = hidden_size // head_num + num_layers = hf_config.num_hidden_layers + + nemo_config.mcore_gpt = True + nemo_config.transformer_engine = True + logging.info(f"nemo_config {nemo_config}") + logging.info(f"mcore_gpt: {nemo_config.mcore_gpt}") + logging.info(f"transformer_engine: {nemo_config.transformer_engine}") + # assert nemo_config.mcore_gpt == nemo_config.get( + # 'transformer_engine', False + # ), "mcore_gpt transformer_engine must be enabled (or disabled) together." + + param_to_weights = lambda param: param.float() + + checkpoint = OrderedDict() + checkpoint['state_dict'] = OrderedDict() + + def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias): + source_name = f"{source_prefix}.{weight_or_bias}" + if source_name in model.state_dict(): + target_name = f"{target_prefix}.{weight_or_bias}" + checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name]) + + # add debug remove mapped keys + if source_name in hf_keys: + hf_keys.remove(source_name) + + def add_weight_and_possible_bias(source_prefix, target_prefix): + add_to_checkpoint(source_prefix, target_prefix, 'weight') + if f"{source_prefix}.bias" in model.state_dict(): + add_to_checkpoint(source_prefix, target_prefix, 'bias') + + add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight') + + if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num: + num_query_groups = head_num + else: + num_query_groups = nemo_config.num_query_groups + assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})' + + for l in range(int(num_layers)): + print(f"converting layer {l}") + prefix = f'transformer.h.{l}' + + + # HF: [num_heads x 3 x head_dim, hidden_size], interleaved qkv weights + # Query types and expected kv heads. + # - MHA: num_heads = num_kv_heads + # - Multi-Query Attention: num_kv_heads = 1 + # - Grouped-Query Attention: num_heads % num_kv_heads = 0 + + add_weight_and_possible_bias(f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv') + add_weight_and_possible_bias(f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj') + add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1') + add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2') + + if hf_config.new_decoder_architecture: + add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm') + add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm') + else: + add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm') + if not hf_config.parallel_attn: + add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm') + + print(f"done layer {l}") + + # final layer norm + add_weight_and_possible_bias('transformer.ln_f', 'model.decoder.final_layernorm') + + # LM weight + add_to_checkpoint('lm_head', 'model.output_layer','weight') + + if hf_keys: + logging.warning(f"Some keys in HuggingFace's model didn't get mapped to NeMo's state_dict: {hf_keys}") + + checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + logging.info(f'Weights loaded. Total time: {t}') + + del model + + model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) + + model._save_restore_connector = NLPSaveRestoreConnector() + + # cast to target precision and disable cpu init + model = model.to(dtype=dtype) + model.cfg.use_cpu_initialization = False + + model.save_to(args.out_file) + logging.info(f'NeMo model saved to: {args.out_file}') + + +if __name__ == '__main__': + setup_logging() + args = get_args() + convert(args) \ No newline at end of file From ffaf2289aedca120a794c6648674ad4c38206d9c Mon Sep 17 00:00:00 2001 From: vivian chen Date: Tue, 12 Sep 2023 18:01:25 +0000 Subject: [PATCH 02/69] support falcon bug fix layernorm naming --- .../convert_hf_falcon_to_nemo.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 20a94f8f5c54..b7b3ed55e3b7 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -89,7 +89,7 @@ def load_model(cls, checkpoint, strict, **kwargs): module.data = checkpoint['state_dict'][name] checkpoint['state_dict'].pop(name) else: - print(f"Unexpected key: {name} not in checkpoint but in model.") + logging.info(f"Unexpected key: {name} not in checkpoint but in model.") if len(checkpoint['state_dict'].keys()) != 0: raise RuntimeError( f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model." @@ -259,6 +259,8 @@ def convert(args): nemo_config.mcore_gpt = True nemo_config.transformer_engine = True + nemo_config.bias_activation_fusion = False + logging.info(f"nemo_config {nemo_config}") logging.info(f"mcore_gpt: {nemo_config.mcore_gpt}") logging.info(f"transformer_engine: {nemo_config.transformer_engine}") @@ -271,20 +273,23 @@ def convert(args): checkpoint = OrderedDict() checkpoint['state_dict'] = OrderedDict() - def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias): + def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm=False): source_name = f"{source_prefix}.{weight_or_bias}" if source_name in model.state_dict(): - target_name = f"{target_prefix}.{weight_or_bias}" + if is_layernorm: + target_name = f"{target_prefix}_{weight_or_bias}" + else: + target_name = f"{target_prefix}.{weight_or_bias}" checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name]) # add debug remove mapped keys if source_name in hf_keys: hf_keys.remove(source_name) - def add_weight_and_possible_bias(source_prefix, target_prefix): - add_to_checkpoint(source_prefix, target_prefix, 'weight') + def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=False): + add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm) if f"{source_prefix}.bias" in model.state_dict(): - add_to_checkpoint(source_prefix, target_prefix, 'bias') + add_to_checkpoint(source_prefix, target_prefix, 'bias', is_layernorm) add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight') @@ -311,12 +316,12 @@ def add_weight_and_possible_bias(source_prefix, target_prefix): add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2') if hf_config.new_decoder_architecture: - add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm') - add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm') + add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) + add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) else: - add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm') + add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) if not hf_config.parallel_attn: - add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm') + add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True) print(f"done layer {l}") From 562e6f08a9129fdd88b5411b67c3811e86af4ec3 Mon Sep 17 00:00:00 2001 From: vivian chen Date: Wed, 13 Sep 2023 00:37:36 +0000 Subject: [PATCH 03/69] fix todo --- .../convert_hf_falcon_to_nemo.py | 186 +++++++----------- 1 file changed, 76 insertions(+), 110 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index b7b3ed55e3b7..99e7866e43a2 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -25,6 +25,8 @@ python convert_hf_falcon_to_nemo.py \ --in-file \ --out-file \ + --tokenizer-type \ + --precision ``` """ @@ -55,15 +57,15 @@ # [Y] dict tokenizer mapping for falcon family # [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml # [ ] safetensors loading. (only 180b used safetensors) -# [ ] test on non parallel attention model (block by no alibi support?) +# [Y] test on non parallel attention model (block by no alibi support? 1b-rw good, 7b-rw still some time) # [Y] hf config name mapping for falcon 7b and 40b. # [Y] trust remote code add -# [ ] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA) -# [ ] When bias_gelu_fusion is True, add_bias_linear must also be True. error +# [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA) +# [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error # [ ] remove unnecessary comments and codes. def setup_logging(log_file="test_log.txt"): - logging.basicConfig(filename=log_file, level=logging.INFO, + logging.basicConfig(filename=log_file, level=logging.DEBUG, format='%(asctime)s [%(levelname)s] - %(message)s', datefmt='%d-%b-%y %H:%M:%S') def get_args(): parser = ArgumentParser() @@ -94,15 +96,6 @@ def load_model(cls, checkpoint, strict, **kwargs): raise RuntimeError( f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model." ) - - # register the artifacts - cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] - if cfg.tokenizer.model is not None: - model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model) - if cfg.tokenizer.vocab_file is not None: - model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file) - if cfg.tokenizer.merge_file is not None: - model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file) finally: cls._set_model_restore_state(is_being_restored=False) return model @@ -111,7 +104,7 @@ def load_model(cls, checkpoint, strict, **kwargs): def load_falcon_config(args) -> FalconConfig: """ Helper utility to load FalconConfig. - 7B and 40B are not compatible with `transformers.FalconConfig` and + Falcon-7B and Falcon-40B are not compatible with `transformers.FalconConfig` and `transformers.FalconModel`. need to manually set the config values and force to `falcon` model type. """ @@ -141,7 +134,7 @@ def load_falcon_config(args) -> FalconConfig: return config -def load_config(args): +def load_nemo_config(args): falcon_config = load_falcon_config(args) logging.info(f"falcon_config, {falcon_config}") nemo_config = OmegaConf.load( @@ -154,14 +147,18 @@ def load_config(args): nemo_config.max_position_embeddings = falcon_config.max_position_embeddings nemo_config.init_method_std = falcon_config.initializer_range nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon - if falcon_config.alibi: - raise ValueError("Alibi is not yet supported in Megatron Core") - else: + try: + if falcon_config.alibi: + raise ValueError("Alibi is not yet supported in Megatron Core, \ + force to use RoPE will generate suboptimal responses") + except ValueError as e: + print(e) + finally: nemo_config.position_embedding_type = 'rope' nemo_config.bias = falcon_config.bias nemo_config.hidden_dropout = falcon_config.hidden_dropout nemo_config.attention_dropout = falcon_config.attention_dropout - # need to map to nemo config as well. + # TODO: how does vocab_file, merge_file etc get mapped automatically in respect to variants of falcon models? tokenizer_dict = { 'library': 'huggingface', 'type': args.tokenizer_type, # FIXME: can it work from local args.input too, fix for falcon family? @@ -169,13 +166,12 @@ def load_config(args): nemo_config.tokenizer = tokenizer_dict ############################################## - # need refactor Mcore to support parallel attn + # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch #nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn #nemo_config.parallel_attention = falcon_config['parallel_attn'] ############################################### - #if hasattr(falcon_config,'num_kv_heads'): - if falcon_config.new_decoder_architecture or falcon_config.multi_query: - nemo_config.num_query_groups = falcon_config.num_kv_heads + + nemo_config.num_query_groups = falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None nemo_config.use_cpu_initialization = True nemo_config.activation = 'gelu' if falcon_config.rope_scaling is not None: @@ -183,7 +179,11 @@ def load_config(args): nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor else: raise ValueError("Only linear rope scaling type is supported now") - + + nemo_config.mcore_gpt = True + nemo_config.transformer_engine = True + nemo_config.bias_activation_fusion = False + base = 128 while falcon_config.vocab_size % base != 0: base //= 2 @@ -191,82 +191,64 @@ def load_config(args): return nemo_config +def determine_precision(args): + """Helper function to determine the precision of model + """ + if args.precision in ["32", "16"]: + return int(args.precision) + elif args.precision in ["bf16", "bf16-mixed"]: + if not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()): + logging.warning("BF16 is not supported on this device. Using FP16 instead.") + return args.precision[2:] # prune 'bf' from string + return args.precision + +def determine_dtype(precision): + dtype_map = { + "32": torch.float32, + "16": torch.float16, + "16-mixed": torch.float16, + "bf16": torch.bfloat16, + "bf16-mixed": torch.bfloat16 + } + return dtype_map.get(precision, torch.float32) # default to torch.float32 def convert(args): logging.info(f"loading checkpoint {args.in_file}") tik = time.time() model = AutoModelForCausalLM.from_pretrained(args.in_file, trust_remote_code=True) - #tokenizer = AutoTokenizer.from_pretrained(args.in_file) - hf_config = load_falcon_config(args) - - # print(f"hf_config: {hf_config}") - # print("named parameters:") - # for name, param in model.named_parameters(): - # print(f"- {name}") - - # add debug state dict list - hf_keys = list(model.state_dict().keys()) + falcon_config = load_falcon_config(args) + # debug + logging.debug(f"initial falcon_config, {falcon_config}") - nemo_config = load_config(args) - - if args.precision in ["32", "16"]: - precision = int(float(args.precision)) - elif args.precision in ["bf16", "bf16-mixed"]: - if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): - precision = args.precision - else: - logging.warning("BF16 is not supported on this device. Using FP16 instead.") - precision = args.precision[2:] # prune bf in string - else: - precision = args.precision + nemo_config = load_nemo_config(args) + precision = determine_precision(args) plugins = [] - if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32), - growth_interval=nemo_config.get('native_amp_growth_interval', 1000), - hysteresis=nemo_config.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - - if nemo_config.get('megatron_amp_O2', False): - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if precision == 32: - dtype = torch.float32 - elif precision in [16, "16", "16-mixed"]: - dtype = torch.float16 - elif precision == ["bf16", "bf16-mixed"]: - dtype = torch.bfloat16 - else: - dtype = torch.float32 # fallback + + if precision in ['16', '16-mixed', 'bf16', 'bf16-mixed']: + scaler_params = { + 'init_scale': nemo_config.get('native_amp_init_scale', 2 ** 32), + 'growth_interval': nemo_config.get('native_amp_growth_interval', 1000), + 'hysteresis': nemo_config.get('hysteresis', 2) + } + + plugin_precision = '16-mixed' if precision in ['16', '16-mixed'] else 'bf16-mixed' + scaler = GradScaler(**scaler_params) if precision in ['16', '16-mixed'] else None + dtype = determine_dtype(precision) nemo_config.precision = precision - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision) - hidden_size = hf_config.hidden_size - head_num = hf_config.num_attention_heads + hidden_size = falcon_config.hidden_size + head_num = falcon_config.num_attention_heads head_size = hidden_size // head_num - num_layers = hf_config.num_hidden_layers + num_layers = falcon_config.num_hidden_layers - nemo_config.mcore_gpt = True - nemo_config.transformer_engine = True - nemo_config.bias_activation_fusion = False - - logging.info(f"nemo_config {nemo_config}") - logging.info(f"mcore_gpt: {nemo_config.mcore_gpt}") - logging.info(f"transformer_engine: {nemo_config.transformer_engine}") - # assert nemo_config.mcore_gpt == nemo_config.get( - # 'transformer_engine', False - # ), "mcore_gpt transformer_engine must be enabled (or disabled) together." + # - MHA: num_heads = num_kv_heads + # - Multi-Query Attention: num_kv_heads = 1 + # - Grouped-Query Attention: num_heads % num_kv_heads = 0 + num_query_groups = nemo_config.num_query_groups if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num else head_num + assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})' param_to_weights = lambda param: param.float() @@ -281,10 +263,6 @@ def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm else: target_name = f"{target_prefix}.{weight_or_bias}" checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name]) - - # add debug remove mapped keys - if source_name in hf_keys: - hf_keys.remove(source_name) def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=False): add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm) @@ -293,34 +271,21 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight') - if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num: - num_query_groups = head_num - else: - num_query_groups = nemo_config.num_query_groups - assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})' - for l in range(int(num_layers)): print(f"converting layer {l}") prefix = f'transformer.h.{l}' - - # HF: [num_heads x 3 x head_dim, hidden_size], interleaved qkv weights - # Query types and expected kv heads. - # - MHA: num_heads = num_kv_heads - # - Multi-Query Attention: num_kv_heads = 1 - # - Grouped-Query Attention: num_heads % num_kv_heads = 0 - add_weight_and_possible_bias(f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv') add_weight_and_possible_bias(f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj') add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1') add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2') - if hf_config.new_decoder_architecture: + if falcon_config.new_decoder_architecture: add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) else: add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) - if not hf_config.parallel_attn: + if not falcon_config.parallel_attn: add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True) print(f"done layer {l}") @@ -331,10 +296,8 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals # LM weight add_to_checkpoint('lm_head', 'model.output_layer','weight') - if hf_keys: - logging.warning(f"Some keys in HuggingFace's model didn't get mapped to NeMo's state_dict: {hf_keys}") - checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config + logging.debug(f'final checkpoint, {checkpoint}') tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) @@ -342,14 +305,17 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals del model - model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) + #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) + model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() # cast to target precision and disable cpu init model = model.to(dtype=dtype) model.cfg.use_cpu_initialization = False - + # We make sure that the tokenizer can be instantiated later regardless of args.input + model.cfg.tokenizer.update(type=args.tokenizer_type) + # save model model.save_to(args.out_file) logging.info(f'NeMo model saved to: {args.out_file}') From 8297b5ccb9b4c489e25316777cf8181f24712ebe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Sep 2023 17:55:43 +0000 Subject: [PATCH 04/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../convert_hf_falcon_to_nemo.py | 124 ++++++++++++------ 1 file changed, 82 insertions(+), 42 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 99e7866e43a2..2fcaae6a0f75 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -30,8 +30,8 @@ ``` """ -import os import logging +import os import time from argparse import ArgumentParser from collections import OrderedDict @@ -40,8 +40,7 @@ from omegaconf import OmegaConf from pytorch_lightning.core.saving import _load_state as ptl_load_state from pytorch_lightning.trainer.trainer import Trainer -from transformers import AutoTokenizer, AutoModelForCausalLM, FalconConfig - +from transformers import AutoModelForCausalLM, AutoTokenizer, FalconConfig from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.parts.nlp_overrides import ( @@ -51,22 +50,28 @@ PipelineMixedPrecisionPlugin, ) - # TODO: # [Y] refactor ckpt func to make it cleaner # [Y] dict tokenizer mapping for falcon family # [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml # [ ] safetensors loading. (only 180b used safetensors) -# [Y] test on non parallel attention model (block by no alibi support? 1b-rw good, 7b-rw still some time) +# [Y] test on non parallel attention model (block by no alibi support? 1b-rw good, 7b-rw still some time) # [Y] hf config name mapping for falcon 7b and 40b. # [Y] trust remote code add # [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA) # [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error # [ ] remove unnecessary comments and codes. + def setup_logging(log_file="test_log.txt"): - logging.basicConfig(filename=log_file, level=logging.DEBUG, - format='%(asctime)s [%(levelname)s] - %(message)s', datefmt='%d-%b-%y %H:%M:%S') + logging.basicConfig( + filename=log_file, + level=logging.DEBUG, + format='%(asctime)s [%(levelname)s] - %(message)s', + datefmt='%d-%b-%y %H:%M:%S', + ) + + def get_args(): parser = ArgumentParser() parser.add_argument( @@ -74,7 +79,11 @@ def get_args(): ) parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.") parser.add_argument("--precision", type=str, default="32", help="Model precision") - parser.add_argument("--tokenizer-type", type=str, default="tiiuae/falcon-7b", help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'." + parser.add_argument( + "--tokenizer-type", + type=str, + default="tiiuae/falcon-7b", + help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'.", ) args = parser.parse_args() return args @@ -115,14 +124,14 @@ def load_falcon_config(args) -> FalconConfig: "num_hidden_layers": config.n_layer, "num_attention_heads": config.n_head, "num_kv_heads": config.n_head_kv, - "new_decoder_architecture": True + "new_decoder_architecture": True, } elif config.model_type == 'RefinedWebModel': mappings = { "num_hidden_layers": config.n_layer, "num_attention_heads": config.n_head, - "num_kv_heads": 1 if config.multi_query else config.n_head, - "new_decoder_architecture": False + "num_kv_heads": 1 if config.multi_query else config.n_head, + "new_decoder_architecture": False, } else: return config @@ -147,10 +156,12 @@ def load_nemo_config(args): nemo_config.max_position_embeddings = falcon_config.max_position_embeddings nemo_config.init_method_std = falcon_config.initializer_range nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon - try: + try: if falcon_config.alibi: - raise ValueError("Alibi is not yet supported in Megatron Core, \ - force to use RoPE will generate suboptimal responses") + raise ValueError( + "Alibi is not yet supported in Megatron Core, \ + force to use RoPE will generate suboptimal responses" + ) except ValueError as e: print(e) finally: @@ -163,15 +174,17 @@ def load_nemo_config(args): 'library': 'huggingface', 'type': args.tokenizer_type, # FIXME: can it work from local args.input too, fix for falcon family? } - + nemo_config.tokenizer = tokenizer_dict ############################################## # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch - #nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn - #nemo_config.parallel_attention = falcon_config['parallel_attn'] + # nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn + # nemo_config.parallel_attention = falcon_config['parallel_attn'] ############################################### - nemo_config.num_query_groups = falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None + nemo_config.num_query_groups = ( + falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None + ) nemo_config.use_cpu_initialization = True nemo_config.activation = 'gelu' if falcon_config.rope_scaling is not None: @@ -179,11 +192,11 @@ def load_nemo_config(args): nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor else: raise ValueError("Only linear rope scaling type is supported now") - + nemo_config.mcore_gpt = True nemo_config.transformer_engine = True nemo_config.bias_activation_fusion = False - + base = 128 while falcon_config.vocab_size % base != 0: base //= 2 @@ -191,6 +204,7 @@ def load_nemo_config(args): return nemo_config + def determine_precision(args): """Helper function to determine the precision of model """ @@ -202,16 +216,18 @@ def determine_precision(args): return args.precision[2:] # prune 'bf' from string return args.precision + def determine_dtype(precision): dtype_map = { "32": torch.float32, "16": torch.float16, "16-mixed": torch.float16, "bf16": torch.bfloat16, - "bf16-mixed": torch.bfloat16 + "bf16-mixed": torch.bfloat16, } return dtype_map.get(precision, torch.float32) # default to torch.float32 + def convert(args): logging.info(f"loading checkpoint {args.in_file}") tik = time.time() @@ -224,14 +240,14 @@ def convert(args): precision = determine_precision(args) plugins = [] - + if precision in ['16', '16-mixed', 'bf16', 'bf16-mixed']: scaler_params = { 'init_scale': nemo_config.get('native_amp_init_scale', 2 ** 32), 'growth_interval': nemo_config.get('native_amp_growth_interval', 1000), - 'hysteresis': nemo_config.get('hysteresis', 2) + 'hysteresis': nemo_config.get('hysteresis', 2), } - + plugin_precision = '16-mixed' if precision in ['16', '16-mixed'] else 'bf16-mixed' scaler = GradScaler(**scaler_params) if precision in ['16', '16-mixed'] else None @@ -241,20 +257,26 @@ def convert(args): hidden_size = falcon_config.hidden_size head_num = falcon_config.num_attention_heads - head_size = hidden_size // head_num + head_size = hidden_size // head_num num_layers = falcon_config.num_hidden_layers - + # - MHA: num_heads = num_kv_heads # - Multi-Query Attention: num_kv_heads = 1 # - Grouped-Query Attention: num_heads % num_kv_heads = 0 - num_query_groups = nemo_config.num_query_groups if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num else head_num - assert head_num % num_query_groups == 0, f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})' + num_query_groups = ( + nemo_config.num_query_groups + if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num + else head_num + ) + assert ( + head_num % num_query_groups == 0 + ), f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})' param_to_weights = lambda param: param.float() checkpoint = OrderedDict() checkpoint['state_dict'] = OrderedDict() - + def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm=False): source_name = f"{source_prefix}.{weight_or_bias}" if source_name in model.state_dict(): @@ -268,25 +290,43 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm) if f"{source_prefix}.bias" in model.state_dict(): add_to_checkpoint(source_prefix, target_prefix, 'bias', is_layernorm) - + add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight') for l in range(int(num_layers)): print(f"converting layer {l}") prefix = f'transformer.h.{l}' - - add_weight_and_possible_bias(f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv') - add_weight_and_possible_bias(f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj') + + add_weight_and_possible_bias( + f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv' + ) + add_weight_and_possible_bias( + f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj' + ) add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1') add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2') if falcon_config.new_decoder_architecture: - add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) - add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) + add_weight_and_possible_bias( + f'{prefix}.ln_attn', + f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', + is_layernorm=True, + ) + add_weight_and_possible_bias( + f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True + ) else: - add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) + add_weight_and_possible_bias( + f'{prefix}.input_layernorm', + f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', + is_layernorm=True, + ) if not falcon_config.parallel_attn: - add_weight_and_possible_bias(f'{prefix}.post_attention_layernorm', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True) + add_weight_and_possible_bias( + f'{prefix}.post_attention_layernorm', + f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', + is_layernorm=True, + ) print(f"done layer {l}") @@ -294,18 +334,18 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals add_weight_and_possible_bias('transformer.ln_f', 'model.decoder.final_layernorm') # LM weight - add_to_checkpoint('lm_head', 'model.output_layer','weight') - + add_to_checkpoint('lm_head', 'model.output_layer', 'weight') + checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config logging.debug(f'final checkpoint, {checkpoint}') - + tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) logging.info(f'Weights loaded. Total time: {t}') del model - #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) + # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() @@ -323,4 +363,4 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals if __name__ == '__main__': setup_logging() args = get_args() - convert(args) \ No newline at end of file + convert(args) From 2fc07a40ae19627637248ff03c8d48bd31044f77 Mon Sep 17 00:00:00 2001 From: vivian Date: Tue, 19 Sep 2023 20:05:42 +0000 Subject: [PATCH 05/69] fix for new architecture --- scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 99e7866e43a2..896310ecd3bd 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -183,6 +183,7 @@ def load_nemo_config(args): nemo_config.mcore_gpt = True nemo_config.transformer_engine = True nemo_config.bias_activation_fusion = False + nemo_config.bias_dropout_add_fusion = False base = 128 while falcon_config.vocab_size % base != 0: @@ -282,7 +283,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals if falcon_config.new_decoder_architecture: add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) - add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) + add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True) else: add_weight_and_possible_bias(f'{prefix}.input_layernorm', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) if not falcon_config.parallel_attn: @@ -306,7 +307,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals del model #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) - model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer) + model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() From 9bafd733698cd8e8ae59b4eecb24e54c0b99977a Mon Sep 17 00:00:00 2001 From: vivian Date: Tue, 19 Sep 2023 20:13:40 +0000 Subject: [PATCH 06/69] new transformerlayer for falcon --- .../language_modeling/megatron/__init__.py | 1 + .../megatron/falcon_mcore/falcon_gpt_model.py | 309 ++++++++++++++++++ .../falcon_mcore/falcon_transformer_block.py | 287 ++++++++++++++++ .../falcon_mcore/falcon_transformer_config.py | 273 ++++++++++++++++ .../falcon_mcore/falcon_transformer_layer.py | 271 +++++++++++++++ 5 files changed, 1141 insertions(+) create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py index 3afb1e3fae48..0a7cab62e240 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py @@ -16,6 +16,7 @@ try: from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel + from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_gpt_model import FalconGPTModel HAVE_MEGATRON_CORE = True except (ImportError, ModuleNotFoundError): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py new file mode 100644 index 000000000000..bcb904e7e4fc --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py @@ -0,0 +1,309 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# just copy paste here, need work +import logging +from typing import Literal, Optional + +import torch +from torch import Tensor + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding +from megatron.core.models.gpt.gpt_embedding import GPTEmbedding +from megatron.core.transformer.enums import AttnMaskType, ModelType +from megatron.core.transformer.module import MegatronModule +#from megatron.core.transformer.transformer_block import TransformerBlock +from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import FalconTransformerBlock +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint + + +class FalconGPTModel(MegatronModule): + """Transformer language model. + + Arguments: + config (TransformerConfig): transformer config + + vocab_size (int): vocabulary size + + max_sequence_length (int): maximum size of sequence. This is used for positional embedding + + pre_process (bool): Include embedding layer (used with pipeline parallelism) + post_process (bool): Include an output layer (used with pipeline parallelism) + + parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks + + share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are + shared. Defaults to False. + + position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope']. + Defaults is 'learned_absolute'. + + rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. + Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'. + + seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences. + The value must be a float larger than 1.0. Defaults to None. + """ + + def __init__( + self, + config: TransformerConfig, + vocab_size: int, + max_sequence_length: int, + pre_process: bool = True, + post_process: bool = True, + fp16_lm_cross_entropy: bool = False, + parallel_output: bool = True, + share_embeddings_and_output_weights: bool = False, + position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute', + rotary_percent: float = 1.0, + seq_len_interpolation_factor: Optional[float] = None, + ): + super(GPTModel, self).__init__(config=config) + + self.config: TransformerConfig = config + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + self.pre_process = pre_process + self.post_process = post_process + self.fp16_lm_cross_entropy = fp16_lm_cross_entropy + self.parallel_output = parallel_output + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + self.position_embedding_type = position_embedding_type + + # megatron core pipelining currently depends on model type + # TODO: remove this dependency ? + self.model_type = ModelType.encoder_or_decoder + + # Embeddings. + if self.pre_process: + self.embedding = GPTEmbedding( + config=self.config, + vocab_size=self.vocab_size, + max_sequence_length=self.max_sequence_length, + add_position_embedding=(self.position_embedding_type == 'learned_absolute'), + ) + + # Rotary Position Embeddings + if self.position_embedding_type == 'rope': + rotary_dim = self.config.kv_channels + if rotary_percent < 1.0: + rotary_dim = int(rotary_dim * rotary_percent) + + self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor) + else: + self.rotary_pos_emb = None + + # Transformer. + self.decoder = FalconTransformerBlock( + config=self.config, + self_attn_mask_type=AttnMaskType.causal, + pre_process=self.pre_process, + post_process=self.post_process, + ) + + # Output + if post_process: + self.output_layer = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + self.vocab_size, + config=config, + init_method=config.init_method, + bias=False, + skip_bias_add=False, + gather_output=not self.parallel_output, + skip_weight_param_allocation=self.pre_process + and self.share_embeddings_and_output_weights, + ) + + if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): + self.initialize_last_stage_with_word_embeddings() + + def set_input_tensor(self, input_tensor): + """ See megatron.model.transformer.set_input_tensor()""" + + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt' + self.decoder.set_input_tensor(input_tensor[0]) + + def forward( + self, + input_ids: Tensor, + position_ids: Tensor, + attention_mask: Tensor, + decoder_input: Tensor = None, + labels: Tensor = None, + inference_params=None, + ): + # If decoder_input is provided (not None), then input_ids and position_ids are ignored. + # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input. + + # Decoder embedding. + if decoder_input is not None: + pass + elif self.pre_process: + decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) + else: + # intermediate stage of pipeline + # decoder will get hidden_states from encoder.input_tensor + decoder_input = None + + # Rotary positional embeddings + rotary_pos_emb = None + if self.rotary_pos_emb is not None: + if inference_params is not None: + rotary_seq_len = inference_params.max_sequence_length + else: + if self.decoder.input_tensor is not None: + rotary_seq_len = self.decoder.input_tensor.size(0) + else: + rotary_seq_len = decoder_input.size(0) + + # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region + if self.config.sequence_parallel: + rotary_seq_len *= self.config.tensor_model_parallel_size + + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + + # Run decoder. + hidden_states = self.decoder( + hidden_states=decoder_input, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + if not self.post_process: + return hidden_states + + # logits and loss + output_weight = None + if self.share_embeddings_and_output_weights: + output_weight = self.shared_embedding_or_output_weight() + logits, _ = self.output_layer(hidden_states, weight=output_weight) + + if labels is None: + # [s b h] => [b s h] + return logits.transpose(0, 1).contiguous() + + # [b s] => [s b] + labels = labels.transpose(0, 1).contiguous() + loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels) + + # [s b] => [b, s] + loss = loss.transpose(0, 1).contiguous() + return loss + + def shared_embedding_or_output_weight(self): + if self.pre_process: + return self.embedding.word_embeddings.weight + elif self.post_process: + return self.output_layer.weight + return None + + def initialize_last_stage_with_word_embeddings(self): + + # This function just initializes the word embeddings in the final stage + # when we are using pipeline parallelism and sharing word + # embeddings. Nothing to do if we aren't sharing weights or aren't using + # pipeline parallelism. + if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process): + return + + if self.post_process and not self.pre_process: + assert not parallel_state.is_pipeline_first_stage() + # set word_embeddings weights to 0 here, then copy first + # stage's weights using all_reduce below. + self.output_layer.weight.data.fill_(0) + self.output_layer.weight.shared = True + + # Parameters are shared between the word embeddings layers, and the + # heads at the end of the model. In a pipelined setup with more than + # one stage, the initial embedding layer and the head are on different + # workers, so we do the following: + # 1. Create a second copy of word_embeddings on the last stage, with + # initial parameters of 0.0. + # 2. Do an all-reduce between the first and last stage to ensure that + # the two copies of word_embeddings start off with the same + # parameter values. + # 3. In the training loop, before an all-reduce between the grads of + # the two word_embeddings layers to ensure that every applied weight + # update is the same on both stages. + + # Ensure that first and last stages have the same initial parameter + # values. + if torch.distributed.is_initialized(): + if parallel_state.is_rank_in_embedding_group(): + weight = self.shared_embedding_or_output_weight() + torch.distributed.all_reduce( + weight.data, group=parallel_state.get_embedding_group() + ) + + elif not getattr(GPTModel, "embedding_warning_printed", False): + logging.getLogger(__name__).warning( + "Distributed processes aren't initialized, so the output layer " + "is not initialized with weights from the word embeddings. " + "If you are just manipulating a model this is fine, but " + "this needs to be handled manually. If you are training " + "something is definitely wrong." + ) + GPTModel.embedding_warning_printed = True + + def sharded_state_dict(self, prefix=''): + sharded_state_dict = {} + + if self.pre_process: + embedding_prefix = f'{prefix}embedding.' + embedding_sharded_state_dict = self.embedding.sharded_state_dict( + prefix=embedding_prefix + ) + sharded_state_dict.update(embedding_sharded_state_dict) + + decoder_prefix = f'{prefix}decoder.' + decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix) + sharded_state_dict.update(decoder_sharded_state_dict) + + if self.post_process: + output_layer_prefix = f'{prefix}output_layer.' + output_layer_key = f'{output_layer_prefix}weight' + if self.share_embeddings_and_output_weights: + if not self.pre_process: + # when sharing embeddings with last stage, we need to use the weights from the first stage + # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight + tensor = self.shared_embedding_or_output_weight() + first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' + dp_rank = parallel_state.get_data_parallel_rank() + dp_size = parallel_state.get_data_parallel_world_size() + last_stage_word_emb_replica_id = ( + dp_rank + dp_size + ) # copy of first stage embedding + + sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( + tensor=tensor, + key=first_stage_word_emb_key, + replica_id=last_stage_word_emb_replica_id, + allow_shape_mismatch=True, + ) + + sharded_state_dict[output_layer_key] = sharded_output_layer_tensor + + else: + output_layer_state_dict = self.output_layer.state_dict( + prefix=output_layer_prefix, keep_vars=True + ) + output_layer_tensor = output_layer_state_dict[output_layer_key] + # independent output layer + sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( + tensor=output_layer_tensor, + key=output_layer_key, + replica_id=parallel_state.get_data_parallel_rank(), + allow_shape_mismatch=True, + ) + + sharded_state_dict[output_layer_key] = sharded_output_layer_tensor + + return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py new file mode 100644 index 000000000000..30a80782b5ea --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py @@ -0,0 +1,287 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import re +from contextlib import nullcontext + +import torch + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.fusions.fused_layer_norm import FusedLayerNorm +from megatron.core.transformer.custom_layers.transformer_engine import TENorm +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig +# change import FalconTransformerLayer +from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import FalconTransformerLayer +from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor + + +class FalconTransformerBlock(MegatronModule): + """Transformer class.""" + + def __init__( + self, + config: TransformerConfig, + self_attn_mask_type=AttnMaskType.padding, + post_layer_norm=True, + pre_process=True, + post_process=True, + ): + super().__init__(config=config) + + self.config: TransformerConfig = config + + self.self_attn_mask_type = self_attn_mask_type + self.post_layer_norm = post_layer_norm + self.pre_process = pre_process + self.post_process = post_process + + # required for pipeline parallel schedules + self.input_tensor = None + + self.checkpoint_core_attention = self.config.recompute_granularity == 'selective' + + self.num_layers_per_pipeline_rank = ( + self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() + ) + + self._build_layers() + + def _build_layers(self): + # Transformer layers. + # @jcasper can we improve how we deal with layer_number? + # currently it's only used in CoreAttention? + # if self.apply_query_key_layer_scaling: + # coeff = self.layer_number + # self.norm_factor *= coeff + def build_layer(layer_number): + layer = FalconTransformerLayer( + config=self.config, + layer_number=layer_number, + self_attn_mask_type=self.self_attn_mask_type, + ) + return layer + + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + # Interleaved pipeline parallelism: + # Number of layers in each model chunk is the number of layers in the stage, + # divided by the number of model chunks in a stage. + # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0] [2] [4] [6] + # Stage 1: [1] [3] [5] [7] + # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0, 1] [4, 5] + # Stage 1: [2, 3] [6, 7] + + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size + + num_layers_to_build = num_layers_per_virtual_rank + + else: + # Non-interleaved pipeline parallelism: + # Each stage gets a contiguous set of layers. + + num_layers_to_build = self.num_layers_per_pipeline_rank + + # offset is implicit in TransformerLayer + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)]) + + # # TODO: add back standalone_embedding_stage + # if self.num_layers == 0: + # # When a standalone embedding stage is used (e.g., + # # args.standalone_embedding_stage == True), virtual pipeline ranks + # # on pipeline rank 0 will have zero transformer layers assigned to + # # them. This results in the model's input and output tensors to be + # # the same, which will cause failure for certain output tensor + # # optimizations (e.g., pipeline output deallocation). To remedy + # # this, we assign a 'no-op' layer on these ranks, which will + # # disconnect the input tensor from the output tensor. + # self.num_layers = 1 + # self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) + # else: + # self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)]) + + if self.post_process and self.post_layer_norm: + # Final layer norm before output. + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + persist_layer_norm=self.config.persist_layer_norm, + sequence_parallel=self.config.sequence_parallel, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + normalization=self.config.normalization, + ) + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb): + """Forward method with activation checkpointing.""" + + def custom(start, end): + def custom_forward(*args, **kwargs): + x_, *args = args + for index in range(start, end): + layer = self._get_layer(index) + x_ = layer(x_, *args, **kwargs) + return x_ + + return custom_forward + + if self.config.recompute_method == 'uniform': + # Uniformly divide the total number of Transformer layers and checkpoint + # the input activation of each divided chunk. + # A method to further reduce memory usage reducing checkpoints. + l = 0 + while l < self.num_layers_per_pipeline_rank: + hidden_states = tensor_parallel.checkpoint( + custom(l, l + self.config.recompute_num_layers), + self.config.distribute_saved_activations, + hidden_states, + attention_mask, + rotary_pos_emb, + ) + + l += self.config.recompute_num_layers + + elif self.config.recompute_method == 'block': + # Checkpoint the input activation of only a set number of individual + # Transformer layers and skip the rest. + # A method fully use the device memory removing redundant re-computation. + for l in range(self.num_layers_per_pipeline_rank): + if l < self.config.recompute_num_layers: + hidden_states = tensor_parallel.checkpoint( + custom(l, l + 1), + self.config.distribute_saved_activations, + hidden_states, + attention_mask, + rotary_pos_emb, + ) + else: + hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb) + else: + raise ValueError("Invalid activation recompute method.") + + return hidden_states + + def set_input_tensor(self, input_tensor): + """Set input tensor to be used instead of forward()'s input. + + When doing pipeline parallelism the input from the previous + stage comes from communication, not from the input, so the + model's forward_step_func won't have it. This function is thus + used by internal code to bypass the input provided by the + forward_step_func""" + self.input_tensor = input_tensor + + def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None): + # hidden_states (float): [s, b, h] + # attention_mask (bool): [1, 1, s, s] + + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + # Viewless tensor. + # - We only need to create a viewless tensor in the case of micro batch + # size (mbs) == 1, since in this case, 'hidden_states.transpose()' + # above creates a view tensor, and '.contiguous()' is a pass-through. + # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating + # the need to make it viewless. + # + # However, we don't explicitly check mbs == 1 here because + # make_viewless_tensor() has negligible overhead when its input + # is already viewless. + # + # - For the 'else' case above, calling make_viewless_tensor() here is + # likely redundant, since p2p_communication.py (likely originator) + # already creates viewless tensors. That said, make_viewless_tensor() + # is called here to be future-proof and corner-case-proof. + hidden_states = make_viewless_tensor( + inp=hidden_states, requires_grad=True, keep_graph=True, + ) + + if self.config.sequence_parallel: + rng_context = tensor_parallel.get_cuda_rng_tracker().fork() + else: + rng_context = nullcontext() + + if self.config.fp8: + import transformer_engine # To keep out TE dependency when not training in fp8 + + if self.config.fp8 == "e4m3": + fp8_format = transformer_engine.common.recipe.Format.E4M3 + elif self.config.fp8 == "hybrid": + fp8_format = transformer_engine.common.recipe.Format.HYBRID + else: + raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.") + + fp8_recipe = transformer_engine.common.recipe.DelayedScaling( + margin=self.config.fp8_margin, + interval=self.config.fp8_interval, + fp8_format=fp8_format, + amax_compute_algo=self.config.fp8_amax_compute_algo, + amax_history_len=self.config.fp8_amax_history_len, + override_linear_precision=(False, False, not self.config.fp8_wgrad), + ) + fp8_group = None + if parallel_state.model_parallel_is_initialized(): + fp8_group = parallel_state.get_amax_reduction_group() + fp8_context = transformer_engine.pytorch.fp8_autocast( + enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group + ) + else: + fp8_context = nullcontext() + + with rng_context and fp8_context: + # Forward pass. + if self.config.recompute_granularity == 'full': + hidden_states = self._checkpointed_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + ) + else: + for layer in self.layers: + hidden_states = layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + inference_params=inference_params, + ) + + # Final layer norm. + if self.post_process and self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states + + def sharded_state_dict(self, prefix=''): + + sharded_state_dict = {} + + layer_prefix = f'{prefix}layers.' + for layer in self.layers: + sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix)) + + if self.post_process and self.post_layer_norm: + state_dict = self.state_dict(keep_vars=True) + + tensor = state_dict['final_layernorm.weight'] + layer_name = f'{prefix}final_layernorm.weight' + sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) + + # RMSNorm doesn't have bias. + if 'final_layernorm.bias' in state_dict.keys(): + tensor = state_dict['final_layernorm.bias'] + layer_name = f'{prefix}final_layernorm.bias' + sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint( + tensor, layer_name + ) + + return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py new file mode 100644 index 000000000000..cb980dad1b5f --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py @@ -0,0 +1,273 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# just copy paste here, need work +from dataclasses import dataclass +from typing import Callable + +import torch +import torch.nn.functional as F + +from megatron.core import ModelParallelConfig +from megatron.core.utils import init_method_normal, scaled_init_method_normal + + +@dataclass +class TransformerConfig(ModelParallelConfig): + """Configuration object for megatron-core transformers. + + Attributes: + + # model architecture + num_layers (int): Number of transformer layers in a transformer block. + hidden_size (int): Transformer hidden size. + ffn_hidden_size (int): Transformer Feed-Forward Network hidden size. + This is set to 4*hidden_size if not provided. Defaults to None.') + num_attention_heads (int): Number of transformer attention heads. + kv_channels (int): Projection weights dimension in multi-head attention. + This is set to hidden_size // num_attention_heads if not provided. + Defaults to None. + num_query_groups (int): Number of query groups for group query attention. If None, normal attention is used. + + hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1. + attention_dropout (float): Post attention dropout probability. Defaults to 0.1. + fp32_residual_connection (bool): If true, move residual connections to fp32. + apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering. + Defaults to False. + layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5. + + layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values + around 0. This improves numerical stability. Defaults to False. + + add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two + in MLP layer). Default is True. + + gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False. + + activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu. + + # initialization + init_method (Callable): Method to initialize weights. Note that bias is always set to + zero. Should be a function that takes a single Tensor and + initializes it. Defaults to + megatron.core.utils.init_method_normal(init_method_std) which is + torch.nn.init.normal_ with mean=0.0 and std=init_method_Std. + + output_layer_init_method (Callable): Method to initialize weights of the output layer of + both attention and MLP blocks. Defaults to + megatron.core.utils.scaled_init_method_normal(init_method_std) + which is torch.nn.init.normal_ with mean=0.0 and + std=init_method_std / math.sqrt(2.0 * num_layers). + + init_method_std (float): Standard deviation of the zero mean normal for the default + initialization method, not used if init_method and + output_layer_init_method are provided. Defaults to 0.02. + + # mixed-precision + apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True. + attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32. + This should be true if apply_query_key_layer_scaling is true. + + # fusion + bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False. + masked_softmax_fusion (bool): If true, uses softmax fusion. + persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel. + This kernel only supports a fixed set of hidden sizes. + Defaults to False. + bias_dropout_fusion (bool): If true, uses bias dropout fusion. + + # activation recomputation + + recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory + intensive part of attention is checkpointed. These memory intensive activations + are also less compute intensive which makes activation checkpointing more efficient + for LLMs (20B+). See Reducing Activation Recomputation in Large Transformer + Models: https://arxiv.org/abs/2205.05198 for more details. 'full' will checkpoint + the entire transformer layer. Must be 'selective' or 'full'. 'selective' always uses all layers. + Defaults to None. + + recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer + block and recompute the input activation of each divided chunk at the specified + granularity. block will recompute the input activations for only a set number of + transformer layers per pipeline stage. The rest of the layers in the pipeline stage + will not have any activations recomputed. Must be 'uniform' or 'block'. Defaults to + None. + + recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer + layers in each uniformly divided recompute unit. When recompute_method is block, + recompute_num_layers is the number of transformer layers to recompute within each + pipeline stage. Must be None for 'selective' activation checkpointing. Defaults to None. + + distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel + group. Defaults to None. + + # fp8 related (via Transformer Engine). For detailed info, refer the the Transformer Engine docs at + # https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html + + fp8 (str): If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined choices: (1) 'e4m3' + uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 activation and weight tensors and + e5m2 for all FP8 output activation gradient tensors. Defaults to None. + + fp8_margin (int): Margin for the scaling factor computation. + + fp8_interval (int): Controls how often the scaling factor is recomputed. + + fp8_amax_history_len (int): The length of the amax history window used for scaling factor computation. + + fp8_amax_compute_algo (str): Algorithm used for choosing the `amax` value for the scaling factor computation. + There are 2 predefined choices: `max` chooses the largest `amax` in the history + window, while `most_recent` always chooses the most recently seen value. + + fp8_wgrad (bool): When set to False, override FP8 config options and do the wgrad computation in higher precision. + Defaults to True. + + # Experimental + normalization (str): Swtich b/w `LayerNorm` and `RMSNorm` as normalization layers. For now, these are primarily + used by Transformer-Engine's layers like `LayerNormLinear`. Default value is `LayerNorm`. + + + """ + + # model architecture + num_layers: int = 0 + hidden_size: int = 0 + num_attention_heads: int = 0 + num_query_groups: int = None + + ffn_hidden_size: int = None + kv_channels: int = None + hidden_dropout: float = 0.1 + attention_dropout: float = 0.1 + fp32_residual_connection: bool = False + # @jcasper should we keep this option? + apply_residual_connection_post_layernorm: bool = False + layernorm_epsilon: float = 1e-5 + layernorm_zero_centered_gamma: bool = False + add_bias_linear: bool = True + gated_linear_unit: bool = False + activation_func: Callable = F.gelu + + # initialization + init_method: Callable = None + output_layer_init_method: Callable = None + init_method_std: float = 0.02 + + # mixed-precision + apply_query_key_layer_scaling: bool = True + attention_softmax_in_fp32: bool = True + + # communication + + # fusion + bias_gelu_fusion: bool = False # TODO: this should be bias_activation_fusion ? + masked_softmax_fusion: bool = False + persist_layer_norm: bool = False + bias_dropout_fusion: bool = False # TODO: this should be bias_dropout_add_fusion? + + # activation recomputation + recompute_granularity: str = None + recompute_method: str = None + recompute_num_layers: int = None + distribute_saved_activations: bool = None + + # fp8 related + fp8: str = None + fp8_margin: int = 0 + fp8_interval: int = 1 + fp8_amax_history_len: int = 1 + fp8_amax_compute_algo: str = "most_recent" + fp8_wgrad: bool = True + + # experimental section (TODO: move to apt. section above once stable) + normalization: bool = "LayerNorm" # alt value supported by TE: "RMSNorm" + + def __post_init__(self): + """ Python dataclass method that is used to modify attributes after initialization. + See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details. + """ + super().__post_init__() + if self.fp16 and self.bf16: + raise ValueError( + f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.' + ) + + if self.num_attention_heads % self.tensor_model_parallel_size != 0: + raise ValueError( + f"num_attention_heads ({self.num_attention_heads}) must be a multiple of " + f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." + ) + + if self.ffn_hidden_size is None: + self.ffn_hidden_size = 4 * self.hidden_size + + if self.kv_channels is None: + self.kv_channels = self.hidden_size // self.num_attention_heads + + if self.num_query_groups is None: + self.num_query_groups = self.num_attention_heads + + if self.num_query_groups % self.tensor_model_parallel_size != 0: + raise ValueError( + f"num_query_groups ({self.num_query_groups}) must be a multiple of " + f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." + ) + + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + + if self.recompute_granularity is not None: + if not self.recompute_granularity in ['full', 'selective']: + raise ValueError( + f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".' + ) + + if self.recompute_method is not None: + if not self.recompute_method in ['block', 'uniform']: + raise ValueError( + f'recompute_method: {self.recompute_method} must be "block" or "uniform".' + ) + elif self.recompute_granularity != 'selective': + raise ValueError( + f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"' + ) + + if self.recompute_granularity != 'selective' and self.recompute_num_layers is None: + raise ValueError( + f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between ' + f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}' + ) + elif ( + self.recompute_granularity == 'selective' and self.recompute_num_layers is not None + ): + raise ValueError( + f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.' + ) + + if self.distribute_saved_activations and self.sequence_parallel: + raise ValueError( + f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel}' + ) + + if self.virtual_pipeline_model_parallel_size is not None: + if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0: + raise ValueError( + f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}' + ) + + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + + if self.bias_gelu_fusion: + if not self.add_bias_linear: + raise ValueError( + "When bias_gelu_fusion is True, add_bias_linear must also be True." + ) + + if self.activation_func != F.gelu: + raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.') + + if self.init_method is None: + self.init_method = init_method_normal(self.init_method_std) + + if self.output_layer_init_method is None: + self.output_layer_init_method = scaled_init_method_normal( + self.init_method_std, self.num_layers + ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py new file mode 100644 index 000000000000..9bd40a84376f --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py @@ -0,0 +1,271 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import re + +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +#from megatron.core.transformer.attention import SelfAttention +# change attention due to extra layernorm before mlp, ln_mlp. +from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention +from megatron.core.transformer.custom_layers.transformer_engine import TENorm +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.mlp import MLP +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_viewless_tensor + +""" We use the following notation throughout this file: + h: hidden size + n: number of attention heads + p: number of model parallel partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + l: number of layers + Transformer takes input of size [s, b, h] and returns a + tensor of the same size. We use the following arguments: + hyperparameters: transformer hyperparameters +""" + +class FalconTransformerLayer(MegatronModule): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + + Args: + new_decoder_architecture (bool): + Whether to use Falcon's new decoder architecture that were used in 7B/40B/180B variants. + + parallel_attention (bool): + Whether to use parallel attention, which computes attention in parallel with feed forward layer. + + """ + + def __init__( + self, + config: TransformerConfig, + layer_number: int = 1, + self_attn_mask_type=AttnMaskType.padding, + parallel_attention=False, + new_decoder_architecture=False, + ): + super().__init__(config=config) + self.config: TransformerConfig = config + + self.layer_number = layer_number + self._get_layer_offset() + + self.self_attn_mask_type = self_attn_mask_type + + self.new_decoder_architecture = new_decoder_architecture + self.parallel_attention = parallel_attention + + # Layernorm on the input data. + # TODO: add pytorch only layernorm + self.input_layernorm = self._create_identity_op() + + self.mlp_layernorm = self._create_identity_op() if self.new_decoder_architecture else None + + if self.new_decoder_architecture or self.parallel_attention: + self.post_self_attn_layernorm = None + else: + # Layernorm on the attention output + self.post_self_attn_layernorm = self._create_identity_op() + + # Self attention. + self.self_attention = SelfAttention( + config=self.config, layer_number=layer_number, attn_mask_type=self_attn_mask_type, + ) + + # MLP + self.mlp = MLP(config=self.config) + + # @jcasper how should we handle nvfuser? + # Set bias+dropout+add fusion grad_enable execution handler. + # TORCH_MAJOR = int(torch.__version__.split('.')[0]) + # TORCH_MINOR = int(torch.__version__.split('.')[1]) + # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) + # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad + self.bias_dropout_add_exec_handler = torch.enable_grad + + def _create_identity_op(self): + """Helper function to create an IdentityOp with common parameters.""" + return IdentityOp( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + persist_layer_norm=self.config.persist_layer_norm, + sequence_parallel=self.config.sequence_parallel, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + normalization=self.config.normalization, + ) + + def _get_layer_offset(self): + + pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() + + num_layers_per_pipeline_rank = ( + self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() + ) + + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank() + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + total_num_layers = self.config.num_layers + num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size + total_virtual_chunks = total_num_layers // vp_size + offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank) + + else: + # Each stage gets a contiguous set of layers. + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + offset = pipeline_rank * num_layers_per_pipeline_rank + else: + offset = 0 + + return offset + + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + enc_dec_attn_mask=None, + inference_params=None, + rotary_pos_emb=None, + ): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + input_mlp_ln = layernorm_output + + # Self attention. + attention_output_with_bias = self.self_attention( + layernorm_output, + attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + # Residual connection. + if self.config.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + # falcon specific + if self.new_decoder_architecture: + mlp_ln_output = self.mlp_layernorm(hidden_states) + + bias_dropout_add_func = get_bias_dropout_add(self.training, self.config.bias_dropout_fusion) + + # bias_dropout_add fusion returning fp32 instead of bf16 + with self.bias_dropout_add_exec_handler(): + layernorm_input = bias_dropout_add_func( + attention_output_with_bias, residual, self.config.hidden_dropout + ) + + # falcon specific + if not self.new_decoder_architecture: + if self.parallel_attention: + layernorm_output = input_mlp_ln + else: + layernorm_output = self.post_self_attn_layernorm(layernorm_input) + residual = layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output + else: + layernorm_output = mlp_ln_output + + # MLP. + mlp_output_with_bias = self.mlp(layernorm_output) + + # falcon specific: + if self.new_decoder_architecture or self.parallel_attention: + mlp_output_with_bias = mlp_output_with_bias + attention_output_with_bias + + with self.bias_dropout_add_exec_handler(): + output = bias_dropout_add_func( + mlp_output_with_bias, residual, self.config.hidden_dropout + ) + + # Jit compiled function creates 'view' tensor. This tensor + # potentially gets saved in the MPU checkpoint function context, + # which rejects view tensors. While making a viewless tensor here + # won't result in memory savings (like the data loader, or + # p2p_communication), it serves to document the origin of this + # 'view' tensor. + output = make_viewless_tensor( + inp=output, requires_grad=output.requires_grad, keep_graph=True + ) + + return output + + def sharded_state_dict(self, prefix=''): + + # state_dict = self.state_dict(prefix=prefix, keep_vars=True) + state_dict = self.state_dict(keep_vars=True) + + tensor_parallel_layers_axis_map = { + 'self_attention.linear_qkv.weight': 0, + 'self_attention.linear_qkv.bias': 0, + 'self_attention.linear_proj.weight': 1, + 'mlp.linear_fc1.weight': 0, + 'mlp.linear_fc1.bias': 0, + 'mlp.linear_fc2.weight': 1, + } + + offset = self._get_layer_offset() + num_layers = self.config.num_layers + + sharded_state_dict = {} + + for layer_name in state_dict.keys(): + tensor = state_dict[layer_name] + global_layer_offset = self.layer_number - 1 # self.layer_number starts at 1 + layer_key = f'{prefix}{global_layer_offset - offset}.{layer_name}' # module list index in TransformerBlock + sharded_offsets = [(0, global_layer_offset, num_layers)] # PP sharding + + if layer_name in tensor_parallel_layers_axis_map: + tp_axis = tensor_parallel_layers_axis_map[layer_name] + # TP sharding + sharded_offsets.append( + [ + tp_axis + 1, # +1 for PP dimension + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_tensor_model_parallel_world_size(), + ] + ) + replica_id = parallel_state.get_data_parallel_rank() + else: + replica_id = ( + parallel_state.get_data_parallel_rank() + * parallel_state.get_data_parallel_world_size() + + parallel_state.get_tensor_model_parallel_rank() + ) + + if layer_name.endswith('._extra_state'): + sharded_state_dict[layer_key] = ShardedObject( + f'{prefix}{layer_name}', + tensor, + (num_layers,), + (global_layer_offset,), + replica_id, + ) + + else: + sharded_state_dict[layer_key] = ShardedTensor.from_rank_offsets( + f'{prefix}{layer_name}', + tensor, + *sharded_offsets, + replica_id=replica_id, + prepend_axis_num=1, # for PP sharding + ) + + return sharded_state_dict From 36fe312249bb2938df6013b2743b8997c1bc1ab8 Mon Sep 17 00:00:00 2001 From: vivian Date: Tue, 19 Sep 2023 21:06:10 +0000 Subject: [PATCH 07/69] fix for new decoder architecture --- .../convert_hf_falcon_to_nemo.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index b4f98d639a5e..aff5901b3f88 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -196,12 +196,8 @@ def load_nemo_config(args): nemo_config.mcore_gpt = True nemo_config.transformer_engine = True nemo_config.bias_activation_fusion = False -<<<<<<< HEAD nemo_config.bias_dropout_add_fusion = False -======= - ->>>>>>> 8297b5ccb9b4c489e25316777cf8181f24712ebe base = 128 while falcon_config.vocab_size % base != 0: base //= 2 @@ -312,19 +308,8 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2') if falcon_config.new_decoder_architecture: -<<<<<<< HEAD add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True) -======= - add_weight_and_possible_bias( - f'{prefix}.ln_attn', - f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', - is_layernorm=True, - ) - add_weight_and_possible_bias( - f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True - ) ->>>>>>> 8297b5ccb9b4c489e25316777cf8181f24712ebe else: add_weight_and_possible_bias( f'{prefix}.input_layernorm', @@ -355,13 +340,8 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals del model -<<<<<<< HEAD #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer) -======= - # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) - model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], strict=False, trainer=trainer) ->>>>>>> 8297b5ccb9b4c489e25316777cf8181f24712ebe model._save_restore_connector = NLPSaveRestoreConnector() From 044026d809c7fe23931df63b3a08f6de292e2c61 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 21:07:50 +0000 Subject: [PATCH 08/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language_modeling/megatron/__init__.py | 2 +- .../megatron/falcon_mcore/falcon_gpt_model.py | 29 ++++----- .../falcon_mcore/falcon_transformer_block.py | 24 +++----- .../falcon_mcore/falcon_transformer_config.py | 20 ++---- .../falcon_mcore/falcon_transformer_layer.py | 61 ++++++++----------- .../convert_hf_falcon_to_nemo.py | 14 +++-- 6 files changed, 61 insertions(+), 89 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py index 0a7cab62e240..5dec9388528e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py @@ -15,8 +15,8 @@ # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel try: - from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_gpt_model import FalconGPTModel + from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel HAVE_MEGATRON_CORE = True except (ImportError, ModuleNotFoundError): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py index bcb904e7e4fc..b76b3d8828ee 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py @@ -4,17 +4,19 @@ from typing import Literal, Optional import torch -from torch import Tensor - from megatron.core import parallel_state, tensor_parallel from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding from megatron.core.models.gpt.gpt_embedding import GPTEmbedding from megatron.core.transformer.enums import AttnMaskType, ModelType from megatron.core.transformer.module import MegatronModule -#from megatron.core.transformer.transformer_block import TransformerBlock -from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import FalconTransformerBlock from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint +from torch import Tensor + +# from megatron.core.transformer.transformer_block import TransformerBlock +from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import ( + FalconTransformerBlock, +) class FalconGPTModel(MegatronModule): @@ -112,8 +114,7 @@ def __init__( bias=False, skip_bias_add=False, gather_output=not self.parallel_output, - skip_weight_param_allocation=self.pre_process - and self.share_embeddings_and_output_weights, + skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights, ) if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): @@ -239,9 +240,7 @@ def initialize_last_stage_with_word_embeddings(self): if torch.distributed.is_initialized(): if parallel_state.is_rank_in_embedding_group(): weight = self.shared_embedding_or_output_weight() - torch.distributed.all_reduce( - weight.data, group=parallel_state.get_embedding_group() - ) + torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group()) elif not getattr(GPTModel, "embedding_warning_printed", False): logging.getLogger(__name__).warning( @@ -258,9 +257,7 @@ def sharded_state_dict(self, prefix=''): if self.pre_process: embedding_prefix = f'{prefix}embedding.' - embedding_sharded_state_dict = self.embedding.sharded_state_dict( - prefix=embedding_prefix - ) + embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix) sharded_state_dict.update(embedding_sharded_state_dict) decoder_prefix = f'{prefix}decoder.' @@ -278,9 +275,7 @@ def sharded_state_dict(self, prefix=''): first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' dp_rank = parallel_state.get_data_parallel_rank() dp_size = parallel_state.get_data_parallel_world_size() - last_stage_word_emb_replica_id = ( - dp_rank + dp_size - ) # copy of first stage embedding + last_stage_word_emb_replica_id = dp_rank + dp_size # copy of first stage embedding sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( tensor=tensor, @@ -292,9 +287,7 @@ def sharded_state_dict(self, prefix=''): sharded_state_dict[output_layer_key] = sharded_output_layer_tensor else: - output_layer_state_dict = self.output_layer.state_dict( - prefix=output_layer_prefix, keep_vars=True - ) + output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True) output_layer_tensor = output_layer_state_dict[output_layer_key] # independent output layer sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py index 30a80782b5ea..d3b9ae63e4ac 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py @@ -4,17 +4,19 @@ from contextlib import nullcontext import torch - from megatron.core import parallel_state, tensor_parallel from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.transformer.custom_layers.transformer_engine import TENorm from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.transformer_config import TransformerConfig -# change import FalconTransformerLayer -from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import FalconTransformerLayer from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor +# change import FalconTransformerLayer +from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import ( + FalconTransformerLayer, +) + class FalconTransformerBlock(MegatronModule): """Transformer class.""" @@ -56,9 +58,7 @@ def _build_layers(self): # self.norm_factor *= coeff def build_layer(layer_number): layer = FalconTransformerLayer( - config=self.config, - layer_number=layer_number, - self_attn_mask_type=self.self_attn_mask_type, + config=self.config, layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type, ) return layer @@ -202,9 +202,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p # likely redundant, since p2p_communication.py (likely originator) # already creates viewless tensors. That said, make_viewless_tensor() # is called here to be future-proof and corner-case-proof. - hidden_states = make_viewless_tensor( - inp=hidden_states, requires_grad=True, keep_graph=True, - ) + hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,) if self.config.sequence_parallel: rng_context = tensor_parallel.get_cuda_rng_tracker().fork() @@ -242,9 +240,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p # Forward pass. if self.config.recompute_granularity == 'full': hidden_states = self._checkpointed_forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - rotary_pos_emb=rotary_pos_emb, + hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb, ) else: for layer in self.layers: @@ -280,8 +276,6 @@ def sharded_state_dict(self, prefix=''): if 'final_layernorm.bias' in state_dict.keys(): tensor = state_dict['final_layernorm.bias'] layer_name = f'{prefix}final_layernorm.bias' - sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint( - tensor, layer_name - ) + sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py index cb980dad1b5f..e804a6228e70 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py @@ -185,9 +185,7 @@ def __post_init__(self): """ super().__post_init__() if self.fp16 and self.bf16: - raise ValueError( - f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.' - ) + raise ValueError(f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.') if self.num_attention_heads % self.tensor_model_parallel_size != 0: raise ValueError( @@ -221,9 +219,7 @@ def __post_init__(self): if self.recompute_method is not None: if not self.recompute_method in ['block', 'uniform']: - raise ValueError( - f'recompute_method: {self.recompute_method} must be "block" or "uniform".' - ) + raise ValueError(f'recompute_method: {self.recompute_method} must be "block" or "uniform".') elif self.recompute_granularity != 'selective': raise ValueError( f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"' @@ -234,9 +230,7 @@ def __post_init__(self): f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between ' f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}' ) - elif ( - self.recompute_granularity == 'selective' and self.recompute_num_layers is not None - ): + elif self.recompute_granularity == 'selective' and self.recompute_num_layers is not None: raise ValueError( f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.' ) @@ -257,9 +251,7 @@ def __post_init__(self): if self.bias_gelu_fusion: if not self.add_bias_linear: - raise ValueError( - "When bias_gelu_fusion is True, add_bias_linear must also be True." - ) + raise ValueError("When bias_gelu_fusion is True, add_bias_linear must also be True.") if self.activation_func != F.gelu: raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.') @@ -268,6 +260,4 @@ def __post_init__(self): self.init_method = init_method_normal(self.init_method_std) if self.output_layer_init_method is None: - self.output_layer_init_method = scaled_init_method_normal( - self.init_method_std, self.num_layers - ) + self.output_layer_init_method = scaled_init_method_normal(self.init_method_std, self.num_layers) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py index 9bd40a84376f..e592a1e0b1ac 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py @@ -3,13 +3,9 @@ import re import torch - from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -#from megatron.core.transformer.attention import SelfAttention -# change attention due to extra layernorm before mlp, ln_mlp. -from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention from megatron.core.transformer.custom_layers.transformer_engine import TENorm from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp @@ -18,6 +14,10 @@ from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import make_viewless_tensor +# from megatron.core.transformer.attention import SelfAttention +# change attention due to extra layernorm before mlp, ln_mlp. +from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention + """ We use the following notation throughout this file: h: hidden size n: number of attention heads @@ -33,6 +33,7 @@ hyperparameters: transformer hyperparameters """ + class FalconTransformerLayer(MegatronModule): """A single transformer layer. @@ -62,16 +63,16 @@ def __init__( self.layer_number = layer_number + self._get_layer_offset() self.self_attn_mask_type = self_attn_mask_type - + self.new_decoder_architecture = new_decoder_architecture self.parallel_attention = parallel_attention - + # Layernorm on the input data. # TODO: add pytorch only layernorm self.input_layernorm = self._create_identity_op() - + self.mlp_layernorm = self._create_identity_op() if self.new_decoder_architecture else None - + if self.new_decoder_architecture or self.parallel_attention: self.post_self_attn_layernorm = None else: @@ -93,7 +94,7 @@ def __init__( # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad self.bias_dropout_add_exec_handler = torch.enable_grad - + def _create_identity_op(self): """Helper function to create an IdentityOp with common parameters.""" return IdentityOp( @@ -142,26 +143,23 @@ def forward( rotary_pos_emb=None, ): # hidden_states: [s, b, h] - + # Layer norm at the beginning of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) input_mlp_ln = layernorm_output - + # Self attention. attention_output_with_bias = self.self_attention( - layernorm_output, - attention_mask, - inference_params=inference_params, - rotary_pos_emb=rotary_pos_emb, + layernorm_output, attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb, ) - + # Residual connection. if self.config.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = hidden_states - - # falcon specific + + # falcon specific if self.new_decoder_architecture: mlp_ln_output = self.mlp_layernorm(hidden_states) @@ -169,9 +167,7 @@ def forward( # bias_dropout_add fusion returning fp32 instead of bf16 with self.bias_dropout_add_exec_handler(): - layernorm_input = bias_dropout_add_func( - attention_output_with_bias, residual, self.config.hidden_dropout - ) + layernorm_input = bias_dropout_add_func(attention_output_with_bias, residual, self.config.hidden_dropout) # falcon specific if not self.new_decoder_architecture: @@ -179,7 +175,9 @@ def forward( layernorm_output = input_mlp_ln else: layernorm_output = self.post_self_attn_layernorm(layernorm_input) - residual = layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output + residual = ( + layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output + ) else: layernorm_output = mlp_ln_output @@ -189,11 +187,9 @@ def forward( # falcon specific: if self.new_decoder_architecture or self.parallel_attention: mlp_output_with_bias = mlp_output_with_bias + attention_output_with_bias - + with self.bias_dropout_add_exec_handler(): - output = bias_dropout_add_func( - mlp_output_with_bias, residual, self.config.hidden_dropout - ) + output = bias_dropout_add_func(mlp_output_with_bias, residual, self.config.hidden_dropout) # Jit compiled function creates 'view' tensor. This tensor # potentially gets saved in the MPU checkpoint function context, @@ -201,9 +197,7 @@ def forward( # won't result in memory savings (like the data loader, or # p2p_communication), it serves to document the origin of this # 'view' tensor. - output = make_viewless_tensor( - inp=output, requires_grad=output.requires_grad, keep_graph=True - ) + output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True) return output @@ -245,18 +239,13 @@ def sharded_state_dict(self, prefix=''): replica_id = parallel_state.get_data_parallel_rank() else: replica_id = ( - parallel_state.get_data_parallel_rank() - * parallel_state.get_data_parallel_world_size() + parallel_state.get_data_parallel_rank() * parallel_state.get_data_parallel_world_size() + parallel_state.get_tensor_model_parallel_rank() ) if layer_name.endswith('._extra_state'): sharded_state_dict[layer_key] = ShardedObject( - f'{prefix}{layer_name}', - tensor, - (num_layers,), - (global_layer_offset,), - replica_id, + f'{prefix}{layer_name}', tensor, (num_layers,), (global_layer_offset,), replica_id, ) else: diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index aff5901b3f88..d3de7e128782 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -197,7 +197,7 @@ def load_nemo_config(args): nemo_config.transformer_engine = True nemo_config.bias_activation_fusion = False nemo_config.bias_dropout_add_fusion = False - + base = 128 while falcon_config.vocab_size % base != 0: base //= 2 @@ -308,8 +308,14 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2') if falcon_config.new_decoder_architecture: - add_weight_and_possible_bias(f'{prefix}.ln_attn', f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', is_layernorm=True) - add_weight_and_possible_bias(f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True) + add_weight_and_possible_bias( + f'{prefix}.ln_attn', + f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', + is_layernorm=True, + ) + add_weight_and_possible_bias( + f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True + ) else: add_weight_and_possible_bias( f'{prefix}.input_layernorm', @@ -340,7 +346,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals del model - #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) + # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() From 908004e0d81e47ceabe2e7319bbf299167e78407 Mon Sep 17 00:00:00 2001 From: Vivian Date: Thu, 21 Sep 2023 21:59:24 +0000 Subject: [PATCH 09/69] add DDP --- .../convert_hf_falcon_to_nemo.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index d3de7e128782..6dfada1c3580 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -46,6 +46,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, + NLPDDPStrategy, NLPSaveRestoreConnector, PipelineMixedPrecisionPlugin, ) @@ -60,6 +61,7 @@ # [Y] trust remote code add # [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA) # [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error +# [Y] update save_to and restore_from for dist checkpointing # [ ] remove unnecessary comments and codes. @@ -178,8 +180,8 @@ def load_nemo_config(args): nemo_config.tokenizer = tokenizer_dict ############################################## # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch - # nemo_config.new_decoder_architecture = falcon_config['new_decoder_architecture'] #bool, if True, always use parallel attn - # nemo_config.parallel_attention = falcon_config['parallel_attn'] + nemo_config.new_decoder_architecture = falcon_config.new_decoder_architecture #bool, if True, always use parallel attn + nemo_config.parallel_attention = falcon_config.parallel_attn ############################################### nemo_config.num_query_groups = ( @@ -254,8 +256,8 @@ def convert(args): dtype = determine_dtype(precision) nemo_config.precision = precision - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision) - + trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + hidden_size = falcon_config.hidden_size head_num = falcon_config.num_attention_heads head_size = hidden_size // head_num @@ -346,7 +348,6 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals del model - # model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() @@ -362,6 +363,6 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals if __name__ == '__main__': - setup_logging() + #setup_logging() args = get_args() convert(args) From c69d577c46ec6004435234398bc2f97ca3667db1 Mon Sep 17 00:00:00 2001 From: Vivian Date: Sat, 23 Sep 2023 02:25:16 +0000 Subject: [PATCH 10/69] fix state dict based on spec system --- .../convert_hf_falcon_to_nemo.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 6dfada1c3580..c7c828e99f37 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -65,7 +65,7 @@ # [ ] remove unnecessary comments and codes. -def setup_logging(log_file="test_log.txt"): +def setup_logging(log_file="test.txt"): logging.basicConfig( filename=log_file, level=logging.DEBUG, @@ -93,11 +93,13 @@ def get_args(): def load_model(cls, checkpoint, strict, **kwargs): try: + logging.debug(f'kwargs are, {kwargs}') if 'cfg' in kwargs: model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs) else: model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs) for name, module in model.named_parameters(): + logging.debug(f'model state dict name, {name}') if name in checkpoint['state_dict']: module.data = checkpoint['state_dict'][name] checkpoint['state_dict'].pop(name) @@ -199,6 +201,7 @@ def load_nemo_config(args): nemo_config.transformer_engine = True nemo_config.bias_activation_fusion = False nemo_config.bias_dropout_add_fusion = False + nemo_config.share_embeddings_and_output_weights = False base = 128 while falcon_config.vocab_size % base != 0: @@ -240,6 +243,8 @@ def convert(args): logging.debug(f"initial falcon_config, {falcon_config}") nemo_config = load_nemo_config(args) + # debug + logging.debug(f"initial nemo_config, {nemo_config}") precision = determine_precision(args) plugins = [] @@ -280,19 +285,16 @@ def convert(args): checkpoint = OrderedDict() checkpoint['state_dict'] = OrderedDict() - def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias, is_layernorm=False): + def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias): source_name = f"{source_prefix}.{weight_or_bias}" if source_name in model.state_dict(): - if is_layernorm: - target_name = f"{target_prefix}_{weight_or_bias}" - else: - target_name = f"{target_prefix}.{weight_or_bias}" + target_name = f"{target_prefix}.{weight_or_bias}" checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name]) - def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=False): - add_to_checkpoint(source_prefix, target_prefix, 'weight', is_layernorm) + def add_weight_and_possible_bias(source_prefix, target_prefix): + add_to_checkpoint(source_prefix, target_prefix, 'weight') if f"{source_prefix}.bias" in model.state_dict(): - add_to_checkpoint(source_prefix, target_prefix, 'bias', is_layernorm) + add_to_checkpoint(source_prefix, target_prefix, 'bias') add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight') @@ -312,23 +314,21 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals if falcon_config.new_decoder_architecture: add_weight_and_possible_bias( f'{prefix}.ln_attn', - f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', - is_layernorm=True, + f'model.decoder.layers.{l}.input_layernorm', ) add_weight_and_possible_bias( - f'{prefix}.ln_mlp', f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', is_layernorm=True + f'{prefix}.ln_mlp', + f'model.decoder.layers.{l}.pre_mlp_layernorm', ) else: add_weight_and_possible_bias( f'{prefix}.input_layernorm', - f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm', - is_layernorm=True, + f'model.decoder.layers.{l}.input_layernorm', ) if not falcon_config.parallel_attn: add_weight_and_possible_bias( f'{prefix}.post_attention_layernorm', - f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm', - is_layernorm=True, + f'model.decoder.layers.{l}.post_self_attn_layernorm', ) print(f"done layer {l}") @@ -348,6 +348,7 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals del model + #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() @@ -358,11 +359,12 @@ def add_weight_and_possible_bias(source_prefix, target_prefix, is_layernorm=Fals # We make sure that the tokenizer can be instantiated later regardless of args.input model.cfg.tokenizer.update(type=args.tokenizer_type) # save model + model.save_to(args.out_file) logging.info(f'NeMo model saved to: {args.out_file}') if __name__ == '__main__': - #setup_logging() + setup_logging() args = get_args() convert(args) From 0610e19fae1672b950f3501baf5d44eebba4ecc6 Mon Sep 17 00:00:00 2001 From: Vivian Date: Sun, 24 Sep 2023 21:00:33 +0000 Subject: [PATCH 11/69] fix state dict based on change in layers, fix amp O2 --- .../convert_hf_falcon_to_nemo.py | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index c7c828e99f37..f7995d471e24 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -65,7 +65,7 @@ # [ ] remove unnecessary comments and codes. -def setup_logging(log_file="test.txt"): +def setup_logging(log_file="test.log"): logging.basicConfig( filename=log_file, level=logging.DEBUG, @@ -93,22 +93,35 @@ def get_args(): def load_model(cls, checkpoint, strict, **kwargs): try: - logging.debug(f'kwargs are, {kwargs}') if 'cfg' in kwargs: model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs) else: model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs) for name, module in model.named_parameters(): - logging.debug(f'model state dict name, {name}') if name in checkpoint['state_dict']: module.data = checkpoint['state_dict'][name] checkpoint['state_dict'].pop(name) else: - logging.info(f"Unexpected key: {name} not in checkpoint but in model.") + print(f"Unexpected key: {name} not in checkpoint but in model.") + + for name, buffer in model.named_buffers(): + if name in checkpoint['state_dict']: + buffer.data = checkpoint['state_dict'][name] + checkpoint['state_dict'].pop(name) + if len(checkpoint['state_dict'].keys()) != 0: raise RuntimeError( f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model." ) + + # register the artifacts + cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] + if cfg.tokenizer.model is not None: + model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model) + if cfg.tokenizer.vocab_file is not None: + model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file) + if cfg.tokenizer.merge_file is not None: + model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file) finally: cls._set_model_restore_state(is_being_restored=False) return model @@ -180,11 +193,9 @@ def load_nemo_config(args): } nemo_config.tokenizer = tokenizer_dict - ############################################## - # TODO: need refactor Mcore to support parallel attn and 40b/180b model arch + nemo_config.new_decoder_architecture = falcon_config.new_decoder_architecture #bool, if True, always use parallel attn nemo_config.parallel_attention = falcon_config.parallel_attn - ############################################### nemo_config.num_query_groups = ( falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None @@ -340,13 +351,15 @@ def add_weight_and_possible_bias(source_prefix, target_prefix): add_to_checkpoint('lm_head', 'model.output_layer', 'weight') checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config - logging.debug(f'final checkpoint, {checkpoint}') - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logging.info(f'Weights loaded. Total time: {t}') + #logging.debug(f'final checkpoint, {checkpoint}') del model + + # state dict name for megatron_amp_O2 is different + if nemo_config.get('megatron_amp_O2', False): + keys = list(checkpoint['state_dict'].keys()) + for key in keys: + checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key) #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer) @@ -362,6 +375,10 @@ def add_weight_and_possible_bias(source_prefix, target_prefix): model.save_to(args.out_file) logging.info(f'NeMo model saved to: {args.out_file}') + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + logging.info(f'Weights loaded and saved. Total time: {t}') if __name__ == '__main__': From a8684d04a7664fc406c9685f3f6acc8309d6825b Mon Sep 17 00:00:00 2001 From: Vivian Date: Sun, 24 Sep 2023 22:29:41 +0000 Subject: [PATCH 12/69] add falcon spec system support --- .../language_modeling/megatron/__init__.py | 2 +- .../megatron/spec_falcon/__init__.py | 1 + .../megatron/spec_falcon/falcon_gpt_model.py | 313 ++++++++++++++++++ .../megatron/spec_falcon/falcon_spec.py | 43 +++ .../spec_falcon/spec_falcon_decoder_block.py | 290 ++++++++++++++++ .../spec_falcon/spec_falcon_decoder_layer.py | 299 +++++++++++++++++ .../language_modeling/megatron_gpt_model.py | 68 +++- 7 files changed, 1010 insertions(+), 6 deletions(-) create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py index 5dec9388528e..b024f0b061c3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py @@ -15,7 +15,7 @@ # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel try: - from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_gpt_model import FalconGPTModel + from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel HAVE_MEGATRON_CORE = True diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py new file mode 100644 index 000000000000..46da18a40345 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py @@ -0,0 +1 @@ +from .falcon_gpt_model import FalconGPTModel \ No newline at end of file diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py new file mode 100644 index 000000000000..67b93283b0a4 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py @@ -0,0 +1,313 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# just copy paste here, need work +import logging +from typing import Literal, Optional + +import torch +from megatron.core import parallel_state, tensor_parallel +from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding +from megatron.core.models.gpt.gpt_embedding import GPTEmbedding +from megatron.core.transformer.enums import AttnMaskType, ModelType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint +from torch import Tensor + +# from megatron.core.transformer.transformer_block import TransformerBlock +from .spec_falcon_decoder_block import FalconTransformerBlock + + +class FalconGPTModel(MegatronModule): + """Transformer language model. + + Arguments: + config (TransformerConfig): transformer config + + vocab_size (int): vocabulary size + + max_sequence_length (int): maximum size of sequence. This is used for positional embedding + + pre_process (bool): Include embedding layer (used with pipeline parallelism) + post_process (bool): Include an output layer (used with pipeline parallelism) + + parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks + + share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are + shared. Defaults to False. + + position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope']. + Defaults is 'learned_absolute'. + + rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. + Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'. + + seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences. + The value must be a float larger than 1.0. Defaults to None. + """ + + def __init__( + self, + config: TransformerConfig, + transformer_layer_spec: ModuleSpec, + vocab_size: int, + max_sequence_length: int, + pre_process: bool = True, + post_process: bool = True, + fp16_lm_cross_entropy: bool = False, + parallel_output: bool = True, + share_embeddings_and_output_weights: bool = False, + position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute', + rotary_percent: float = 1.0, + seq_len_interpolation_factor: Optional[float] = None, + ): + super(FalconGPTModel, self).__init__(config=config) + + self.config: TransformerConfig = config + self.transformer_layer_spec: ModuleSpec = transformer_layer_spec + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + self.pre_process = pre_process + self.post_process = post_process + self.fp16_lm_cross_entropy = fp16_lm_cross_entropy + self.parallel_output = parallel_output + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + self.position_embedding_type = position_embedding_type + + # megatron core pipelining currently depends on model type + # TODO: remove this dependency ? + self.model_type = ModelType.encoder_or_decoder + + # Embeddings. + if self.pre_process: + self.embedding = GPTEmbedding( + config=self.config, + vocab_size=self.vocab_size, + max_sequence_length=self.max_sequence_length, + add_position_embedding=(self.position_embedding_type == 'learned_absolute'), + ) + + # Rotary Position Embeddings + if self.position_embedding_type == 'rope': + rotary_dim = self.config.kv_channels + if rotary_percent < 1.0: + rotary_dim = int(rotary_dim * rotary_percent) + + self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor) + else: + self.rotary_pos_emb = None + + # Transformer. + self.decoder = FalconTransformerBlock( + config=self.config, + transformer_layer_spec=self.transformer_layer_spec, + self_attn_mask_type=AttnMaskType.causal, + pre_process=self.pre_process, + post_process=self.post_process, + ) + + # Output + if post_process: + self.output_layer = tensor_parallel.ColumnParallelLinear( + config.hidden_size, + self.vocab_size, + config=config, + init_method=config.init_method, + bias=False, + skip_bias_add=False, + gather_output=not self.parallel_output, + skip_weight_param_allocation=self.pre_process + and self.share_embeddings_and_output_weights, + ) + + if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): + self.initialize_last_stage_with_word_embeddings() + + def set_input_tensor(self, input_tensor): + """ See megatron.model.transformer.set_input_tensor()""" + + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt' + self.decoder.set_input_tensor(input_tensor[0]) + + def forward( + self, + input_ids: Tensor, + position_ids: Tensor, + attention_mask: Tensor, + decoder_input: Tensor = None, + labels: Tensor = None, + inference_params=None, + ): + # If decoder_input is provided (not None), then input_ids and position_ids are ignored. + # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input. + + # Decoder embedding. + if decoder_input is not None: + pass + elif self.pre_process: + decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) + else: + # intermediate stage of pipeline + # decoder will get hidden_states from encoder.input_tensor + decoder_input = None + + # Rotary positional embeddings + rotary_pos_emb = None + if self.rotary_pos_emb is not None: + if inference_params is not None: + rotary_seq_len = inference_params.max_sequence_length + else: + if self.decoder.input_tensor is not None: + rotary_seq_len = self.decoder.input_tensor.size(0) + else: + rotary_seq_len = decoder_input.size(0) + + # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region + if self.config.sequence_parallel: + rotary_seq_len *= self.config.tensor_model_parallel_size + + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + + # Run decoder. + hidden_states = self.decoder( + hidden_states=decoder_input, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + if not self.post_process: + return hidden_states + + # logits and loss + output_weight = None + if self.share_embeddings_and_output_weights: + output_weight = self.shared_embedding_or_output_weight() + logits, _ = self.output_layer(hidden_states, weight=output_weight) + + if labels is None: + # [s b h] => [b s h] + return logits.transpose(0, 1).contiguous() + + # [b s] => [s b] + labels = labels.transpose(0, 1).contiguous() + loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels) + + # [s b] => [b, s] + loss = loss.transpose(0, 1).contiguous() + return loss + + def shared_embedding_or_output_weight(self): + if self.pre_process: + return self.embedding.word_embeddings.weight + elif self.post_process: + return self.output_layer.weight + return None + + def initialize_last_stage_with_word_embeddings(self): + + # This function just initializes the word embeddings in the final stage + # when we are using pipeline parallelism and sharing word + # embeddings. Nothing to do if we aren't sharing weights or aren't using + # pipeline parallelism. + if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process): + return + + if self.post_process and not self.pre_process: + assert not parallel_state.is_pipeline_first_stage() + # set word_embeddings weights to 0 here, then copy first + # stage's weights using all_reduce below. + self.output_layer.weight.data.fill_(0) + self.output_layer.weight.shared = True + + # Parameters are shared between the word embeddings layers, and the + # heads at the end of the model. In a pipelined setup with more than + # one stage, the initial embedding layer and the head are on different + # workers, so we do the following: + # 1. Create a second copy of word_embeddings on the last stage, with + # initial parameters of 0.0. + # 2. Do an all-reduce between the first and last stage to ensure that + # the two copies of word_embeddings start off with the same + # parameter values. + # 3. In the training loop, before an all-reduce between the grads of + # the two word_embeddings layers to ensure that every applied weight + # update is the same on both stages. + + # Ensure that first and last stages have the same initial parameter + # values. + if torch.distributed.is_initialized(): + if parallel_state.is_rank_in_embedding_group(): + weight = self.shared_embedding_or_output_weight() + torch.distributed.all_reduce( + weight.data, group=parallel_state.get_embedding_group() + ) + + elif not getattr(GPTModel, "embedding_warning_printed", False): + logging.getLogger(__name__).warning( + "Distributed processes aren't initialized, so the output layer " + "is not initialized with weights from the word embeddings. " + "If you are just manipulating a model this is fine, but " + "this needs to be handled manually. If you are training " + "something is definitely wrong." + ) + GPTModel.embedding_warning_printed = True + + def sharded_state_dict(self, prefix=''): + sharded_state_dict = {} + + if self.pre_process: + embedding_prefix = f'{prefix}embedding.' + embedding_sharded_state_dict = self.embedding.sharded_state_dict( + prefix=embedding_prefix + ) + sharded_state_dict.update(embedding_sharded_state_dict) + + decoder_prefix = f'{prefix}decoder.' + decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix) + sharded_state_dict.update(decoder_sharded_state_dict) + + if self.post_process: + output_layer_prefix = f'{prefix}output_layer.' + output_layer_key = f'{output_layer_prefix}weight' + if self.share_embeddings_and_output_weights: + if not self.pre_process: + # when sharing embeddings with last stage, we need to use the weights from the first stage + # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight + tensor = self.shared_embedding_or_output_weight() + first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' + dp_rank = parallel_state.get_data_parallel_rank() + dp_size = parallel_state.get_data_parallel_world_size() + last_stage_word_emb_replica_id = ( + dp_rank + dp_size + ) # copy of first stage embedding + + sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( + tensor=tensor, + key=first_stage_word_emb_key, + replica_id=last_stage_word_emb_replica_id, + allow_shape_mismatch=True, + ) + + sharded_state_dict[output_layer_key] = sharded_output_layer_tensor + + else: + output_layer_state_dict = self.output_layer.state_dict( + prefix=output_layer_prefix, keep_vars=True + ) + output_layer_tensor = output_layer_state_dict[output_layer_key] + # independent output layer + sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( + tensor=output_layer_tensor, + key=output_layer_key, + replica_id=parallel_state.get_data_parallel_rank(), + allow_shape_mismatch=True, + ) + + sharded_state_dict[output_layer_key] = sharded_output_layer_tensor + + return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py new file mode 100644 index 000000000000..70c90e520937 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py @@ -0,0 +1,43 @@ +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.fusions.fused_layer_norm import FusedLayerNorm +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.custom_layers.transformer_engine import ( + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TERowParallelLinear, + TENorm, + TEColumnParallelLinear, +) +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules + +# Use this spec for an implementation using modules in TE +falcon_layer_spec = ModuleSpec( + module=FalconTransformerLayer, + submodules=FalconTransformerLayerSubmodules( + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + dot_product_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + self_attn_bda=get_bias_dropout_add, + post_self_attn_layernorm=TENorm, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + ), +) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py new file mode 100644 index 000000000000..8a009c39b59e --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py @@ -0,0 +1,290 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import re +from contextlib import nullcontext + +import torch + +from megatron.core import parallel_state, tensor_parallel +from megatron.core.fusions.fused_layer_norm import FusedLayerNorm +from megatron.core.transformer.custom_layers.transformer_engine import TENorm +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_config import TransformerConfig +from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules +from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor + + +class FalconTransformerBlock(MegatronModule): + """Transformer class.""" + + def __init__( + self, + config: TransformerConfig, + transformer_layer_spec: ModuleSpec, + self_attn_mask_type=AttnMaskType.padding, + post_layer_norm=True, + pre_process=True, + post_process=True, + ): + super().__init__(config=config) + + self.config: TransformerConfig = config + self.transformer_layer_spec: ModuleSpec = transformer_layer_spec + + self.self_attn_mask_type = self_attn_mask_type + self.post_layer_norm = post_layer_norm + self.pre_process = pre_process + self.post_process = post_process + + # required for pipeline parallel schedules + self.input_tensor = None + + self.checkpoint_core_attention = self.config.recompute_granularity == 'selective' + + self.num_layers_per_pipeline_rank = ( + self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() + ) + + self._build_layers(self.transformer_layer_spec) + + def _build_layers(self, transformer_layer_spec): + # Transformer layers. + # @jcasper can we improve how we deal with layer_number? + # currently it's only used in CoreAttention? + # if self.apply_query_key_layer_scaling: + # coeff = self.layer_number + # self.norm_factor *= coeff + def build_layer(layer_number): + layer = FalconTransformerLayer( + config=self.config, + submodules=transformer_layer_spec.submodules, + layer_number=layer_number, + self_attn_mask_type=self.self_attn_mask_type, + ) + return layer + + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + # Interleaved pipeline parallelism: + # Number of layers in each model chunk is the number of layers in the stage, + # divided by the number of model chunks in a stage. + # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0] [2] [4] [6] + # Stage 1: [1] [3] [5] [7] + # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of + # layers to stages like (each list is a model chunk): + # Stage 0: [0, 1] [4, 5] + # Stage 1: [2, 3] [6, 7] + + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size + + num_layers_to_build = num_layers_per_virtual_rank + + else: + # Non-interleaved pipeline parallelism: + # Each stage gets a contiguous set of layers. + + num_layers_to_build = self.num_layers_per_pipeline_rank + + # offset is implicit in TransformerLayer + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)]) + + # # TODO: add back standalone_embedding_stage + # if self.num_layers == 0: + # # When a standalone embedding stage is used (e.g., + # # args.standalone_embedding_stage == True), virtual pipeline ranks + # # on pipeline rank 0 will have zero transformer layers assigned to + # # them. This results in the model's input and output tensors to be + # # the same, which will cause failure for certain output tensor + # # optimizations (e.g., pipeline output deallocation). To remedy + # # this, we assign a 'no-op' layer on these ranks, which will + # # disconnect the input tensor from the output tensor. + # self.num_layers = 1 + # self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) + # else: + # self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)]) + + if self.post_process and self.post_layer_norm: + # Final layer norm before output. + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + persist_layer_norm=self.config.persist_layer_norm, + sequence_parallel=self.config.sequence_parallel, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + normalization=self.config.normalization, + ) + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb): + """Forward method with activation checkpointing.""" + + def custom(start, end): + def custom_forward(*args, **kwargs): + x_, *args = args + for index in range(start, end): + layer = self._get_layer(index) + x_ = layer(x_, *args, **kwargs) + return x_ + + return custom_forward + + if self.config.recompute_method == 'uniform': + # Uniformly divide the total number of Transformer layers and checkpoint + # the input activation of each divided chunk. + # A method to further reduce memory usage reducing checkpoints. + l = 0 + while l < self.num_layers_per_pipeline_rank: + hidden_states = tensor_parallel.checkpoint( + custom(l, l + self.config.recompute_num_layers), + self.config.distribute_saved_activations, + hidden_states, + attention_mask, + rotary_pos_emb, + ) + + l += self.config.recompute_num_layers + + elif self.config.recompute_method == 'block': + # Checkpoint the input activation of only a set number of individual + # Transformer layers and skip the rest. + # A method fully use the device memory removing redundant re-computation. + for l in range(self.num_layers_per_pipeline_rank): + if l < self.config.recompute_num_layers: + hidden_states = tensor_parallel.checkpoint( + custom(l, l + 1), + self.config.distribute_saved_activations, + hidden_states, + attention_mask, + rotary_pos_emb, + ) + else: + hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb) + else: + raise ValueError("Invalid activation recompute method.") + + return hidden_states + + def set_input_tensor(self, input_tensor): + """Set input tensor to be used instead of forward()'s input. + + When doing pipeline parallelism the input from the previous + stage comes from communication, not from the input, so the + model's forward_step_func won't have it. This function is thus + used by internal code to bypass the input provided by the + forward_step_func""" + self.input_tensor = input_tensor + + def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None): + # hidden_states (float): [s, b, h] + # attention_mask (bool): [1, 1, s, s] + + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + # Viewless tensor. + # - We only need to create a viewless tensor in the case of micro batch + # size (mbs) == 1, since in this case, 'hidden_states.transpose()' + # above creates a view tensor, and '.contiguous()' is a pass-through. + # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating + # the need to make it viewless. + # + # However, we don't explicitly check mbs == 1 here because + # make_viewless_tensor() has negligible overhead when its input + # is already viewless. + # + # - For the 'else' case above, calling make_viewless_tensor() here is + # likely redundant, since p2p_communication.py (likely originator) + # already creates viewless tensors. That said, make_viewless_tensor() + # is called here to be future-proof and corner-case-proof. + hidden_states = make_viewless_tensor( + inp=hidden_states, requires_grad=True, keep_graph=True, + ) + + if self.config.sequence_parallel: + rng_context = tensor_parallel.get_cuda_rng_tracker().fork() + else: + rng_context = nullcontext() + + if self.config.fp8: + import transformer_engine # To keep out TE dependency when not training in fp8 + + if self.config.fp8 == "e4m3": + fp8_format = transformer_engine.common.recipe.Format.E4M3 + elif self.config.fp8 == "hybrid": + fp8_format = transformer_engine.common.recipe.Format.HYBRID + else: + raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.") + + fp8_recipe = transformer_engine.common.recipe.DelayedScaling( + margin=self.config.fp8_margin, + interval=self.config.fp8_interval, + fp8_format=fp8_format, + amax_compute_algo=self.config.fp8_amax_compute_algo, + amax_history_len=self.config.fp8_amax_history_len, + override_linear_precision=(False, False, not self.config.fp8_wgrad), + ) + fp8_group = None + if parallel_state.model_parallel_is_initialized(): + fp8_group = parallel_state.get_amax_reduction_group() + fp8_context = transformer_engine.pytorch.fp8_autocast( + enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group + ) + else: + fp8_context = nullcontext() + + with rng_context and fp8_context: + # Forward pass. + if self.config.recompute_granularity == 'full': + hidden_states = self._checkpointed_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + ) + else: + for layer in self.layers: + hidden_states = layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + inference_params=inference_params, + ) + + # Final layer norm. + if self.post_process and self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states + + def sharded_state_dict(self, prefix=''): + + sharded_state_dict = {} + + layer_prefix = f'{prefix}layers.' + for layer in self.layers: + sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix)) + + if self.post_process and self.post_layer_norm: + state_dict = self.state_dict(keep_vars=True) + + tensor = state_dict['final_layernorm.weight'] + layer_name = f'{prefix}final_layernorm.weight' + sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) + + # RMSNorm doesn't have bias. + if 'final_layernorm.bias' in state_dict.keys(): + tensor = state_dict['final_layernorm.bias'] + layer_name = f'{prefix}final_layernorm.bias' + sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint( + tensor, layer_name + ) + + return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py new file mode 100644 index 000000000000..68f417b04409 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py @@ -0,0 +1,299 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass +from typing import Union + +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.transformer.attention import SelfAttentionSubmodules +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import make_viewless_tensor + +""" We use the following notation throughout this file: + h: hidden size + n: number of attention heads + p: number of model parallel partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + l: number of layers + Transformer takes input of size [s, b, h] and returns a + tensor of the same size. We use the following arguments: + hyperparameters: transformer hyperparameters +""" + +@dataclass +class FalconTransformerLayerSubmodules: + input_layernorm: Union[ModuleSpec, type] = IdentityOp + self_attention: Union[ModuleSpec, type] = IdentityOp + self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp + + post_self_attn_layernorm: Union[ModuleSpec, type] = IdentityOp + + pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp + mlp: Union[ModuleSpec, type] = IdentityOp + mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp + + +class FalconTransformerLayer(MegatronModule): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + + """ + + def __init__( + self, + config: TransformerConfig, # should come from FalconTransformerConfig class + submodules: FalconTransformerLayerSubmodules, + layer_number: int = 1, + self_attn_mask_type=AttnMaskType.padding, + ): + super().__init__(config=config) + self.config: TransformerConfig = config + + + self.layer_number = layer_number + self._get_layer_offset() + + self.self_attn_mask_type = self_attn_mask_type + + self.new_decoder_architecture = self.config.new_decoder_architecture + + self.parallel_attention = self.config.parallel_attention + + + ## [Module 1: Input Layernorm] Optional Layernorm on the input data + # TODO: add pytorch only layernorm + self.input_layernorm = build_module( + submodules.input_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + persist_layer_norm=self.config.persist_layer_norm, + sequence_parallel=self.config.sequence_parallel, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + normalization=self.config.normalization, + ) + + ## [Module 2: SelfAttention] + self.self_attention = build_module( + submodules.self_attention, config=self.config, layer_number=layer_number, + ) + + ## [Module 3: BiasDropoutFusion] Optional + self.self_attn_bda = build_module(submodules.self_attn_bda) + + ## [Module 4: Post SelfAttention] Optional Layernorm after self-attn + if self.new_decoder_architecture or self.parallel_attention: + self.post_self_attn_layernorm = None + else: + self.post_self_attn_layernorm = build_module( + submodules.post_self_attn_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + persist_layer_norm=self.config.persist_layer_norm, + sequence_parallel=self.config.sequence_parallel, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + normalization=self.config.normalization, + ) + + ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture + self.pre_mlp_layernorm = build_module( + submodules.pre_mlp_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + persist_layer_norm=self.config.persist_layer_norm, + sequence_parallel=self.config.sequence_parallel, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + normalization=self.config.normalization, + ) if self.new_decoder_architecture else None + + ## [Module 6: MLP block] + self.mlp = build_module(submodules.mlp, config=self.config) + + ## [Module 7: BiasDropoutFusion] Optional + self.mlp_bda = build_module(submodules.mlp_bda) + + # @jcasper how should we handle nvfuser? + # Set bias+dropout+add fusion grad_enable execution handler. + # TORCH_MAJOR = int(torch.__version__.split('.')[0]) + # TORCH_MINOR = int(torch.__version__.split('.')[1]) + # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) + # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad + self.bias_dropout_add_exec_handler = torch.enable_grad + + def _get_layer_offset(self): + + pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() + + num_layers_per_pipeline_rank = ( + self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() + ) + + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank() + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + total_num_layers = self.config.num_layers + num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size + total_virtual_chunks = total_num_layers // vp_size + offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank) + + else: + # Each stage gets a contiguous set of layers. + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + offset = pipeline_rank * num_layers_per_pipeline_rank + else: + offset = 0 + + return offset + + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + enc_dec_attn_mask=None, + inference_params=None, + rotary_pos_emb=None, + ): + # hidden_states: [s, b, h] + + # Residual connection. + residual = hidden_states + + if self.new_decoder_architecture: + mlp_ln_output = self.pre_mlp_layernorm(hidden_states) + + # Optional Input Layer norm + input_layernorm_output = self.input_layernorm(hidden_states) + + input_mlp_ln = input_layernorm_output + + # Self attention. + attention_output_with_bias = self.self_attention( + input_layernorm_output, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) + + # TODO: could we move `bias_dropout_add_exec_handler` itself + # inside the module provided in the `bias_dropout_add_spec` module? + with self.bias_dropout_add_exec_handler(): + hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)( + attention_output_with_bias, residual, self.config.hidden_dropout + ) + + if not self.new_decoder_architecture: + if self.parallel_attention: + layernorm_output = input_mlp_ln + else: + residual = hidden_states + layernorm_output = self.post_self_attn_layernorm(hidden_states) + + else: + layernorm_output = mlp_ln_output + + mlp_output_with_bias = self.mlp(layernorm_output) + + # falcon specific: + if self.new_decoder_architecture or self.parallel_attention: + mlp_output= mlp_output_with_bias[0] + attn_output= attention_output_with_bias[0] + mlp_output_without_bias = mlp_output + attn_output + mlp_output_with_bias = (mlp_output_without_bias, None) + + # TODO: could we move `bias_dropout_add_exec_handler` itself + # inside the module provided in the `bias_dropout_add_spec` module? + with self.bias_dropout_add_exec_handler(): + hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)( + mlp_output_with_bias, residual, self.config.hidden_dropout + ) + + # Jit compiled function creates 'view' tensor. This tensor + # potentially gets saved in the MPU checkpoint function context, + # which rejects view tensors. While making a viewless tensor here + # won't result in memory savings (like the data loader, or + # p2p_communication), it serves to document the origin of this + # 'view' tensor. + output = make_viewless_tensor( + inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True + ) + + return output + + def sharded_state_dict(self, prefix=''): + + # state_dict = self.state_dict(prefix=prefix, keep_vars=True) + state_dict = self.state_dict(keep_vars=True) + + tensor_parallel_layers_axis_map = { + 'self_attention.linear_qkv.weight': 0, + 'self_attention.linear_qkv.bias': 0, + 'self_attention.linear_proj.weight': 1, + 'mlp.linear_fc1.weight': 0, + 'mlp.linear_fc1.bias': 0, + 'mlp.linear_fc2.weight': 1, + } + + offset = self._get_layer_offset() + num_layers = self.config.num_layers + + sharded_state_dict = {} + + for layer_name in state_dict.keys(): + tensor = state_dict[layer_name] + global_layer_offset = self.layer_number - 1 # self.layer_number starts at 1 + layer_key = f'{prefix}{global_layer_offset - offset}.{layer_name}' # module list index in TransformerBlock + sharded_offsets = [(0, global_layer_offset, num_layers)] # PP sharding + + if layer_name in tensor_parallel_layers_axis_map: + tp_axis = tensor_parallel_layers_axis_map[layer_name] + # TP sharding + sharded_offsets.append( + [ + tp_axis + 1, # +1 for PP dimension + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_tensor_model_parallel_world_size(), + ] + ) + replica_id = parallel_state.get_data_parallel_rank() + else: + replica_id = ( + parallel_state.get_data_parallel_rank() + * parallel_state.get_data_parallel_world_size() + + parallel_state.get_tensor_model_parallel_rank() + ) + + if layer_name.endswith('._extra_state'): + sharded_state_dict[layer_key] = ShardedObject( + f'{prefix}{layer_name}', + tensor, + (num_layers,), + (global_layer_offset,), + replica_id, + ) + + else: + sharded_state_dict[layer_key] = ShardedTensor.from_rank_offsets( + f'{prefix}{layer_name}', + tensor, + *sharded_offsets, + replica_id=replica_id, + prepend_axis_num=1, # for PP sharding + ) + + return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 45586bffcdce..cf640a813dae 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -15,7 +15,7 @@ import itertools import queue import warnings -from dataclasses import fields +from dataclasses import fields, dataclass from functools import partial from typing import Any, Dict, Iterator, List, Optional, Union @@ -78,6 +78,8 @@ from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal + from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec + from megatron.core.transformer.spec_utils import import_module # TODO @tmoon: Use once available in Megatron-LM # from megatron.core.pipeline_parallel.schedules import DataIteratorList @@ -99,6 +101,25 @@ except (ImportError, ModuleNotFoundError): HAVE_TE = False +def import_falcon_gpt_model(): + """Conditionally import FalconGPTModel. + """ + try: + #from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel + from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel + from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_spec import falcon_layer_spec + return FalconGPTModel, falcon_layer_spec + except (ImportError, ModuleNotFoundError): + raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.") + +@dataclass +class FalconTransformerConfig(TransformerConfig): + """ + Transformer Config for Falcon Variants + """ + + new_decoder_architecture: bool = False + parallel_attention: bool = False class MegatronGPTExportableModel(torch.nn.Module, Exportable): """ @@ -213,7 +234,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) self.mcore_gpt = cfg.get('mcore_gpt', False) - + # Falcon specific args + self.new_decoder_architecture = cfg.get('new_decoder_architecture', False) + self.parallel_attention = cfg.get('parallel_attention', False) + self.rampup_batch_size = self.cfg.get('rampup_batch_size', None) if self.rampup_batch_size: self.prev_consumed_samples = 0 @@ -301,9 +325,32 @@ def get_inference_config(self): def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" - if self.mcore_gpt: + if self.mcore_gpt and (self.new_decoder_architecture or self.parallel_attention): + FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model() + transformer_layer_spec = falcon_layer_spec + #debug + logging.info(f'falcon gpt config, {self.transformer_config}') + model = FalconGPTModel( + config=self.transformer_config, + transformer_layer_spec = transformer_layer_spec, + vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), + max_sequence_length=self.cfg.get('encoder_seq_length', 512), + pre_process=pre_process, + post_process=post_process, + parallel_output=True, + share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True), + position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'), + rotary_percent=self.cfg.get('rotary_percentage', 1.0), + seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), + ) + + logging.info(f'model architecture is {model}') + + elif self.mcore_gpt: + transformer_layer_spec = gpt_layer_with_transformer_engine_spec model = MCoreGPTModel( config=self.transformer_config, + transformer_layer_spec = transformer_layer_spec, vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, @@ -1482,6 +1529,10 @@ def build_transformer_config(self) -> TransformerConfig: gated_linear_unit = activation.endswith('glu') activation_func = activation_to_func(activation) + mcore_gpt = self.cfg.get('mcore_gpt', False) + new_decoder_architecture = self.cfg.get('new_decoder_architecture', False) + parallel_attention = self.cfg.get('parallel_attention', False) + normalization = self.cfg.get('normalization', 'layernorm') if normalization == 'layernorm': normalization = 'LayerNorm' @@ -1563,8 +1614,15 @@ def build_transformer_config(self) -> TransformerConfig: f"The model: {self} does not have field.name: {field.name} in its cfg. " f"Add this key to cfg or config_mapping to make to make it configurable." ) - - transformer_config = TransformerConfig(**transformer_config_dict) + + if mcore_gpt and (new_decoder_architecture or parallel_attention): + transformer_config = FalconTransformerConfig( + **transformer_config_dict, + new_decoder_architecture = new_decoder_architecture, + parallel_attention = parallel_attention, + ) + else: + transformer_config = TransformerConfig(**transformer_config_dict) return transformer_config From 09952722b393661f1ae29536c383bc66742b2ac6 Mon Sep 17 00:00:00 2001 From: Vivian Date: Sun, 24 Sep 2023 22:32:54 +0000 Subject: [PATCH 13/69] remove old falcon mcore support --- .../megatron/falcon_mcore/falcon_gpt_model.py | 302 ------------------ .../falcon_mcore/falcon_transformer_block.py | 281 ---------------- .../falcon_mcore/falcon_transformer_config.py | 263 --------------- .../falcon_mcore/falcon_transformer_layer.py | 260 --------------- 4 files changed, 1106 deletions(-) delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py deleted file mode 100644 index b76b3d8828ee..000000000000 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_gpt_model.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# just copy paste here, need work -import logging -from typing import Literal, Optional - -import torch -from megatron.core import parallel_state, tensor_parallel -from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding -from megatron.core.models.gpt.gpt_embedding import GPTEmbedding -from megatron.core.transformer.enums import AttnMaskType, ModelType -from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint -from torch import Tensor - -# from megatron.core.transformer.transformer_block import TransformerBlock -from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_block import ( - FalconTransformerBlock, -) - - -class FalconGPTModel(MegatronModule): - """Transformer language model. - - Arguments: - config (TransformerConfig): transformer config - - vocab_size (int): vocabulary size - - max_sequence_length (int): maximum size of sequence. This is used for positional embedding - - pre_process (bool): Include embedding layer (used with pipeline parallelism) - post_process (bool): Include an output layer (used with pipeline parallelism) - - parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks - - share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are - shared. Defaults to False. - - position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope']. - Defaults is 'learned_absolute'. - - rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. - Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'. - - seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences. - The value must be a float larger than 1.0. Defaults to None. - """ - - def __init__( - self, - config: TransformerConfig, - vocab_size: int, - max_sequence_length: int, - pre_process: bool = True, - post_process: bool = True, - fp16_lm_cross_entropy: bool = False, - parallel_output: bool = True, - share_embeddings_and_output_weights: bool = False, - position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute', - rotary_percent: float = 1.0, - seq_len_interpolation_factor: Optional[float] = None, - ): - super(GPTModel, self).__init__(config=config) - - self.config: TransformerConfig = config - self.vocab_size = vocab_size - self.max_sequence_length = max_sequence_length - self.pre_process = pre_process - self.post_process = post_process - self.fp16_lm_cross_entropy = fp16_lm_cross_entropy - self.parallel_output = parallel_output - self.share_embeddings_and_output_weights = share_embeddings_and_output_weights - self.position_embedding_type = position_embedding_type - - # megatron core pipelining currently depends on model type - # TODO: remove this dependency ? - self.model_type = ModelType.encoder_or_decoder - - # Embeddings. - if self.pre_process: - self.embedding = GPTEmbedding( - config=self.config, - vocab_size=self.vocab_size, - max_sequence_length=self.max_sequence_length, - add_position_embedding=(self.position_embedding_type == 'learned_absolute'), - ) - - # Rotary Position Embeddings - if self.position_embedding_type == 'rope': - rotary_dim = self.config.kv_channels - if rotary_percent < 1.0: - rotary_dim = int(rotary_dim * rotary_percent) - - self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor) - else: - self.rotary_pos_emb = None - - # Transformer. - self.decoder = FalconTransformerBlock( - config=self.config, - self_attn_mask_type=AttnMaskType.causal, - pre_process=self.pre_process, - post_process=self.post_process, - ) - - # Output - if post_process: - self.output_layer = tensor_parallel.ColumnParallelLinear( - config.hidden_size, - self.vocab_size, - config=config, - init_method=config.init_method, - bias=False, - skip_bias_add=False, - gather_output=not self.parallel_output, - skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights, - ) - - if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): - self.initialize_last_stage_with_word_embeddings() - - def set_input_tensor(self, input_tensor): - """ See megatron.model.transformer.set_input_tensor()""" - - # This is usually handled in schedules.py but some inference code still - # gives us non-lists or None - if not isinstance(input_tensor, list): - input_tensor = [input_tensor] - - assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt' - self.decoder.set_input_tensor(input_tensor[0]) - - def forward( - self, - input_ids: Tensor, - position_ids: Tensor, - attention_mask: Tensor, - decoder_input: Tensor = None, - labels: Tensor = None, - inference_params=None, - ): - # If decoder_input is provided (not None), then input_ids and position_ids are ignored. - # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input. - - # Decoder embedding. - if decoder_input is not None: - pass - elif self.pre_process: - decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) - else: - # intermediate stage of pipeline - # decoder will get hidden_states from encoder.input_tensor - decoder_input = None - - # Rotary positional embeddings - rotary_pos_emb = None - if self.rotary_pos_emb is not None: - if inference_params is not None: - rotary_seq_len = inference_params.max_sequence_length - else: - if self.decoder.input_tensor is not None: - rotary_seq_len = self.decoder.input_tensor.size(0) - else: - rotary_seq_len = decoder_input.size(0) - - # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region - if self.config.sequence_parallel: - rotary_seq_len *= self.config.tensor_model_parallel_size - - rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) - - # Run decoder. - hidden_states = self.decoder( - hidden_states=decoder_input, - attention_mask=attention_mask, - inference_params=inference_params, - rotary_pos_emb=rotary_pos_emb, - ) - - if not self.post_process: - return hidden_states - - # logits and loss - output_weight = None - if self.share_embeddings_and_output_weights: - output_weight = self.shared_embedding_or_output_weight() - logits, _ = self.output_layer(hidden_states, weight=output_weight) - - if labels is None: - # [s b h] => [b s h] - return logits.transpose(0, 1).contiguous() - - # [b s] => [s b] - labels = labels.transpose(0, 1).contiguous() - loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels) - - # [s b] => [b, s] - loss = loss.transpose(0, 1).contiguous() - return loss - - def shared_embedding_or_output_weight(self): - if self.pre_process: - return self.embedding.word_embeddings.weight - elif self.post_process: - return self.output_layer.weight - return None - - def initialize_last_stage_with_word_embeddings(self): - - # This function just initializes the word embeddings in the final stage - # when we are using pipeline parallelism and sharing word - # embeddings. Nothing to do if we aren't sharing weights or aren't using - # pipeline parallelism. - if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process): - return - - if self.post_process and not self.pre_process: - assert not parallel_state.is_pipeline_first_stage() - # set word_embeddings weights to 0 here, then copy first - # stage's weights using all_reduce below. - self.output_layer.weight.data.fill_(0) - self.output_layer.weight.shared = True - - # Parameters are shared between the word embeddings layers, and the - # heads at the end of the model. In a pipelined setup with more than - # one stage, the initial embedding layer and the head are on different - # workers, so we do the following: - # 1. Create a second copy of word_embeddings on the last stage, with - # initial parameters of 0.0. - # 2. Do an all-reduce between the first and last stage to ensure that - # the two copies of word_embeddings start off with the same - # parameter values. - # 3. In the training loop, before an all-reduce between the grads of - # the two word_embeddings layers to ensure that every applied weight - # update is the same on both stages. - - # Ensure that first and last stages have the same initial parameter - # values. - if torch.distributed.is_initialized(): - if parallel_state.is_rank_in_embedding_group(): - weight = self.shared_embedding_or_output_weight() - torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group()) - - elif not getattr(GPTModel, "embedding_warning_printed", False): - logging.getLogger(__name__).warning( - "Distributed processes aren't initialized, so the output layer " - "is not initialized with weights from the word embeddings. " - "If you are just manipulating a model this is fine, but " - "this needs to be handled manually. If you are training " - "something is definitely wrong." - ) - GPTModel.embedding_warning_printed = True - - def sharded_state_dict(self, prefix=''): - sharded_state_dict = {} - - if self.pre_process: - embedding_prefix = f'{prefix}embedding.' - embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix) - sharded_state_dict.update(embedding_sharded_state_dict) - - decoder_prefix = f'{prefix}decoder.' - decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix) - sharded_state_dict.update(decoder_sharded_state_dict) - - if self.post_process: - output_layer_prefix = f'{prefix}output_layer.' - output_layer_key = f'{output_layer_prefix}weight' - if self.share_embeddings_and_output_weights: - if not self.pre_process: - # when sharing embeddings with last stage, we need to use the weights from the first stage - # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight - tensor = self.shared_embedding_or_output_weight() - first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' - dp_rank = parallel_state.get_data_parallel_rank() - dp_size = parallel_state.get_data_parallel_world_size() - last_stage_word_emb_replica_id = dp_rank + dp_size # copy of first stage embedding - - sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( - tensor=tensor, - key=first_stage_word_emb_key, - replica_id=last_stage_word_emb_replica_id, - allow_shape_mismatch=True, - ) - - sharded_state_dict[output_layer_key] = sharded_output_layer_tensor - - else: - output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True) - output_layer_tensor = output_layer_state_dict[output_layer_key] - # independent output layer - sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( - tensor=output_layer_tensor, - key=output_layer_key, - replica_id=parallel_state.get_data_parallel_rank(), - allow_shape_mismatch=True, - ) - - sharded_state_dict[output_layer_key] = sharded_output_layer_tensor - - return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py deleted file mode 100644 index d3b9ae63e4ac..000000000000 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_block.py +++ /dev/null @@ -1,281 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - -import re -from contextlib import nullcontext - -import torch -from megatron.core import parallel_state, tensor_parallel -from megatron.core.fusions.fused_layer_norm import FusedLayerNorm -from megatron.core.transformer.custom_layers.transformer_engine import TENorm -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor - -# change import FalconTransformerLayer -from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_transformer_layer import ( - FalconTransformerLayer, -) - - -class FalconTransformerBlock(MegatronModule): - """Transformer class.""" - - def __init__( - self, - config: TransformerConfig, - self_attn_mask_type=AttnMaskType.padding, - post_layer_norm=True, - pre_process=True, - post_process=True, - ): - super().__init__(config=config) - - self.config: TransformerConfig = config - - self.self_attn_mask_type = self_attn_mask_type - self.post_layer_norm = post_layer_norm - self.pre_process = pre_process - self.post_process = post_process - - # required for pipeline parallel schedules - self.input_tensor = None - - self.checkpoint_core_attention = self.config.recompute_granularity == 'selective' - - self.num_layers_per_pipeline_rank = ( - self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() - ) - - self._build_layers() - - def _build_layers(self): - # Transformer layers. - # @jcasper can we improve how we deal with layer_number? - # currently it's only used in CoreAttention? - # if self.apply_query_key_layer_scaling: - # coeff = self.layer_number - # self.norm_factor *= coeff - def build_layer(layer_number): - layer = FalconTransformerLayer( - config=self.config, layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type, - ) - return layer - - if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: - # Interleaved pipeline parallelism: - # Number of layers in each model chunk is the number of layers in the stage, - # divided by the number of model chunks in a stage. - # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of - # layers to stages like (each list is a model chunk): - # Stage 0: [0] [2] [4] [6] - # Stage 1: [1] [3] [5] [7] - # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of - # layers to stages like (each list is a model chunk): - # Stage 0: [0, 1] [4, 5] - # Stage 1: [2, 3] [6, 7] - - vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() - - num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size - - num_layers_to_build = num_layers_per_virtual_rank - - else: - # Non-interleaved pipeline parallelism: - # Each stage gets a contiguous set of layers. - - num_layers_to_build = self.num_layers_per_pipeline_rank - - # offset is implicit in TransformerLayer - self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)]) - - # # TODO: add back standalone_embedding_stage - # if self.num_layers == 0: - # # When a standalone embedding stage is used (e.g., - # # args.standalone_embedding_stage == True), virtual pipeline ranks - # # on pipeline rank 0 will have zero transformer layers assigned to - # # them. This results in the model's input and output tensors to be - # # the same, which will cause failure for certain output tensor - # # optimizations (e.g., pipeline output deallocation). To remedy - # # this, we assign a 'no-op' layer on these ranks, which will - # # disconnect the input tensor from the output tensor. - # self.num_layers = 1 - # self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) - # else: - # self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)]) - - if self.post_process and self.post_layer_norm: - # Final layer norm before output. - self.final_layernorm = TENorm( - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - persist_layer_norm=self.config.persist_layer_norm, - sequence_parallel=self.config.sequence_parallel, - zero_centered_gamma=self.config.layernorm_zero_centered_gamma, - normalization=self.config.normalization, - ) - - def _get_layer(self, layer_number): - return self.layers[layer_number] - - def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb): - """Forward method with activation checkpointing.""" - - def custom(start, end): - def custom_forward(*args, **kwargs): - x_, *args = args - for index in range(start, end): - layer = self._get_layer(index) - x_ = layer(x_, *args, **kwargs) - return x_ - - return custom_forward - - if self.config.recompute_method == 'uniform': - # Uniformly divide the total number of Transformer layers and checkpoint - # the input activation of each divided chunk. - # A method to further reduce memory usage reducing checkpoints. - l = 0 - while l < self.num_layers_per_pipeline_rank: - hidden_states = tensor_parallel.checkpoint( - custom(l, l + self.config.recompute_num_layers), - self.config.distribute_saved_activations, - hidden_states, - attention_mask, - rotary_pos_emb, - ) - - l += self.config.recompute_num_layers - - elif self.config.recompute_method == 'block': - # Checkpoint the input activation of only a set number of individual - # Transformer layers and skip the rest. - # A method fully use the device memory removing redundant re-computation. - for l in range(self.num_layers_per_pipeline_rank): - if l < self.config.recompute_num_layers: - hidden_states = tensor_parallel.checkpoint( - custom(l, l + 1), - self.config.distribute_saved_activations, - hidden_states, - attention_mask, - rotary_pos_emb, - ) - else: - hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb) - else: - raise ValueError("Invalid activation recompute method.") - - return hidden_states - - def set_input_tensor(self, input_tensor): - """Set input tensor to be used instead of forward()'s input. - - When doing pipeline parallelism the input from the previous - stage comes from communication, not from the input, so the - model's forward_step_func won't have it. This function is thus - used by internal code to bypass the input provided by the - forward_step_func""" - self.input_tensor = input_tensor - - def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None): - # hidden_states (float): [s, b, h] - # attention_mask (bool): [1, 1, s, s] - - if not self.pre_process: - # See set_input_tensor() - hidden_states = self.input_tensor - - # Viewless tensor. - # - We only need to create a viewless tensor in the case of micro batch - # size (mbs) == 1, since in this case, 'hidden_states.transpose()' - # above creates a view tensor, and '.contiguous()' is a pass-through. - # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating - # the need to make it viewless. - # - # However, we don't explicitly check mbs == 1 here because - # make_viewless_tensor() has negligible overhead when its input - # is already viewless. - # - # - For the 'else' case above, calling make_viewless_tensor() here is - # likely redundant, since p2p_communication.py (likely originator) - # already creates viewless tensors. That said, make_viewless_tensor() - # is called here to be future-proof and corner-case-proof. - hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,) - - if self.config.sequence_parallel: - rng_context = tensor_parallel.get_cuda_rng_tracker().fork() - else: - rng_context = nullcontext() - - if self.config.fp8: - import transformer_engine # To keep out TE dependency when not training in fp8 - - if self.config.fp8 == "e4m3": - fp8_format = transformer_engine.common.recipe.Format.E4M3 - elif self.config.fp8 == "hybrid": - fp8_format = transformer_engine.common.recipe.Format.HYBRID - else: - raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.") - - fp8_recipe = transformer_engine.common.recipe.DelayedScaling( - margin=self.config.fp8_margin, - interval=self.config.fp8_interval, - fp8_format=fp8_format, - amax_compute_algo=self.config.fp8_amax_compute_algo, - amax_history_len=self.config.fp8_amax_history_len, - override_linear_precision=(False, False, not self.config.fp8_wgrad), - ) - fp8_group = None - if parallel_state.model_parallel_is_initialized(): - fp8_group = parallel_state.get_amax_reduction_group() - fp8_context = transformer_engine.pytorch.fp8_autocast( - enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group - ) - else: - fp8_context = nullcontext() - - with rng_context and fp8_context: - # Forward pass. - if self.config.recompute_granularity == 'full': - hidden_states = self._checkpointed_forward( - hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb, - ) - else: - for layer in self.layers: - hidden_states = layer( - hidden_states=hidden_states, - attention_mask=attention_mask, - rotary_pos_emb=rotary_pos_emb, - inference_params=inference_params, - ) - - # Final layer norm. - if self.post_process and self.post_layer_norm: - hidden_states = self.final_layernorm(hidden_states) - - return hidden_states - - def sharded_state_dict(self, prefix=''): - - sharded_state_dict = {} - - layer_prefix = f'{prefix}layers.' - for layer in self.layers: - sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix)) - - if self.post_process and self.post_layer_norm: - state_dict = self.state_dict(keep_vars=True) - - tensor = state_dict['final_layernorm.weight'] - layer_name = f'{prefix}final_layernorm.weight' - sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) - - # RMSNorm doesn't have bias. - if 'final_layernorm.bias' in state_dict.keys(): - tensor = state_dict['final_layernorm.bias'] - layer_name = f'{prefix}final_layernorm.bias' - sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) - - return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py deleted file mode 100644 index e804a6228e70..000000000000 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_config.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# just copy paste here, need work -from dataclasses import dataclass -from typing import Callable - -import torch -import torch.nn.functional as F - -from megatron.core import ModelParallelConfig -from megatron.core.utils import init_method_normal, scaled_init_method_normal - - -@dataclass -class TransformerConfig(ModelParallelConfig): - """Configuration object for megatron-core transformers. - - Attributes: - - # model architecture - num_layers (int): Number of transformer layers in a transformer block. - hidden_size (int): Transformer hidden size. - ffn_hidden_size (int): Transformer Feed-Forward Network hidden size. - This is set to 4*hidden_size if not provided. Defaults to None.') - num_attention_heads (int): Number of transformer attention heads. - kv_channels (int): Projection weights dimension in multi-head attention. - This is set to hidden_size // num_attention_heads if not provided. - Defaults to None. - num_query_groups (int): Number of query groups for group query attention. If None, normal attention is used. - - hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1. - attention_dropout (float): Post attention dropout probability. Defaults to 0.1. - fp32_residual_connection (bool): If true, move residual connections to fp32. - apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering. - Defaults to False. - layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5. - - layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values - around 0. This improves numerical stability. Defaults to False. - - add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two - in MLP layer). Default is True. - - gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False. - - activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu. - - # initialization - init_method (Callable): Method to initialize weights. Note that bias is always set to - zero. Should be a function that takes a single Tensor and - initializes it. Defaults to - megatron.core.utils.init_method_normal(init_method_std) which is - torch.nn.init.normal_ with mean=0.0 and std=init_method_Std. - - output_layer_init_method (Callable): Method to initialize weights of the output layer of - both attention and MLP blocks. Defaults to - megatron.core.utils.scaled_init_method_normal(init_method_std) - which is torch.nn.init.normal_ with mean=0.0 and - std=init_method_std / math.sqrt(2.0 * num_layers). - - init_method_std (float): Standard deviation of the zero mean normal for the default - initialization method, not used if init_method and - output_layer_init_method are provided. Defaults to 0.02. - - # mixed-precision - apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True. - attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32. - This should be true if apply_query_key_layer_scaling is true. - - # fusion - bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False. - masked_softmax_fusion (bool): If true, uses softmax fusion. - persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel. - This kernel only supports a fixed set of hidden sizes. - Defaults to False. - bias_dropout_fusion (bool): If true, uses bias dropout fusion. - - # activation recomputation - - recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory - intensive part of attention is checkpointed. These memory intensive activations - are also less compute intensive which makes activation checkpointing more efficient - for LLMs (20B+). See Reducing Activation Recomputation in Large Transformer - Models: https://arxiv.org/abs/2205.05198 for more details. 'full' will checkpoint - the entire transformer layer. Must be 'selective' or 'full'. 'selective' always uses all layers. - Defaults to None. - - recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer - block and recompute the input activation of each divided chunk at the specified - granularity. block will recompute the input activations for only a set number of - transformer layers per pipeline stage. The rest of the layers in the pipeline stage - will not have any activations recomputed. Must be 'uniform' or 'block'. Defaults to - None. - - recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer - layers in each uniformly divided recompute unit. When recompute_method is block, - recompute_num_layers is the number of transformer layers to recompute within each - pipeline stage. Must be None for 'selective' activation checkpointing. Defaults to None. - - distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel - group. Defaults to None. - - # fp8 related (via Transformer Engine). For detailed info, refer the the Transformer Engine docs at - # https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html - - fp8 (str): If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined choices: (1) 'e4m3' - uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 activation and weight tensors and - e5m2 for all FP8 output activation gradient tensors. Defaults to None. - - fp8_margin (int): Margin for the scaling factor computation. - - fp8_interval (int): Controls how often the scaling factor is recomputed. - - fp8_amax_history_len (int): The length of the amax history window used for scaling factor computation. - - fp8_amax_compute_algo (str): Algorithm used for choosing the `amax` value for the scaling factor computation. - There are 2 predefined choices: `max` chooses the largest `amax` in the history - window, while `most_recent` always chooses the most recently seen value. - - fp8_wgrad (bool): When set to False, override FP8 config options and do the wgrad computation in higher precision. - Defaults to True. - - # Experimental - normalization (str): Swtich b/w `LayerNorm` and `RMSNorm` as normalization layers. For now, these are primarily - used by Transformer-Engine's layers like `LayerNormLinear`. Default value is `LayerNorm`. - - - """ - - # model architecture - num_layers: int = 0 - hidden_size: int = 0 - num_attention_heads: int = 0 - num_query_groups: int = None - - ffn_hidden_size: int = None - kv_channels: int = None - hidden_dropout: float = 0.1 - attention_dropout: float = 0.1 - fp32_residual_connection: bool = False - # @jcasper should we keep this option? - apply_residual_connection_post_layernorm: bool = False - layernorm_epsilon: float = 1e-5 - layernorm_zero_centered_gamma: bool = False - add_bias_linear: bool = True - gated_linear_unit: bool = False - activation_func: Callable = F.gelu - - # initialization - init_method: Callable = None - output_layer_init_method: Callable = None - init_method_std: float = 0.02 - - # mixed-precision - apply_query_key_layer_scaling: bool = True - attention_softmax_in_fp32: bool = True - - # communication - - # fusion - bias_gelu_fusion: bool = False # TODO: this should be bias_activation_fusion ? - masked_softmax_fusion: bool = False - persist_layer_norm: bool = False - bias_dropout_fusion: bool = False # TODO: this should be bias_dropout_add_fusion? - - # activation recomputation - recompute_granularity: str = None - recompute_method: str = None - recompute_num_layers: int = None - distribute_saved_activations: bool = None - - # fp8 related - fp8: str = None - fp8_margin: int = 0 - fp8_interval: int = 1 - fp8_amax_history_len: int = 1 - fp8_amax_compute_algo: str = "most_recent" - fp8_wgrad: bool = True - - # experimental section (TODO: move to apt. section above once stable) - normalization: bool = "LayerNorm" # alt value supported by TE: "RMSNorm" - - def __post_init__(self): - """ Python dataclass method that is used to modify attributes after initialization. - See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details. - """ - super().__post_init__() - if self.fp16 and self.bf16: - raise ValueError(f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.') - - if self.num_attention_heads % self.tensor_model_parallel_size != 0: - raise ValueError( - f"num_attention_heads ({self.num_attention_heads}) must be a multiple of " - f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." - ) - - if self.ffn_hidden_size is None: - self.ffn_hidden_size = 4 * self.hidden_size - - if self.kv_channels is None: - self.kv_channels = self.hidden_size // self.num_attention_heads - - if self.num_query_groups is None: - self.num_query_groups = self.num_attention_heads - - if self.num_query_groups % self.tensor_model_parallel_size != 0: - raise ValueError( - f"num_query_groups ({self.num_query_groups}) must be a multiple of " - f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." - ) - - if self.apply_query_key_layer_scaling: - self.attention_softmax_in_fp32 = True - - if self.recompute_granularity is not None: - if not self.recompute_granularity in ['full', 'selective']: - raise ValueError( - f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".' - ) - - if self.recompute_method is not None: - if not self.recompute_method in ['block', 'uniform']: - raise ValueError(f'recompute_method: {self.recompute_method} must be "block" or "uniform".') - elif self.recompute_granularity != 'selective': - raise ValueError( - f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"' - ) - - if self.recompute_granularity != 'selective' and self.recompute_num_layers is None: - raise ValueError( - f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between ' - f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}' - ) - elif self.recompute_granularity == 'selective' and self.recompute_num_layers is not None: - raise ValueError( - f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.' - ) - - if self.distribute_saved_activations and self.sequence_parallel: - raise ValueError( - f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel}' - ) - - if self.virtual_pipeline_model_parallel_size is not None: - if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0: - raise ValueError( - f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}' - ) - - if self.apply_query_key_layer_scaling: - self.attention_softmax_in_fp32 = True - - if self.bias_gelu_fusion: - if not self.add_bias_linear: - raise ValueError("When bias_gelu_fusion is True, add_bias_linear must also be True.") - - if self.activation_func != F.gelu: - raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.') - - if self.init_method is None: - self.init_method = init_method_normal(self.init_method_std) - - if self.output_layer_init_method is None: - self.output_layer_init_method = scaled_init_method_normal(self.init_method_std, self.num_layers) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py deleted file mode 100644 index e592a1e0b1ac..000000000000 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon_mcore/falcon_transformer_layer.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - -import re - -import torch -from megatron.core import parallel_state -from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.transformer.custom_layers.transformer_engine import TENorm -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.identity_op import IdentityOp -from megatron.core.transformer.mlp import MLP -from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import make_viewless_tensor - -# from megatron.core.transformer.attention import SelfAttention -# change attention due to extra layernorm before mlp, ln_mlp. -from nemo.collections.nlp.models.language_modeling.megatron.falcon_mcore.falcon_attention import SelfAttention - -""" We use the following notation throughout this file: - h: hidden size - n: number of attention heads - p: number of model parallel partitions - np: n/p - hp: h/p - hn: h/n - b: batch size - s: sequence length - l: number of layers - Transformer takes input of size [s, b, h] and returns a - tensor of the same size. We use the following arguments: - hyperparameters: transformer hyperparameters -""" - - -class FalconTransformerLayer(MegatronModule): - """A single transformer layer. - - Transformer layer takes input with size [s, b, h] and returns an - output of the same size. - - Args: - new_decoder_architecture (bool): - Whether to use Falcon's new decoder architecture that were used in 7B/40B/180B variants. - - parallel_attention (bool): - Whether to use parallel attention, which computes attention in parallel with feed forward layer. - - """ - - def __init__( - self, - config: TransformerConfig, - layer_number: int = 1, - self_attn_mask_type=AttnMaskType.padding, - parallel_attention=False, - new_decoder_architecture=False, - ): - super().__init__(config=config) - self.config: TransformerConfig = config - - self.layer_number = layer_number + self._get_layer_offset() - - self.self_attn_mask_type = self_attn_mask_type - - self.new_decoder_architecture = new_decoder_architecture - self.parallel_attention = parallel_attention - - # Layernorm on the input data. - # TODO: add pytorch only layernorm - self.input_layernorm = self._create_identity_op() - - self.mlp_layernorm = self._create_identity_op() if self.new_decoder_architecture else None - - if self.new_decoder_architecture or self.parallel_attention: - self.post_self_attn_layernorm = None - else: - # Layernorm on the attention output - self.post_self_attn_layernorm = self._create_identity_op() - - # Self attention. - self.self_attention = SelfAttention( - config=self.config, layer_number=layer_number, attn_mask_type=self_attn_mask_type, - ) - - # MLP - self.mlp = MLP(config=self.config) - - # @jcasper how should we handle nvfuser? - # Set bias+dropout+add fusion grad_enable execution handler. - # TORCH_MAJOR = int(torch.__version__.split('.')[0]) - # TORCH_MINOR = int(torch.__version__.split('.')[1]) - # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) - # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad - self.bias_dropout_add_exec_handler = torch.enable_grad - - def _create_identity_op(self): - """Helper function to create an IdentityOp with common parameters.""" - return IdentityOp( - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - persist_layer_norm=self.config.persist_layer_norm, - sequence_parallel=self.config.sequence_parallel, - zero_centered_gamma=self.config.layernorm_zero_centered_gamma, - normalization=self.config.normalization, - ) - - def _get_layer_offset(self): - - pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() - - num_layers_per_pipeline_rank = ( - self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() - ) - - if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: - vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank() - vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() - - total_num_layers = self.config.num_layers - num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size - total_virtual_chunks = total_num_layers // vp_size - offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank) - - else: - # Each stage gets a contiguous set of layers. - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - offset = pipeline_rank * num_layers_per_pipeline_rank - else: - offset = 0 - - return offset - - def forward( - self, - hidden_states, - attention_mask, - encoder_output=None, - enc_dec_attn_mask=None, - inference_params=None, - rotary_pos_emb=None, - ): - # hidden_states: [s, b, h] - - # Layer norm at the beginning of the transformer layer. - layernorm_output = self.input_layernorm(hidden_states) - input_mlp_ln = layernorm_output - - # Self attention. - attention_output_with_bias = self.self_attention( - layernorm_output, attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb, - ) - - # Residual connection. - if self.config.apply_residual_connection_post_layernorm: - residual = layernorm_output - else: - residual = hidden_states - - # falcon specific - if self.new_decoder_architecture: - mlp_ln_output = self.mlp_layernorm(hidden_states) - - bias_dropout_add_func = get_bias_dropout_add(self.training, self.config.bias_dropout_fusion) - - # bias_dropout_add fusion returning fp32 instead of bf16 - with self.bias_dropout_add_exec_handler(): - layernorm_input = bias_dropout_add_func(attention_output_with_bias, residual, self.config.hidden_dropout) - - # falcon specific - if not self.new_decoder_architecture: - if self.parallel_attention: - layernorm_output = input_mlp_ln - else: - layernorm_output = self.post_self_attn_layernorm(layernorm_input) - residual = ( - layernorm_input if not self.config.apply_residual_connection_post_layernorm else layernorm_output - ) - else: - layernorm_output = mlp_ln_output - - # MLP. - mlp_output_with_bias = self.mlp(layernorm_output) - - # falcon specific: - if self.new_decoder_architecture or self.parallel_attention: - mlp_output_with_bias = mlp_output_with_bias + attention_output_with_bias - - with self.bias_dropout_add_exec_handler(): - output = bias_dropout_add_func(mlp_output_with_bias, residual, self.config.hidden_dropout) - - # Jit compiled function creates 'view' tensor. This tensor - # potentially gets saved in the MPU checkpoint function context, - # which rejects view tensors. While making a viewless tensor here - # won't result in memory savings (like the data loader, or - # p2p_communication), it serves to document the origin of this - # 'view' tensor. - output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True) - - return output - - def sharded_state_dict(self, prefix=''): - - # state_dict = self.state_dict(prefix=prefix, keep_vars=True) - state_dict = self.state_dict(keep_vars=True) - - tensor_parallel_layers_axis_map = { - 'self_attention.linear_qkv.weight': 0, - 'self_attention.linear_qkv.bias': 0, - 'self_attention.linear_proj.weight': 1, - 'mlp.linear_fc1.weight': 0, - 'mlp.linear_fc1.bias': 0, - 'mlp.linear_fc2.weight': 1, - } - - offset = self._get_layer_offset() - num_layers = self.config.num_layers - - sharded_state_dict = {} - - for layer_name in state_dict.keys(): - tensor = state_dict[layer_name] - global_layer_offset = self.layer_number - 1 # self.layer_number starts at 1 - layer_key = f'{prefix}{global_layer_offset - offset}.{layer_name}' # module list index in TransformerBlock - sharded_offsets = [(0, global_layer_offset, num_layers)] # PP sharding - - if layer_name in tensor_parallel_layers_axis_map: - tp_axis = tensor_parallel_layers_axis_map[layer_name] - # TP sharding - sharded_offsets.append( - [ - tp_axis + 1, # +1 for PP dimension - parallel_state.get_tensor_model_parallel_rank(), - parallel_state.get_tensor_model_parallel_world_size(), - ] - ) - replica_id = parallel_state.get_data_parallel_rank() - else: - replica_id = ( - parallel_state.get_data_parallel_rank() * parallel_state.get_data_parallel_world_size() - + parallel_state.get_tensor_model_parallel_rank() - ) - - if layer_name.endswith('._extra_state'): - sharded_state_dict[layer_key] = ShardedObject( - f'{prefix}{layer_name}', tensor, (num_layers,), (global_layer_offset,), replica_id, - ) - - else: - sharded_state_dict[layer_key] = ShardedTensor.from_rank_offsets( - f'{prefix}{layer_name}', - tensor, - *sharded_offsets, - replica_id=replica_id, - prepend_axis_num=1, # for PP sharding - ) - - return sharded_state_dict From f2ad089c8c7852bd06035a76393919eaba964256 Mon Sep 17 00:00:00 2001 From: vivian Date: Thu, 28 Sep 2023 07:11:20 +0000 Subject: [PATCH 14/69] refactor conversion script to align with others --- .../convert_hf_falcon_to_nemo.py | 482 +++++++----------- 1 file changed, 192 insertions(+), 290 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index f7995d471e24..5fb198a3fd91 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -23,109 +23,65 @@ Example to run this conversion script: ``` python convert_hf_falcon_to_nemo.py \ - --in-file \ - --out-file \ - --tokenizer-type \ + --config /path/to/megatron_gpt_config.yaml \ + --input \ + --output \ --precision ``` """ -import logging +import argparse import os +from typing import Dict import time -from argparse import ArgumentParser -from collections import OrderedDict +import pytorch_lightning as pl import torch +import yaml from omegaconf import OmegaConf -from pytorch_lightning.core.saving import _load_state as ptl_load_state -from pytorch_lightning.trainer.trainer import Trainer -from transformers import AutoModelForCausalLM, AutoTokenizer, FalconConfig +from transformers import FalconConfig, AutoModelForCausalLM from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.nlp_overrides import ( - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) - -# TODO: -# [Y] refactor ckpt func to make it cleaner -# [Y] dict tokenizer mapping for falcon family -# [ ] good way to add new_decoder_architecture and parallel_attn in megatron_gpt_config.yaml -# [ ] safetensors loading. (only 180b used safetensors) -# [Y] test on non parallel attention model (block by no alibi support? 1b-rw good, 7b-rw still some time) -# [Y] hf config name mapping for falcon 7b and 40b. -# [Y] trust remote code add -# [Y] MQA MHA GQA num_kv_heads and mcore's GQA logic add (not sure about MQA) -# [Y] When bias_gelu_fusion is True, add_bias_linear must also be True. error -# [Y] update save_to and restore_from for dist checkpointing -# [ ] remove unnecessary comments and codes. - - -def setup_logging(log_file="test.log"): - logging.basicConfig( - filename=log_file, - level=logging.DEBUG, - format='%(asctime)s [%(levelname)s] - %(message)s', - datefmt='%d-%b-%y %H:%M:%S', - ) - - -def get_args(): - parser = ArgumentParser() - parser.add_argument( - "--in-file", type=str, default=None, required=True, help="Path to Huggingface Falcon checkpoints", - ) - parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.") - parser.add_argument("--precision", type=str, default="32", help="Model precision") - parser.add_argument( - "--tokenizer-type", - type=str, - default="tiiuae/falcon-7b", - help="Tokenizer type to use, e.g., 'tiiuae/falcon-7b'.", - ) - args = parser.parse_args() - return args - - -def load_model(cls, checkpoint, strict, **kwargs): - try: - if 'cfg' in kwargs: - model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs) +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.utils import logging + +def convert_state_dict(state_dict: Dict[str, torch.Tensor], amp: bool = False): + def get_new_key(old_key): + if old_key == "transformer.word_embeddings.weight": + return "embedding.word_embeddings.weight" + elif old_key.startswith("transformer.ln_f"): + return old_key.replace("transformer.ln_f", "decoder.final_layernorm") + elif old_key.startswith("lm_head"): + return old_key.replace("lm_head", "output_layer") + + # For the rest, a base transformation + key = old_key.replace("transformer.h", "decoder.layers") + + # Handling the layer normalization replacements + if falcon_config.new_decoder_architecture: + key = key.replace("ln_attn", "input_layernorm") + key = key.replace("ln_mlp", "pre_mlp_layernorm") else: - model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs) - for name, module in model.named_parameters(): - if name in checkpoint['state_dict']: - module.data = checkpoint['state_dict'][name] - checkpoint['state_dict'].pop(name) - else: - print(f"Unexpected key: {name} not in checkpoint but in model.") - - for name, buffer in model.named_buffers(): - if name in checkpoint['state_dict']: - buffer.data = checkpoint['state_dict'][name] - checkpoint['state_dict'].pop(name) + key = key.replace("input_layernorm", "input_layernorm") + if not falcon_config.parallel_attn: + key = key.replace("post_attention_layernorm", "post_self_attn_layernorm") + + key = key.replace("self_attention.dense", "self_attention.linear_proj") + key = key.replace("self_attention.query_key_value", "self_attention.linear_qkv") + key = key.replace("dense_h_to_4h", "linear_fc1") + key = key.replace("dense_4h_to_h", "linear_fc2") + return key - if len(checkpoint['state_dict'].keys()) != 0: - raise RuntimeError( - f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model." - ) + new_dict = {} + # amp O2 mode has different state dict name + prefix = "model.module." if amp else "model." - # register the artifacts - cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] - if cfg.tokenizer.model is not None: - model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model) - if cfg.tokenizer.vocab_file is not None: - model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file) - if cfg.tokenizer.merge_file is not None: - model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file) - finally: - cls._set_model_restore_state(is_being_restored=False) - return model + for old_key, val in state_dict.items(): + new_key = get_new_key(old_key) + new_key = prefix + new_key + new_dict[new_key] = val + return new_dict def load_falcon_config(args) -> FalconConfig: """ Helper utility to load FalconConfig. @@ -134,8 +90,7 @@ def load_falcon_config(args) -> FalconConfig: `transformers.FalconModel`. need to manually set the config values and force to `falcon` model type. """ - config = FalconConfig.from_pretrained(args.in_file) - + config = FalconConfig.from_pretrained(args.input) if config.model_type == 'RefinedWeb': mappings = { "num_hidden_layers": config.n_layer, @@ -159,229 +114,176 @@ def load_falcon_config(args) -> FalconConfig: config.model_type = 'falcon' return config +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file" + ) + parser.add_argument( + "--input", type=str, required=True, help="Falcon variants from HuggingFace hub or local dir with downloaded model" + ) + parser.add_argument( + "--output", type=str, default=".", help="Path to dir where to store output .nemo file" + ) + parser.add_argument( + "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" + ) + parser.add_argument( + "--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving" + ) + + args = parser.parse_args() -def load_nemo_config(args): + if not os.path.isdir(args.output): + raise FileNotFoundError(f"Output directory '{args.output}' does not exist") + falcon_config = load_falcon_config(args) logging.info(f"falcon_config, {falcon_config}") - nemo_config = OmegaConf.load( - os.path.join(os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml') - ).model - nemo_config.encoder_seq_length = falcon_config.max_position_embeddings - nemo_config.num_layers = int(falcon_config.num_hidden_layers) - nemo_config.hidden_size = falcon_config.hidden_size - nemo_config.num_attention_heads = falcon_config.num_attention_heads - nemo_config.max_position_embeddings = falcon_config.max_position_embeddings - nemo_config.init_method_std = falcon_config.initializer_range - nemo_config.layernorm_epsilon = falcon_config.layer_norm_epsilon - try: - if falcon_config.alibi: + with open(args.config, "r", encoding="utf_8") as f: + orig_cfg = yaml.safe_load(f) + + model_dict = orig_cfg["model"] + + if "data" in model_dict: + del model_dict["data"] + + override_model_dict = { + "micro_batch_size": 1, + "global_batch_size": 1, + "tensor_model_parallel_size": 1, + "pipeline_model_parallel_size": 1, + "megatron_amp_O2": False, + "transformer_engine": True, + "use_cpu_initialization": not args.cuda, + "normalization": "layernorm", + "mcore_gpt": True, + "num_query_groups": None, # MHA + "hidden_size": falcon_config.hidden_size, + "encoder_seq_length": falcon_config.max_position_embeddings, + "max_position_embeddings": falcon_config.max_position_embeddings, + "num_layers": falcon_config.num_hidden_layers, + "num_attention_heads": falcon_config.num_attention_heads, + "ffn_hidden_size": falcon_config.hidden_size * 4, + "layernorm_epsilon": falcon_config.layer_norm_epsilon, + "pre_process": True, + "post_process": True, + "apply_query_key_layer_scaling": False, + "bias": falcon_config.bias, + "transformer_block_type": "pre_ln", + "fp32_residual_connection": False, + "hidden_dropout": falcon_config.hidden_dropout, + "attention_dropout": falcon_config.attention_dropout, + "ffn_dropout": 0, + "share_embeddings_and_output_weights": False, + "position_embedding_type": "rope", + "precision": args.precision, + "init_method_std": falcon_config.initializer_range, + "new_decoder_architecture": falcon_config.new_decoder_architecture, + "parallel_attention": falcon_config.parallel_attn, + "activation": "gelu", + "bias_activation_fusion": False, + "bias_dropout_add_fusion": False, + "seq_len_interpolation_factor": None, + } + tokenizer_dict = { + "library": "huggingface", + "type": args.input, + "use_fast": True, + } + trainer_dict = { + "devices": 1, + "num_nodes": 1, + "accelerator": "gpu" if args.cuda else "cpu", + "precision": args.precision, + "logger": False, + "enable_checkpointing": False, + "max_epochs": -1, + "max_steps": 100000, + "log_every_n_steps": 10, + "val_check_interval": 100, + "limit_val_batches": 50, + "limit_test_batches": 500, + "accumulate_grad_batches": 1, + "gradient_clip_val": 1.0, + "benchmark": False, + "enable_model_summary": False, + "strategy": NLPDDPStrategy(), + } + + # Additional logic for position_embedding_type = alibi + if falcon_config.alibi: + try: raise ValueError( "Alibi is not yet supported in Megatron Core, \ force to use RoPE will generate suboptimal responses" ) - except ValueError as e: - print(e) - finally: - nemo_config.position_embedding_type = 'rope' - nemo_config.bias = falcon_config.bias - nemo_config.hidden_dropout = falcon_config.hidden_dropout - nemo_config.attention_dropout = falcon_config.attention_dropout - # TODO: how does vocab_file, merge_file etc get mapped automatically in respect to variants of falcon models? - tokenizer_dict = { - 'library': 'huggingface', - 'type': args.tokenizer_type, # FIXME: can it work from local args.input too, fix for falcon family? - } - - nemo_config.tokenizer = tokenizer_dict - - nemo_config.new_decoder_architecture = falcon_config.new_decoder_architecture #bool, if True, always use parallel attn - nemo_config.parallel_attention = falcon_config.parallel_attn + except ValueError as e: + print(e) - nemo_config.num_query_groups = ( - falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None - ) - nemo_config.use_cpu_initialization = True - nemo_config.activation = 'gelu' + # Additional logic for num_query_groups + if override_model_dict.get("num_query_groups") is None: + override_model_dict["num_query_groups"] = ( + falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None + ) + + # Additional logic for bias fusion + if falcon_config.bias: + override_model_dict["bias_activation_fusion"] = True + override_model_dict["bias_dropout_add_fusion"] = True + + # Addtional logic for rope scaling if falcon_config.rope_scaling is not None: if falcon_config.rope_scaling.type == 'linear': - nemo_config['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor + override_model_dict['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor else: raise ValueError("Only linear rope scaling type is supported now") - - nemo_config.mcore_gpt = True - nemo_config.transformer_engine = True - nemo_config.bias_activation_fusion = False - nemo_config.bias_dropout_add_fusion = False - nemo_config.share_embeddings_and_output_weights = False - - base = 128 - while falcon_config.vocab_size % base != 0: - base //= 2 - nemo_config.make_vocab_size_divisible_by = base - - return nemo_config - - -def determine_precision(args): - """Helper function to determine the precision of model - """ - if args.precision in ["32", "16"]: - return int(args.precision) - elif args.precision in ["bf16", "bf16-mixed"]: - if not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()): - logging.warning("BF16 is not supported on this device. Using FP16 instead.") - return args.precision[2:] # prune 'bf' from string - return args.precision - - -def determine_dtype(precision): - dtype_map = { - "32": torch.float32, - "16": torch.float16, - "16-mixed": torch.float16, - "bf16": torch.bfloat16, - "bf16-mixed": torch.bfloat16, - } - return dtype_map.get(precision, torch.float32) # default to torch.float32 - - -def convert(args): - logging.info(f"loading checkpoint {args.in_file}") - tik = time.time() - model = AutoModelForCausalLM.from_pretrained(args.in_file, trust_remote_code=True) - falcon_config = load_falcon_config(args) - # debug - logging.debug(f"initial falcon_config, {falcon_config}") - - nemo_config = load_nemo_config(args) - # debug - logging.debug(f"initial nemo_config, {nemo_config}") - precision = determine_precision(args) - - plugins = [] - - if precision in ['16', '16-mixed', 'bf16', 'bf16-mixed']: - scaler_params = { - 'init_scale': nemo_config.get('native_amp_init_scale', 2 ** 32), - 'growth_interval': nemo_config.get('native_amp_growth_interval', 1000), - 'hysteresis': nemo_config.get('hysteresis', 2), - } - - plugin_precision = '16-mixed' if precision in ['16', '16-mixed'] else 'bf16-mixed' - scaler = GradScaler(**scaler_params) if precision in ['16', '16-mixed'] else None - - dtype = determine_dtype(precision) - nemo_config.precision = precision - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + - hidden_size = falcon_config.hidden_size - head_num = falcon_config.num_attention_heads - head_size = hidden_size // head_num - num_layers = falcon_config.num_hidden_layers - - # - MHA: num_heads = num_kv_heads - # - Multi-Query Attention: num_kv_heads = 1 - # - Grouped-Query Attention: num_heads % num_kv_heads = 0 - num_query_groups = ( - nemo_config.num_query_groups - if nemo_config.num_query_groups and nemo_config.num_query_groups != head_num - else head_num - ) - assert ( - head_num % num_query_groups == 0 - ), f'head_num ({head_num}) must be divisible by num_query_groups ({num_query_groups})' + model_dict.update(override_model_dict) + model_dict["tokenizer"] = tokenizer_dict + model_dict["name"] = 'megatron_falcon_gpt' - param_to_weights = lambda param: param.float() - - checkpoint = OrderedDict() - checkpoint['state_dict'] = OrderedDict() - - def add_to_checkpoint(source_prefix, target_prefix, weight_or_bias): - source_name = f"{source_prefix}.{weight_or_bias}" - if source_name in model.state_dict(): - target_name = f"{target_prefix}.{weight_or_bias}" - checkpoint['state_dict'][target_name] = param_to_weights(model.state_dict()[source_name]) - - def add_weight_and_possible_bias(source_prefix, target_prefix): - add_to_checkpoint(source_prefix, target_prefix, 'weight') - if f"{source_prefix}.bias" in model.state_dict(): - add_to_checkpoint(source_prefix, target_prefix, 'bias') - - add_to_checkpoint('transformer.word_embeddings', 'model.embedding.word_embeddings', 'weight') + omega_cfg = OmegaConf.create(model_dict) + + # output_path = "./falcon_megatron_config.yaml" + # OmegaConf.save(config=omega_cfg, f=output_path) - for l in range(int(num_layers)): - print(f"converting layer {l}") - prefix = f'transformer.h.{l}' + trainer = pl.Trainer(**trainer_dict) - add_weight_and_possible_bias( - f'{prefix}.self_attention.query_key_value', f'model.decoder.layers.{l}.self_attention.linear_qkv' - ) - add_weight_and_possible_bias( - f'{prefix}.self_attention.dense', f'model.decoder.layers.{l}.self_attention.linear_proj' - ) - add_weight_and_possible_bias(f'{prefix}.mlp.dense_h_to_4h', f'model.decoder.layers.{l}.mlp.linear_fc1') - add_weight_and_possible_bias(f'{prefix}.mlp.dense_4h_to_h', f'model.decoder.layers.{l}.mlp.linear_fc2') - - if falcon_config.new_decoder_architecture: - add_weight_and_possible_bias( - f'{prefix}.ln_attn', - f'model.decoder.layers.{l}.input_layernorm', - ) - add_weight_and_possible_bias( - f'{prefix}.ln_mlp', - f'model.decoder.layers.{l}.pre_mlp_layernorm', - ) - else: - add_weight_and_possible_bias( - f'{prefix}.input_layernorm', - f'model.decoder.layers.{l}.input_layernorm', - ) - if not falcon_config.parallel_attn: - add_weight_and_possible_bias( - f'{prefix}.post_attention_layernorm', - f'model.decoder.layers.{l}.post_self_attn_layernorm', - ) - - print(f"done layer {l}") + logging.info("Creating Megatron model...") + tik = time.time() + model = MegatronGPTModel(omega_cfg, trainer) + logging.info(f"Created model:\n{model}") - # final layer norm - add_weight_and_possible_bias('transformer.ln_f', 'model.decoder.final_layernorm') + logging.info("Loading HuggingFace model...") + model_hf = AutoModelForCausalLM.from_pretrained(args.input, trust_remote_code=True) + logging.info(f"Loaded model:\n{model_hf}") - # LM weight - add_to_checkpoint('lm_head', 'model.output_layer', 'weight') + state_dict_hf = model_hf.state_dict() + convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2) - checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config - #logging.debug(f'final checkpoint, {checkpoint}') + logging.info("Loading state dict...") + missing_keys, unexpected_keys = model.load_state_dict(convert_dict, strict=False) - del model - - # state dict name for megatron_amp_O2 is different - if nemo_config.get('megatron_amp_O2', False): - keys = list(checkpoint['state_dict'].keys()) - for key in keys: - checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key) + if missing_keys: + # Keys ending with '_extra_state' are related to Transformer Engine internals + missing_keys_non_extra = [key for key in missing_keys if not key.endswith("_extra_state")] + if missing_keys_non_extra: + logging.critical("Missing keys were detected during the load, something has gone wrong. Aborting.") + raise RuntimeError(f"Missing keys: \n{missing_keys_non_extra}") - #model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) - model = MegatronGPTModel(checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY], trainer=trainer) + if unexpected_keys: + logging.critical("Unexpected keys were detected which should not happen. Aborting.") + raise RuntimeError(f"Unexpected keys: \n{unexpected_keys}") - model._save_restore_connector = NLPSaveRestoreConnector() + logging.info("Saving model...") - # cast to target precision and disable cpu init + dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32 model = model.to(dtype=dtype) - model.cfg.use_cpu_initialization = False - # We make sure that the tokenizer can be instantiated later regardless of args.input - model.cfg.tokenizer.update(type=args.tokenizer_type) - # save model - - model.save_to(args.out_file) - logging.info(f'NeMo model saved to: {args.out_file}') - + model.cfg.update(use_cpu_initialization=False) + name_last_part = os.path.basename(args.input.rstrip('/')) + model.save_to(os.path.join(args.output, f'falcon_{name_last_part}_{args.precision}_tp1_pp1.nemo')) + logging.info("Done.") tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logging.info(f'Weights loaded and saved. Total time: {t}') - - -if __name__ == '__main__': - setup_logging() - args = get_args() - convert(args) + logging.info(f'nemo model created and saved. Total time: {t}') \ No newline at end of file From 47d2f23fd50f4627f9804da273d28c0c2a349dc3 Mon Sep 17 00:00:00 2001 From: vivian Date: Thu, 28 Sep 2023 07:16:46 +0000 Subject: [PATCH 15/69] add support for falcon-rw model (normal gpt architecture) --- .../spec_falcon/spec_falcon_decoder_layer.py | 12 +++++++++--- .../models/language_modeling/megatron_gpt_model.py | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py index 68f417b04409..a7bb1a588e40 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py @@ -67,9 +67,15 @@ def __init__( self.self_attn_mask_type = self_attn_mask_type - self.new_decoder_architecture = self.config.new_decoder_architecture - - self.parallel_attention = self.config.parallel_attention + if hasattr(self.config, 'new_decoder_architecture'): + self.new_decoder_architecture = self.config.new_decoder_architecture + else: + self.new_decoder_architecture = None + + if hasattr(self.config, 'parallel_attention'): + self.parallel_attention = self.config.parallel_attention + else: + self.parallel_attention = None ## [Module 1: Input Layernorm] Optional Layernorm on the input data diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index cf640a813dae..464457a13632 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -235,6 +235,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.mcore_gpt = cfg.get('mcore_gpt', False) # Falcon specific args + self.falcon_name = cfg.get('name', 'megatron_falcon_gpt') self.new_decoder_architecture = cfg.get('new_decoder_architecture', False) self.parallel_attention = cfg.get('parallel_attention', False) @@ -325,7 +326,7 @@ def get_inference_config(self): def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" - if self.mcore_gpt and (self.new_decoder_architecture or self.parallel_attention): + if self.mcore_gpt and self.falcon_name: FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model() transformer_layer_spec = falcon_layer_spec #debug @@ -344,7 +345,6 @@ def model_provider_func(self, pre_process, post_process): seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), ) - logging.info(f'model architecture is {model}') elif self.mcore_gpt: transformer_layer_spec = gpt_layer_with_transformer_engine_spec From ed8869a81652efad710bb538481c2510a5f08087 Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 3 Oct 2023 03:15:00 +0000 Subject: [PATCH 16/69] modify falcon 7b config and remove trust remote code due to HF code changes --- .../nlp_language_modeling/convert_hf_falcon_to_nemo.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 5fb198a3fd91..a47ec258662e 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -222,9 +222,10 @@ def load_falcon_config(args) -> FalconConfig: # Additional logic for num_query_groups if override_model_dict.get("num_query_groups") is None: - override_model_dict["num_query_groups"] = ( - falcon_config.num_kv_heads if falcon_config.new_decoder_architecture or falcon_config.multi_query else None - ) + if falcon_config.new_decoder_architecture: + override_model_dict["num_query_groups"] = falcon_config.num_kv_heads + elif falcon_config.multi_query: + override_model_dict["num_query_groups"] = 1 # Additional logic for bias fusion if falcon_config.bias: @@ -256,7 +257,7 @@ def load_falcon_config(args) -> FalconConfig: logging.info(f"Created model:\n{model}") logging.info("Loading HuggingFace model...") - model_hf = AutoModelForCausalLM.from_pretrained(args.input, trust_remote_code=True) + model_hf = AutoModelForCausalLM.from_pretrained(args.input) logging.info(f"Loaded model:\n{model_hf}") state_dict_hf = model_hf.state_dict() From 59e0f2ed79040a145e3f9fb16ff5540413c36f9a Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 3 Oct 2023 03:32:53 +0000 Subject: [PATCH 17/69] rename falcon implementation dir --- .../megatron/{spec_falcon => falcon}/__init__.py | 0 .../megatron/{spec_falcon => falcon}/falcon_gpt_model.py | 0 .../megatron/{spec_falcon => falcon}/falcon_spec.py | 0 .../megatron/{spec_falcon => falcon}/spec_falcon_decoder_block.py | 0 .../megatron/{spec_falcon => falcon}/spec_falcon_decoder_layer.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/__init__.py (100%) rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/falcon_gpt_model.py (100%) rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/falcon_spec.py (100%) rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/spec_falcon_decoder_block.py (100%) rename nemo/collections/nlp/models/language_modeling/megatron/{spec_falcon => falcon}/spec_falcon_decoder_layer.py (100%) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py similarity index 100% rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/__init__.py rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py similarity index 100% rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_gpt_model.py rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py similarity index 100% rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/falcon_spec.py rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py similarity index 100% rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_block.py rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py similarity index 100% rename from nemo/collections/nlp/models/language_modeling/megatron/spec_falcon/spec_falcon_decoder_layer.py rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py From 03d06bcd32b187908b1ab82fa93054fc1fd30bf1 Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 3 Oct 2023 03:44:21 +0000 Subject: [PATCH 18/69] change dir name --- .../megatron/falcon/falcon_gpt_model.py | 10 +++++----- .../language_modeling/megatron/falcon/falcon_spec.py | 2 +- .../megatron/falcon/spec_falcon_decoder_block.py | 8 +++++--- .../nlp/models/language_modeling/megatron_gpt_model.py | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py index 67b93283b0a4..0d8f2ede9ffc 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py @@ -15,7 +15,7 @@ from torch import Tensor # from megatron.core.transformer.transformer_block import TransformerBlock -from .spec_falcon_decoder_block import FalconTransformerBlock +from .falcon_decoder_block import FalconTransformerBlock class FalconGPTModel(MegatronModule): @@ -189,7 +189,7 @@ def forward( if self.share_embeddings_and_output_weights: output_weight = self.shared_embedding_or_output_weight() logits, _ = self.output_layer(hidden_states, weight=output_weight) - + if labels is None: # [s b h] => [b s h] return logits.transpose(0, 1).contiguous() @@ -197,7 +197,7 @@ def forward( # [b s] => [s b] labels = labels.transpose(0, 1).contiguous() loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels) - + # [s b] => [b, s] loss = loss.transpose(0, 1).contiguous() return loss @@ -247,7 +247,7 @@ def initialize_last_stage_with_word_embeddings(self): weight.data, group=parallel_state.get_embedding_group() ) - elif not getattr(GPTModel, "embedding_warning_printed", False): + elif not getattr(FalconGPTModel, "embedding_warning_printed", False): logging.getLogger(__name__).warning( "Distributed processes aren't initialized, so the output layer " "is not initialized with weights from the word embeddings. " @@ -255,7 +255,7 @@ def initialize_last_stage_with_word_embeddings(self): "this needs to be handled manually. If you are training " "something is definitely wrong." ) - GPTModel.embedding_warning_printed = True + FalconGPTModel.embedding_warning_printed = True def sharded_state_dict(self, prefix=''): sharded_state_dict = {} diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 70c90e520937..084a82c71c48 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -13,7 +13,7 @@ from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.spec_utils import ModuleSpec -from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules +from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules # Use this spec for an implementation using modules in TE falcon_layer_spec = ModuleSpec( diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py index 8a009c39b59e..2717ff399a63 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py @@ -12,7 +12,7 @@ from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_config import TransformerConfig -from .spec_falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules +from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor @@ -208,7 +208,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p hidden_states = make_viewless_tensor( inp=hidden_states, requires_grad=True, keep_graph=True, ) - + if self.config.sequence_parallel: rng_context = tensor_parallel.get_cuda_rng_tracker().fork() else: @@ -250,13 +250,15 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p rotary_pos_emb=rotary_pos_emb, ) else: - for layer in self.layers: + for idx, layer in enumerate(self.layers): hidden_states = layer( hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb, inference_params=inference_params, ) + logging.debug(f"Layer {idx + 1} tensor:", hidden_states) + logging.debug(f"Layer {idx + 1} tensor shape:", hidden_states.shape) # Final layer norm. if self.post_process and self.post_layer_norm: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 464457a13632..6b6e899a11a2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -106,8 +106,8 @@ def import_falcon_gpt_model(): """ try: #from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel - from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel - from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_spec import falcon_layer_spec + from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel + from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import falcon_layer_spec return FalconGPTModel, falcon_layer_spec except (ImportError, ModuleNotFoundError): raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.") From 71b25b81d54b6ac04a8d64183817ffee5ebe1de2 Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 3 Oct 2023 03:50:19 +0000 Subject: [PATCH 19/69] modify block name --- .../{spec_falcon_decoder_block.py => falcon_decoder_block.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename nemo/collections/nlp/models/language_modeling/megatron/falcon/{spec_falcon_decoder_block.py => falcon_decoder_block.py} (100%) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py similarity index 100% rename from nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_block.py rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py From 9bb2e32f0a4cc1146a9aa64458213f6e4c9b9895 Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 3 Oct 2023 03:52:08 +0000 Subject: [PATCH 20/69] rename decoder layer --- .../{spec_falcon_decoder_layer.py => falcon_decoder_layer.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename nemo/collections/nlp/models/language_modeling/megatron/falcon/{spec_falcon_decoder_layer.py => falcon_decoder_layer.py} (100%) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py similarity index 100% rename from nemo/collections/nlp/models/language_modeling/megatron/falcon/spec_falcon_decoder_layer.py rename to nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py From d1056032091ce701d08c2defd559b1a78af9a18d Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 3 Oct 2023 03:53:24 +0000 Subject: [PATCH 21/69] clean up --- .../nlp/models/language_modeling/megatron/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py index b024f0b061c3..bdd9da8799e6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py @@ -15,7 +15,7 @@ # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel try: - from nemo.collections.nlp.models.language_modeling.megatron.spec_falcon.falcon_gpt_model import FalconGPTModel + from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel HAVE_MEGATRON_CORE = True From 65fb7266c03afba2c5c0194263879d3d41844318 Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 3 Oct 2023 04:03:59 +0000 Subject: [PATCH 22/69] remove debug --- .../language_modeling/megatron/falcon/falcon_decoder_block.py | 2 -- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py index 2717ff399a63..52eb22a25f85 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py @@ -257,8 +257,6 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p rotary_pos_emb=rotary_pos_emb, inference_params=inference_params, ) - logging.debug(f"Layer {idx + 1} tensor:", hidden_states) - logging.debug(f"Layer {idx + 1} tensor shape:", hidden_states.shape) # Final layer norm. if self.post_process and self.post_layer_norm: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 6b6e899a11a2..aa68ca44df8a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -329,8 +329,6 @@ def model_provider_func(self, pre_process, post_process): if self.mcore_gpt and self.falcon_name: FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model() transformer_layer_spec = falcon_layer_spec - #debug - logging.info(f'falcon gpt config, {self.transformer_config}') model = FalconGPTModel( config=self.transformer_config, transformer_layer_spec = transformer_layer_spec, From c4ad769d8994fa56d48f0320aedadc0ebb0de936 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 19:13:37 +0000 Subject: [PATCH 23/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../megatron/falcon/__init__.py | 2 +- .../megatron/falcon/falcon_decoder_block.py | 18 ++--- .../megatron/falcon/falcon_decoder_layer.py | 74 +++++++++---------- .../megatron/falcon/falcon_gpt_model.py | 25 ++----- .../megatron/falcon/falcon_spec.py | 10 +-- .../language_modeling/megatron_gpt_model.py | 33 +++++---- .../convert_hf_falcon_to_nemo.py | 51 +++++++------ 7 files changed, 96 insertions(+), 117 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py index 46da18a40345..5dd085d829f6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py @@ -1 +1 @@ -from .falcon_gpt_model import FalconGPTModel \ No newline at end of file +from .falcon_gpt_model import FalconGPTModel diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py index 52eb22a25f85..16bda328f38b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py @@ -4,7 +4,6 @@ from contextlib import nullcontext import torch - from megatron.core import parallel_state, tensor_parallel from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.transformer.custom_layers.transformer_engine import TENorm @@ -12,9 +11,10 @@ from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_config import TransformerConfig -from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor +from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules + class FalconTransformerBlock(MegatronModule): """Transformer class.""" @@ -205,10 +205,8 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p # likely redundant, since p2p_communication.py (likely originator) # already creates viewless tensors. That said, make_viewless_tensor() # is called here to be future-proof and corner-case-proof. - hidden_states = make_viewless_tensor( - inp=hidden_states, requires_grad=True, keep_graph=True, - ) - + hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,) + if self.config.sequence_parallel: rng_context = tensor_parallel.get_cuda_rng_tracker().fork() else: @@ -245,9 +243,7 @@ def forward(self, hidden_states, attention_mask, inference_params=None, rotary_p # Forward pass. if self.config.recompute_granularity == 'full': hidden_states = self._checkpointed_forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - rotary_pos_emb=rotary_pos_emb, + hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb, ) else: for idx, layer in enumerate(self.layers): @@ -283,8 +279,6 @@ def sharded_state_dict(self, prefix=''): if 'final_layernorm.bias' in state_dict.keys(): tensor = state_dict['final_layernorm.bias'] layer_name = f'{prefix}final_layernorm.bias' - sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint( - tensor, layer_name - ) + sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index a7bb1a588e40..9ff495c87e7e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -31,6 +31,7 @@ hyperparameters: transformer hyperparameters """ + @dataclass class FalconTransformerLayerSubmodules: input_layernorm: Union[ModuleSpec, type] = IdentityOp @@ -38,7 +39,7 @@ class FalconTransformerLayerSubmodules: self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp post_self_attn_layernorm: Union[ModuleSpec, type] = IdentityOp - + pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp mlp: Union[ModuleSpec, type] = IdentityOp mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp @@ -54,14 +55,13 @@ class FalconTransformerLayer(MegatronModule): def __init__( self, - config: TransformerConfig, # should come from FalconTransformerConfig class + config: TransformerConfig, # should come from FalconTransformerConfig class submodules: FalconTransformerLayerSubmodules, layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding, ): super().__init__(config=config) self.config: TransformerConfig = config - self.layer_number = layer_number + self._get_layer_offset() @@ -70,13 +70,12 @@ def __init__( if hasattr(self.config, 'new_decoder_architecture'): self.new_decoder_architecture = self.config.new_decoder_architecture else: - self.new_decoder_architecture = None - + self.new_decoder_architecture = None + if hasattr(self.config, 'parallel_attention'): self.parallel_attention = self.config.parallel_attention else: - self.parallel_attention = None - + self.parallel_attention = None ## [Module 1: Input Layernorm] Optional Layernorm on the input data # TODO: add pytorch only layernorm @@ -92,9 +91,7 @@ def __init__( ) ## [Module 2: SelfAttention] - self.self_attention = build_module( - submodules.self_attention, config=self.config, layer_number=layer_number, - ) + self.self_attention = build_module(submodules.self_attention, config=self.config, layer_number=layer_number,) ## [Module 3: BiasDropoutFusion] Optional self.self_attn_bda = build_module(submodules.self_attn_bda) @@ -115,23 +112,27 @@ def __init__( ) ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture - self.pre_mlp_layernorm = build_module( - submodules.pre_mlp_layernorm, - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - persist_layer_norm=self.config.persist_layer_norm, - sequence_parallel=self.config.sequence_parallel, - zero_centered_gamma=self.config.layernorm_zero_centered_gamma, - normalization=self.config.normalization, - ) if self.new_decoder_architecture else None + self.pre_mlp_layernorm = ( + build_module( + submodules.pre_mlp_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + persist_layer_norm=self.config.persist_layer_norm, + sequence_parallel=self.config.sequence_parallel, + zero_centered_gamma=self.config.layernorm_zero_centered_gamma, + normalization=self.config.normalization, + ) + if self.new_decoder_architecture + else None + ) ## [Module 6: MLP block] self.mlp = build_module(submodules.mlp, config=self.config) ## [Module 7: BiasDropoutFusion] Optional self.mlp_bda = build_module(submodules.mlp_bda) - + # @jcasper how should we handle nvfuser? # Set bias+dropout+add fusion grad_enable execution handler. # TORCH_MAJOR = int(torch.__version__.split('.')[0]) @@ -151,7 +152,7 @@ def _get_layer_offset(self): if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank() vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() - + total_num_layers = self.config.num_layers num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size total_virtual_chunks = total_num_layers // vp_size @@ -163,7 +164,7 @@ def _get_layer_offset(self): offset = pipeline_rank * num_layers_per_pipeline_rank else: offset = 0 - + return offset def forward( @@ -176,13 +177,13 @@ def forward( rotary_pos_emb=None, ): # hidden_states: [s, b, h] - + # Residual connection. residual = hidden_states - + if self.new_decoder_architecture: mlp_ln_output = self.pre_mlp_layernorm(hidden_states) - + # Optional Input Layer norm input_layernorm_output = self.input_layernorm(hidden_states) @@ -195,7 +196,7 @@ def forward( inference_params=inference_params, rotary_pos_emb=rotary_pos_emb, ) - + # TODO: could we move `bias_dropout_add_exec_handler` itself # inside the module provided in the `bias_dropout_add_spec` module? with self.bias_dropout_add_exec_handler(): @@ -214,11 +215,11 @@ def forward( layernorm_output = mlp_ln_output mlp_output_with_bias = self.mlp(layernorm_output) - + # falcon specific: if self.new_decoder_architecture or self.parallel_attention: - mlp_output= mlp_output_with_bias[0] - attn_output= attention_output_with_bias[0] + mlp_output = mlp_output_with_bias[0] + attn_output = attention_output_with_bias[0] mlp_output_without_bias = mlp_output + attn_output mlp_output_with_bias = (mlp_output_without_bias, None) @@ -235,9 +236,7 @@ def forward( # won't result in memory savings (like the data loader, or # p2p_communication), it serves to document the origin of this # 'view' tensor. - output = make_viewless_tensor( - inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True - ) + output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True) return output @@ -279,18 +278,13 @@ def sharded_state_dict(self, prefix=''): replica_id = parallel_state.get_data_parallel_rank() else: replica_id = ( - parallel_state.get_data_parallel_rank() - * parallel_state.get_data_parallel_world_size() + parallel_state.get_data_parallel_rank() * parallel_state.get_data_parallel_world_size() + parallel_state.get_tensor_model_parallel_rank() ) if layer_name.endswith('._extra_state'): sharded_state_dict[layer_key] = ShardedObject( - f'{prefix}{layer_name}', - tensor, - (num_layers,), - (global_layer_offset,), - replica_id, + f'{prefix}{layer_name}', tensor, (num_layers,), (global_layer_offset,), replica_id, ) else: diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py index 0d8f2ede9ffc..05d449544094 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py @@ -62,7 +62,7 @@ def __init__( seq_len_interpolation_factor: Optional[float] = None, ): super(FalconGPTModel, self).__init__(config=config) - + self.config: TransformerConfig = config self.transformer_layer_spec: ModuleSpec = transformer_layer_spec self.vocab_size = vocab_size @@ -116,8 +116,7 @@ def __init__( bias=False, skip_bias_add=False, gather_output=not self.parallel_output, - skip_weight_param_allocation=self.pre_process - and self.share_embeddings_and_output_weights, + skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights, ) if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): @@ -189,7 +188,7 @@ def forward( if self.share_embeddings_and_output_weights: output_weight = self.shared_embedding_or_output_weight() logits, _ = self.output_layer(hidden_states, weight=output_weight) - + if labels is None: # [s b h] => [b s h] return logits.transpose(0, 1).contiguous() @@ -197,7 +196,7 @@ def forward( # [b s] => [s b] labels = labels.transpose(0, 1).contiguous() loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels) - + # [s b] => [b, s] loss = loss.transpose(0, 1).contiguous() return loss @@ -243,9 +242,7 @@ def initialize_last_stage_with_word_embeddings(self): if torch.distributed.is_initialized(): if parallel_state.is_rank_in_embedding_group(): weight = self.shared_embedding_or_output_weight() - torch.distributed.all_reduce( - weight.data, group=parallel_state.get_embedding_group() - ) + torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group()) elif not getattr(FalconGPTModel, "embedding_warning_printed", False): logging.getLogger(__name__).warning( @@ -262,9 +259,7 @@ def sharded_state_dict(self, prefix=''): if self.pre_process: embedding_prefix = f'{prefix}embedding.' - embedding_sharded_state_dict = self.embedding.sharded_state_dict( - prefix=embedding_prefix - ) + embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix) sharded_state_dict.update(embedding_sharded_state_dict) decoder_prefix = f'{prefix}decoder.' @@ -282,9 +277,7 @@ def sharded_state_dict(self, prefix=''): first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' dp_rank = parallel_state.get_data_parallel_rank() dp_size = parallel_state.get_data_parallel_world_size() - last_stage_word_emb_replica_id = ( - dp_rank + dp_size - ) # copy of first stage embedding + last_stage_word_emb_replica_id = dp_rank + dp_size # copy of first stage embedding sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( tensor=tensor, @@ -296,9 +289,7 @@ def sharded_state_dict(self, prefix=''): sharded_state_dict[output_layer_key] = sharded_output_layer_tensor else: - output_layer_state_dict = self.output_layer.state_dict( - prefix=output_layer_prefix, keep_vars=True - ) + output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True) output_layer_tensor = output_layer_state_dict[output_layer_key] # independent output layer sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 084a82c71c48..be62fcb33bea 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -3,16 +3,17 @@ from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, TEDotProductAttention, TELayerNormColumnParallelLinear, - TERowParallelLinear, TENorm, - TEColumnParallelLinear, + TERowParallelLinear, ) from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.spec_utils import ModuleSpec + from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules # Use this spec for an implementation using modules in TE @@ -33,10 +34,7 @@ post_self_attn_layernorm=TENorm, pre_mlp_layernorm=TENorm, mlp=ModuleSpec( - module=MLP, - submodules=MLPSubmodules( - linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear, - ), + module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), ), mlp_bda=get_bias_dropout_add, ), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 69ff32063d72..39dff75be55c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -16,7 +16,7 @@ import os import queue import warnings -from dataclasses import fields, dataclass +from dataclasses import dataclass, fields from functools import partial from typing import Any, Dict, Iterator, List, Optional, Union @@ -76,12 +76,12 @@ try: from megatron.core import InferenceParams, parallel_state from megatron.core.models.gpt import GPTModel as MCoreGPTModel + from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec from megatron.core.pipeline_parallel.schedules import get_forward_backward_func from megatron.core.transformer.module import Float16Module as MCoreFloat16Module + from megatron.core.transformer.spec_utils import import_module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal - from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec - from megatron.core.transformer.spec_utils import import_module # TODO @tmoon: Use once available in Megatron-LM # from megatron.core.pipeline_parallel.schedules import DataIteratorList @@ -103,26 +103,30 @@ except (ImportError, ModuleNotFoundError): HAVE_TE = False + def import_falcon_gpt_model(): """Conditionally import FalconGPTModel. """ try: - #from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel + # from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import falcon_layer_spec + return FalconGPTModel, falcon_layer_spec except (ImportError, ModuleNotFoundError): raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.") -@dataclass + +@dataclass class FalconTransformerConfig(TransformerConfig): """ Transformer Config for Falcon Variants """ - + new_decoder_architecture: bool = False parallel_attention: bool = False + class MegatronGPTExportableModel(torch.nn.Module, Exportable): """ Megatron GPT Wrapper for ONNX export @@ -239,7 +243,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.falcon_name = cfg.get('name', 'megatron_falcon_gpt') self.new_decoder_architecture = cfg.get('new_decoder_architecture', False) self.parallel_attention = cfg.get('parallel_attention', False) - + self.rampup_batch_size = self.cfg.get('rampup_batch_size', None) if self.rampup_batch_size: self.prev_consumed_samples = 0 @@ -334,7 +338,7 @@ def model_provider_func(self, pre_process, post_process): transformer_layer_spec = falcon_layer_spec model = FalconGPTModel( config=self.transformer_config, - transformer_layer_spec = transformer_layer_spec, + transformer_layer_spec=transformer_layer_spec, vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, @@ -345,13 +349,12 @@ def model_provider_func(self, pre_process, post_process): rotary_percent=self.cfg.get('rotary_percentage', 1.0), seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), ) - - + elif self.mcore_gpt: transformer_layer_spec = gpt_layer_with_transformer_engine_spec model = MCoreGPTModel( config=self.transformer_config, - transformer_layer_spec = transformer_layer_spec, + transformer_layer_spec=transformer_layer_spec, vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, @@ -1562,7 +1565,7 @@ def build_transformer_config(self) -> TransformerConfig: mcore_gpt = self.cfg.get('mcore_gpt', False) new_decoder_architecture = self.cfg.get('new_decoder_architecture', False) parallel_attention = self.cfg.get('parallel_attention', False) - + normalization = self.cfg.get('normalization', 'layernorm') if normalization == 'layernorm': normalization = 'LayerNorm' @@ -1644,12 +1647,12 @@ def build_transformer_config(self) -> TransformerConfig: f"The model: {self} does not have field.name: {field.name} in its cfg. " f"Add this key to cfg or config_mapping to make to make it configurable." ) - + if mcore_gpt and (new_decoder_architecture or parallel_attention): transformer_config = FalconTransformerConfig( **transformer_config_dict, - new_decoder_architecture = new_decoder_architecture, - parallel_attention = parallel_attention, + new_decoder_architecture=new_decoder_architecture, + parallel_attention=parallel_attention, ) else: transformer_config = TransformerConfig(**transformer_config_dict) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index a47ec258662e..3438bbddafaf 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -32,19 +32,20 @@ import argparse import os -from typing import Dict import time +from typing import Dict import pytorch_lightning as pl import torch import yaml from omegaconf import OmegaConf -from transformers import FalconConfig, AutoModelForCausalLM +from transformers import AutoModelForCausalLM, FalconConfig from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.utils import logging + def convert_state_dict(state_dict: Dict[str, torch.Tensor], amp: bool = False): def get_new_key(old_key): if old_key == "transformer.word_embeddings.weight": @@ -53,10 +54,10 @@ def get_new_key(old_key): return old_key.replace("transformer.ln_f", "decoder.final_layernorm") elif old_key.startswith("lm_head"): return old_key.replace("lm_head", "output_layer") - + # For the rest, a base transformation key = old_key.replace("transformer.h", "decoder.layers") - + # Handling the layer normalization replacements if falcon_config.new_decoder_architecture: key = key.replace("ln_attn", "input_layernorm") @@ -65,7 +66,7 @@ def get_new_key(old_key): key = key.replace("input_layernorm", "input_layernorm") if not falcon_config.parallel_attn: key = key.replace("post_attention_layernorm", "post_self_attn_layernorm") - + key = key.replace("self_attention.dense", "self_attention.linear_proj") key = key.replace("self_attention.query_key_value", "self_attention.linear_qkv") key = key.replace("dense_h_to_4h", "linear_fc1") @@ -83,6 +84,7 @@ def get_new_key(old_key): return new_dict + def load_falcon_config(args) -> FalconConfig: """ Helper utility to load FalconConfig. @@ -114,39 +116,37 @@ def load_falcon_config(args) -> FalconConfig: config.model_type = 'falcon' return config + if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file") parser.add_argument( - "--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file" - ) - parser.add_argument( - "--input", type=str, required=True, help="Falcon variants from HuggingFace hub or local dir with downloaded model" - ) - parser.add_argument( - "--output", type=str, default=".", help="Path to dir where to store output .nemo file" + "--input", + type=str, + required=True, + help="Falcon variants from HuggingFace hub or local dir with downloaded model", ) + parser.add_argument("--output", type=str, default=".", help="Path to dir where to store output .nemo file") parser.add_argument( "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" ) - parser.add_argument( - "--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving" - ) - + parser.add_argument("--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving") + args = parser.parse_args() if not os.path.isdir(args.output): raise FileNotFoundError(f"Output directory '{args.output}' does not exist") - + falcon_config = load_falcon_config(args) logging.info(f"falcon_config, {falcon_config}") with open(args.config, "r", encoding="utf_8") as f: orig_cfg = yaml.safe_load(f) - + model_dict = orig_cfg["model"] - + if "data" in model_dict: del model_dict["data"] - + override_model_dict = { "micro_batch_size": 1, "global_batch_size": 1, @@ -226,26 +226,25 @@ def load_falcon_config(args) -> FalconConfig: override_model_dict["num_query_groups"] = falcon_config.num_kv_heads elif falcon_config.multi_query: override_model_dict["num_query_groups"] = 1 - + # Additional logic for bias fusion if falcon_config.bias: override_model_dict["bias_activation_fusion"] = True override_model_dict["bias_dropout_add_fusion"] = True - + # Addtional logic for rope scaling if falcon_config.rope_scaling is not None: if falcon_config.rope_scaling.type == 'linear': override_model_dict['seq_len_interpolation_factor'] = falcon_config.rope_scaling.factor else: raise ValueError("Only linear rope scaling type is supported now") - - + model_dict.update(override_model_dict) model_dict["tokenizer"] = tokenizer_dict model_dict["name"] = 'megatron_falcon_gpt' omega_cfg = OmegaConf.create(model_dict) - + # output_path = "./falcon_megatron_config.yaml" # OmegaConf.save(config=omega_cfg, f=output_path) @@ -287,4 +286,4 @@ def load_falcon_config(args) -> FalconConfig: logging.info("Done.") tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logging.info(f'nemo model created and saved. Total time: {t}') \ No newline at end of file + logging.info(f'nemo model created and saved. Total time: {t}') From b9264d46a8ab62e11e5abe2cd8c9a336a78fff1f Mon Sep 17 00:00:00 2001 From: Vivian Date: Wed, 11 Oct 2023 01:17:02 +0000 Subject: [PATCH 24/69] add proper header Signed-off-by: Vivian --- .../megatron/falcon/__init__.py | 14 ++++++++++++++ .../megatron/falcon/falcon_decoder_block.py | 14 +++++++++++++- .../megatron/falcon/falcon_decoder_layer.py | 14 +++++++++++++- .../megatron/falcon/falcon_gpt_model.py | 16 ++++++++++++++-- .../megatron/falcon/falcon_spec.py | 14 ++++++++++++++ 5 files changed, 68 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py index 5dd085d829f6..d6a3184288ce 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py @@ -1 +1,15 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .falcon_gpt_model import FalconGPTModel diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py index 16bda328f38b..b2ee4882ed46 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py @@ -1,4 +1,16 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import re from contextlib import nullcontext diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 9ff495c87e7e..23c74cfaa083 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -1,4 +1,16 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from dataclasses import dataclass from typing import Union diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py index 05d449544094..33369c3c3d97 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py @@ -1,5 +1,17 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# just copy paste here, need work +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import logging from typing import Literal, Optional diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index be62fcb33bea..4906442c5426 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear From 3dcbd384512a8c3fdafa19d686a473f84bb3ce47 Mon Sep 17 00:00:00 2001 From: Huiying Li Date: Thu, 12 Oct 2023 01:05:51 -0700 Subject: [PATCH 25/69] falcon lora mixin to support when non-fused LN linear --- .../modules/common/megatron/adapters/mcore_mixins.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index a56318294e38..990bbcf5e94d 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -57,13 +57,21 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None): Derives `query`, `key` and `value` tensors from `hidden_states`. """ # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] - (mixed_qkv, layernorm_output), _ = self.linear_qkv(hidden_states) + linear_qkv_output, _ = self.linear_qkv(hidden_states) + layernorm_output = None + if isinstance(linear_qkv_output, tuple): #if LN and linear fused, both will be returned + mixed_qkv, layernorm_output = linear_qkv_output + else: # otherwise only mixed_qkv + mixed_qkv = linear_qkv_output # LoRA logic if self.is_adapter_available(): lora_kqv_adapter = self.get_adapter_module(AdapterName.LORA_KQV_ADAPTER) if lora_kqv_adapter: - lora_mixed_qkv = lora_kqv_adapter(layernorm_output) + if layernorm_output: + lora_mixed_qkv = lora_kqv_adapter(layernorm_output) + else: + lora_mixed_qkv = lora_kqv_adapter(hidden_states) mixed_qkv = mixed_qkv + lora_mixed_qkv # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] From a9df5a4ded30ef458d180add1937c9dc41db12d2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Oct 2023 02:47:53 +0000 Subject: [PATCH 26/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/modules/common/megatron/adapters/mcore_mixins.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index 990bbcf5e94d..d23146375802 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -59,9 +59,9 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None): # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] linear_qkv_output, _ = self.linear_qkv(hidden_states) layernorm_output = None - if isinstance(linear_qkv_output, tuple): #if LN and linear fused, both will be returned - mixed_qkv, layernorm_output = linear_qkv_output - else: # otherwise only mixed_qkv + if isinstance(linear_qkv_output, tuple): # if LN and linear fused, both will be returned + mixed_qkv, layernorm_output = linear_qkv_output + else: # otherwise only mixed_qkv mixed_qkv = linear_qkv_output # LoRA logic From d7129255503a226fa158ab8a1f17caac0fc2fe80 Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 17 Oct 2023 04:42:20 +0000 Subject: [PATCH 27/69] revise jenkinsfile, tokenizer update in convertion script, add two falcon config files Signed-off-by: Vivian --- Jenkinsfile | 13 +- .../conf/megatron_falcon_config.yaml | 217 ++++++++++++++++++ .../conf/megatron_falcon_inference.yaml | 39 ++++ .../convert_hf_falcon_to_nemo.py | 30 +-- 4 files changed, 283 insertions(+), 16 deletions(-) create mode 100644 examples/nlp/language_modeling/conf/megatron_falcon_config.yaml create mode 100644 examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml diff --git a/Jenkinsfile b/Jenkinsfile index 3d262931915b..50b2ebd225d5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -59,11 +59,11 @@ pipeline { stage('Megatron Core installation') { steps { - // pinned MCore https://github.com/NVIDIA/Megatron-LM/commit/ab0336a5c8eab77aa74ae604ba1e73decbf6d560 + // pinned MCore https://github.com/NVIDIA/Megatron-LM/commit/954a65b04c01a4986adbad2a7cc9e9a2d094dd77 // ToT for 23.08 branch sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \ + git checkout 954a65b04c01a4986adbad2a7cc9e9a2d094dd77 && \ pip install -e .' } } @@ -135,6 +135,15 @@ pipeline { sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo' } } + stage('Falcon') { + steps { + sh 'python scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py \ + --config examples/nlp/language_modeling/conf/megatron_gpt_config.yaml \ + --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ + --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf' + sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_falcon-ci-hf_bf16_tp1_pp1.nemo' + } + } } } diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml new file mode 100644 index 000000000000..b9a38aa5b952 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml @@ -0,0 +1,217 @@ +name: megatron_falcon_gpt +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_falcon_gpt + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + +model: + mcore_gpt: True + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + micro_batch_size: 1 # limited by GPU memory + global_batch_size: 1 # will use more micro batches to reach global batch size + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline + + # model architecture + encoder_seq_length: 2048 + max_position_embeddings: ${.encoder_seq_length} + num_layers: 32 # 7b: 32 | 40b: 60 | 180b: 80 + hidden_size: 4544 # 7b: 4544 | 40b: 8192 | 180b: 14848 + ffn_hidden_size: 18176 # Transformer FFN hidden size. Usually 4 * hidden_size. | 7b: 18176 | 40b: 32768 | 180b: 59392 + num_attention_heads: 71 # 7b: 71 | 40b: 128 | 180b: 232 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0.0 # Dropout probability for hidden state transformer. + attention_dropout: 0.0 # Dropout probability for attention + ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + bias: False # Whether to use bias terms in all weight matrices. + activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. + transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] + openai_gelu: False # Use OpenAI's GELU instead of the default GeLU + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. + position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope'] + rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. + attention_type: 'multihead' # Attention type. Options ['multihead'] + share_embeddings_and_output_weights: False # Share embedding and output layer weights. + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: 1 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 1 | 40b: 8 | 180b: 8 + new_decoder_architecture: false + parallel_attention: true + + tokenizer: + library: 'huggingface' + type: 'tiiuae/falcon-7b' + use_fast: True + + # Mixed precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + + + # Miscellaneous + seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Transformer Engine + transformer_engine: True + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + + data: + # Path to data must be specified by the user. + # Supports List, String and Dictionary + # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", + # Or see example below: + # data_prefix: + # - .5 + # - /raid/data/pile/my-gpt3_00_text_document + # - .5 + # - /raid/data/pile/my-gpt3_01_text_document + # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} + # Or see example below: + # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" + # data_prefix: ??? + index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix + data_impl: mmap + splits_string: 900,50,50 + seq_length: ${model.encoder_seq_length} + skip_warmup: True + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + validation_drop_last: True # Set to false if the last partial validation samples is to be consumed + no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token + pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size + shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: fused_adam + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 50000 + min_lr: 2e-5 diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml new file mode 100644 index 000000000000..298b6a702571 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml @@ -0,0 +1,39 @@ +inference: + greedy: False # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + add_BOS: False # add the bos token at the begining of the prompt + tokens_to_generate: 30 # The minimum length of the sequence to be generated. + all_probs: False # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False + end_strings: ["<|endoftext|>"] # generation will stop when one of these tokens is generated + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: bf16 # 16, 32, or bf16 + use_distributed_sampler: False + +tensor_model_parallel_size: -1 +pipeline_model_parallel_size: -1 +pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) +megatron_amp_O2: False # Enable O2-level automatic mixed precision to save memory +gpt_model_file: null # GPT nemo file path +checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training +checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading +hparams_file: null # model configuration file, only used for PTL checkpoint loading +prompts: # prompts for GPT inference + - "Q: How are you?" + - "Q: How big is the universe?" +server: False # whether launch the API server +port: 5555 # the port number for the inference server +web_server: False # whether launch the web inference server +share: False # whether create a public URL +username: test # user name for web client +password: test2 # password for web client +web_port: 9889 # the port number of the web server diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 3438bbddafaf..20098ed5d554 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -15,10 +15,8 @@ """ Conversion script to convert Huggingface Falcon 1B/7B/40B/180B checkpoints into nemo checkpoint. -This script will generate a Megatron model with TP=1 and PP=1. If you need different TP/PP -values, then after running this script, please use the script located below to set the -TP/PP values you want: - NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py +This script will generate a Megatron model with TP=1 and PP=1. The new dist ckpt format does not require +user to run additional script to set the TP/PP values manually. Example to run this conversion script: ``` @@ -88,7 +86,7 @@ def get_new_key(old_key): def load_falcon_config(args) -> FalconConfig: """ Helper utility to load FalconConfig. - Falcon-7B and Falcon-40B are not compatible with `transformers.FalconConfig` and + Legacy Falcon-7B and Falcon-40B are not compatible with `transformers.FalconConfig` and `transformers.FalconModel`. need to manually set the config values and force to `falcon` model type. """ @@ -138,7 +136,6 @@ def load_falcon_config(args) -> FalconConfig: raise FileNotFoundError(f"Output directory '{args.output}' does not exist") falcon_config = load_falcon_config(args) - logging.info(f"falcon_config, {falcon_config}") with open(args.config, "r", encoding="utf_8") as f: orig_cfg = yaml.safe_load(f) @@ -245,19 +242,14 @@ def load_falcon_config(args) -> FalconConfig: omega_cfg = OmegaConf.create(model_dict) - # output_path = "./falcon_megatron_config.yaml" - # OmegaConf.save(config=omega_cfg, f=output_path) - trainer = pl.Trainer(**trainer_dict) logging.info("Creating Megatron model...") tik = time.time() model = MegatronGPTModel(omega_cfg, trainer) - logging.info(f"Created model:\n{model}") logging.info("Loading HuggingFace model...") model_hf = AutoModelForCausalLM.from_pretrained(args.input) - logging.info(f"Loaded model:\n{model_hf}") state_dict_hf = model_hf.state_dict() convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2) @@ -277,12 +269,22 @@ def load_falcon_config(args) -> FalconConfig: raise RuntimeError(f"Unexpected keys: \n{unexpected_keys}") logging.info("Saving model...") - + + # We make sure that the tokenizer can be instantiated later regardless of args.input + if falcon_config.new_decoder_architecture: + model.cfg.tokenizer.update(type="tiiuae/falcon-40b") + elif falcon_config.multi_query: + model.cfg.tokenizer.update(type="tiiuae/falcon-7b") + elif falcon_config.alibi and falcon_config.num_hidden_layers == 36: + model.cfg.tokenizer.update(type="tiiuae/falcon-rw-7b") + else: + model.cfg.tokenizer.update(type="tiiuae/falcon-rw-1b") + dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32 model = model.to(dtype=dtype) model.cfg.update(use_cpu_initialization=False) - name_last_part = os.path.basename(args.input.rstrip('/')) - model.save_to(os.path.join(args.output, f'falcon_{name_last_part}_{args.precision}_tp1_pp1.nemo')) + tokenizer_name_part = model.cfg.tokenizer["type"].split("/")[1] + model.save_to(os.path.join(args.output, f'falcon_{tokenizer_name_part}_{args.precision}_tp1_pp1.nemo')) logging.info("Done.") tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) From 7a46e868c6b1b59617142b6af90ca1a417c9ca6e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Oct 2023 04:44:35 +0000 Subject: [PATCH 28/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 20098ed5d554..275955d5a056 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -269,7 +269,7 @@ def load_falcon_config(args) -> FalconConfig: raise RuntimeError(f"Unexpected keys: \n{unexpected_keys}") logging.info("Saving model...") - + # We make sure that the tokenizer can be instantiated later regardless of args.input if falcon_config.new_decoder_architecture: model.cfg.tokenizer.update(type="tiiuae/falcon-40b") @@ -279,7 +279,7 @@ def load_falcon_config(args) -> FalconConfig: model.cfg.tokenizer.update(type="tiiuae/falcon-rw-7b") else: model.cfg.tokenizer.update(type="tiiuae/falcon-rw-1b") - + dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32 model = model.to(dtype=dtype) model.cfg.update(use_cpu_initialization=False) From d4a5fec8ebc435160b52f2bd9c60520e07f94c15 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 2 Nov 2023 22:49:30 -0700 Subject: [PATCH 29/69] refactor falcon to use MCoreGPT+spec+baselayer initial commit --- .../language_modeling/megatron/__init__.py | 2 +- .../megatron/falcon/__init__.py | 2 - .../megatron/falcon/falcon_decoder_block.py | 296 ---------------- .../megatron/falcon/falcon_decoder_layer.py | 42 +-- .../megatron/falcon/falcon_gpt_model.py | 316 ------------------ .../megatron/falcon/falcon_spec.py | 41 +-- .../language_modeling/megatron_gpt_model.py | 82 ++--- .../convert_hf_falcon_to_nemo.py | 9 +- 8 files changed, 61 insertions(+), 729 deletions(-) delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py index bdd9da8799e6..70b6d4c169b8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py @@ -15,8 +15,8 @@ # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel try: - from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel + from nemo.collections.nlp.models.language_modeling.megatron.falcon import falcon_spec HAVE_MEGATRON_CORE = True except (ImportError, ModuleNotFoundError): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py index d6a3184288ce..4fc50543f1d2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .falcon_gpt_model import FalconGPTModel diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py deleted file mode 100644 index b2ee4882ed46..000000000000 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_block.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from contextlib import nullcontext - -import torch -from megatron.core import parallel_state, tensor_parallel -from megatron.core.fusions.fused_layer_norm import FusedLayerNorm -from megatron.core.transformer.custom_layers.transformer_engine import TENorm -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.spec_utils import ModuleSpec -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor - -from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules - - -class FalconTransformerBlock(MegatronModule): - """Transformer class.""" - - def __init__( - self, - config: TransformerConfig, - transformer_layer_spec: ModuleSpec, - self_attn_mask_type=AttnMaskType.padding, - post_layer_norm=True, - pre_process=True, - post_process=True, - ): - super().__init__(config=config) - - self.config: TransformerConfig = config - self.transformer_layer_spec: ModuleSpec = transformer_layer_spec - - self.self_attn_mask_type = self_attn_mask_type - self.post_layer_norm = post_layer_norm - self.pre_process = pre_process - self.post_process = post_process - - # required for pipeline parallel schedules - self.input_tensor = None - - self.checkpoint_core_attention = self.config.recompute_granularity == 'selective' - - self.num_layers_per_pipeline_rank = ( - self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() - ) - - self._build_layers(self.transformer_layer_spec) - - def _build_layers(self, transformer_layer_spec): - # Transformer layers. - # @jcasper can we improve how we deal with layer_number? - # currently it's only used in CoreAttention? - # if self.apply_query_key_layer_scaling: - # coeff = self.layer_number - # self.norm_factor *= coeff - def build_layer(layer_number): - layer = FalconTransformerLayer( - config=self.config, - submodules=transformer_layer_spec.submodules, - layer_number=layer_number, - self_attn_mask_type=self.self_attn_mask_type, - ) - return layer - - if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: - # Interleaved pipeline parallelism: - # Number of layers in each model chunk is the number of layers in the stage, - # divided by the number of model chunks in a stage. - # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of - # layers to stages like (each list is a model chunk): - # Stage 0: [0] [2] [4] [6] - # Stage 1: [1] [3] [5] [7] - # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of - # layers to stages like (each list is a model chunk): - # Stage 0: [0, 1] [4, 5] - # Stage 1: [2, 3] [6, 7] - - vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() - - num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size - - num_layers_to_build = num_layers_per_virtual_rank - - else: - # Non-interleaved pipeline parallelism: - # Each stage gets a contiguous set of layers. - - num_layers_to_build = self.num_layers_per_pipeline_rank - - # offset is implicit in TransformerLayer - self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)]) - - # # TODO: add back standalone_embedding_stage - # if self.num_layers == 0: - # # When a standalone embedding stage is used (e.g., - # # args.standalone_embedding_stage == True), virtual pipeline ranks - # # on pipeline rank 0 will have zero transformer layers assigned to - # # them. This results in the model's input and output tensors to be - # # the same, which will cause failure for certain output tensor - # # optimizations (e.g., pipeline output deallocation). To remedy - # # this, we assign a 'no-op' layer on these ranks, which will - # # disconnect the input tensor from the output tensor. - # self.num_layers = 1 - # self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) - # else: - # self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)]) - - if self.post_process and self.post_layer_norm: - # Final layer norm before output. - self.final_layernorm = TENorm( - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - persist_layer_norm=self.config.persist_layer_norm, - sequence_parallel=self.config.sequence_parallel, - zero_centered_gamma=self.config.layernorm_zero_centered_gamma, - normalization=self.config.normalization, - ) - - def _get_layer(self, layer_number): - return self.layers[layer_number] - - def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb): - """Forward method with activation checkpointing.""" - - def custom(start, end): - def custom_forward(*args, **kwargs): - x_, *args = args - for index in range(start, end): - layer = self._get_layer(index) - x_ = layer(x_, *args, **kwargs) - return x_ - - return custom_forward - - if self.config.recompute_method == 'uniform': - # Uniformly divide the total number of Transformer layers and checkpoint - # the input activation of each divided chunk. - # A method to further reduce memory usage reducing checkpoints. - l = 0 - while l < self.num_layers_per_pipeline_rank: - hidden_states = tensor_parallel.checkpoint( - custom(l, l + self.config.recompute_num_layers), - self.config.distribute_saved_activations, - hidden_states, - attention_mask, - rotary_pos_emb, - ) - - l += self.config.recompute_num_layers - - elif self.config.recompute_method == 'block': - # Checkpoint the input activation of only a set number of individual - # Transformer layers and skip the rest. - # A method fully use the device memory removing redundant re-computation. - for l in range(self.num_layers_per_pipeline_rank): - if l < self.config.recompute_num_layers: - hidden_states = tensor_parallel.checkpoint( - custom(l, l + 1), - self.config.distribute_saved_activations, - hidden_states, - attention_mask, - rotary_pos_emb, - ) - else: - hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb) - else: - raise ValueError("Invalid activation recompute method.") - - return hidden_states - - def set_input_tensor(self, input_tensor): - """Set input tensor to be used instead of forward()'s input. - - When doing pipeline parallelism the input from the previous - stage comes from communication, not from the input, so the - model's forward_step_func won't have it. This function is thus - used by internal code to bypass the input provided by the - forward_step_func""" - self.input_tensor = input_tensor - - def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None): - # hidden_states (float): [s, b, h] - # attention_mask (bool): [1, 1, s, s] - - if not self.pre_process: - # See set_input_tensor() - hidden_states = self.input_tensor - - # Viewless tensor. - # - We only need to create a viewless tensor in the case of micro batch - # size (mbs) == 1, since in this case, 'hidden_states.transpose()' - # above creates a view tensor, and '.contiguous()' is a pass-through. - # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating - # the need to make it viewless. - # - # However, we don't explicitly check mbs == 1 here because - # make_viewless_tensor() has negligible overhead when its input - # is already viewless. - # - # - For the 'else' case above, calling make_viewless_tensor() here is - # likely redundant, since p2p_communication.py (likely originator) - # already creates viewless tensors. That said, make_viewless_tensor() - # is called here to be future-proof and corner-case-proof. - hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,) - - if self.config.sequence_parallel: - rng_context = tensor_parallel.get_cuda_rng_tracker().fork() - else: - rng_context = nullcontext() - - if self.config.fp8: - import transformer_engine # To keep out TE dependency when not training in fp8 - - if self.config.fp8 == "e4m3": - fp8_format = transformer_engine.common.recipe.Format.E4M3 - elif self.config.fp8 == "hybrid": - fp8_format = transformer_engine.common.recipe.Format.HYBRID - else: - raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.") - - fp8_recipe = transformer_engine.common.recipe.DelayedScaling( - margin=self.config.fp8_margin, - interval=self.config.fp8_interval, - fp8_format=fp8_format, - amax_compute_algo=self.config.fp8_amax_compute_algo, - amax_history_len=self.config.fp8_amax_history_len, - override_linear_precision=(False, False, not self.config.fp8_wgrad), - ) - fp8_group = None - if parallel_state.model_parallel_is_initialized(): - fp8_group = parallel_state.get_amax_reduction_group() - fp8_context = transformer_engine.pytorch.fp8_autocast( - enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group - ) - else: - fp8_context = nullcontext() - - with rng_context and fp8_context: - # Forward pass. - if self.config.recompute_granularity == 'full': - hidden_states = self._checkpointed_forward( - hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb, - ) - else: - for idx, layer in enumerate(self.layers): - hidden_states = layer( - hidden_states=hidden_states, - attention_mask=attention_mask, - rotary_pos_emb=rotary_pos_emb, - inference_params=inference_params, - ) - - # Final layer norm. - if self.post_process and self.post_layer_norm: - hidden_states = self.final_layernorm(hidden_states) - - return hidden_states - - def sharded_state_dict(self, prefix=''): - - sharded_state_dict = {} - - layer_prefix = f'{prefix}layers.' - for layer in self.layers: - sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix)) - - if self.post_process and self.post_layer_norm: - state_dict = self.state_dict(keep_vars=True) - - tensor = state_dict['final_layernorm.weight'] - layer_name = f'{prefix}final_layernorm.weight' - sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) - - # RMSNorm doesn't have bias. - if 'final_layernorm.bias' in state_dict.keys(): - tensor = state_dict['final_layernorm.bias'] - layer_name = f'{prefix}final_layernorm.bias' - sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name) - - return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 23c74cfaa083..037305dd6824 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -23,7 +23,7 @@ from megatron.core.transformer.attention import SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp -from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.base_layer import LayerSubmodules, BaseLayer from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import make_viewless_tensor @@ -45,7 +45,7 @@ @dataclass -class FalconTransformerLayerSubmodules: +class FalconTransformerLayerSubmodules(LayerSubmodules): input_layernorm: Union[ModuleSpec, type] = IdentityOp self_attention: Union[ModuleSpec, type] = IdentityOp self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp @@ -57,7 +57,7 @@ class FalconTransformerLayerSubmodules: mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp -class FalconTransformerLayer(MegatronModule): +class FalconTransformerLayer(BaseLayer): """A single transformer layer. Transformer layer takes input with size [s, b, h] and returns an @@ -72,7 +72,7 @@ def __init__( layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding, ): - super().__init__(config=config) + super().__init__(config=config, submodules=submodules) self.config: TransformerConfig = config self.layer_number = layer_number + self._get_layer_offset() @@ -153,40 +153,14 @@ def __init__( # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad self.bias_dropout_add_exec_handler = torch.enable_grad - def _get_layer_offset(self): - - pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() - - num_layers_per_pipeline_rank = ( - self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size() - ) - - if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: - vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank() - vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() - - total_num_layers = self.config.num_layers - num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size - total_virtual_chunks = total_num_layers // vp_size - offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank) - - else: - # Each stage gets a contiguous set of layers. - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - offset = pipeline_rank * num_layers_per_pipeline_rank - else: - offset = 0 - - return offset - def forward( self, hidden_states, attention_mask, - encoder_output=None, - enc_dec_attn_mask=None, - inference_params=None, + context=None, + context_mask=None, rotary_pos_emb=None, + inference_params=None, ): # hidden_states: [s, b, h] @@ -250,7 +224,7 @@ def forward( # 'view' tensor. output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True) - return output + return output, context def sharded_state_dict(self, prefix=''): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py deleted file mode 100644 index 33369c3c3d97..000000000000 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_gpt_model.py +++ /dev/null @@ -1,316 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Literal, Optional - -import torch -from megatron.core import parallel_state, tensor_parallel -from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding -from megatron.core.models.gpt.gpt_embedding import GPTEmbedding -from megatron.core.transformer.enums import AttnMaskType, ModelType -from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.spec_utils import ModuleSpec -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint -from torch import Tensor - -# from megatron.core.transformer.transformer_block import TransformerBlock -from .falcon_decoder_block import FalconTransformerBlock - - -class FalconGPTModel(MegatronModule): - """Transformer language model. - - Arguments: - config (TransformerConfig): transformer config - - vocab_size (int): vocabulary size - - max_sequence_length (int): maximum size of sequence. This is used for positional embedding - - pre_process (bool): Include embedding layer (used with pipeline parallelism) - post_process (bool): Include an output layer (used with pipeline parallelism) - - parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks - - share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are - shared. Defaults to False. - - position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope']. - Defaults is 'learned_absolute'. - - rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. - Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'. - - seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences. - The value must be a float larger than 1.0. Defaults to None. - """ - - def __init__( - self, - config: TransformerConfig, - transformer_layer_spec: ModuleSpec, - vocab_size: int, - max_sequence_length: int, - pre_process: bool = True, - post_process: bool = True, - fp16_lm_cross_entropy: bool = False, - parallel_output: bool = True, - share_embeddings_and_output_weights: bool = False, - position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute', - rotary_percent: float = 1.0, - seq_len_interpolation_factor: Optional[float] = None, - ): - super(FalconGPTModel, self).__init__(config=config) - - self.config: TransformerConfig = config - self.transformer_layer_spec: ModuleSpec = transformer_layer_spec - self.vocab_size = vocab_size - self.max_sequence_length = max_sequence_length - self.pre_process = pre_process - self.post_process = post_process - self.fp16_lm_cross_entropy = fp16_lm_cross_entropy - self.parallel_output = parallel_output - self.share_embeddings_and_output_weights = share_embeddings_and_output_weights - self.position_embedding_type = position_embedding_type - - # megatron core pipelining currently depends on model type - # TODO: remove this dependency ? - self.model_type = ModelType.encoder_or_decoder - - # Embeddings. - if self.pre_process: - self.embedding = GPTEmbedding( - config=self.config, - vocab_size=self.vocab_size, - max_sequence_length=self.max_sequence_length, - add_position_embedding=(self.position_embedding_type == 'learned_absolute'), - ) - - # Rotary Position Embeddings - if self.position_embedding_type == 'rope': - rotary_dim = self.config.kv_channels - if rotary_percent < 1.0: - rotary_dim = int(rotary_dim * rotary_percent) - - self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor) - else: - self.rotary_pos_emb = None - - # Transformer. - self.decoder = FalconTransformerBlock( - config=self.config, - transformer_layer_spec=self.transformer_layer_spec, - self_attn_mask_type=AttnMaskType.causal, - pre_process=self.pre_process, - post_process=self.post_process, - ) - - # Output - if post_process: - self.output_layer = tensor_parallel.ColumnParallelLinear( - config.hidden_size, - self.vocab_size, - config=config, - init_method=config.init_method, - bias=False, - skip_bias_add=False, - gather_output=not self.parallel_output, - skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights, - ) - - if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): - self.initialize_last_stage_with_word_embeddings() - - def set_input_tensor(self, input_tensor): - """ See megatron.model.transformer.set_input_tensor()""" - - # This is usually handled in schedules.py but some inference code still - # gives us non-lists or None - if not isinstance(input_tensor, list): - input_tensor = [input_tensor] - - assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt' - self.decoder.set_input_tensor(input_tensor[0]) - - def forward( - self, - input_ids: Tensor, - position_ids: Tensor, - attention_mask: Tensor, - decoder_input: Tensor = None, - labels: Tensor = None, - inference_params=None, - ): - # If decoder_input is provided (not None), then input_ids and position_ids are ignored. - # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input. - - # Decoder embedding. - if decoder_input is not None: - pass - elif self.pre_process: - decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) - else: - # intermediate stage of pipeline - # decoder will get hidden_states from encoder.input_tensor - decoder_input = None - - # Rotary positional embeddings - rotary_pos_emb = None - if self.rotary_pos_emb is not None: - if inference_params is not None: - rotary_seq_len = inference_params.max_sequence_length - else: - if self.decoder.input_tensor is not None: - rotary_seq_len = self.decoder.input_tensor.size(0) - else: - rotary_seq_len = decoder_input.size(0) - - # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region - if self.config.sequence_parallel: - rotary_seq_len *= self.config.tensor_model_parallel_size - - rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) - - # Run decoder. - hidden_states = self.decoder( - hidden_states=decoder_input, - attention_mask=attention_mask, - inference_params=inference_params, - rotary_pos_emb=rotary_pos_emb, - ) - - if not self.post_process: - return hidden_states - - # logits and loss - output_weight = None - if self.share_embeddings_and_output_weights: - output_weight = self.shared_embedding_or_output_weight() - logits, _ = self.output_layer(hidden_states, weight=output_weight) - - if labels is None: - # [s b h] => [b s h] - return logits.transpose(0, 1).contiguous() - - # [b s] => [s b] - labels = labels.transpose(0, 1).contiguous() - loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels) - - # [s b] => [b, s] - loss = loss.transpose(0, 1).contiguous() - return loss - - def shared_embedding_or_output_weight(self): - if self.pre_process: - return self.embedding.word_embeddings.weight - elif self.post_process: - return self.output_layer.weight - return None - - def initialize_last_stage_with_word_embeddings(self): - - # This function just initializes the word embeddings in the final stage - # when we are using pipeline parallelism and sharing word - # embeddings. Nothing to do if we aren't sharing weights or aren't using - # pipeline parallelism. - if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process): - return - - if self.post_process and not self.pre_process: - assert not parallel_state.is_pipeline_first_stage() - # set word_embeddings weights to 0 here, then copy first - # stage's weights using all_reduce below. - self.output_layer.weight.data.fill_(0) - self.output_layer.weight.shared = True - - # Parameters are shared between the word embeddings layers, and the - # heads at the end of the model. In a pipelined setup with more than - # one stage, the initial embedding layer and the head are on different - # workers, so we do the following: - # 1. Create a second copy of word_embeddings on the last stage, with - # initial parameters of 0.0. - # 2. Do an all-reduce between the first and last stage to ensure that - # the two copies of word_embeddings start off with the same - # parameter values. - # 3. In the training loop, before an all-reduce between the grads of - # the two word_embeddings layers to ensure that every applied weight - # update is the same on both stages. - - # Ensure that first and last stages have the same initial parameter - # values. - if torch.distributed.is_initialized(): - if parallel_state.is_rank_in_embedding_group(): - weight = self.shared_embedding_or_output_weight() - torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group()) - - elif not getattr(FalconGPTModel, "embedding_warning_printed", False): - logging.getLogger(__name__).warning( - "Distributed processes aren't initialized, so the output layer " - "is not initialized with weights from the word embeddings. " - "If you are just manipulating a model this is fine, but " - "this needs to be handled manually. If you are training " - "something is definitely wrong." - ) - FalconGPTModel.embedding_warning_printed = True - - def sharded_state_dict(self, prefix=''): - sharded_state_dict = {} - - if self.pre_process: - embedding_prefix = f'{prefix}embedding.' - embedding_sharded_state_dict = self.embedding.sharded_state_dict(prefix=embedding_prefix) - sharded_state_dict.update(embedding_sharded_state_dict) - - decoder_prefix = f'{prefix}decoder.' - decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix) - sharded_state_dict.update(decoder_sharded_state_dict) - - if self.post_process: - output_layer_prefix = f'{prefix}output_layer.' - output_layer_key = f'{output_layer_prefix}weight' - if self.share_embeddings_and_output_weights: - if not self.pre_process: - # when sharing embeddings with last stage, we need to use the weights from the first stage - # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight - tensor = self.shared_embedding_or_output_weight() - first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight' - dp_rank = parallel_state.get_data_parallel_rank() - dp_size = parallel_state.get_data_parallel_world_size() - last_stage_word_emb_replica_id = dp_rank + dp_size # copy of first stage embedding - - sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( - tensor=tensor, - key=first_stage_word_emb_key, - replica_id=last_stage_word_emb_replica_id, - allow_shape_mismatch=True, - ) - - sharded_state_dict[output_layer_key] = sharded_output_layer_tensor - - else: - output_layer_state_dict = self.output_layer.state_dict(prefix=output_layer_prefix, keep_vars=True) - output_layer_tensor = output_layer_state_dict[output_layer_key] - # independent output layer - sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint( - tensor=output_layer_tensor, - key=output_layer_key, - replica_id=parallel_state.get_data_parallel_rank(), - allow_shape_mismatch=True, - ) - - sharded_state_dict[output_layer_key] = sharded_output_layer_tensor - - return sharded_state_dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 4906442c5426..debcc2555d46 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -31,25 +31,26 @@ from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules # Use this spec for an implementation using modules in TE -falcon_layer_spec = ModuleSpec( - module=FalconTransformerLayer, - submodules=FalconTransformerLayerSubmodules( - input_layernorm=TENorm, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=TEColumnParallelLinear, - dot_product_attention=TEDotProductAttention, - linear_proj=TERowParallelLinear, +def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec: + return ModuleSpec( + module=FalconTransformerLayer, + submodules=FalconTransformerLayerSubmodules( + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), ), + self_attn_bda=get_bias_dropout_add, + post_self_attn_layernorm=TENorm, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + ), + mlp_bda=get_bias_dropout_add, ), - self_attn_bda=get_bias_dropout_add, - post_self_attn_layernorm=TENorm, - pre_mlp_layernorm=TENorm, - mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), - ), - mlp_bda=get_bias_dropout_add, - ), -) + ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d3297b2efd8a..2b3a41fe6a64 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -76,7 +76,6 @@ try: from megatron.core import InferenceParams, parallel_state from megatron.core.models.gpt import GPTModel as MCoreGPTModel - from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec from megatron.core.pipeline_parallel.schedules import get_forward_backward_func from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.spec_utils import import_module @@ -104,28 +103,22 @@ HAVE_TE = False -def import_falcon_gpt_model(): - """Conditionally import FalconGPTModel. - """ - try: - # from megatron.core.models.falcon.falcon_gpt_model import FalconGPTModel - from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_gpt_model import FalconGPTModel - from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import falcon_layer_spec - - return FalconGPTModel, falcon_layer_spec - except (ImportError, ModuleNotFoundError): - raise ImportError("Failed to import FalconGPTModel. Please ensure the necessary dependencies are installed.") - - -@dataclass -class FalconTransformerConfig(TransformerConfig): - """ - Transformer Config for Falcon Variants - """ - - new_decoder_architecture: bool = False - parallel_attention: bool = False +def get_specs(spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec'): #Assumes the default spec function name + import importlib.util + name_spec_dict = { + "": "megatron.core.models.gpt.gpt_layer_specs", #default GPT + "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec" #Other customized model spec locations + } + module_path = name_spec_dict.get(spec_name) + if not module_path: + raise ImportError(f"Failed to import {spec_name}, please ensure {spec_name} is supported.") + module = importlib.import_module(module_path) + try: + spec = getattr(module, spec_func)() + except AttributeError: + raise ImportError(f"Module {module_path} does not have {spec_func}") + return spec class MegatronGPTExportableModel(torch.nn.Module, Exportable): """ @@ -239,10 +232,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) self.mcore_gpt = cfg.get('mcore_gpt', False) - # Falcon specific args - self.falcon_name = cfg.get('name', 'megatron_falcon_gpt') - self.new_decoder_architecture = cfg.get('new_decoder_architecture', False) - self.parallel_attention = cfg.get('parallel_attention', False) + self.spec_name = cfg.get('name', '') self.rampup_batch_size = self.cfg.get('rampup_batch_size', None) if self.rampup_batch_size: @@ -333,28 +323,10 @@ def get_inference_config(self): def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" - if self.mcore_gpt and self.falcon_name: - FalconGPTModel, falcon_layer_spec = import_falcon_gpt_model() - transformer_layer_spec = falcon_layer_spec - model = FalconGPTModel( - config=self.transformer_config, - transformer_layer_spec=transformer_layer_spec, - vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), - max_sequence_length=self.cfg.get('encoder_seq_length', 512), - pre_process=pre_process, - post_process=post_process, - parallel_output=True, - share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True), - position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'), - rotary_percent=self.cfg.get('rotary_percentage', 1.0), - seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), - ) - - elif self.mcore_gpt: - transformer_layer_spec = gpt_layer_with_transformer_engine_spec + if self.mcore_gpt: model = MCoreGPTModel( config=self.transformer_config, - transformer_layer_spec=transformer_layer_spec, + transformer_layer_spec=get_specs(self.spec_name), vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, @@ -1563,10 +1535,6 @@ def build_transformer_config(self) -> TransformerConfig: gated_linear_unit = activation.endswith('glu') activation_func = activation_to_func(activation) - mcore_gpt = self.cfg.get('mcore_gpt', False) - new_decoder_architecture = self.cfg.get('new_decoder_architecture', False) - parallel_attention = self.cfg.get('parallel_attention', False) - normalization = self.cfg.get('normalization', 'layernorm') layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' if normalization == 'layernorm': @@ -1653,14 +1621,12 @@ def build_transformer_config(self) -> TransformerConfig: f"Add this key to cfg or config_mapping to make to make it configurable." ) - if mcore_gpt and (new_decoder_architecture or parallel_attention): - transformer_config = FalconTransformerConfig( - **transformer_config_dict, - new_decoder_architecture=new_decoder_architecture, - parallel_attention=parallel_attention, - ) - else: - transformer_config = TransformerConfig(**transformer_config_dict) + transformer_config = TransformerConfig(**transformer_config_dict) + + #pass mcore customization configs directly to mcore + mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {}) + for key,value in mcore_customization_config_dict.items(): + setattr(transformer_config, key, value) return transformer_config diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 275955d5a056..fdce82d03023 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -175,13 +175,17 @@ def load_falcon_config(args) -> FalconConfig: "position_embedding_type": "rope", "precision": args.precision, "init_method_std": falcon_config.initializer_range, - "new_decoder_architecture": falcon_config.new_decoder_architecture, - "parallel_attention": falcon_config.parallel_attn, "activation": "gelu", "bias_activation_fusion": False, "bias_dropout_add_fusion": False, "seq_len_interpolation_factor": None, } + + mcore_customization_config_dict={ + "new_decoder_architecture": falcon_config.new_decoder_architecture, + "parallel_attention": falcon_config.parallel_attn, + } + tokenizer_dict = { "library": "huggingface", "type": args.input, @@ -239,6 +243,7 @@ def load_falcon_config(args) -> FalconConfig: model_dict.update(override_model_dict) model_dict["tokenizer"] = tokenizer_dict model_dict["name"] = 'megatron_falcon_gpt' + model_dict["mcore_customization_config"] = mcore_customization_config_dict omega_cfg = OmegaConf.create(model_dict) From f1860334c3f8e45967f7c41a1fc796134f5aedb3 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 2 Nov 2023 22:50:32 -0700 Subject: [PATCH 30/69] modification to get nemo run with mcore in this version --- nemo/collections/nlp/parts/nlp_overrides.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 8b2e06b4eb0c..1693cd23993a 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -195,7 +195,6 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None: pipeline_model_parallel_size=app_state.pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=app_state.virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank, - use_fp8=app_state.use_fp8, ) # assert that fake tp and pp rank match after model parallel init From 1c846b8bd2ff77f98ae73fbdf57a371d2bf2993c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Nov 2023 19:48:47 +0000 Subject: [PATCH 31/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../models/language_modeling/megatron/__init__.py | 2 +- .../megatron/falcon/falcon_decoder_layer.py | 3 +-- .../megatron/falcon/falcon_spec.py | 3 ++- .../models/language_modeling/megatron_gpt_model.py | 14 +++++++++----- .../convert_hf_falcon_to_nemo.py | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py index 70b6d4c169b8..e9a6714729c9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py @@ -15,8 +15,8 @@ # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel try: - from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron.falcon import falcon_spec + from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel HAVE_MEGATRON_CORE = True except (ImportError, ModuleNotFoundError): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 037305dd6824..c4998b667081 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -16,14 +16,13 @@ from typing import Union import torch - from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.transformer.attention import SelfAttentionSubmodules +from megatron.core.transformer.base_layer import BaseLayer, LayerSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp -from megatron.core.transformer.base_layer import LayerSubmodules, BaseLayer from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import make_viewless_tensor diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index debcc2555d46..551f4e65bdfb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -49,7 +49,8 @@ def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec: post_self_attn_layernorm=TENorm, pre_mlp_layernorm=TENorm, mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + module=MLP, + submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), ), mlp_bda=get_bias_dropout_add, ), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8b3260786641..46e8f0a3c4e7 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -103,11 +103,14 @@ HAVE_TE = False -def get_specs(spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec'): #Assumes the default spec function name +def get_specs( + spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec' +): # Assumes the default spec function name import importlib.util + name_spec_dict = { - "": "megatron.core.models.gpt.gpt_layer_specs", #default GPT - "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec" #Other customized model spec locations + "": "megatron.core.models.gpt.gpt_layer_specs", # default GPT + "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec", # Other customized model spec locations } module_path = name_spec_dict.get(spec_name) if not module_path: @@ -120,6 +123,7 @@ def get_specs(spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec') raise ImportError(f"Module {module_path} does not have {spec_func}") return spec + class MegatronGPTExportableModel(torch.nn.Module, Exportable): """ Megatron GPT Wrapper for ONNX export @@ -1623,9 +1627,9 @@ def build_transformer_config(self) -> TransformerConfig: transformer_config = TransformerConfig(**transformer_config_dict) - #pass mcore customization configs directly to mcore + # pass mcore customization configs directly to mcore mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {}) - for key,value in mcore_customization_config_dict.items(): + for key, value in mcore_customization_config_dict.items(): setattr(transformer_config, key, value) return transformer_config diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index fdce82d03023..d93e925c9f71 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -181,7 +181,7 @@ def load_falcon_config(args) -> FalconConfig: "seq_len_interpolation_factor": None, } - mcore_customization_config_dict={ + mcore_customization_config_dict = { "new_decoder_architecture": falcon_config.new_decoder_architecture, "parallel_attention": falcon_config.parallel_attn, } From 42ff405844233718ba72f53b21195bd9c3383f69 Mon Sep 17 00:00:00 2001 From: Vivian Date: Thu, 16 Nov 2023 20:21:59 +0000 Subject: [PATCH 32/69] small fix on the output file path --- .../convert_hf_falcon_to_nemo.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index d93e925c9f71..9c0a2d994229 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -122,9 +122,9 @@ def load_falcon_config(args) -> FalconConfig: "--input", type=str, required=True, - help="Falcon variants from HuggingFace hub or local dir with downloaded model", + help="Path to Falcon variants checkpoint from HuggingFace hub or local dir", ) - parser.add_argument("--output", type=str, default=".", help="Path to dir where to store output .nemo file") + parser.add_argument("--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file") parser.add_argument( "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" ) @@ -132,9 +132,6 @@ def load_falcon_config(args) -> FalconConfig: args = parser.parse_args() - if not os.path.isdir(args.output): - raise FileNotFoundError(f"Output directory '{args.output}' does not exist") - falcon_config = load_falcon_config(args) with open(args.config, "r", encoding="utf_8") as f: orig_cfg = yaml.safe_load(f) @@ -288,9 +285,8 @@ def load_falcon_config(args) -> FalconConfig: dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32 model = model.to(dtype=dtype) model.cfg.update(use_cpu_initialization=False) - tokenizer_name_part = model.cfg.tokenizer["type"].split("/")[1] - model.save_to(os.path.join(args.output, f'falcon_{tokenizer_name_part}_{args.precision}_tp1_pp1.nemo')) - logging.info("Done.") + model.save_to(args.output) + logging.info(f'Done. NeMo model saved to: {args.output}') tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) logging.info(f'nemo model created and saved. Total time: {t}') From 0fea44739e35c72827ed54669481cecb9ea399bc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Nov 2023 20:28:15 +0000 Subject: [PATCH 33/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 9c0a2d994229..39f194ecaf8d 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -124,7 +124,9 @@ def load_falcon_config(args) -> FalconConfig: required=True, help="Path to Falcon variants checkpoint from HuggingFace hub or local dir", ) - parser.add_argument("--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file") + parser.add_argument( + "--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file" + ) parser.add_argument( "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" ) From 39b78a9ad4bd985065f705f0e1158dc2708378a7 Mon Sep 17 00:00:00 2001 From: Vivian Date: Thu, 16 Nov 2023 23:40:20 +0000 Subject: [PATCH 34/69] add nemo to hf conversion script --- .../convert_nemo_falcon_to_hf.py | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py new file mode 100644 index 000000000000..338f02b7be1b --- /dev/null +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -0,0 +1,171 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from argparse import ArgumentParser +from collections import OrderedDict + +import torch +from pytorch_lightning import Trainer +from transformers import AutoModelForCausalLM + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.utils import logging + +""" +Script to convert a falcon checkpoint in nemo (mcore path) into a HuggingFace checkpoint. +This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder. + +1) Generate only HF weights from a nemo file: + + python convert_nemo_falcon_to_hf.py \ + --in-file /path/to/file.nemo or /path/to/extracted_folder \ + --out-file /path/to/pytorch_model.bin + +2) Generate the full HF model folder + + python convert_nemo_falcon_to_hf.py \ + --in-file /path/to/file.nemo or /path/to/extracted_folder \ + --out-file /path/to/pytorch_model.bin \ + --hf-in-file /path/to/input_hf_folder \ + --hf-out-file /path/to/output_hf_folder + + Use the --cpu-only flag if the model cannot fit in the GPU (e.g. falcon 180b). + However this option makes the conversion script significantly slower. +""" + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--in-file", type=str, default=None, required=True, help="Path to .nemo file", + ) + parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to HF .bin file") + parser.add_argument( + "--hf-in-path", + type=str, + default=None, + help="A HF model path, " "e.g. a folder containing https://huggingface.co/meta-falcon/falcon-2-7b-hf/tree/main", + ) + parser.add_argument( + "--hf-out-path", + type=str, + default=None, + help="Output HF model path, " "with the same format as above but user's own weights", + ) + parser.add_argument( + "--precision", + type=str, + default=None, + help="Precision of output weights." + "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)", + ) + parser.add_argument( + "--cpu-only", + action="store_true", + help="Load model in cpu only. Useful if the model cannot fit in GPU memory, " + "but this option makes the conversion script significantly slower.", + ) + args = parser.parse_args() + return args + + +def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None: + """ + Convert NeMo weights to HF weights + """ + dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy()) + if cpu_only: + map_location = torch.device('cpu') + model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) + model_config.use_cpu_initialization = True + else: + map_location, model_config = None, None + + if cpu_only: + logging.info("******** Loading model on CPU. This will take a significant amount of time.") + model = MegatronGPTModel.restore_from( + input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location + ) + if precision is None: + precision = model.cfg.precision + if precision in [32, "32"]: + dtype = torch.float32 + elif precision in [16, "16", "16-mixed"]: + dtype = torch.float16 + elif precision in ["bf16", "bf16-mixed"]: + dtype = torch.bfloat16 + else: + logging.warning(f"Precision string {precision} is not recognized, falling back to fp32") + dtype = torch.float32 # fallback + + param_to_weights = lambda param: param.to(dtype) + checkpoint = OrderedDict() + checkpoint['state_dict'] = OrderedDict() + + def get_original_key(new_key): + new_key = new_key[len(prefix):] + + if new_key.startswith("embedding.word_embeddings.weight"): + return "transformer.word_embeddings.weight" + elif new_key.startswith("decoder.final_layernorm"): + return new_key.replace("decoder.final_layernorm", "transformer.ln_f") + elif new_key.startswith("output_layer"): + return new_key.replace("output_layer", "lm_head") + + key = new_key.replace("decoder.layers", "transformer.h") + + if model.cfg.new_decoder_architecture: + key = key.replace("input_layernorm", "ln_attn") + key = key.replace("pre_mlp_layernorm", "ln_mlp") + else: + key = key.replace("input_layernorm", "input_layernorm") + if not model.cfg.parallel_attention: + key = key.replace("post_self_attn_layernorm", "post_attention_layernorm") + + key = key.replace("self_attention.linear_proj", "self_attention.dense") + key = key.replace("self_attention.linear_qkv", "self_attention.query_key_value") + key = key.replace("linear_fc1", "dense_h_to_4h") + key = key.replace("linear_fc2", "dense_4h_to_h") + return key + + prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.' + + for key, value in model.state_dict().items(): + orig_key = get_original_key(key) + checkpoint['state_dict'][orig_key] = param_to_weights(value) + + os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) + torch.save(checkpoint, output_hf_file) + logging.info(f"Weights reverted and saved to {output_hf_file}") + + +def replace_hf_weights(weights_file, input_hf_path, output_hf_path): + model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True) + nemo_exported = torch.load(weights_file) + + model.load_state_dict(nemo_exported['state_dict']) + model.save_pretrained(output_hf_path) + logging.info(f"Full HF model saved to {output_hf_path}") + + +if __name__ == '__main__': + args = get_args() + convert(args.in_file, args.out_file, precision=args.precision, cpu_only=args.cpu_only) + if args.hf_in_path and args.hf_out_path: + replace_hf_weights(args.out_file, args.hf_in_path, args.hf_out_path) + else: + logging.info("`hf-in-path` and/or `hf-out-path` not provided, not generating full HF model.") + logging.info(f".bin file is saved to {args.out_file}") From c85f3ac4d8427a515f1168aedb8624dca2b2709e Mon Sep 17 00:00:00 2001 From: Vivian Date: Fri, 17 Nov 2023 18:43:11 +0000 Subject: [PATCH 35/69] fix on base layer config and missing state dict due to dist ckpt --- scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py index 338f02b7be1b..923079f239f5 100644 --- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -127,12 +127,12 @@ def get_original_key(new_key): key = new_key.replace("decoder.layers", "transformer.h") - if model.cfg.new_decoder_architecture: + if model.cfg.mcore_customization_config.new_decoder_architecture: key = key.replace("input_layernorm", "ln_attn") key = key.replace("pre_mlp_layernorm", "ln_mlp") else: key = key.replace("input_layernorm", "input_layernorm") - if not model.cfg.parallel_attention: + if not model.cfg.mcore_customization_config.parallel_attention: key = key.replace("post_self_attn_layernorm", "post_attention_layernorm") key = key.replace("self_attention.linear_proj", "self_attention.dense") @@ -144,7 +144,10 @@ def get_original_key(new_key): prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.' for key, value in model.state_dict().items(): + if '_extra_state' in key: + continue orig_key = get_original_key(key) + print(f'Converting {key} to {orig_key}') checkpoint['state_dict'][orig_key] = param_to_weights(value) os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) From b0c1bb73ee7b452055f840f18db167e2b4a346f3 Mon Sep 17 00:00:00 2001 From: Vivian Date: Fri, 17 Nov 2023 18:47:17 +0000 Subject: [PATCH 36/69] Revert "fix on base layer config and missing state dict due to dist ckpt" This reverts commit c85f3ac4d8427a515f1168aedb8624dca2b2709e. --- scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py index 923079f239f5..338f02b7be1b 100644 --- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -127,12 +127,12 @@ def get_original_key(new_key): key = new_key.replace("decoder.layers", "transformer.h") - if model.cfg.mcore_customization_config.new_decoder_architecture: + if model.cfg.new_decoder_architecture: key = key.replace("input_layernorm", "ln_attn") key = key.replace("pre_mlp_layernorm", "ln_mlp") else: key = key.replace("input_layernorm", "input_layernorm") - if not model.cfg.mcore_customization_config.parallel_attention: + if not model.cfg.parallel_attention: key = key.replace("post_self_attn_layernorm", "post_attention_layernorm") key = key.replace("self_attention.linear_proj", "self_attention.dense") @@ -144,10 +144,7 @@ def get_original_key(new_key): prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.' for key, value in model.state_dict().items(): - if '_extra_state' in key: - continue orig_key = get_original_key(key) - print(f'Converting {key} to {orig_key}') checkpoint['state_dict'][orig_key] = param_to_weights(value) os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) From ce1bf4a87d977d3c860a9cedf753ccc5ab029dbd Mon Sep 17 00:00:00 2001 From: Vivian Date: Fri, 17 Nov 2023 18:49:20 +0000 Subject: [PATCH 37/69] fix on base layer config and missing state dict due to dist ckpt --- scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py index 338f02b7be1b..3042baeda527 100644 --- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -127,12 +127,12 @@ def get_original_key(new_key): key = new_key.replace("decoder.layers", "transformer.h") - if model.cfg.new_decoder_architecture: + if model.cfg.mcore_customization_config.new_decoder_architecture: key = key.replace("input_layernorm", "ln_attn") key = key.replace("pre_mlp_layernorm", "ln_mlp") else: key = key.replace("input_layernorm", "input_layernorm") - if not model.cfg.parallel_attention: + if not model.cfg.mcore_customization_config.parallel_attention: key = key.replace("post_self_attn_layernorm", "post_attention_layernorm") key = key.replace("self_attention.linear_proj", "self_attention.dense") @@ -144,6 +144,8 @@ def get_original_key(new_key): prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.' for key, value in model.state_dict().items(): + if '_extra_state' in key: + continue orig_key = get_original_key(key) checkpoint['state_dict'][orig_key] = param_to_weights(value) From 6351ae9c51fae3f22b43d2ff17bb24efa4505fdc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:24:17 +0000 Subject: [PATCH 38/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../convert_nemo_falcon_to_hf.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py index 3042baeda527..ac678a8676bf 100644 --- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -57,7 +57,8 @@ def get_args(): "--hf-in-path", type=str, default=None, - help="A HF model path, " "e.g. a folder containing https://huggingface.co/meta-falcon/falcon-2-7b-hf/tree/main", + help="A HF model path, " + "e.g. a folder containing https://huggingface.co/meta-falcon/falcon-2-7b-hf/tree/main", ) parser.add_argument( "--hf-out-path", @@ -116,7 +117,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> checkpoint['state_dict'] = OrderedDict() def get_original_key(new_key): - new_key = new_key[len(prefix):] + new_key = new_key[len(prefix) :] if new_key.startswith("embedding.word_embeddings.weight"): return "transformer.word_embeddings.weight" @@ -124,7 +125,7 @@ def get_original_key(new_key): return new_key.replace("decoder.final_layernorm", "transformer.ln_f") elif new_key.startswith("output_layer"): return new_key.replace("output_layer", "lm_head") - + key = new_key.replace("decoder.layers", "transformer.h") if model.cfg.mcore_customization_config.new_decoder_architecture: @@ -140,15 +141,15 @@ def get_original_key(new_key): key = key.replace("linear_fc1", "dense_h_to_4h") key = key.replace("linear_fc2", "dense_4h_to_h") return key - + prefix = 'model.module.' if any(k.startswith('model.module.') for k in model.state_dict()) else 'model.' - + for key, value in model.state_dict().items(): if '_extra_state' in key: continue orig_key = get_original_key(key) checkpoint['state_dict'][orig_key] = param_to_weights(value) - + os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) torch.save(checkpoint, output_hf_file) logging.info(f"Weights reverted and saved to {output_hf_file}") From a6c1fae27e0af42341255ea50022a40b8313e0e2 Mon Sep 17 00:00:00 2001 From: Vivian chen Date: Wed, 29 Nov 2023 19:32:13 +0000 Subject: [PATCH 39/69] fix megatron_gpt_model Signed-off-by: Vivian chen --- .../nlp/models/language_modeling/megatron_gpt_model.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index b110a306c56c..13472532b168 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -331,11 +331,7 @@ def model_provider_func(self, pre_process, post_process): if self.mcore_gpt: model = MCoreGPTModel( config=self.transformer_config, -<<<<<<< HEAD - transformer_layer_spec=get_specs(self.spec_name), -======= transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), ->>>>>>> main vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, From 1156b6af234a48e366e13c0e8ed9a7c27e9ef678 Mon Sep 17 00:00:00 2001 From: Vivian chen Date: Wed, 29 Nov 2023 20:00:02 +0000 Subject: [PATCH 40/69] modify model config Signed-off-by: Vivian chen --- .../language_modeling/conf/megatron_falcon_config.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml index b9a38aa5b952..143a75f3fc21 100644 --- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml @@ -84,8 +84,6 @@ model: overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: 1 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 1 | 40b: 8 | 180b: 8 - new_decoder_architecture: false - parallel_attention: true tokenizer: library: 'huggingface' @@ -215,3 +213,10 @@ model: warmup_steps: 500 constant_steps: 50000 min_lr: 2e-5 +gc_interval: 0 +precision: bf16 +mcore_customization_config: + new_decoder_architecture: false + parallel_attention: true +target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel +nemo_version: 1.21.0rc0 \ No newline at end of file From 5034dfc63b30e22dacf939fbac00afe1ad9d8dcf Mon Sep 17 00:00:00 2001 From: Vivian Chen <140748220+xuanzic@users.noreply.github.com> Date: Thu, 30 Nov 2023 21:42:30 -0800 Subject: [PATCH 41/69] Apply suggestions from code review Co-authored-by: Eric Harper Signed-off-by: Vivian Chen <140748220+xuanzic@users.noreply.github.com> --- examples/nlp/language_modeling/conf/megatron_falcon_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml index 143a75f3fc21..6d45db3f7d80 100644 --- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml @@ -37,7 +37,7 @@ exp_manager: mode: min always_save_nemo: False # saves nemo file during validation, not implemented for model parallel save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits - filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' + filename: 'megatron_falcon--{val_loss:.2f}-{step}-{consumed_samples}' model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} model: From c97d38c8b2dc05ef29adba00f14f885dd69f6936 Mon Sep 17 00:00:00 2001 From: Vivian Chen Date: Fri, 1 Dec 2023 06:10:46 +0000 Subject: [PATCH 42/69] fix based on review Signed-off-by: Vivian Chen --- .../conf/megatron_falcon_config.yaml | 17 +++++------- .../conf/megatron_falcon_inference.yaml | 7 +++-- .../language_modeling/megatron/__init__.py | 1 - .../megatron/falcon/falcon_decoder_layer.py | 27 +++---------------- .../language_modeling/megatron_gpt_model.py | 2 +- 5 files changed, 15 insertions(+), 39 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml index 6d45db3f7d80..4b8009256a9e 100644 --- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml @@ -84,6 +84,11 @@ model: overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: 1 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 1 | 40b: 8 | 180b: 8 + gc_interval: 0 + precision: bf16 + mcore_customization_config: + new_decoder_architecture: false + parallel_attention: true tokenizer: library: 'huggingface' @@ -153,7 +158,6 @@ model: sequence_parallel: False ## Transformer Engine - transformer_engine: True fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID @@ -202,7 +206,7 @@ model: gen_shape: False # Generate model and kernel details including input shapes optim: - name: fused_adam + name: distributed_fused_adam lr: 2e-4 weight_decay: 0.01 betas: @@ -212,11 +216,4 @@ model: name: CosineAnnealing warmup_steps: 500 constant_steps: 50000 - min_lr: 2e-5 -gc_interval: 0 -precision: bf16 -mcore_customization_config: - new_decoder_architecture: false - parallel_attention: true -target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel -nemo_version: 1.21.0rc0 \ No newline at end of file + min_lr: 2e-5 \ No newline at end of file diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml index 298b6a702571..1ccc9ed5dff8 100644 --- a/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml +++ b/examples/nlp/language_modeling/conf/megatron_falcon_inference.yaml @@ -19,10 +19,9 @@ trainer: precision: bf16 # 16, 32, or bf16 use_distributed_sampler: False -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -megatron_amp_O2: False # Enable O2-level automatic mixed precision to save memory +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +megatron_amp_O2: True # Enable O2-level automatic mixed precision to save memory gpt_model_file: null # GPT nemo file path checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading diff --git a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py index e9a6714729c9..3afb1e3fae48 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/__init__.py @@ -15,7 +15,6 @@ # from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel try: - from nemo.collections.nlp.models.language_modeling.megatron.falcon import falcon_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel HAVE_MEGATRON_CORE = True diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index c4998b667081..63b32bc70d3f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -89,7 +89,6 @@ def __init__( self.parallel_attention = None ## [Module 1: Input Layernorm] Optional Layernorm on the input data - # TODO: add pytorch only layernorm self.input_layernorm = build_module( submodules.input_layernorm, config=self.config, @@ -123,8 +122,8 @@ def __init__( ) ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture - self.pre_mlp_layernorm = ( - build_module( + if self.new_decoder_architecture: + self.pre_mlp_layernorm = build_module( submodules.pre_mlp_layernorm, config=self.config, hidden_size=self.config.hidden_size, @@ -134,9 +133,8 @@ def __init__( zero_centered_gamma=self.config.layernorm_zero_centered_gamma, normalization=self.config.normalization, ) - if self.new_decoder_architecture - else None - ) + else: + self.pre_mlp_layernorm = None ## [Module 6: MLP block] self.mlp = build_module(submodules.mlp, config=self.config) @@ -144,12 +142,6 @@ def __init__( ## [Module 7: BiasDropoutFusion] Optional self.mlp_bda = build_module(submodules.mlp_bda) - # @jcasper how should we handle nvfuser? - # Set bias+dropout+add fusion grad_enable execution handler. - # TORCH_MAJOR = int(torch.__version__.split('.')[0]) - # TORCH_MINOR = int(torch.__version__.split('.')[1]) - # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) - # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad self.bias_dropout_add_exec_handler = torch.enable_grad def forward( @@ -182,8 +174,6 @@ def forward( rotary_pos_emb=rotary_pos_emb, ) - # TODO: could we move `bias_dropout_add_exec_handler` itself - # inside the module provided in the `bias_dropout_add_spec` module? with self.bias_dropout_add_exec_handler(): hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)( attention_output_with_bias, residual, self.config.hidden_dropout @@ -208,26 +198,17 @@ def forward( mlp_output_without_bias = mlp_output + attn_output mlp_output_with_bias = (mlp_output_without_bias, None) - # TODO: could we move `bias_dropout_add_exec_handler` itself - # inside the module provided in the `bias_dropout_add_spec` module? with self.bias_dropout_add_exec_handler(): hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)( mlp_output_with_bias, residual, self.config.hidden_dropout ) - # Jit compiled function creates 'view' tensor. This tensor - # potentially gets saved in the MPU checkpoint function context, - # which rejects view tensors. While making a viewless tensor here - # won't result in memory savings (like the data loader, or - # p2p_communication), it serves to document the origin of this - # 'view' tensor. output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True) return output, context def sharded_state_dict(self, prefix=''): - # state_dict = self.state_dict(prefix=prefix, keep_vars=True) state_dict = self.state_dict(keep_vars=True) tensor_parallel_layers_axis_map = { diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 13472532b168..0cf374336d9a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -16,7 +16,7 @@ import os import queue import warnings -from dataclasses import dataclass, fields +from dataclasses import fields from functools import partial from typing import Any, Dict, Iterator, List, Optional, Union From 4383bd17386b789ed814d7655fcb9c8800bbf358 Mon Sep 17 00:00:00 2001 From: Vivian Chen Date: Sat, 2 Dec 2023 01:09:59 +0000 Subject: [PATCH 43/69] multiple revise based on review and latest mcore changes --- .../megatron/falcon/falcon_decoder_layer.py | 12 ---------- .../language_modeling/megatron_gpt_model.py | 2 +- .../convert_hf_falcon_to_nemo.py | 9 ++++--- .../convert_nemo_falcon_to_hf.py | 24 +++++++++---------- 4 files changed, 16 insertions(+), 31 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 63b32bc70d3f..fb0b0d9e0093 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -94,10 +94,6 @@ def __init__( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, - persist_layer_norm=self.config.persist_layer_norm, - sequence_parallel=self.config.sequence_parallel, - zero_centered_gamma=self.config.layernorm_zero_centered_gamma, - normalization=self.config.normalization, ) ## [Module 2: SelfAttention] @@ -115,10 +111,6 @@ def __init__( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, - persist_layer_norm=self.config.persist_layer_norm, - sequence_parallel=self.config.sequence_parallel, - zero_centered_gamma=self.config.layernorm_zero_centered_gamma, - normalization=self.config.normalization, ) ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture @@ -128,10 +120,6 @@ def __init__( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, - persist_layer_norm=self.config.persist_layer_norm, - sequence_parallel=self.config.sequence_parallel, - zero_centered_gamma=self.config.layernorm_zero_centered_gamma, - normalization=self.config.normalization, ) else: self.pre_mlp_layernorm = None diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 0cf374336d9a..46f70d248c84 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -331,7 +331,7 @@ def model_provider_func(self, pre_process, post_process): if self.mcore_gpt: model = MCoreGPTModel( config=self.transformer_config, - transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), + transformer_layer_spec=get_specs(self.spec_name), vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), pre_process=pre_process, diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index 39f194ecaf8d..f02b6dc3b336 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -21,15 +21,14 @@ Example to run this conversion script: ``` python convert_hf_falcon_to_nemo.py \ - --config /path/to/megatron_gpt_config.yaml \ - --input \ - --output \ + --config /path/to/megatron_falcon_config.yaml \ + --input /path/to/hf/checkpoints/folder \ + --output /path/to/output/nemo/file \ --precision ``` """ import argparse -import os import time from typing import Dict @@ -125,7 +124,7 @@ def load_falcon_config(args) -> FalconConfig: help="Path to Falcon variants checkpoint from HuggingFace hub or local dir", ) parser.add_argument( - "--output", type=str, default="None", required=True, help="Path to dir where to store output .nemo file" + "--output", type=str, required=True, help="Path to dir where to store output .nemo file" ) parser.add_argument( "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py index ac678a8676bf..bce63409a519 100644 --- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -21,6 +21,7 @@ from transformers import AutoModelForCausalLM from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.utils import logging @@ -50,9 +51,9 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--in-file", type=str, default=None, required=True, help="Path to .nemo file", + "--in-file", type=str, required=True, help="Path to .nemo file", ) - parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to HF .bin file") + parser.add_argument("--out-file", type=str, required=True, help="Path to HF .bin file") parser.add_argument( "--hf-in-path", type=str, @@ -92,6 +93,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> map_location = torch.device('cpu') model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.use_cpu_initialization = True + model_config.tensor_model_parallel_size = 1 else: map_location, model_config = None, None @@ -102,19 +104,15 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> ) if precision is None: precision = model.cfg.precision - if precision in [32, "32"]: - dtype = torch.float32 - elif precision in [16, "16", "16-mixed"]: - dtype = torch.float16 - elif precision in ["bf16", "bf16-mixed"]: - dtype = torch.bfloat16 - else: - logging.warning(f"Precision string {precision} is not recognized, falling back to fp32") + try: + dtype = torch_dtype_from_precision(precision) + except ValueError as e: + # warning that {precision} is not supported, fallback to float32 + logging.warning(str(e) + f", precision string '{precision}' is not recognized, falling back to fp32") dtype = torch.float32 # fallback param_to_weights = lambda param: param.to(dtype) checkpoint = OrderedDict() - checkpoint['state_dict'] = OrderedDict() def get_original_key(new_key): new_key = new_key[len(prefix) :] @@ -148,7 +146,7 @@ def get_original_key(new_key): if '_extra_state' in key: continue orig_key = get_original_key(key) - checkpoint['state_dict'][orig_key] = param_to_weights(value) + checkpoint[orig_key] = param_to_weights(value) os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) torch.save(checkpoint, output_hf_file) @@ -159,7 +157,7 @@ def replace_hf_weights(weights_file, input_hf_path, output_hf_path): model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True) nemo_exported = torch.load(weights_file) - model.load_state_dict(nemo_exported['state_dict']) + model.load_state_dict(nemo_exported) model.save_pretrained(output_hf_path) logging.info(f"Full HF model saved to {output_hf_path}") From b499c8805f18478d4d243f55b3e7aa94dfd12ca5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:11:20 +0000 Subject: [PATCH 44/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py | 4 +--- scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py index f02b6dc3b336..ef9410b1b929 100644 --- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py @@ -123,9 +123,7 @@ def load_falcon_config(args) -> FalconConfig: required=True, help="Path to Falcon variants checkpoint from HuggingFace hub or local dir", ) - parser.add_argument( - "--output", type=str, required=True, help="Path to dir where to store output .nemo file" - ) + parser.add_argument("--output", type=str, required=True, help="Path to dir where to store output .nemo file") parser.add_argument( "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" ) diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py index bce63409a519..cbe70064e272 100644 --- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -21,8 +21,8 @@ from transformers import AutoModelForCausalLM from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.utils import logging """ From 94f4ba25d664ba639151ceb86814dbd1ef6a9efb Mon Sep 17 00:00:00 2001 From: Vivian Chen Date: Sat, 2 Dec 2023 01:16:40 +0000 Subject: [PATCH 45/69] fix Signed-off-by: Vivian Chen --- scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py index cbe70064e272..66f6399855a3 100644 --- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py +++ b/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py @@ -107,7 +107,6 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> try: dtype = torch_dtype_from_precision(precision) except ValueError as e: - # warning that {precision} is not supported, fallback to float32 logging.warning(str(e) + f", precision string '{precision}' is not recognized, falling back to fp32") dtype = torch.float32 # fallback From 8928050323acc091a5131e7d01582680ed4a2451 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Sun, 3 Dec 2023 02:05:41 -0800 Subject: [PATCH 46/69] subclass from TransformerLayer --- .../megatron/falcon/falcon_decoder_layer.py | 63 ++----------------- .../megatron/falcon/falcon_spec.py | 6 +- 2 files changed, 7 insertions(+), 62 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index fb0b0d9e0093..1e3c83f6f394 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -20,9 +20,8 @@ from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.transformer.attention import SelfAttentionSubmodules -from megatron.core.transformer.base_layer import BaseLayer, LayerSubmodules from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import make_viewless_tensor @@ -42,21 +41,7 @@ hyperparameters: transformer hyperparameters """ - -@dataclass -class FalconTransformerLayerSubmodules(LayerSubmodules): - input_layernorm: Union[ModuleSpec, type] = IdentityOp - self_attention: Union[ModuleSpec, type] = IdentityOp - self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp - - post_self_attn_layernorm: Union[ModuleSpec, type] = IdentityOp - - pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp - mlp: Union[ModuleSpec, type] = IdentityOp - mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp - - -class FalconTransformerLayer(BaseLayer): +class FalconTransformerLayer(TransformerLayer): """A single transformer layer. Transformer layer takes input with size [s, b, h] and returns an @@ -67,53 +52,21 @@ class FalconTransformerLayer(BaseLayer): def __init__( self, config: TransformerConfig, # should come from FalconTransformerConfig class - submodules: FalconTransformerLayerSubmodules, + submodules: TransformerLayerSubmodules, layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding, ): - super().__init__(config=config, submodules=submodules) - self.config: TransformerConfig = config - - self.layer_number = layer_number + self._get_layer_offset() - - self.self_attn_mask_type = self_attn_mask_type + super().__init__(config=config, submodules=submodules, layer_number=layer_number) if hasattr(self.config, 'new_decoder_architecture'): self.new_decoder_architecture = self.config.new_decoder_architecture else: self.new_decoder_architecture = None - if hasattr(self.config, 'parallel_attention'): self.parallel_attention = self.config.parallel_attention else: self.parallel_attention = None - ## [Module 1: Input Layernorm] Optional Layernorm on the input data - self.input_layernorm = build_module( - submodules.input_layernorm, - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - - ## [Module 2: SelfAttention] - self.self_attention = build_module(submodules.self_attention, config=self.config, layer_number=layer_number,) - - ## [Module 3: BiasDropoutFusion] Optional - self.self_attn_bda = build_module(submodules.self_attn_bda) - - ## [Module 4: Post SelfAttention] Optional Layernorm after self-attn - if self.new_decoder_architecture or self.parallel_attention: - self.post_self_attn_layernorm = None - else: - self.post_self_attn_layernorm = build_module( - submodules.post_self_attn_layernorm, - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - - ## [Module 5: pre mlp layernorm] Optional Layernorm before MLP, used in Falcon's new decoder architecture if self.new_decoder_architecture: self.pre_mlp_layernorm = build_module( submodules.pre_mlp_layernorm, @@ -124,14 +77,6 @@ def __init__( else: self.pre_mlp_layernorm = None - ## [Module 6: MLP block] - self.mlp = build_module(submodules.mlp, config=self.config) - - ## [Module 7: BiasDropoutFusion] Optional - self.mlp_bda = build_module(submodules.mlp_bda) - - self.bias_dropout_add_exec_handler = torch.enable_grad - def forward( self, hidden_states, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 551f4e65bdfb..e53092e86a6b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -28,13 +28,14 @@ from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.spec_utils import ModuleSpec -from .falcon_decoder_layer import FalconTransformerLayer, FalconTransformerLayerSubmodules +from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules +from .falcon_decoder_layer import FalconTransformerLayer # Use this spec for an implementation using modules in TE def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec: return ModuleSpec( module=FalconTransformerLayer, - submodules=FalconTransformerLayerSubmodules( + submodules=TransformerLayerSubmodules( input_layernorm=TENorm, self_attention=ModuleSpec( module=SelfAttention, @@ -46,7 +47,6 @@ def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec: ), ), self_attn_bda=get_bias_dropout_add, - post_self_attn_layernorm=TENorm, pre_mlp_layernorm=TENorm, mlp=ModuleSpec( module=MLP, From b5234409b1dfa6c8357fb3eebe360b7236cbcbf1 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Sun, 3 Dec 2023 03:11:25 -0800 Subject: [PATCH 47/69] fixes according to comments --- .../megatron/falcon/falcon_decoder_layer.py | 1 - .../megatron/falcon/falcon_spec.py | 2 +- .../language_modeling/megatron_gpt_model.py | 24 ++++++------------- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 1e3c83f6f394..b20a493f5b08 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -94,7 +94,6 @@ def forward( if self.new_decoder_architecture: mlp_ln_output = self.pre_mlp_layernorm(hidden_states) - # Optional Input Layer norm input_layernorm_output = self.input_layernorm(hidden_states) input_mlp_ln = input_layernorm_output diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index e53092e86a6b..22238a9152be 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -32,7 +32,7 @@ from .falcon_decoder_layer import FalconTransformerLayer # Use this spec for an implementation using modules in TE -def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec: +def get_falcon_layer_spec() -> ModuleSpec: return ModuleSpec( module=FalconTransformerLayer, submodules=TransformerLayerSubmodules( diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 46f70d248c84..a9cec5f9bfc6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -58,6 +58,7 @@ ) from nemo.collections.nlp.parts import utils_funcs from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec from nemo.core.classes import Exportable from nemo.core.classes.common import PretrainedModelInfo from nemo.core.neural_types import ChannelType, NeuralType @@ -104,25 +105,14 @@ HAVE_TE = False -def get_specs( - spec_name, spec_func='get_gpt_layer_with_transformer_engine_spec' -): # Assumes the default spec function name - import importlib.util - +def get_specs(spec_name): name_spec_dict = { - "": "megatron.core.models.gpt.gpt_layer_specs", # default GPT - "megatron_falcon_gpt": "nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec", # Other customized model spec locations + "":get_gpt_layer_with_transformer_engine_spec(), + "megatron_falcon_gpt":get_falcon_layer_spec() } - module_path = name_spec_dict.get(spec_name) - if not module_path: - raise ImportError(f"Failed to import {spec_name}, please ensure {spec_name} is supported.") - - module = importlib.import_module(module_path) - try: - spec = getattr(module, spec_func)() - except AttributeError: - raise ImportError(f"Module {module_path} does not have {spec_func}") - return spec + if spec_name not in name_spec_dict: + raise ValueError(f"Spec name '{spec_name}' is not recognized.") + return name_spec_dict[spec_name] class MegatronGPTExportableModel(torch.nn.Module, Exportable): From 8ebf142a47348d81766272e26551960d4282cf71 Mon Sep 17 00:00:00 2001 From: Vivian Chen Date: Mon, 4 Dec 2023 17:45:52 +0000 Subject: [PATCH 48/69] add falcon ci test Signed-off-by: Vivian Chen --- Jenkinsfile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 12fafac57a67..ad4a5955985f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -143,6 +143,15 @@ pipeline { sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo' } } + stage('Falcon') { + steps { + sh 'python scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py \ + --config examples/nlp/language_modeling/conf/megatron_falcon_config.yaml \ + --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ + --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo' + sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo' + } + } } } From 1c1bc512821b4fcf79808543bda14cc3c115a447 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Mon, 4 Dec 2023 10:23:40 -0800 Subject: [PATCH 49/69] add post_self_attn_layernorm --- .../megatron/falcon/falcon_decoder_layer.py | 9 +++++++++ .../language_modeling/megatron/falcon/falcon_spec.py | 11 +++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index b20a493f5b08..c2732423d73e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -67,6 +67,15 @@ def __init__( else: self.parallel_attention = None + if self.new_decoder_architecture or self.parallel_attention: + self.post_self_attn_layernorm = None + else: + self.post_self_attn_layernorm = build_module( + submodules.post_self_attn_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) if self.new_decoder_architecture: self.pre_mlp_layernorm = build_module( submodules.pre_mlp_layernorm, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 22238a9152be..51afb58b84c9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -33,9 +33,7 @@ # Use this spec for an implementation using modules in TE def get_falcon_layer_spec() -> ModuleSpec: - return ModuleSpec( - module=FalconTransformerLayer, - submodules=TransformerLayerSubmodules( + falcon_submodules = TransformerLayerSubmodules( input_layernorm=TENorm, self_attention=ModuleSpec( module=SelfAttention, @@ -53,5 +51,10 @@ def get_falcon_layer_spec() -> ModuleSpec: submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), ), mlp_bda=get_bias_dropout_add, - ), + ) + #Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules. + falcon_submodules.post_self_attn_layernorm = TENorm + return ModuleSpec( + module=FalconTransformerLayer, + submodules=falcon_submodules ) From f84fee6bcf42f8ddd0c54c97138a9d13bb7fd0d1 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 7 Dec 2023 13:48:43 -0800 Subject: [PATCH 50/69] add explicit explanation/refs for handling lora logic --- .../common/megatron/adapters/mcore_mixins.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index 588135df4f50..d36c9c7dbf09 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -21,6 +21,7 @@ from megatron.core.transformer.mlp import MLP from megatron.core.transformer.transformer_layer import TransformerLayer from megatron.core.utils import make_viewless_tensor +from megatron.core.transformer.custom_layers.transformer_engine import TELayerNormColumnParallelLinear, TEColumnParallelLinear from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( AdapterName, @@ -65,19 +66,28 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None): # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] linear_qkv_output, _ = self.linear_qkv(hidden_states) layernorm_output = None - if isinstance(linear_qkv_output, tuple): # if LN and linear fused, both will be returned + + # In megatron/core/models/gpt/gpt_layer_specs.py TELayerNormColumnParallelLinear is used for linear_qkv. + # TELayerNormColumnParallelLinear fused LN and linear, both will be returned. + # In nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py TEColumnParallelLinear is used for linear_qkv, + # which only returns linear. + if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear): mixed_qkv, layernorm_output = linear_qkv_output - else: # otherwise only mixed_qkv + elif isinstance(self.linear_qkv, TEColumnParallelLinear): # only mixed_qkv mixed_qkv = linear_qkv_output + else: + raise ValueError(f"Unrecognized module type '{type(self.linear_qkv)}' when getting query, key, value tensors for mcore mixins. ") # LoRA logic if self.is_adapter_available(): lora_kqv_adapter = self.get_adapter_module(AdapterName.LORA_KQV_ADAPTER) if lora_kqv_adapter: - if layernorm_output: + if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear): lora_mixed_qkv = lora_kqv_adapter(layernorm_output) - else: + elif isinstance(self.linear_qkv, TEColumnParallelLinear): lora_mixed_qkv = lora_kqv_adapter(hidden_states) + else: + raise ValueError(f"Unrecognized module type '{type(self.linear_qkv)}' when applying lora.") mixed_qkv = mixed_qkv + lora_mixed_qkv # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] From ea39e6874bdfe86cae3040843ecf79fdd4d06bea Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Dec 2023 23:10:33 +0000 Subject: [PATCH 51/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../megatron/falcon/falcon_decoder_layer.py | 3 +- .../megatron/falcon/falcon_spec.py | 40 +++++++++---------- .../language_modeling/megatron_gpt_model.py | 7 +--- .../common/megatron/adapters/mcore_mixins.py | 11 +++-- 4 files changed, 30 insertions(+), 31 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index c2732423d73e..5f3c46961c85 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -21,9 +21,9 @@ from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.transformer.attention import SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.utils import make_viewless_tensor """ We use the following notation throughout this file: @@ -41,6 +41,7 @@ hyperparameters: transformer hyperparameters """ + class FalconTransformerLayer(TransformerLayer): """A single transformer layer. diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 51afb58b84c9..ea265b03685e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -34,27 +34,23 @@ # Use this spec for an implementation using modules in TE def get_falcon_layer_spec() -> ModuleSpec: falcon_submodules = TransformerLayerSubmodules( - input_layernorm=TENorm, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=TEColumnParallelLinear, - core_attention=TEDotProductAttention, - linear_proj=TERowParallelLinear, - ), + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=TENorm, - mlp=ModuleSpec( - module=MLP, - submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), - ), - mlp_bda=get_bias_dropout_add, - ) - #Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules. - falcon_submodules.post_self_attn_layernorm = TENorm - return ModuleSpec( - module=FalconTransformerLayer, - submodules=falcon_submodules + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + ), + mlp_bda=get_bias_dropout_add, ) + # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules. + falcon_submodules.post_self_attn_layernorm = TENorm + return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index e97905b9c180..a70ef40ae262 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -31,6 +31,7 @@ MegatronPretrainingSampler, ) from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import build_train_valid_test_datasets +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel from nemo.collections.nlp.modules.common.megatron.build_model import build_model @@ -58,7 +59,6 @@ ) from nemo.collections.nlp.parts import utils_funcs from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank -from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec from nemo.core.classes import Exportable from nemo.core.classes.common import PretrainedModelInfo from nemo.core.neural_types import ChannelType, NeuralType @@ -106,10 +106,7 @@ def get_specs(spec_name): - name_spec_dict = { - "":get_gpt_layer_with_transformer_engine_spec(), - "megatron_falcon_gpt":get_falcon_layer_spec() - } + name_spec_dict = {"": get_gpt_layer_with_transformer_engine_spec(), "megatron_falcon_gpt": get_falcon_layer_spec()} if spec_name not in name_spec_dict: raise ValueError(f"Spec name '{spec_name}' is not recognized.") return name_spec_dict[spec_name] diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index d36c9c7dbf09..eb86c8324dcd 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -18,10 +18,13 @@ from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding from megatron.core.transformer.attention import SelfAttention +from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, + TELayerNormColumnParallelLinear, +) from megatron.core.transformer.mlp import MLP from megatron.core.transformer.transformer_layer import TransformerLayer from megatron.core.utils import make_viewless_tensor -from megatron.core.transformer.custom_layers.transformer_engine import TELayerNormColumnParallelLinear, TEColumnParallelLinear from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( AdapterName, @@ -71,12 +74,14 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None): # TELayerNormColumnParallelLinear fused LN and linear, both will be returned. # In nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py TEColumnParallelLinear is used for linear_qkv, # which only returns linear. - if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear): + if isinstance(self.linear_qkv, TELayerNormColumnParallelLinear): mixed_qkv, layernorm_output = linear_qkv_output elif isinstance(self.linear_qkv, TEColumnParallelLinear): # only mixed_qkv mixed_qkv = linear_qkv_output else: - raise ValueError(f"Unrecognized module type '{type(self.linear_qkv)}' when getting query, key, value tensors for mcore mixins. ") + raise ValueError( + f"Unrecognized module type '{type(self.linear_qkv)}' when getting query, key, value tensors for mcore mixins. " + ) # LoRA logic if self.is_adapter_available(): From aea1e81b70822f6e0b097579599228178669f71a Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Sat, 9 Dec 2023 11:15:04 -0800 Subject: [PATCH 52/69] fixes for code scanning --- .../language_modeling/megatron/falcon/falcon_decoder_layer.py | 3 +-- .../models/language_modeling/megatron/falcon/falcon_spec.py | 4 ---- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 - 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 5f3c46961c85..13a460e1b285 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -18,8 +18,6 @@ import torch from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.transformer.attention import SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig @@ -101,6 +99,7 @@ def forward( # Residual connection. residual = hidden_states + mlp_ln_output = None if self.new_decoder_architecture: mlp_ln_output = self.pre_mlp_layernorm(hidden_states) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index ea265b03685e..28c96bb12af3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -13,17 +13,13 @@ # limitations under the License. from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.fusions.fused_layer_norm import FusedLayerNorm -from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.custom_layers.transformer_engine import ( TEColumnParallelLinear, TEDotProductAttention, - TELayerNormColumnParallelLinear, TENorm, TERowParallelLinear, ) -from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.spec_utils import ModuleSpec diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a70ef40ae262..775754fb5f33 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -80,7 +80,6 @@ from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.pipeline_parallel.schedules import get_forward_backward_func from megatron.core.transformer.module import Float16Module as MCoreFloat16Module - from megatron.core.transformer.spec_utils import import_module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal From b0966c1394935a318dc38aef986ea9913313e729 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Mon, 11 Dec 2023 21:59:03 -0800 Subject: [PATCH 53/69] remove unused imports --- .../megatron/falcon/falcon_decoder_layer.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 13a460e1b285..cbd3430adba8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -12,14 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass -from typing import Union - -import torch from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.spec_utils import build_module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.utils import make_viewless_tensor From 0c9a2e369efb45bf77ba82fb8353139f4c96eefb Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 12 Dec 2023 14:53:28 -0800 Subject: [PATCH 54/69] unit test for falcon model --- tests/collections/nlp/test_falcon_model.py | 277 +++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 tests/collections/nlp/test_falcon_model.py diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py new file mode 100644 index 000000000000..00317bae3fc4 --- /dev/null +++ b/tests/collections/nlp/test_falcon_model.py @@ -0,0 +1,277 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest +import torch +from omegaconf import DictConfig +from pytorch_lightning import Trainer + +from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy + +DEVICE_CAPABILITY = None +if torch.cuda.is_available(): + DEVICE_CAPABILITY = torch.cuda.get_device_capability() + + +@pytest.fixture() +def model_cfg(test_data_dir): + + model_cfg = { + 'mcore_gpt': True, + 'micro_batch_size': 1, + 'global_batch_size': 1, + 'rampup_batch_size': None, + 'tensor_model_parallel_size': 1, + 'pipeline_model_parallel_size': 1, + 'virtual_pipeline_model_parallel_size': None, + 'encoder_seq_length': 512, + 'max_position_embeddings': 512, + 'num_layers': 1, + 'hidden_size': 128, + 'ffn_hidden_size': 512, + 'num_attention_heads': 2, + 'num_query_groups': 1, + 'init_method_std': 0.02, + 'use_scaled_init_method': True, + 'hidden_dropout': 0.0, + 'attention_dropout': 0.0, + 'ffn_dropout': 0, + 'kv_channels': None, + 'apply_query_key_layer_scaling': False, + 'normalization': 'layernorm', + 'layernorm_epsilon': 1e-05, + 'do_layer_norm_weight_decay': False, + 'make_vocab_size_divisible_by': 128, + 'pre_process': True, + 'post_process': True, + 'persist_layer_norm': True, + 'bias': False, + 'activation': 'gelu', + 'headscale': False, + 'transformer_block_type': 'pre_ln', + 'openai_gelu': False, + 'normalize_attention_scores': True, + 'position_embedding_type': 'rope', + 'rotary_percentage': 1.0, + 'attention_type': 'multihead', + 'share_embeddings_and_output_weights': False, + 'overlap_p2p_comm': False, + 'batch_p2p_comm': True, + 'seq_len_interpolation_factor': None, + 'tokenizer': { + 'library': 'huggingface', + 'type': 'tiiuae/falcon-40b', + 'use_fast': True + }, + 'native_amp_init_scale': 4294967296, + 'native_amp_growth_interval': 1000, + 'hysteresis': 2, + 'fp32_residual_connection': False, + 'fp16_lm_cross_entropy': False, + 'megatron_amp_O2': False, + 'grad_allreduce_chunk_size_mb': 125, + 'grad_div_ar_fusion': True, + 'gradient_accumulation_fusion': False, + 'bias_activation_fusion': False, + 'bias_dropout_add_fusion': False, + 'masked_softmax_fusion': True, + 'get_attention_mask_from_fusion': True, + 'seed': 1234, + 'resume_from_checkpoint': None, + 'use_cpu_initialization': False, + 'onnx_safe': False, + 'apex_transformer_log_level': 30, + 'gradient_as_bucket_view': True, + 'sync_batch_comm': False, + 'activations_checkpoint_granularity': None, + 'activations_checkpoint_method': None, + 'activations_checkpoint_num_layers': None, + 'num_micro_batches_with_partial_activation_checkpoints': None, + 'activations_checkpoint_layers_per_pipeline': None, + 'sequence_parallel': False, + 'transformer_engine': True, + 'fp8': False, + 'fp8_e4m3': False, + 'fp8_hybrid': False, + 'fp8_margin': 0, + 'fp8_interval': 1, + 'fp8_amax_history_len': 1, + 'fp8_amax_compute_algo': 'most_recent', + 'reduce_amax': True, + 'use_emha': False, + 'ub_tp_comm_overlap': False, + 'ub_tp_comm_overlap_cfg': None, + 'use_flash_attention': False, + 'nsys_profile': { + 'enabled': False, + 'start_step': 10, + 'end_step': 10, + 'ranks': [0], + 'gen_shape': False}, + 'optim': { + 'name': 'distributed_fused_adam', + 'lr': '2e-4', + 'weight_decay': 0.01, + 'betas': [0.9, 0.98], + 'sched': { + 'name': 'CosineAnnealing', + 'warmup_steps': 500, + 'constant_steps': 50000, + 'min_lr': '2e-5'} + }, + 'gc_interval': 0, + 'precision': 'bf16', + 'new_decoder_architecture': False, + 'parallel_attention': True, + 'name': 'megatron_falcon_gpt', + 'target': 'nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel', + } + return model_cfg + + +@pytest.fixture() +def trainer_cfg(): + + trainer_cfg = { + 'devices': 1, + 'num_nodes': 1, + 'accelerator': 'gpu', + 'precision': 'bf16', + 'logger': False, + 'enable_checkpointing': False, + 'use_distributed_sampler': False, + 'max_epochs': 1000, + 'max_steps': 100000, + 'log_every_n_steps': 10, + 'val_check_interval': 100, + 'limit_val_batches': 50, + 'limit_test_batches': 500, + 'accumulate_grad_batches': 1, + 'gradient_clip_val': 1.0, + } + + return trainer_cfg + + +@pytest.fixture() +def precision(): + return 'bf16' + + +@pytest.fixture() +def falcon_gpt_model(model_cfg, trainer_cfg, precision): + model_cfg['precision'] = precision + trainer_cfg['precision'] = precision + + strategy = NLPDDPStrategy() + + trainer = Trainer(strategy=strategy, **trainer_cfg) + + cfg = DictConfig(model_cfg) + + model = MegatronGPTModel(cfg=cfg, trainer=trainer) + + return model + + +@pytest.fixture() +def test_text(): + test_text = [ + "hello, world", + "four score and seven years ago", + "Your time is limited", + "If you set goals rediculously high", + ] + return test_text + + +@pytest.mark.run_only_on('GPU') +class TestFalconGPTModel: + @pytest.mark.unit + def test_constructor(self, falcon_gpt_model): + assert isinstance(falcon_gpt_model, MegatronGPTModel) + + num_weights = falcon_gpt_model.num_weights + assert num_weights == 16827136 + + @pytest.mark.unit + def test_tokenizer(self, falcon_gpt_model, test_text): + + assert isinstance(falcon_gpt_model.tokenizer, AutoTokenizer) + assert falcon_gpt_model.tokenizer.name == 'PreTrainedTokenizerFast' + assert falcon_gpt_model.tokenizer.vocab_size == 65024 + + ids = [falcon_gpt_model.tokenizer.text_to_ids(text) for text in test_text] + + true_ids = [ + [30835, 23, 1079], + [18584, 5179, 273, 5144, 909, 2323], + [4560, 601, 304, 3991], + [1424, 299, 889, 4258, 2400, 276, 20201, 986], + ] + assert sum([id_list == true_id_list for id_list, true_id_list in zip(ids, true_ids)]) == 4 + + @pytest.mark.parametrize( + "precision", + [ + 32, + 16, + pytest.param( + "bf16", + marks=pytest.mark.skipif( + not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, + reason='bfloat16 is not supported on this device', + ), + ), + ], + ) + @pytest.mark.unit + def test_forward(self, falcon_gpt_model, test_text): + + dtype = falcon_gpt_model.torch_dtype + + falcon_gpt_model.eval() + + ids = [falcon_gpt_model.tokenizer.text_to_ids(text) for text in test_text] + + id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids] + + masks_and_position_ids = [ + get_ltor_masks_and_position_ids(id_tensor, falcon_gpt_model.tokenizer.eos_id, False, False, False) + for id_tensor in id_tensors + ] + output_tensors = [] + with torch.no_grad(): + for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): + attn_mask, _, pos_ids = attn_mask_and_pos_ids + assert tokens.shape == pos_ids.shape + assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1] + with torch.autocast('cuda', dtype=dtype): + output_tensor = falcon_gpt_model.forward( + tokens=tokens.cuda(), + text_position_ids=pos_ids.cuda(), + attention_mask=attn_mask.cuda(), + labels=None, + ) + # output is [b s h] + assert output_tensor.shape[0] == 1 + assert output_tensor.shape[1] == tokens.shape[1] + assert output_tensor.shape[2] == falcon_gpt_model.padded_vocab_size + assert output_tensor.dtype == dtype + output_tensors.append(output_tensor) From 8e8ba6624c42d8bb718bbb204a07b1d5486d7e83 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Dec 2023 23:03:55 +0000 Subject: [PATCH 55/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/collections/nlp/test_falcon_model.py | 29 ++++++---------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py index 00317bae3fc4..a36b64e82271 100644 --- a/tests/collections/nlp/test_falcon_model.py +++ b/tests/collections/nlp/test_falcon_model.py @@ -74,11 +74,7 @@ def model_cfg(test_data_dir): 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'seq_len_interpolation_factor': None, - 'tokenizer': { - 'library': 'huggingface', - 'type': 'tiiuae/falcon-40b', - 'use_fast': True - }, + 'tokenizer': {'library': 'huggingface', 'type': 'tiiuae/falcon-40b', 'use_fast': True}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, @@ -118,23 +114,14 @@ def model_cfg(test_data_dir): 'ub_tp_comm_overlap': False, 'ub_tp_comm_overlap_cfg': None, 'use_flash_attention': False, - 'nsys_profile': { - 'enabled': False, - 'start_step': 10, - 'end_step': 10, - 'ranks': [0], - 'gen_shape': False}, + 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': { - 'name': 'distributed_fused_adam', - 'lr': '2e-4', - 'weight_decay': 0.01, - 'betas': [0.9, 0.98], - 'sched': { - 'name': 'CosineAnnealing', - 'warmup_steps': 500, - 'constant_steps': 50000, - 'min_lr': '2e-5'} - }, + 'name': 'distributed_fused_adam', + 'lr': '2e-4', + 'weight_decay': 0.01, + 'betas': [0.9, 0.98], + 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 50000, 'min_lr': '2e-5'}, + }, 'gc_interval': 0, 'precision': 'bf16', 'new_decoder_architecture': False, From b1e63982ebd1f3694f5c73ed3b03f3c21124f33a Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 12 Dec 2023 20:42:17 -0800 Subject: [PATCH 56/69] add falcon transformer layer unit test --- .../nlp/test_falcon_transformer_layer.py | 118 ++++++++++++++++++ tests/utils/test_parallel_utils.py | 31 +++++ 2 files changed, 149 insertions(+) create mode 100644 tests/collections/nlp/test_falcon_transformer_layer.py create mode 100644 tests/utils/test_parallel_utils.py diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py new file mode 100644 index 000000000000..1562c7c3ac89 --- /dev/null +++ b/tests/collections/nlp/test_falcon_transformer_layer.py @@ -0,0 +1,118 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest +import torch + +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer + +from tests.utils.test_parallel_utils import Utils + +class TestParallelFalconTransformerLayer: + + def setup_method(self, method): + Utils.initialize_model_parallel(1,1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) + self.parallel_falcon_transformer_layer = FalconTransformerLayer(transformer_config, + get_falcon_layer_spec().submodules) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer + assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer) + assert parallel_falcon_transformer_layer.layer_number == 1 + + num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()]) + assert num_weights == 1884 + + def test_gpu_forward(self): + parallel_transformer_layer = self.parallel_falcon_transformer_layer + config: TransformerConfig = parallel_transformer_layer.config + sequence_length = 32 + micro_batch_size = 2 + parallel_transformer_layer.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + hidden_states, context = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask) + assert hidden_states.shape[0] == sequence_length + assert hidden_states.shape[1] == micro_batch_size + assert hidden_states.shape[2] == config.hidden_size + + @pytest.mark.parametrize('tp_pp', [(1, 1)]) + def test_sharded_state_dict(self, tp_pp): + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(*tp_pp) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig(num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True) + parallel_transformer_layer = FalconTransformerLayer(transformer_config, + get_falcon_layer_spec().submodules) + + sharded_state_dict = parallel_transformer_layer.sharded_state_dict() + + extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')} + sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')} + assert all(isinstance(t, ShardedObject) for t in extra_states.values()) + assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values()) + + # Test all local shapes + tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()} + tp_size = parallel_state.get_tensor_model_parallel_world_size() + ans = get_tensor_shapes_for_tp(transformer_config, tp_size) + assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size) + + # Test all global shapes. Prepend num layers in front of expected shapes + tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()} + expected_global_shapes = {k: (transformer_config.num_layers, *v) + for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()} + assert tensor_global_shapes == expected_global_shapes + + # Test ShardedTensor keys + for state_dict_key, sh_ten in sharded_tensors.items(): + assert state_dict_key == f'0.{sh_ten.key}' + + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(1, 1) + + +def get_tensor_shapes_for_tp(transformer_config, tp_size): + hs = transformer_config.hidden_size + ffn_hs = transformer_config.ffn_hidden_size + return { + '0.input_layernorm.weight': (hs,), + '0.input_layernorm.bias': (hs,), + '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs), + '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,), + '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size), + '0.mlp.linear_fc2.bias': (hs,), + '0.self_attention.linear_proj.weight': (hs, hs // tp_size), + '0.self_attention.linear_proj.bias': (hs,), + '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs), + '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,), + '0.post_self_attn_layernorm.weight': (hs,), + '0.post_self_attn_layernorm.bias': (hs,) + } diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py new file mode 100644 index 000000000000..ead8ec8b744e --- /dev/null +++ b/tests/utils/test_parallel_utils.py @@ -0,0 +1,31 @@ +import os +import torch +import megatron.core.parallel_state as ps + +class Utils: + + world_size = 1 #one gpu for unit test + os.environ['LOCAL_RANK']='0' + rank = int(os.environ['LOCAL_RANK']) + + @staticmethod + def initialize_distributed(): + print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') + torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method) + + @staticmethod + def destroy_model_parallel(): + ps.destroy_model_parallel() + torch.distributed.barrier() + + @staticmethod + def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None): + ps.destroy_model_parallel() + if not torch.distributed.is_initialized(): + Utils.initialize_distributed() + ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank) From 2759755187783d081a6d2b8661a14d8d18240214 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Dec 2023 04:49:26 +0000 Subject: [PATCH 57/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/test_falcon_transformer_layer.py | 32 +++++++++++-------- tests/utils/test_parallel_utils.py | 28 ++++++++++++---- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py index 1562c7c3ac89..80765edaa454 100644 --- a/tests/collections/nlp/test_falcon_transformer_layer.py +++ b/tests/collections/nlp/test_falcon_transformer_layer.py @@ -16,24 +16,26 @@ import pytest import torch - from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.transformer_config import TransformerConfig -from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec -from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec from tests.utils.test_parallel_utils import Utils -class TestParallelFalconTransformerLayer: +class TestParallelFalconTransformerLayer: def setup_method(self, method): - Utils.initialize_model_parallel(1,1) + Utils.initialize_model_parallel(1, 1) model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) - self.parallel_falcon_transformer_layer = FalconTransformerLayer(transformer_config, - get_falcon_layer_spec().submodules) + transformer_config = TransformerConfig( + num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True + ) + self.parallel_falcon_transformer_layer = FalconTransformerLayer( + transformer_config, get_falcon_layer_spec().submodules + ) def teardown_method(self, method): Utils.destroy_model_parallel() @@ -68,9 +70,10 @@ def test_sharded_state_dict(self, tp_pp): Utils.destroy_model_parallel() Utils.initialize_model_parallel(*tp_pp) model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig(num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True) - parallel_transformer_layer = FalconTransformerLayer(transformer_config, - get_falcon_layer_spec().submodules) + transformer_config = TransformerConfig( + num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True + ) + parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules) sharded_state_dict = parallel_transformer_layer.sharded_state_dict() @@ -87,8 +90,9 @@ def test_sharded_state_dict(self, tp_pp): # Test all global shapes. Prepend num layers in front of expected shapes tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()} - expected_global_shapes = {k: (transformer_config.num_layers, *v) - for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items()} + expected_global_shapes = { + k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items() + } assert tensor_global_shapes == expected_global_shapes # Test ShardedTensor keys @@ -114,5 +118,5 @@ def get_tensor_shapes_for_tp(transformer_config, tp_size): '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs), '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,), '0.post_self_attn_layernorm.weight': (hs,), - '0.post_self_attn_layernorm.bias': (hs,) + '0.post_self_attn_layernorm.bias': (hs,), } diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py index ead8ec8b744e..0595cf43b599 100644 --- a/tests/utils/test_parallel_utils.py +++ b/tests/utils/test_parallel_utils.py @@ -1,11 +1,13 @@ import os -import torch + import megatron.core.parallel_state as ps +import torch + class Utils: - world_size = 1 #one gpu for unit test - os.environ['LOCAL_RANK']='0' + world_size = 1 # one gpu for unit test + os.environ['LOCAL_RANK'] = '0' rank = int(os.environ['LOCAL_RANK']) @staticmethod @@ -16,16 +18,28 @@ def initialize_distributed(): master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port - torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method) - + torch.distributed.init_process_group( + backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method + ) + @staticmethod def destroy_model_parallel(): ps.destroy_model_parallel() torch.distributed.barrier() @staticmethod - def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None): + def initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + ): ps.destroy_model_parallel() if not torch.distributed.is_initialized(): Utils.initialize_distributed() - ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank) + ps.initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size, + pipeline_model_parallel_split_rank, + ) From 5ad525c458025741ff3150aebf9172516922abef Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Wed, 13 Dec 2023 10:21:23 -0800 Subject: [PATCH 58/69] fixes for code scan --- tests/collections/nlp/test_falcon_model.py | 2 -- tests/collections/nlp/test_falcon_transformer_layer.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py index a36b64e82271..860434ac772b 100644 --- a/tests/collections/nlp/test_falcon_model.py +++ b/tests/collections/nlp/test_falcon_model.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest import torch from omegaconf import DictConfig diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py index 80765edaa454..2613e5535a5b 100644 --- a/tests/collections/nlp/test_falcon_transformer_layer.py +++ b/tests/collections/nlp/test_falcon_transformer_layer.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest import torch from megatron.core import parallel_state @@ -85,7 +83,6 @@ def test_sharded_state_dict(self, tp_pp): # Test all local shapes tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()} tp_size = parallel_state.get_tensor_model_parallel_world_size() - ans = get_tensor_shapes_for_tp(transformer_config, tp_size) assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size) # Test all global shapes. Prepend num layers in front of expected shapes From 9c4960f0bb64c9b52fd96f99316e88d37844b5ca Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Wed, 13 Dec 2023 13:35:38 -0800 Subject: [PATCH 59/69] remove mcore dependent tests --- .../nlp/test_falcon_transformer_layer.py | 119 ------------------ tests/utils/test_parallel_utils.py | 45 ------- 2 files changed, 164 deletions(-) delete mode 100644 tests/collections/nlp/test_falcon_transformer_layer.py delete mode 100644 tests/utils/test_parallel_utils.py diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py deleted file mode 100644 index 2613e5535a5b..000000000000 --- a/tests/collections/nlp/test_falcon_transformer_layer.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import torch -from megatron.core import parallel_state -from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor -from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed -from megatron.core.transformer.transformer_config import TransformerConfig - -from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer -from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec -from tests.utils.test_parallel_utils import Utils - - -class TestParallelFalconTransformerLayer: - def setup_method(self, method): - Utils.initialize_model_parallel(1, 1) - model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig( - num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True - ) - self.parallel_falcon_transformer_layer = FalconTransformerLayer( - transformer_config, get_falcon_layer_spec().submodules - ) - - def teardown_method(self, method): - Utils.destroy_model_parallel() - - def test_constructor(self): - parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer - assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer) - assert parallel_falcon_transformer_layer.layer_number == 1 - - num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()]) - assert num_weights == 1884 - - def test_gpu_forward(self): - parallel_transformer_layer = self.parallel_falcon_transformer_layer - config: TransformerConfig = parallel_transformer_layer.config - sequence_length = 32 - micro_batch_size = 2 - parallel_transformer_layer.cuda() - - # [sequence length, batch size, hidden size] - hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) - hidden_states = hidden_states.cuda() - - attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() - hidden_states, context = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask) - assert hidden_states.shape[0] == sequence_length - assert hidden_states.shape[1] == micro_batch_size - assert hidden_states.shape[2] == config.hidden_size - - @pytest.mark.parametrize('tp_pp', [(1, 1)]) - def test_sharded_state_dict(self, tp_pp): - Utils.destroy_model_parallel() - Utils.initialize_model_parallel(*tp_pp) - model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig( - num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True - ) - parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules) - - sharded_state_dict = parallel_transformer_layer.sharded_state_dict() - - extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')} - sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')} - assert all(isinstance(t, ShardedObject) for t in extra_states.values()) - assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values()) - - # Test all local shapes - tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()} - tp_size = parallel_state.get_tensor_model_parallel_world_size() - assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size) - - # Test all global shapes. Prepend num layers in front of expected shapes - tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()} - expected_global_shapes = { - k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items() - } - assert tensor_global_shapes == expected_global_shapes - - # Test ShardedTensor keys - for state_dict_key, sh_ten in sharded_tensors.items(): - assert state_dict_key == f'0.{sh_ten.key}' - - Utils.destroy_model_parallel() - Utils.initialize_model_parallel(1, 1) - - -def get_tensor_shapes_for_tp(transformer_config, tp_size): - hs = transformer_config.hidden_size - ffn_hs = transformer_config.ffn_hidden_size - return { - '0.input_layernorm.weight': (hs,), - '0.input_layernorm.bias': (hs,), - '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs), - '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,), - '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size), - '0.mlp.linear_fc2.bias': (hs,), - '0.self_attention.linear_proj.weight': (hs, hs // tp_size), - '0.self_attention.linear_proj.bias': (hs,), - '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs), - '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,), - '0.post_self_attn_layernorm.weight': (hs,), - '0.post_self_attn_layernorm.bias': (hs,), - } diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py deleted file mode 100644 index 0595cf43b599..000000000000 --- a/tests/utils/test_parallel_utils.py +++ /dev/null @@ -1,45 +0,0 @@ -import os - -import megatron.core.parallel_state as ps -import torch - - -class Utils: - - world_size = 1 # one gpu for unit test - os.environ['LOCAL_RANK'] = '0' - rank = int(os.environ['LOCAL_RANK']) - - @staticmethod - def initialize_distributed(): - print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') - torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) - init_method = 'tcp://' - master_ip = os.getenv('MASTER_ADDR', 'localhost') - master_port = os.getenv('MASTER_PORT', '6000') - init_method += master_ip + ':' + master_port - torch.distributed.init_process_group( - backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method - ) - - @staticmethod - def destroy_model_parallel(): - ps.destroy_model_parallel() - torch.distributed.barrier() - - @staticmethod - def initialize_model_parallel( - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, - virtual_pipeline_model_parallel_size=None, - pipeline_model_parallel_split_rank=None, - ): - ps.destroy_model_parallel() - if not torch.distributed.is_initialized(): - Utils.initialize_distributed() - ps.initialize_model_parallel( - tensor_model_parallel_size, - pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size, - pipeline_model_parallel_split_rank, - ) From fb048066e9688b945c2b6ecbd715687f5b79c012 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Wed, 13 Dec 2023 14:25:40 -0800 Subject: [PATCH 60/69] Revert "remove mcore dependent tests" This reverts commit 9c4960f0bb64c9b52fd96f99316e88d37844b5ca. --- .../nlp/test_falcon_transformer_layer.py | 119 ++++++++++++++++++ tests/utils/test_parallel_utils.py | 45 +++++++ 2 files changed, 164 insertions(+) create mode 100644 tests/collections/nlp/test_falcon_transformer_layer.py create mode 100644 tests/utils/test_parallel_utils.py diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py new file mode 100644 index 000000000000..2613e5535a5b --- /dev/null +++ b/tests/collections/nlp/test_falcon_transformer_layer.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch +from megatron.core import parallel_state +from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig + +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer +from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec +from tests.utils.test_parallel_utils import Utils + + +class TestParallelFalconTransformerLayer: + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig( + num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True + ) + self.parallel_falcon_transformer_layer = FalconTransformerLayer( + transformer_config, get_falcon_layer_spec().submodules + ) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer + assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer) + assert parallel_falcon_transformer_layer.layer_number == 1 + + num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()]) + assert num_weights == 1884 + + def test_gpu_forward(self): + parallel_transformer_layer = self.parallel_falcon_transformer_layer + config: TransformerConfig = parallel_transformer_layer.config + sequence_length = 32 + micro_batch_size = 2 + parallel_transformer_layer.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + hidden_states, context = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask) + assert hidden_states.shape[0] == sequence_length + assert hidden_states.shape[1] == micro_batch_size + assert hidden_states.shape[2] == config.hidden_size + + @pytest.mark.parametrize('tp_pp', [(1, 1)]) + def test_sharded_state_dict(self, tp_pp): + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(*tp_pp) + model_parallel_cuda_manual_seed(123) + transformer_config = TransformerConfig( + num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True + ) + parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules) + + sharded_state_dict = parallel_transformer_layer.sharded_state_dict() + + extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')} + sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')} + assert all(isinstance(t, ShardedObject) for t in extra_states.values()) + assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values()) + + # Test all local shapes + tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()} + tp_size = parallel_state.get_tensor_model_parallel_world_size() + assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size) + + # Test all global shapes. Prepend num layers in front of expected shapes + tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()} + expected_global_shapes = { + k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items() + } + assert tensor_global_shapes == expected_global_shapes + + # Test ShardedTensor keys + for state_dict_key, sh_ten in sharded_tensors.items(): + assert state_dict_key == f'0.{sh_ten.key}' + + Utils.destroy_model_parallel() + Utils.initialize_model_parallel(1, 1) + + +def get_tensor_shapes_for_tp(transformer_config, tp_size): + hs = transformer_config.hidden_size + ffn_hs = transformer_config.ffn_hidden_size + return { + '0.input_layernorm.weight': (hs,), + '0.input_layernorm.bias': (hs,), + '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs), + '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,), + '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size), + '0.mlp.linear_fc2.bias': (hs,), + '0.self_attention.linear_proj.weight': (hs, hs // tp_size), + '0.self_attention.linear_proj.bias': (hs,), + '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs), + '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,), + '0.post_self_attn_layernorm.weight': (hs,), + '0.post_self_attn_layernorm.bias': (hs,), + } diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py new file mode 100644 index 000000000000..0595cf43b599 --- /dev/null +++ b/tests/utils/test_parallel_utils.py @@ -0,0 +1,45 @@ +import os + +import megatron.core.parallel_state as ps +import torch + + +class Utils: + + world_size = 1 # one gpu for unit test + os.environ['LOCAL_RANK'] = '0' + rank = int(os.environ['LOCAL_RANK']) + + @staticmethod + def initialize_distributed(): + print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') + torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group( + backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method + ) + + @staticmethod + def destroy_model_parallel(): + ps.destroy_model_parallel() + torch.distributed.barrier() + + @staticmethod + def initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + ): + ps.destroy_model_parallel() + if not torch.distributed.is_initialized(): + Utils.initialize_distributed() + ps.initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size, + pipeline_model_parallel_split_rank, + ) From e54fdad6643e2acff0f87a4e20db7460c0be44d0 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Wed, 13 Dec 2023 18:07:40 -0800 Subject: [PATCH 61/69] add import guards --- .../megatron/falcon/falcon_decoder_layer.py | 25 +++++++++++---- .../megatron/falcon/falcon_spec.py | 32 ++++++++++++------- .../nlp/test_falcon_transformer_layer.py | 16 +++++++--- tests/utils/test_parallel_utils.py | 10 ++++-- 4 files changed, 58 insertions(+), 25 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index cbd3430adba8..a75a6f5e4645 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -12,13 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from megatron.core import parallel_state -from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.spec_utils import build_module -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules -from megatron.core.utils import make_viewless_tensor +try: + from megatron.core import parallel_state + from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor + from megatron.core.transformer.enums import AttnMaskType + from megatron.core.transformer.spec_utils import build_module + from megatron.core.transformer.transformer_config import TransformerConfig + from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + from megatron.core.utils import make_viewless_tensor + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False """ We use the following notation throughout this file: h: hidden size @@ -51,6 +58,10 @@ def __init__( layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding, ): + if not HAVE_MEGATRON_CORE: + raise ImportError( + "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." + ) super().__init__(config=config, submodules=submodules, layer_number=layer_number) if hasattr(self.config, 'new_decoder_architecture'): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 28c96bb12af3..88a4b5a7bb7c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -12,19 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules -from megatron.core.transformer.custom_layers.transformer_engine import ( - TEColumnParallelLinear, - TEDotProductAttention, - TENorm, - TERowParallelLinear, -) -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.mlp import MLP, MLPSubmodules -from megatron.core.transformer.spec_utils import ModuleSpec +try: + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules + from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, + TEDotProductAttention, + TENorm, + TERowParallelLinear, + ) + from megatron.core.transformer.enums import AttnMaskType + from megatron.core.transformer.mlp import MLP, MLPSubmodules + from megatron.core.transformer.spec_utils import ModuleSpec + + from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False -from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules from .falcon_decoder_layer import FalconTransformerLayer # Use this spec for an implementation using modules in TE diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py index 2613e5535a5b..d17597f7fd08 100644 --- a/tests/collections/nlp/test_falcon_transformer_layer.py +++ b/tests/collections/nlp/test_falcon_transformer_layer.py @@ -14,10 +14,18 @@ import pytest import torch -from megatron.core import parallel_state -from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor -from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed -from megatron.core.transformer.transformer_config import TransformerConfig + +try: + from megatron.core import parallel_state + from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor + from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + from megatron.core.transformer.transformer_config import TransformerConfig + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py index 0595cf43b599..c0af612280f5 100644 --- a/tests/utils/test_parallel_utils.py +++ b/tests/utils/test_parallel_utils.py @@ -1,8 +1,14 @@ import os - -import megatron.core.parallel_state as ps import torch +try: + import megatron.core.parallel_state as ps + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False class Utils: From 7cd8cfb12a4f1861ea26de30ecd693c8df82ec76 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 14 Dec 2023 02:09:07 +0000 Subject: [PATCH 62/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/utils/test_parallel_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py index c0af612280f5..eadc467fbce1 100644 --- a/tests/utils/test_parallel_utils.py +++ b/tests/utils/test_parallel_utils.py @@ -10,6 +10,7 @@ HAVE_MEGATRON_CORE = False + class Utils: world_size = 1 # one gpu for unit test From beada8c6014489cc598f96c55d743cd3e0b3c325 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 14 Dec 2023 00:29:54 -0800 Subject: [PATCH 63/69] add import guards cont --- .../megatron/falcon/falcon_decoder_layer.py | 3 +++ .../language_modeling/megatron/falcon/falcon_spec.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index a75a6f5e4645..73b4a70797ab 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -27,6 +27,9 @@ HAVE_MEGATRON_CORE = False + class TransformerLayer: + pass + """ We use the following notation throughout this file: h: hidden size n: number of attention heads diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 88a4b5a7bb7c..5022b49b07e2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -33,10 +33,17 @@ HAVE_MEGATRON_CORE = False + from typing import Any + ModuleSpec = Any + from .falcon_decoder_layer import FalconTransformerLayer # Use this spec for an implementation using modules in TE def get_falcon_layer_spec() -> ModuleSpec: + if not HAVE_MEGATRON_CORE: + raise ImportError( + "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." + ) falcon_submodules = TransformerLayerSubmodules( input_layernorm=TENorm, self_attention=ModuleSpec( From 5d76cf3a3cb51a25c995f3140ee4fdb8cb49fbdc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 14 Dec 2023 08:31:25 +0000 Subject: [PATCH 64/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language_modeling/megatron/falcon/falcon_decoder_layer.py | 1 + .../nlp/models/language_modeling/megatron/falcon/falcon_spec.py | 1 + 2 files changed, 2 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 73b4a70797ab..92db42a88fc1 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -30,6 +30,7 @@ class TransformerLayer: pass + """ We use the following notation throughout this file: h: hidden size n: number of attention heads diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 5022b49b07e2..b6ec930c2964 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -34,6 +34,7 @@ HAVE_MEGATRON_CORE = False from typing import Any + ModuleSpec = Any from .falcon_decoder_layer import FalconTransformerLayer From 27b7694361d971306b99a8fd88950e2756458a26 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 14 Dec 2023 17:36:10 -0800 Subject: [PATCH 65/69] fixes for ci import tests and unit tests --- .../megatron/falcon/falcon_decoder_layer.py | 12 ++-- .../megatron/falcon/falcon_spec.py | 6 +- tests/collections/nlp/test_falcon_model.py | 4 +- .../nlp/test_falcon_transformer_layer.py | 68 ++----------------- tests/utils/test_parallel_utils.py | 52 -------------- 5 files changed, 17 insertions(+), 125 deletions(-) delete mode 100644 tests/utils/test_parallel_utils.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 92db42a88fc1..4c5e88b59680 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults + try: from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor @@ -27,9 +29,10 @@ HAVE_MEGATRON_CORE = False - class TransformerLayer: - pass - + TransformerLayer = ApexGuardDefaults + TransformerConfig = ApexGuardDefaults + TransformerLayerSubmodules = ApexGuardDefaults + AttnMaskType = ApexGuardDefaults() """ We use the following notation throughout this file: h: hidden size @@ -46,7 +49,6 @@ class TransformerLayer: hyperparameters: transformer hyperparameters """ - class FalconTransformerLayer(TransformerLayer): """A single transformer layer. @@ -57,7 +59,7 @@ class FalconTransformerLayer(TransformerLayer): def __init__( self, - config: TransformerConfig, # should come from FalconTransformerConfig class + config: TransformerConfig, submodules: TransformerLayerSubmodules, layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index b6ec930c2964..924e5f4321e6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults + try: from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules @@ -33,9 +35,7 @@ HAVE_MEGATRON_CORE = False - from typing import Any - - ModuleSpec = Any + ModuleSpec = ApexGuardDefaults from .falcon_decoder_layer import FalconTransformerLayer diff --git a/tests/collections/nlp/test_falcon_model.py b/tests/collections/nlp/test_falcon_model.py index 860434ac772b..23430ad36300 100644 --- a/tests/collections/nlp/test_falcon_model.py +++ b/tests/collections/nlp/test_falcon_model.py @@ -32,8 +32,8 @@ def model_cfg(test_data_dir): model_cfg = { 'mcore_gpt': True, - 'micro_batch_size': 1, - 'global_batch_size': 1, + 'micro_batch_size': 4, + 'global_batch_size': 8, 'rampup_batch_size': None, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py index d17597f7fd08..609a56e14596 100644 --- a/tests/collections/nlp/test_falcon_transformer_layer.py +++ b/tests/collections/nlp/test_falcon_transformer_layer.py @@ -16,8 +16,7 @@ import torch try: - from megatron.core import parallel_state - from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor + from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.transformer_config import TransformerConfig @@ -29,12 +28,11 @@ from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec -from tests.utils.test_parallel_utils import Utils - +@pytest.mark.run_only_on('GPU') class TestParallelFalconTransformerLayer: + def setup_method(self, method): - Utils.initialize_model_parallel(1, 1) model_parallel_cuda_manual_seed(123) transformer_config = TransformerConfig( num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True @@ -43,9 +41,7 @@ def setup_method(self, method): transformer_config, get_falcon_layer_spec().submodules ) - def teardown_method(self, method): - Utils.destroy_model_parallel() - + @pytest.mark.unit def test_constructor(self): parallel_falcon_transformer_layer = self.parallel_falcon_transformer_layer assert isinstance(parallel_falcon_transformer_layer, FalconTransformerLayer) @@ -54,6 +50,7 @@ def test_constructor(self): num_weights = sum([p.numel() for p in parallel_falcon_transformer_layer.parameters()]) assert num_weights == 1884 + @pytest.mark.unit def test_gpu_forward(self): parallel_transformer_layer = self.parallel_falcon_transformer_layer config: TransformerConfig = parallel_transformer_layer.config @@ -70,58 +67,3 @@ def test_gpu_forward(self): assert hidden_states.shape[0] == sequence_length assert hidden_states.shape[1] == micro_batch_size assert hidden_states.shape[2] == config.hidden_size - - @pytest.mark.parametrize('tp_pp', [(1, 1)]) - def test_sharded_state_dict(self, tp_pp): - Utils.destroy_model_parallel() - Utils.initialize_model_parallel(*tp_pp) - model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig( - num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True - ) - parallel_transformer_layer = FalconTransformerLayer(transformer_config, get_falcon_layer_spec().submodules) - - sharded_state_dict = parallel_transformer_layer.sharded_state_dict() - - extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')} - sharded_tensors = {k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')} - assert all(isinstance(t, ShardedObject) for t in extra_states.values()) - assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values()) - - # Test all local shapes - tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()} - tp_size = parallel_state.get_tensor_model_parallel_world_size() - assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size) - - # Test all global shapes. Prepend num layers in front of expected shapes - tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()} - expected_global_shapes = { - k: (transformer_config.num_layers, *v) for k, v in get_tensor_shapes_for_tp(transformer_config, 1).items() - } - assert tensor_global_shapes == expected_global_shapes - - # Test ShardedTensor keys - for state_dict_key, sh_ten in sharded_tensors.items(): - assert state_dict_key == f'0.{sh_ten.key}' - - Utils.destroy_model_parallel() - Utils.initialize_model_parallel(1, 1) - - -def get_tensor_shapes_for_tp(transformer_config, tp_size): - hs = transformer_config.hidden_size - ffn_hs = transformer_config.ffn_hidden_size - return { - '0.input_layernorm.weight': (hs,), - '0.input_layernorm.bias': (hs,), - '0.mlp.linear_fc1.weight': (ffn_hs // tp_size, hs), - '0.mlp.linear_fc1.bias': (ffn_hs // tp_size,), - '0.mlp.linear_fc2.weight': (hs, ffn_hs // tp_size), - '0.mlp.linear_fc2.bias': (hs,), - '0.self_attention.linear_proj.weight': (hs, hs // tp_size), - '0.self_attention.linear_proj.bias': (hs,), - '0.self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs), - '0.self_attention.linear_qkv.bias': (hs * 3 // tp_size,), - '0.post_self_attn_layernorm.weight': (hs,), - '0.post_self_attn_layernorm.bias': (hs,), - } diff --git a/tests/utils/test_parallel_utils.py b/tests/utils/test_parallel_utils.py deleted file mode 100644 index eadc467fbce1..000000000000 --- a/tests/utils/test_parallel_utils.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -import torch - -try: - import megatron.core.parallel_state as ps - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - - -class Utils: - - world_size = 1 # one gpu for unit test - os.environ['LOCAL_RANK'] = '0' - rank = int(os.environ['LOCAL_RANK']) - - @staticmethod - def initialize_distributed(): - print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') - torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) - init_method = 'tcp://' - master_ip = os.getenv('MASTER_ADDR', 'localhost') - master_port = os.getenv('MASTER_PORT', '6000') - init_method += master_ip + ':' + master_port - torch.distributed.init_process_group( - backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method - ) - - @staticmethod - def destroy_model_parallel(): - ps.destroy_model_parallel() - torch.distributed.barrier() - - @staticmethod - def initialize_model_parallel( - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, - virtual_pipeline_model_parallel_size=None, - pipeline_model_parallel_split_rank=None, - ): - ps.destroy_model_parallel() - if not torch.distributed.is_initialized(): - Utils.initialize_distributed() - ps.initialize_model_parallel( - tensor_model_parallel_size, - pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size, - pipeline_model_parallel_split_rank, - ) From e7476e8e7d78237bfec68fd9ee077f215766eb3a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Dec 2023 01:38:38 +0000 Subject: [PATCH 66/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language_modeling/megatron/falcon/falcon_decoder_layer.py | 1 + tests/collections/nlp/test_falcon_transformer_layer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 4c5e88b59680..67c732c6aee2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -49,6 +49,7 @@ hyperparameters: transformer hyperparameters """ + class FalconTransformerLayer(TransformerLayer): """A single transformer layer. diff --git a/tests/collections/nlp/test_falcon_transformer_layer.py b/tests/collections/nlp/test_falcon_transformer_layer.py index 609a56e14596..3edb541e8e33 100644 --- a/tests/collections/nlp/test_falcon_transformer_layer.py +++ b/tests/collections/nlp/test_falcon_transformer_layer.py @@ -29,9 +29,9 @@ from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_decoder_layer import FalconTransformerLayer from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec + @pytest.mark.run_only_on('GPU') class TestParallelFalconTransformerLayer: - def setup_method(self, method): model_parallel_cuda_manual_seed(123) transformer_config = TransformerConfig( From 90285550a47ae3cd2f8574f5840227e023f42582 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 14 Dec 2023 22:35:28 -0800 Subject: [PATCH 67/69] fixes for codeql --- .../megatron/falcon/falcon_decoder_layer.py | 55 ++++++++++--------- .../megatron/falcon/falcon_spec.py | 41 +++++++------- 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 67c732c6aee2..f02f183adea3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -69,35 +69,36 @@ def __init__( raise ImportError( "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) - super().__init__(config=config, submodules=submodules, layer_number=layer_number) - - if hasattr(self.config, 'new_decoder_architecture'): - self.new_decoder_architecture = self.config.new_decoder_architecture - else: - self.new_decoder_architecture = None - if hasattr(self.config, 'parallel_attention'): - self.parallel_attention = self.config.parallel_attention else: - self.parallel_attention = None + super().__init__(config=config, submodules=submodules, layer_number=layer_number) - if self.new_decoder_architecture or self.parallel_attention: - self.post_self_attn_layernorm = None - else: - self.post_self_attn_layernorm = build_module( - submodules.post_self_attn_layernorm, - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - if self.new_decoder_architecture: - self.pre_mlp_layernorm = build_module( - submodules.pre_mlp_layernorm, - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - else: - self.pre_mlp_layernorm = None + if hasattr(self.config, 'new_decoder_architecture'): + self.new_decoder_architecture = self.config.new_decoder_architecture + else: + self.new_decoder_architecture = None + if hasattr(self.config, 'parallel_attention'): + self.parallel_attention = self.config.parallel_attention + else: + self.parallel_attention = None + + if self.new_decoder_architecture or self.parallel_attention: + self.post_self_attn_layernorm = None + else: + self.post_self_attn_layernorm = build_module( + submodules.post_self_attn_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + if self.new_decoder_architecture: + self.pre_mlp_layernorm = build_module( + submodules.pre_mlp_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + else: + self.pre_mlp_layernorm = None def forward( self, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 924e5f4321e6..ab5622547782 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -45,24 +45,25 @@ def get_falcon_layer_spec() -> ModuleSpec: raise ImportError( "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) - falcon_submodules = TransformerLayerSubmodules( - input_layernorm=TENorm, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=TEColumnParallelLinear, - core_attention=TEDotProductAttention, - linear_proj=TERowParallelLinear, + else: + falcon_submodules = TransformerLayerSubmodules( + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=TENorm, - mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), - ), - mlp_bda=get_bias_dropout_add, - ) - # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules. - falcon_submodules.post_self_attn_layernorm = TENorm - return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules) + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + ), + mlp_bda=get_bias_dropout_add, + ) + # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules. + falcon_submodules.post_self_attn_layernorm = TENorm + return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules) From 0531cff47ae948066b7ffb687b3683c205116f66 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Dec 2023 06:38:41 +0000 Subject: [PATCH 68/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../models/language_modeling/megatron/falcon/falcon_spec.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index ab5622547782..6efe6d4e23c7 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -60,7 +60,8 @@ def get_falcon_layer_spec() -> ModuleSpec: self_attn_bda=get_bias_dropout_add, pre_mlp_layernorm=TENorm, mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + module=MLP, + submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), ), mlp_bda=get_bias_dropout_add, ) From 5f866da2f93dce1c97ae7a3682cb9394bed7c49e Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 14 Dec 2023 22:52:31 -0800 Subject: [PATCH 69/69] Revert "fixes for codeql" This reverts commit 90285550a47ae3cd2f8574f5840227e023f42582. --- .../megatron/falcon/falcon_decoder_layer.py | 55 +++++++++---------- .../megatron/falcon/falcon_spec.py | 42 +++++++------- 2 files changed, 47 insertions(+), 50 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index f02f183adea3..67c732c6aee2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -69,36 +69,35 @@ def __init__( raise ImportError( "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) - else: - super().__init__(config=config, submodules=submodules, layer_number=layer_number) + super().__init__(config=config, submodules=submodules, layer_number=layer_number) - if hasattr(self.config, 'new_decoder_architecture'): - self.new_decoder_architecture = self.config.new_decoder_architecture - else: - self.new_decoder_architecture = None - if hasattr(self.config, 'parallel_attention'): - self.parallel_attention = self.config.parallel_attention - else: - self.parallel_attention = None + if hasattr(self.config, 'new_decoder_architecture'): + self.new_decoder_architecture = self.config.new_decoder_architecture + else: + self.new_decoder_architecture = None + if hasattr(self.config, 'parallel_attention'): + self.parallel_attention = self.config.parallel_attention + else: + self.parallel_attention = None - if self.new_decoder_architecture or self.parallel_attention: - self.post_self_attn_layernorm = None - else: - self.post_self_attn_layernorm = build_module( - submodules.post_self_attn_layernorm, - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - if self.new_decoder_architecture: - self.pre_mlp_layernorm = build_module( - submodules.pre_mlp_layernorm, - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - else: - self.pre_mlp_layernorm = None + if self.new_decoder_architecture or self.parallel_attention: + self.post_self_attn_layernorm = None + else: + self.post_self_attn_layernorm = build_module( + submodules.post_self_attn_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + if self.new_decoder_architecture: + self.pre_mlp_layernorm = build_module( + submodules.pre_mlp_layernorm, + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + else: + self.pre_mlp_layernorm = None def forward( self, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 6efe6d4e23c7..924e5f4321e6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -45,26 +45,24 @@ def get_falcon_layer_spec() -> ModuleSpec: raise ImportError( "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) - else: - falcon_submodules = TransformerLayerSubmodules( - input_layernorm=TENorm, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=TEColumnParallelLinear, - core_attention=TEDotProductAttention, - linear_proj=TERowParallelLinear, - ), + falcon_submodules = TransformerLayerSubmodules( + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=TENorm, - mlp=ModuleSpec( - module=MLP, - submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), - ), - mlp_bda=get_bias_dropout_add, - ) - # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules. - falcon_submodules.post_self_attn_layernorm = TENorm - return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules) + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + ), + mlp_bda=get_bias_dropout_add, + ) + # Old falcon(prior to 7b/40b/180b) uses post_self_attn_layernorm that is not included in TransformerLayerModules. + falcon_submodules.post_self_attn_layernorm = TENorm + return ModuleSpec(module=FalconTransformerLayer, submodules=falcon_submodules)